npm - @lythos/skill-arena - Versions diffs - 0.11.2 → 0.12.0 - Mend

@lythos/skill-arena 0.11.2 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/README.md CHANGED Viewed

@@ -49,20 +49,20 @@ Note: Claude `-p` mode has known issues with web tools in Bun.spawn (deferred to
 ```bash
 bun add -d @lythos/skill-arena
 # or use directly
-bunx @lythos/skill-arena@0.11.2 <command>
+bunx @lythos/skill-arena@0.12.0 <command>
 ```
 ## Quick Start
 ```bash
 # Single: test a deck with one agent
-bunx @lythos/skill-arena@0.11.2 single \
+bunx @lythos/skill-arena@0.12.0 single \
   --deck https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml \
   --brief "Generate auth flow diagram"
 # Vs: compare multiple decks side by side
 curl -fsSL https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/arena/research-compare/arena.toml > arena.toml
-bunx @lythos/skill-arena@0.11.2 vs --config ./arena.toml
+bunx @lythos/skill-arena@0.12.0 vs --config ./arena.toml
 ```
 ## Commands
@@ -71,23 +71,23 @@ bunx @lythos/skill-arena@0.11.2 vs --config ./arena.toml
 ```bash
 # Print execution plan without running
-bunx @lythos/skill-arena@0.11.2 vs --config arena.toml --dry-run
+bunx @lythos/skill-arena@0.12.0 vs --config arena.toml --dry-run
 # Execute with per-side runs_per_side and statistical aggregation
-bunx @lythos/skill-arena@0.11.2 vs --config arena.toml
+bunx @lythos/skill-arena@0.12.0 vs --config arena.toml
 ```
 ### Scaffold mode (legacy, manual execution)
 ```
-bunx @lythos/skill-arena@0.11.2 scaffold --task "Generate auth flow diagram" \
+bunx @lythos/skill-arena@0.12.0 scaffold --task "Generate auth flow diagram" \
   --decks https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml,https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/documents.toml
 ```
 ### Viz
 ```bash
-bunx @lythos/skill-arena@0.11.2 viz runs/arena-<id>/
+bunx @lythos/skill-arena@0.12.0 viz runs/arena-<id>/
 ```
 ## Skill Documentation
@@ -101,7 +101,7 @@ The agent-visible **Skill** layer documentation is here:
 Part of the [lythoskill](https://github.com/lythos-labs/lythoskill) ecosystem — the thin-skill pattern separates heavy logic (this npm package) from lightweight agent instructions (SKILL.md).
 ```
-Starter (this package) → npm publish → bunx @lythos/skill-arena@0.11.2 ...
+Starter (this package) → npm publish → bunx @lythos/skill-arena@0.12.0 ...
 Skill   (packages/<name>/skill/)     → build → SKILL.md + thin scripts
 Output  (skills/<name>/)             → git commit → agent-visible skill
 ```

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@lythos/skill-arena",
-  "version": "0.11.2",
+  "version": "0.12.0",
   "description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
   "keywords": [
     "ai-agent",
@@ -42,13 +42,13 @@
     "bun": ">=1.0.0"
   },
   "dependencies": {
-    "@lythos/cold-pool": "^0.11.2",
-    "@lythos/infra": "^0.11.2",
-    "@lythos/test-utils": "^0.11.2",
+    "@lythos/cold-pool": "^0.12.0",
+    "@lythos/infra": "^0.12.0",
+    "@lythos/test-utils": "^0.12.0",
     "zod": "^3.24.0",
     "zod-to-json-schema": "^3.25.2"
   },
   "optionalDependencies": {
-    "@lythos/agent-adapter-claude-sdk": "^0.11.2"
+    "@lythos/agent-adapter-claude-sdk": "^0.12.0"
   }
 }

package/src/arena-toml.test.ts CHANGED Viewed

@@ -18,6 +18,22 @@ player = "claude-code"
 deck = "./decks/b.toml"
 `
+const judgeToml = `
+[arena]
+task = "Test task"
+judge = "Evaluate completeness and correctness. Return JSON."
+[[side]]
+name = "runner-a"
+player = "claude-code"
+deck = "./decks/a.toml"
+[[side]]
+name = "runner-b"
+player = "claude-code"
+deck = "./decks/b.toml"
+`
 const fullToml = `
 [arena]
 task = "Generate auth flow diagram"
@@ -46,19 +62,24 @@ pre_run = ["npm ci", "npm run build"]
 working_dir = "/workspace"
 `
-// ── Schema + Parser ────────────────────────────────────────────────────────
 describe('parseArenaToml', () => {
-  test('parses minimal two-side arena', () => {
+  test('parses minimal two-side arena with criteria', () => {
     const result = parseArenaToml(minimalToml)
     expect(result.arena.task).toBe('Test task')
     expect(result.arena.criteria).toEqual(['a', 'b'])
-    expect(result.arena.runs_per_side).toBe(1)       // default
+    expect(result.arena.runs_per_side).toBe(1)
     expect(result.side).toHaveLength(2)
     expect(result.side[0].name).toBe('runner-a')
     expect(result.side[0].player).toBe('claude-code')
     expect(result.side[0].deck).toBe('./decks/a.toml')
-    expect(result.side[0].control).toBe(false)         // default
+    expect(result.side[0].control).toBe(false)
+  })
+  test('parses arena with judge field (preferred over criteria)', () => {
+    const result = parseArenaToml(judgeToml)
+    expect(result.arena.judge).toContain('Evaluate completeness')
+    expect(result.arena.criteria).toBeUndefined()
+    expect(result.side).toHaveLength(2)
   })
   test('parses full arena with runs_per_side and control', () => {
@@ -83,7 +104,17 @@ describe('parseArenaToml', () => {
     expect(() => parseArenaToml(bad)).toThrow()
   })
-  test('rejects empty criteria', () => {
+  test('rejects neither judge nor criteria provided', () => {
+    const bad = `[arena]\ntask = "x"\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
+    expect(() => parseArenaToml(bad)).toThrow()
+  })
+  test('accepts judge without criteria (either is sufficient)', () => {
+    const toml = `[arena]\ntask = "x"\njudge = "Evaluate this."\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
+    expect(() => parseArenaToml(toml)).not.toThrow()
+  })
+  test('rejects empty criteria and no judge', () => {
     const bad = `[arena]\ntask = "x"\ncriteria = []\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
     expect(() => parseArenaToml(bad)).toThrow()
   })
@@ -115,18 +146,20 @@ describe('parseArenaToml', () => {
   })
 })
-// ── Execution Plan ─────────────────────────────────────────────────────────
 describe('buildExecutionPlan', () => {
   test('generates plan: 2 sides × 1 run = 2 cells', () => {
     const toml = parseArenaToml(minimalToml)
     const plan = buildExecutionPlan(toml)
     expect(plan.task).toBe('Test task')
-    expect(plan.criteria).toEqual(['a', 'b'])
+    expect(plan.judge).toBeNull()
     expect(plan.cells).toHaveLength(2)
     expect(plan.total_runs).toBe(2)
-    expect(plan.cells[0]).toEqual({ side: 'runner-a', player: 'claude-code', deck: './decks/a.toml', run: 1, control: false })
-    expect(plan.cells[1]).toEqual({ side: 'runner-b', player: 'claude-code', deck: './decks/b.toml', run: 1, control: false })
+  })
+  test('generates plan with judge field populated', () => {
+    const toml = parseArenaToml(judgeToml)
+    const plan = buildExecutionPlan(toml)
+    expect(plan.judge).toContain('Evaluate completeness')
   })
   test('generates plan: 3 sides × 3 runs = 9 cells', () => {
@@ -134,13 +167,6 @@ describe('buildExecutionPlan', () => {
     const plan = buildExecutionPlan(toml)
     expect(plan.cells).toHaveLength(9)
     expect(plan.total_runs).toBe(9)
-    // Cells are ordered: side 0 run 1, side 0 run 2, side 0 run 3, side 1 run 1, ...
-    expect(plan.cells[0]).toEqual({ side: 'minimal', player: 'standard-coder', deck: './decks/minimal.toml', run: 1, control: false })
-    expect(plan.cells[1]).toEqual({ side: 'minimal', player: 'standard-coder', deck: './decks/minimal.toml', run: 2, control: false })
-    expect(plan.cells[2]).toEqual({ side: 'minimal', player: 'standard-coder', deck: './decks/minimal.toml', run: 3, control: false })
-    expect(plan.cells[3]).toEqual({ side: 'rich', player: 'expert-architect', deck: './decks/rich.toml', run: 1, control: false })
-    expect(plan.cells[8]).toEqual({ side: 'baseline', player: 'standard-coder', deck: './decks/baseline.toml', run: 3, control: true })
   })
   test('control flag preserved in plan cells', () => {
@@ -151,37 +177,9 @@ describe('buildExecutionPlan', () => {
     expect(baselineCells.every(c => c.control)).toBe(true)
   })
-  test('dry-run output format matches expected log', () => {
-    const toml = parseArenaToml(minimalToml)
-    const plan = buildExecutionPlan(toml)
-    // Simulate what --dry-run would log
-    const logs: string[] = []
-    for (const line of formatPlanOutput(plan)) {
-      logs.push(line)
-    }
-    expect(logs.some(l => l.includes('2 cells'))).toBe(true)
-    expect(logs.some(l => l.includes('runner-a'))).toBe(true)
-    expect(logs.some(l => l.includes('runner-b'))).toBe(true)
-    expect(logs.some(l => l.includes('claude-code'))).toBe(true)
-    expect(logs.every(l => !l.includes('control'))).toBe(true) // no control flags in minimal
-  })
-  test('dry-run output shows control flag for control sides', () => {
-    const toml = parseArenaToml(fullToml)
-    const plan = buildExecutionPlan(toml)
-    const lines = formatPlanOutput(plan)
-    const baselineLines = lines.filter(l => l.includes('baseline'))
-    // All baseline cells should have [control] flag
-    expect(baselineLines.every(l => l.includes('[control]'))).toBe(true)
-  })
   test('dry-run: plan is pure data, no side effects', () => {
-    // The entire plan generation is a pure function — dry-run is just printing it
     const toml = parseArenaToml(fullToml)
     const plan = buildExecutionPlan(toml)
-    // Verify plan is self-describing for a --dry-run output
     expect(plan.total_runs).toBeGreaterThan(0)
     expect(plan.cells.every(c => typeof c.side === 'string')).toBe(true)
     expect(plan.cells.every(c => typeof c.player === 'string')).toBe(true)

package/src/arena-toml.ts CHANGED Viewed

@@ -23,14 +23,21 @@ export type Side = z.infer<typeof Side>
 export const ArenaToml = z.object({
   arena: z.object({
-    task: z.string(),              // task description or path to TASK-arena.md
-    criteria: z.array(z.string()).min(1),
+    task: z.string(),              // task description or path to TASK.agent.md
+    // judge: path to judge.md file (preferred) or inline natural-language criteria text.
+    // When present, readFileSync + pass as JudgeInput.criteria directly — no parsing.
+    // If absent, fall back to criteria string[] (legacy, each string becomes a bullet).
+    judge: z.string().optional().describe('Path to judge.md (natural-language criteria for the judge LLM) or inline criteria text. No parsing — passed directly as JudgeInput.criteria.'),
+    criteria: z.array(z.string()).optional().describe('Legacy string criteria. Each becomes a bullet in generated judge prompt. Use judge for full natural-language criteria.'),
     runs_per_side: z.number().int().positive().default(1),
     max_participants: z.number().int().min(2).max(5).default(5),
     model: z.string().optional(),  // e.g. "claude-sonnet-4-6"
     endpoint: z.string().optional(), // e.g. "api.anthropic.com"
     notes: z.string().optional(),  // freeform reproducibility notes
-  }),
+  }).refine(
+    data => !!(data.judge || (data.criteria && data.criteria.length > 0)),
+    { message: 'At least one of arena.judge or arena.criteria must be provided' }
+  ),
   side: z.array(Side).min(2).max(5),
 })
 export type ArenaToml = z.infer<typeof ArenaToml>
@@ -38,7 +45,6 @@ export type ArenaToml = z.infer<typeof ArenaToml>
 // ── Parser ─────────────────────────────────────────────────────────────────
 export function parseArenaToml(content: string): ArenaToml {
-  // Simple inline TOML parser for arena.toml (no external dep needed for this subset)
   const parsed = parseToml(content)
   return ArenaToml.parse(parsed)
 }
@@ -55,7 +61,7 @@ export interface ExecutionCell {
 export interface ExecutionPlan {
   task: string
-  criteria: string[]
+  judge: string | null             // resolved judge text (from judge.md or inline)
   cells: ExecutionCell[]
   total_runs: number
 }
@@ -75,7 +81,7 @@ export function buildExecutionPlan(toml: ArenaToml): ExecutionPlan {
   }
   return {
     task: toml.arena.task,
-    criteria: toml.arena.criteria,
+    judge: toml.arena.judge ?? null,
     cells,
     total_runs: cells.length,
   }
@@ -108,7 +114,6 @@ function parseToml(text: string): Record<string, unknown> {
     const sectionMatch = line.match(/^\[(.+?)\]$/)
     if (sectionMatch) {
       const key = sectionMatch[1]
-      // nested key like "side.env"
       if (key.includes('.')) {
         const [parent, child] = key.split('.')
         const parentArr = arrayTables.get(parent)
@@ -130,13 +135,8 @@ function parseToml(text: string): Record<string, unknown> {
       const key = line.slice(0, eqIdx).trim()
       let value = line.slice(eqIdx + 1).trim()
-      // String value
       if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) {
         value = value.slice(1, -1)
-      } else if (value === 'true') {
-        value = 'true'
-      } else if (value === 'false') {
-        value = 'false'
       }
       // Array value: ["a", "b"]
@@ -166,7 +166,6 @@ function parseToml(text: string): Record<string, unknown> {
     }
   }
-  // Materialize array tables into result
   for (const [key, arr] of arrayTables) {
     result[key] = arr
   }