@lythos/skill-arena 0.9.3 → 0.9.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,6 +1,8 @@
1
1
  # @lythos/skill-arena
2
2
 
3
- > Controlled-variable benchmark for AI agent skills. Compare skills, decks, or configurations on the same task — single-skill A/B or full-deck Pareto frontier analysis.
3
+ ![CI](https://img.shields.io/badge/CI-41%20unit%20tests-brightgreen) ![Intent/Plan](https://img.shields.io/badge/arch-intent%2Fplan%2Fexecute-8A2BE2)
4
+
5
+ > Controlled-variable benchmark for AI agent skills. Compare skills, decks, or configurations on the same task — single-skill A/B or full-deck Pareto frontier analysis. Now with declarative `arena.toml` (k8s-manifest style) and deterministic Pareto frontier.
4
6
 
5
7
  ## Why
6
8
 
@@ -40,25 +42,36 @@ bunx @lythos/skill-arena viz tmp/arena-<id>/
40
42
 
41
43
  ## Commands
42
44
 
45
+ ### Declarative mode (k8s-style, recommended)
46
+
47
+ ```bash
48
+ # Print execution plan without running
49
+ bunx @lythos/skill-arena run --config arena.toml --dry-run
50
+
51
+ # Execute with per-side runs_per_side and statistical aggregation
52
+ bunx @lythos/skill-arena run --config arena.toml
53
+ ```
54
+
55
+ ### CLI-flag mode (backward compat)
56
+
57
+ ```
58
+ bunx @lythos/skill-arena run \
59
+ --task ./TASK-arena.md \
60
+ --players ./players/claude.toml \
61
+ --decks ./decks/run-01.toml,./decks/run-02.toml \
62
+ --criteria coverage,relevance,actionability,depth
43
63
  ```
44
- Usage: bunx @lythos/skill-arena <options> | bunx @lythos/skill-arena viz <dir>
45
64
 
46
- Mode 1 Single-Skill Comparison:
47
- --task, -t <desc> Task description (required)
48
- --skills, -s <list> Comma-separated skills, 2–5 (Mode 1)
49
- --criteria, -c <list> Evaluation dimensions (default: syntax,context,logic,token)
50
- --control <skill> Control skill (default: lythoskill-project-scribe)
65
+ ### Scaffold mode (legacy, manual execution)
51
66
 
52
- Mode 2 — Full-Deck Comparison:
53
- --decks <paths> Comma-separated deck toml paths, 2–5 (Mode 2)
54
- --criteria, -c <list> Evaluation dimensions
67
+ ```
68
+ bunx @lythos/skill-arena scaffold --task "..." --skills a,b
69
+ ```
55
70
 
56
- Common:
57
- --dir, -d <path> Arena parent directory (default: tmp)
58
- --project, -p <path> Project root (default: .)
71
+ ### Viz
59
72
 
60
- Viz:
61
- viz <dir> Render ASCII charts from report.md
73
+ ```bash
74
+ bunx @lythos/skill-arena viz runs/arena-<id>/
62
75
  ```
63
76
 
64
77
  ## Skill Documentation
@@ -77,6 +90,31 @@ Skill (packages/<name>/skill/) → build → SKILL.md + thin scripts
77
90
  Output (skills/<name>/) → git commit → agent-visible skill
78
91
  ```
79
92
 
93
+ ### Runtime architecture (intent/plan/execute)
94
+
95
+ ```
96
+ arena.toml → ArenaToml (Zod) → ExecutionPlan (pure) → per-cell agent spawn (IO)
97
+
98
+ aggregateAllStats (pure) ← verdicts[]
99
+
100
+ runComparativeJudge (IO) → report.md + Pareto frontier
101
+ ```
102
+
103
+ - **Intent**: `arena.toml` declarative config (k8s-manifest style)
104
+ - **Plan**: `buildExecutionPlan()`, `aggregateSideStats()`, `computePareto()` — pure functions
105
+ - **Execute**: `runAgentScenario` per cell, `runComparativeJudge` — IO via `AgentAdapter`
106
+
107
+ Built on `@lythos/test-utils` shared infrastructure.
108
+
109
+ ## Test Coverage
110
+
111
+ | Layer | Count | CI | Notes |
112
+ |-------|-------|----|-------|
113
+ | Unit tests | 41 | ✅ | TOML parser, player resolution, Pareto, stats |
114
+ | Agent BDD | — | ❌ | Requires `claude` CLI; run locally |
115
+
116
+ Pareto frontier is a **deterministic algorithm** — never delegated to LLM. 8 unit tests cover dominance, cross-dominance, transitive chains, partial criteria, and empty scores.
117
+
80
118
  ## License
81
119
 
82
120
  MIT
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lythos/skill-arena",
3
- "version": "0.9.3",
3
+ "version": "0.9.7",
4
4
  "description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
5
5
  "keywords": [
6
6
  "ai-agent",
@@ -1,5 +1,6 @@
1
1
  import { describe, test, expect } from 'bun:test'
2
2
  import { parseArenaToml, buildExecutionPlan, ArenaToml } from './arena-toml'
3
+ import { formatPlanOutput } from './runner'
3
4
 
4
5
  const minimalToml = `
5
6
  [arena]
@@ -150,6 +151,32 @@ describe('buildExecutionPlan', () => {
150
151
  expect(baselineCells.every(c => c.control)).toBe(true)
151
152
  })
152
153
 
154
+ test('dry-run output format matches expected log', () => {
155
+ const toml = parseArenaToml(minimalToml)
156
+ const plan = buildExecutionPlan(toml)
157
+
158
+ // Simulate what --dry-run would log
159
+ const logs: string[] = []
160
+ for (const line of formatPlanOutput(plan)) {
161
+ logs.push(line)
162
+ }
163
+
164
+ expect(logs.some(l => l.includes('2 cells'))).toBe(true)
165
+ expect(logs.some(l => l.includes('runner-a'))).toBe(true)
166
+ expect(logs.some(l => l.includes('runner-b'))).toBe(true)
167
+ expect(logs.some(l => l.includes('claude-code'))).toBe(true)
168
+ expect(logs.every(l => !l.includes('control'))).toBe(true) // no control flags in minimal
169
+ })
170
+
171
+ test('dry-run output shows control flag for control sides', () => {
172
+ const toml = parseArenaToml(fullToml)
173
+ const plan = buildExecutionPlan(toml)
174
+ const lines = formatPlanOutput(plan)
175
+ const baselineLines = lines.filter(l => l.includes('baseline'))
176
+ // All baseline cells should have [control] flag
177
+ expect(baselineLines.every(l => l.includes('[control]'))).toBe(true)
178
+ })
179
+
153
180
  test('dry-run: plan is pure data, no side effects', () => {
154
181
  // The entire plan generation is a pure function — dry-run is just printing it
155
182
  const toml = parseArenaToml(fullToml)
package/src/cli.ts CHANGED
@@ -80,6 +80,9 @@ function parseArgs(argv: string[]) {
80
80
  control: 'lythoskill-project-scribe',
81
81
  dir: 'tmp',
82
82
  project: '.',
83
+ config: undefined,
84
+ out: undefined,
85
+ players: undefined,
83
86
  }
84
87
  const positionals: string[] = []
85
88
 
@@ -99,6 +102,12 @@ function parseArgs(argv: string[]) {
99
102
  options.dir = argv[++i]
100
103
  } else if (arg === '--project' || arg === '-p') {
101
104
  options.project = argv[++i]
105
+ } else if (arg === '--config') {
106
+ options.config = argv[++i]
107
+ } else if (arg === '--out') {
108
+ options.out = argv[++i]
109
+ } else if (arg === '--players') {
110
+ options.players = argv[++i]
102
111
  } else if (!arg.startsWith('-')) {
103
112
  positionals.push(arg)
104
113
  }
@@ -584,11 +593,11 @@ async function runProgrammaticArena(argv: string[]) {
584
593
  const configPath = (options as Record<string, string | undefined>).config!
585
594
 
586
595
  const toml = parseArenaToml(readFileSync(configPath, 'utf-8'))
596
+ const { dirname } = await import('node:path')
587
597
  const result = await runArenaFromToml({
588
598
  toml,
589
- taskPath: toml.arena.task.startsWith('/') || toml.arena.task.startsWith('./')
590
- ? toml.arena.task
591
- : (options as Record<string, string | undefined>).task ?? toml.arena.task,
599
+ taskPath: toml.arena.task,
600
+ configDir: dirname(configPath), // resolve relative paths against config file dir
592
601
  outDir: (options as Record<string, string | undefined>).out,
593
602
  dryRun,
594
603
  })
package/src/runner.ts CHANGED
@@ -5,7 +5,7 @@ import { useAgent } from '@lythos/test-utils/agents'
5
5
  import { ArenaManifest, Player } from '@lythos/test-utils/schema'
6
6
  import type { ArenaManifest as ArenaManifestType, JudgeVerdict } from '@lythos/test-utils/schema'
7
7
  import { runComparativeJudge } from './comparative-judge'
8
- import { parseArenaToml, buildExecutionPlan, type ArenaToml } from './arena-toml'
8
+ import { parseArenaToml, buildExecutionPlan, type ArenaToml, type ExecutionPlan } from './arena-toml'
9
9
  import { resolvePlayer, resolveSides } from './player'
10
10
  import { aggregateAllStats } from './stats'
11
11
  import type { SideStats } from './stats'
@@ -26,30 +26,58 @@ export interface ArenaResult {
26
26
  artifactsDir: string
27
27
  }
28
28
 
29
+ /** Format an execution plan as readable CLI output (pure). */
30
+ export function formatPlanOutput(plan: ExecutionPlan): string[] {
31
+ const lines: string[] = []
32
+ const sideCount = new Set(plan.cells.map(c => c.side)).size
33
+ lines.push(`\n📋 Dry-run: ${plan.total_runs} cells across ${sideCount} sides × ${plan.cells.length / Math.max(1, sideCount)} runs`)
34
+ for (const cell of plan.cells) {
35
+ lines.push(` ${cell.side}/run-${cell.run}: ${cell.player} × ${cell.deck}${cell.control ? ' [control]' : ''}`)
36
+ }
37
+ return lines
38
+ }
39
+
29
40
  export async function runArenaFromToml(opts: {
30
41
  toml: ArenaToml
31
42
  taskPath: string
32
43
  outDir?: string
33
44
  dryRun?: boolean
45
+ log?: (msg: string) => void
46
+ configDir?: string // for resolving relative paths
34
47
  }): Promise<ArenaResult | { plan: ReturnType<typeof buildExecutionPlan> }> {
35
- const { toml, taskPath, outDir, dryRun } = opts
48
+ const { toml, taskPath, outDir, dryRun, log, configDir } = opts
36
49
 
37
- const plan = buildExecutionPlan(toml)
50
+ // Resolve relative paths against config dir (anti-footgun: cwd may differ)
51
+ const resolvePath = (p: string) => {
52
+ if (p.startsWith('/')) return p
53
+ if (configDir) return resolve(configDir, p)
54
+ return resolve(p)
55
+ }
56
+ const taskAbs = resolvePath(taskPath)
57
+ const resolvedToml: ArenaToml = {
58
+ ...toml,
59
+ side: toml.side.map(s => ({ ...s, deck: resolvePath(s.deck) })),
60
+ }
61
+
62
+ const plan = buildExecutionPlan(resolvedToml)
38
63
 
39
64
  // dry-run: return plan without executing
40
65
  if (dryRun) {
66
+ for (const line of formatPlanOutput(plan)) {
67
+ log?.(line)
68
+ }
41
69
  return { plan }
42
70
  }
43
71
 
44
72
  const arenaId = `arena-${stamp()}`
45
73
  const artifactsDir = outDir || join(process.cwd(), 'runs', arenaId)
46
- const resolved = resolveSides(toml)
74
+ const resolved = resolveSides(resolvedToml)
47
75
 
48
76
  // Build manifest
49
77
  const manifest = ArenaManifest.parse({
50
78
  id: arenaId,
51
79
  created_at: new Date().toISOString(),
52
- task: readFileSync(resolve(taskPath), 'utf-8').slice(0, 200),
80
+ task: readFileSync(taskAbs, 'utf-8').slice(0, 200),
53
81
  mode: 'decks',
54
82
  participants: [...new Map(resolved.map(r => [r.side.name, r])).values()].map(r => ({
55
83
  id: r.side.name,
@@ -58,7 +86,7 @@ export async function runArenaFromToml(opts: {
58
86
  deck: r.side.deck,
59
87
  description: `${r.playerName} × ${r.side.deck}`,
60
88
  })),
61
- criteria: toml.arena.criteria,
89
+ criteria: resolvedToml.arena.criteria,
62
90
  status: 'running',
63
91
  })
64
92