@lythos/skill-arena 0.9.3 → 0.9.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,6 +1,8 @@
1
1
  # @lythos/skill-arena
2
2
 
3
- > Controlled-variable benchmark for AI agent skills. Compare skills, decks, or configurations on the same task — single-skill A/B or full-deck Pareto frontier analysis.
3
+ ![CI](https://img.shields.io/badge/CI-41%20unit%20tests-brightgreen) ![Intent/Plan](https://img.shields.io/badge/arch-intent%2Fplan%2Fexecute-8A2BE2)
4
+
5
+ > Controlled-variable benchmark for AI agent skills. Compare skills, decks, or configurations on the same task — single-skill A/B or full-deck Pareto frontier analysis. Now with declarative `arena.toml` (k8s-manifest style) and deterministic Pareto frontier.
4
6
 
5
7
  ## Why
6
8
 
@@ -40,25 +42,36 @@ bunx @lythos/skill-arena viz tmp/arena-<id>/
40
42
 
41
43
  ## Commands
42
44
 
45
+ ### Declarative mode (k8s-style, recommended)
46
+
47
+ ```bash
48
+ # Print execution plan without running
49
+ bunx @lythos/skill-arena run --config arena.toml --dry-run
50
+
51
+ # Execute with per-side runs_per_side and statistical aggregation
52
+ bunx @lythos/skill-arena run --config arena.toml
53
+ ```
54
+
55
+ ### CLI-flag mode (backward compat)
56
+
57
+ ```
58
+ bunx @lythos/skill-arena run \
59
+ --task ./TASK-arena.md \
60
+ --players ./players/claude.toml \
61
+ --decks ./decks/run-01.toml,./decks/run-02.toml \
62
+ --criteria coverage,relevance,actionability,depth
43
63
  ```
44
- Usage: bunx @lythos/skill-arena <options> | bunx @lythos/skill-arena viz <dir>
45
64
 
46
- Mode 1 Single-Skill Comparison:
47
- --task, -t <desc> Task description (required)
48
- --skills, -s <list> Comma-separated skills, 2–5 (Mode 1)
49
- --criteria, -c <list> Evaluation dimensions (default: syntax,context,logic,token)
50
- --control <skill> Control skill (default: lythoskill-project-scribe)
65
+ ### Scaffold mode (legacy, manual execution)
51
66
 
52
- Mode 2 — Full-Deck Comparison:
53
- --decks <paths> Comma-separated deck toml paths, 2–5 (Mode 2)
54
- --criteria, -c <list> Evaluation dimensions
67
+ ```
68
+ bunx @lythos/skill-arena scaffold --task "..." --skills a,b
69
+ ```
55
70
 
56
- Common:
57
- --dir, -d <path> Arena parent directory (default: tmp)
58
- --project, -p <path> Project root (default: .)
71
+ ### Viz
59
72
 
60
- Viz:
61
- viz <dir> Render ASCII charts from report.md
73
+ ```bash
74
+ bunx @lythos/skill-arena viz runs/arena-<id>/
62
75
  ```
63
76
 
64
77
  ## Skill Documentation
@@ -77,6 +90,31 @@ Skill (packages/<name>/skill/) → build → SKILL.md + thin scripts
77
90
  Output (skills/<name>/) → git commit → agent-visible skill
78
91
  ```
79
92
 
93
+ ### Runtime architecture (intent/plan/execute)
94
+
95
+ ```
96
+ arena.toml → ArenaToml (Zod) → ExecutionPlan (pure) → per-cell agent spawn (IO)
97
+
98
+ aggregateAllStats (pure) ← verdicts[]
99
+
100
+ runComparativeJudge (IO) → report.md + Pareto frontier
101
+ ```
102
+
103
+ - **Intent**: `arena.toml` declarative config (k8s-manifest style)
104
+ - **Plan**: `buildExecutionPlan()`, `aggregateSideStats()`, `computePareto()` — pure functions
105
+ - **Execute**: `runAgentScenario` per cell, `runComparativeJudge` — IO via `AgentAdapter`
106
+
107
+ Built on `@lythos/test-utils` shared infrastructure.
108
+
109
+ ## Test Coverage
110
+
111
+ | Layer | Count | CI | Notes |
112
+ |-------|-------|----|-------|
113
+ | Unit tests | 41 | ✅ | TOML parser, player resolution, Pareto, stats |
114
+ | Agent BDD | — | ❌ | Requires `claude` CLI; run locally |
115
+
116
+ Pareto frontier is a **deterministic algorithm** — never delegated to LLM. 8 unit tests cover dominance, cross-dominance, transitive chains, partial criteria, and empty scores.
117
+
80
118
  ## License
81
119
 
82
120
  MIT
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lythos/skill-arena",
3
- "version": "0.9.3",
3
+ "version": "0.9.6",
4
4
  "description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
5
5
  "keywords": [
6
6
  "ai-agent",
@@ -1,5 +1,6 @@
1
1
  import { describe, test, expect } from 'bun:test'
2
2
  import { parseArenaToml, buildExecutionPlan, ArenaToml } from './arena-toml'
3
+ import { formatPlanOutput } from './runner'
3
4
 
4
5
  const minimalToml = `
5
6
  [arena]
@@ -150,6 +151,32 @@ describe('buildExecutionPlan', () => {
150
151
  expect(baselineCells.every(c => c.control)).toBe(true)
151
152
  })
152
153
 
154
+ test('dry-run output format matches expected log', () => {
155
+ const toml = parseArenaToml(minimalToml)
156
+ const plan = buildExecutionPlan(toml)
157
+
158
+ // Simulate what --dry-run would log
159
+ const logs: string[] = []
160
+ for (const line of formatPlanOutput(plan)) {
161
+ logs.push(line)
162
+ }
163
+
164
+ expect(logs.some(l => l.includes('2 cells'))).toBe(true)
165
+ expect(logs.some(l => l.includes('runner-a'))).toBe(true)
166
+ expect(logs.some(l => l.includes('runner-b'))).toBe(true)
167
+ expect(logs.some(l => l.includes('claude-code'))).toBe(true)
168
+ expect(logs.every(l => !l.includes('control'))).toBe(true) // no control flags in minimal
169
+ })
170
+
171
+ test('dry-run output shows control flag for control sides', () => {
172
+ const toml = parseArenaToml(fullToml)
173
+ const plan = buildExecutionPlan(toml)
174
+ const lines = formatPlanOutput(plan)
175
+ const baselineLines = lines.filter(l => l.includes('baseline'))
176
+ // All baseline cells should have [control] flag
177
+ expect(baselineLines.every(l => l.includes('[control]'))).toBe(true)
178
+ })
179
+
153
180
  test('dry-run: plan is pure data, no side effects', () => {
154
181
  // The entire plan generation is a pure function — dry-run is just printing it
155
182
  const toml = parseArenaToml(fullToml)
package/src/cli.ts CHANGED
@@ -80,6 +80,9 @@ function parseArgs(argv: string[]) {
80
80
  control: 'lythoskill-project-scribe',
81
81
  dir: 'tmp',
82
82
  project: '.',
83
+ config: undefined,
84
+ out: undefined,
85
+ players: undefined,
83
86
  }
84
87
  const positionals: string[] = []
85
88
 
@@ -99,6 +102,12 @@ function parseArgs(argv: string[]) {
99
102
  options.dir = argv[++i]
100
103
  } else if (arg === '--project' || arg === '-p') {
101
104
  options.project = argv[++i]
105
+ } else if (arg === '--config') {
106
+ options.config = argv[++i]
107
+ } else if (arg === '--out') {
108
+ options.out = argv[++i]
109
+ } else if (arg === '--players') {
110
+ options.players = argv[++i]
102
111
  } else if (!arg.startsWith('-')) {
103
112
  positionals.push(arg)
104
113
  }
package/src/runner.ts CHANGED
@@ -5,7 +5,7 @@ import { useAgent } from '@lythos/test-utils/agents'
5
5
  import { ArenaManifest, Player } from '@lythos/test-utils/schema'
6
6
  import type { ArenaManifest as ArenaManifestType, JudgeVerdict } from '@lythos/test-utils/schema'
7
7
  import { runComparativeJudge } from './comparative-judge'
8
- import { parseArenaToml, buildExecutionPlan, type ArenaToml } from './arena-toml'
8
+ import { parseArenaToml, buildExecutionPlan, type ArenaToml, type ExecutionPlan } from './arena-toml'
9
9
  import { resolvePlayer, resolveSides } from './player'
10
10
  import { aggregateAllStats } from './stats'
11
11
  import type { SideStats } from './stats'
@@ -26,18 +26,33 @@ export interface ArenaResult {
26
26
  artifactsDir: string
27
27
  }
28
28
 
29
+ /** Format an execution plan as readable CLI output (pure). */
30
+ export function formatPlanOutput(plan: ExecutionPlan): string[] {
31
+ const lines: string[] = []
32
+ const sideCount = new Set(plan.cells.map(c => c.side)).size
33
+ lines.push(`\n📋 Dry-run: ${plan.total_runs} cells across ${sideCount} sides × ${plan.cells.length / Math.max(1, sideCount)} runs`)
34
+ for (const cell of plan.cells) {
35
+ lines.push(` ${cell.side}/run-${cell.run}: ${cell.player} × ${cell.deck}${cell.control ? ' [control]' : ''}`)
36
+ }
37
+ return lines
38
+ }
39
+
29
40
  export async function runArenaFromToml(opts: {
30
41
  toml: ArenaToml
31
42
  taskPath: string
32
43
  outDir?: string
33
44
  dryRun?: boolean
45
+ log?: (msg: string) => void
34
46
  }): Promise<ArenaResult | { plan: ReturnType<typeof buildExecutionPlan> }> {
35
- const { toml, taskPath, outDir, dryRun } = opts
47
+ const { toml, taskPath, outDir, dryRun, log } = opts
36
48
 
37
49
  const plan = buildExecutionPlan(toml)
38
50
 
39
51
  // dry-run: return plan without executing
40
52
  if (dryRun) {
53
+ for (const line of formatPlanOutput(plan)) {
54
+ log?.(line)
55
+ }
41
56
  return { plan }
42
57
  }
43
58