@lythos/skill-arena 0.9.3 → 0.9.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +53 -15
- package/package.json +1 -1
- package/src/arena-toml.test.ts +27 -0
- package/src/cli.ts +12 -3
- package/src/runner.ts +34 -6
package/README.md
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# @lythos/skill-arena
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
 
|
|
4
|
+
|
|
5
|
+
> Controlled-variable benchmark for AI agent skills. Compare skills, decks, or configurations on the same task — single-skill A/B or full-deck Pareto frontier analysis. Now with declarative `arena.toml` (k8s-manifest style) and deterministic Pareto frontier.
|
|
4
6
|
|
|
5
7
|
## Why
|
|
6
8
|
|
|
@@ -40,25 +42,36 @@ bunx @lythos/skill-arena viz tmp/arena-<id>/
|
|
|
40
42
|
|
|
41
43
|
## Commands
|
|
42
44
|
|
|
45
|
+
### Declarative mode (k8s-style, recommended)
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
# Print execution plan without running
|
|
49
|
+
bunx @lythos/skill-arena run --config arena.toml --dry-run
|
|
50
|
+
|
|
51
|
+
# Execute with per-side runs_per_side and statistical aggregation
|
|
52
|
+
bunx @lythos/skill-arena run --config arena.toml
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### CLI-flag mode (backward compat)
|
|
56
|
+
|
|
57
|
+
```
|
|
58
|
+
bunx @lythos/skill-arena run \
|
|
59
|
+
--task ./TASK-arena.md \
|
|
60
|
+
--players ./players/claude.toml \
|
|
61
|
+
--decks ./decks/run-01.toml,./decks/run-02.toml \
|
|
62
|
+
--criteria coverage,relevance,actionability,depth
|
|
43
63
|
```
|
|
44
|
-
Usage: bunx @lythos/skill-arena <options> | bunx @lythos/skill-arena viz <dir>
|
|
45
64
|
|
|
46
|
-
|
|
47
|
-
--task, -t <desc> Task description (required)
|
|
48
|
-
--skills, -s <list> Comma-separated skills, 2–5 (Mode 1)
|
|
49
|
-
--criteria, -c <list> Evaluation dimensions (default: syntax,context,logic,token)
|
|
50
|
-
--control <skill> Control skill (default: lythoskill-project-scribe)
|
|
65
|
+
### Scaffold mode (legacy, manual execution)
|
|
51
66
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
67
|
+
```
|
|
68
|
+
bunx @lythos/skill-arena scaffold --task "..." --skills a,b
|
|
69
|
+
```
|
|
55
70
|
|
|
56
|
-
|
|
57
|
-
--dir, -d <path> Arena parent directory (default: tmp)
|
|
58
|
-
--project, -p <path> Project root (default: .)
|
|
71
|
+
### Viz
|
|
59
72
|
|
|
60
|
-
|
|
61
|
-
|
|
73
|
+
```bash
|
|
74
|
+
bunx @lythos/skill-arena viz runs/arena-<id>/
|
|
62
75
|
```
|
|
63
76
|
|
|
64
77
|
## Skill Documentation
|
|
@@ -77,6 +90,31 @@ Skill (packages/<name>/skill/) → build → SKILL.md + thin scripts
|
|
|
77
90
|
Output (skills/<name>/) → git commit → agent-visible skill
|
|
78
91
|
```
|
|
79
92
|
|
|
93
|
+
### Runtime architecture (intent/plan/execute)
|
|
94
|
+
|
|
95
|
+
```
|
|
96
|
+
arena.toml → ArenaToml (Zod) → ExecutionPlan (pure) → per-cell agent spawn (IO)
|
|
97
|
+
↓
|
|
98
|
+
aggregateAllStats (pure) ← verdicts[]
|
|
99
|
+
↓
|
|
100
|
+
runComparativeJudge (IO) → report.md + Pareto frontier
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
- **Intent**: `arena.toml` declarative config (k8s-manifest style)
|
|
104
|
+
- **Plan**: `buildExecutionPlan()`, `aggregateSideStats()`, `computePareto()` — pure functions
|
|
105
|
+
- **Execute**: `runAgentScenario` per cell, `runComparativeJudge` — IO via `AgentAdapter`
|
|
106
|
+
|
|
107
|
+
Built on `@lythos/test-utils` shared infrastructure.
|
|
108
|
+
|
|
109
|
+
## Test Coverage
|
|
110
|
+
|
|
111
|
+
| Layer | Count | CI | Notes |
|
|
112
|
+
|-------|-------|----|-------|
|
|
113
|
+
| Unit tests | 41 | ✅ | TOML parser, player resolution, Pareto, stats |
|
|
114
|
+
| Agent BDD | — | ❌ | Requires `claude` CLI; run locally |
|
|
115
|
+
|
|
116
|
+
Pareto frontier is a **deterministic algorithm** — never delegated to LLM. 8 unit tests cover dominance, cross-dominance, transitive chains, partial criteria, and empty scores.
|
|
117
|
+
|
|
80
118
|
## License
|
|
81
119
|
|
|
82
120
|
MIT
|
package/package.json
CHANGED
package/src/arena-toml.test.ts
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { describe, test, expect } from 'bun:test'
|
|
2
2
|
import { parseArenaToml, buildExecutionPlan, ArenaToml } from './arena-toml'
|
|
3
|
+
import { formatPlanOutput } from './runner'
|
|
3
4
|
|
|
4
5
|
const minimalToml = `
|
|
5
6
|
[arena]
|
|
@@ -150,6 +151,32 @@ describe('buildExecutionPlan', () => {
|
|
|
150
151
|
expect(baselineCells.every(c => c.control)).toBe(true)
|
|
151
152
|
})
|
|
152
153
|
|
|
154
|
+
test('dry-run output format matches expected log', () => {
|
|
155
|
+
const toml = parseArenaToml(minimalToml)
|
|
156
|
+
const plan = buildExecutionPlan(toml)
|
|
157
|
+
|
|
158
|
+
// Simulate what --dry-run would log
|
|
159
|
+
const logs: string[] = []
|
|
160
|
+
for (const line of formatPlanOutput(plan)) {
|
|
161
|
+
logs.push(line)
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
expect(logs.some(l => l.includes('2 cells'))).toBe(true)
|
|
165
|
+
expect(logs.some(l => l.includes('runner-a'))).toBe(true)
|
|
166
|
+
expect(logs.some(l => l.includes('runner-b'))).toBe(true)
|
|
167
|
+
expect(logs.some(l => l.includes('claude-code'))).toBe(true)
|
|
168
|
+
expect(logs.every(l => !l.includes('control'))).toBe(true) // no control flags in minimal
|
|
169
|
+
})
|
|
170
|
+
|
|
171
|
+
test('dry-run output shows control flag for control sides', () => {
|
|
172
|
+
const toml = parseArenaToml(fullToml)
|
|
173
|
+
const plan = buildExecutionPlan(toml)
|
|
174
|
+
const lines = formatPlanOutput(plan)
|
|
175
|
+
const baselineLines = lines.filter(l => l.includes('baseline'))
|
|
176
|
+
// All baseline cells should have [control] flag
|
|
177
|
+
expect(baselineLines.every(l => l.includes('[control]'))).toBe(true)
|
|
178
|
+
})
|
|
179
|
+
|
|
153
180
|
test('dry-run: plan is pure data, no side effects', () => {
|
|
154
181
|
// The entire plan generation is a pure function — dry-run is just printing it
|
|
155
182
|
const toml = parseArenaToml(fullToml)
|
package/src/cli.ts
CHANGED
|
@@ -80,6 +80,9 @@ function parseArgs(argv: string[]) {
|
|
|
80
80
|
control: 'lythoskill-project-scribe',
|
|
81
81
|
dir: 'tmp',
|
|
82
82
|
project: '.',
|
|
83
|
+
config: undefined,
|
|
84
|
+
out: undefined,
|
|
85
|
+
players: undefined,
|
|
83
86
|
}
|
|
84
87
|
const positionals: string[] = []
|
|
85
88
|
|
|
@@ -99,6 +102,12 @@ function parseArgs(argv: string[]) {
|
|
|
99
102
|
options.dir = argv[++i]
|
|
100
103
|
} else if (arg === '--project' || arg === '-p') {
|
|
101
104
|
options.project = argv[++i]
|
|
105
|
+
} else if (arg === '--config') {
|
|
106
|
+
options.config = argv[++i]
|
|
107
|
+
} else if (arg === '--out') {
|
|
108
|
+
options.out = argv[++i]
|
|
109
|
+
} else if (arg === '--players') {
|
|
110
|
+
options.players = argv[++i]
|
|
102
111
|
} else if (!arg.startsWith('-')) {
|
|
103
112
|
positionals.push(arg)
|
|
104
113
|
}
|
|
@@ -584,11 +593,11 @@ async function runProgrammaticArena(argv: string[]) {
|
|
|
584
593
|
const configPath = (options as Record<string, string | undefined>).config!
|
|
585
594
|
|
|
586
595
|
const toml = parseArenaToml(readFileSync(configPath, 'utf-8'))
|
|
596
|
+
const { dirname } = await import('node:path')
|
|
587
597
|
const result = await runArenaFromToml({
|
|
588
598
|
toml,
|
|
589
|
-
taskPath: toml.arena.task
|
|
590
|
-
|
|
591
|
-
: (options as Record<string, string | undefined>).task ?? toml.arena.task,
|
|
599
|
+
taskPath: toml.arena.task,
|
|
600
|
+
configDir: dirname(configPath), // resolve relative paths against config file dir
|
|
592
601
|
outDir: (options as Record<string, string | undefined>).out,
|
|
593
602
|
dryRun,
|
|
594
603
|
})
|
package/src/runner.ts
CHANGED
|
@@ -5,7 +5,7 @@ import { useAgent } from '@lythos/test-utils/agents'
|
|
|
5
5
|
import { ArenaManifest, Player } from '@lythos/test-utils/schema'
|
|
6
6
|
import type { ArenaManifest as ArenaManifestType, JudgeVerdict } from '@lythos/test-utils/schema'
|
|
7
7
|
import { runComparativeJudge } from './comparative-judge'
|
|
8
|
-
import { parseArenaToml, buildExecutionPlan, type ArenaToml } from './arena-toml'
|
|
8
|
+
import { parseArenaToml, buildExecutionPlan, type ArenaToml, type ExecutionPlan } from './arena-toml'
|
|
9
9
|
import { resolvePlayer, resolveSides } from './player'
|
|
10
10
|
import { aggregateAllStats } from './stats'
|
|
11
11
|
import type { SideStats } from './stats'
|
|
@@ -26,30 +26,58 @@ export interface ArenaResult {
|
|
|
26
26
|
artifactsDir: string
|
|
27
27
|
}
|
|
28
28
|
|
|
29
|
+
/** Format an execution plan as readable CLI output (pure). */
|
|
30
|
+
export function formatPlanOutput(plan: ExecutionPlan): string[] {
|
|
31
|
+
const lines: string[] = []
|
|
32
|
+
const sideCount = new Set(plan.cells.map(c => c.side)).size
|
|
33
|
+
lines.push(`\n📋 Dry-run: ${plan.total_runs} cells across ${sideCount} sides × ${plan.cells.length / Math.max(1, sideCount)} runs`)
|
|
34
|
+
for (const cell of plan.cells) {
|
|
35
|
+
lines.push(` ${cell.side}/run-${cell.run}: ${cell.player} × ${cell.deck}${cell.control ? ' [control]' : ''}`)
|
|
36
|
+
}
|
|
37
|
+
return lines
|
|
38
|
+
}
|
|
39
|
+
|
|
29
40
|
export async function runArenaFromToml(opts: {
|
|
30
41
|
toml: ArenaToml
|
|
31
42
|
taskPath: string
|
|
32
43
|
outDir?: string
|
|
33
44
|
dryRun?: boolean
|
|
45
|
+
log?: (msg: string) => void
|
|
46
|
+
configDir?: string // for resolving relative paths
|
|
34
47
|
}): Promise<ArenaResult | { plan: ReturnType<typeof buildExecutionPlan> }> {
|
|
35
|
-
const { toml, taskPath, outDir, dryRun } = opts
|
|
48
|
+
const { toml, taskPath, outDir, dryRun, log, configDir } = opts
|
|
36
49
|
|
|
37
|
-
|
|
50
|
+
// Resolve relative paths against config dir (anti-footgun: cwd may differ)
|
|
51
|
+
const resolvePath = (p: string) => {
|
|
52
|
+
if (p.startsWith('/')) return p
|
|
53
|
+
if (configDir) return resolve(configDir, p)
|
|
54
|
+
return resolve(p)
|
|
55
|
+
}
|
|
56
|
+
const taskAbs = resolvePath(taskPath)
|
|
57
|
+
const resolvedToml: ArenaToml = {
|
|
58
|
+
...toml,
|
|
59
|
+
side: toml.side.map(s => ({ ...s, deck: resolvePath(s.deck) })),
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
const plan = buildExecutionPlan(resolvedToml)
|
|
38
63
|
|
|
39
64
|
// dry-run: return plan without executing
|
|
40
65
|
if (dryRun) {
|
|
66
|
+
for (const line of formatPlanOutput(plan)) {
|
|
67
|
+
log?.(line)
|
|
68
|
+
}
|
|
41
69
|
return { plan }
|
|
42
70
|
}
|
|
43
71
|
|
|
44
72
|
const arenaId = `arena-${stamp()}`
|
|
45
73
|
const artifactsDir = outDir || join(process.cwd(), 'runs', arenaId)
|
|
46
|
-
const resolved = resolveSides(
|
|
74
|
+
const resolved = resolveSides(resolvedToml)
|
|
47
75
|
|
|
48
76
|
// Build manifest
|
|
49
77
|
const manifest = ArenaManifest.parse({
|
|
50
78
|
id: arenaId,
|
|
51
79
|
created_at: new Date().toISOString(),
|
|
52
|
-
task: readFileSync(
|
|
80
|
+
task: readFileSync(taskAbs, 'utf-8').slice(0, 200),
|
|
53
81
|
mode: 'decks',
|
|
54
82
|
participants: [...new Map(resolved.map(r => [r.side.name, r])).values()].map(r => ({
|
|
55
83
|
id: r.side.name,
|
|
@@ -58,7 +86,7 @@ export async function runArenaFromToml(opts: {
|
|
|
58
86
|
deck: r.side.deck,
|
|
59
87
|
description: `${r.playerName} × ${r.side.deck}`,
|
|
60
88
|
})),
|
|
61
|
-
criteria:
|
|
89
|
+
criteria: resolvedToml.arena.criteria,
|
|
62
90
|
status: 'running',
|
|
63
91
|
})
|
|
64
92
|
|