@lythos/skill-arena 0.9.19 → 0.9.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +41 -10
- package/package.json +1 -1
- package/src/cli.ts +122 -1
- package/src/runner.ts +5 -6
package/README.md
CHANGED
|
@@ -13,31 +13,62 @@
|
|
|
13
13
|
- **Mode 1**: Single-skill comparison (controlled variable — same helper skills, different test skill).
|
|
14
14
|
- **Mode 2**: Full-deck comparison (Pareto frontier — no single winner, only optimal trade-offs).
|
|
15
15
|
|
|
16
|
+
## Prerequisites
|
|
17
|
+
|
|
18
|
+
Arena runs AI agents as subprocesses. You need at least one agent CLI installed:
|
|
19
|
+
|
|
20
|
+
### Kimi CLI (recommended default)
|
|
21
|
+
|
|
22
|
+
Kimi Code CLI is the default player for arena — it has reliable headless execution with eager tool loading (no deferred tool deadlock).
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
# Install via uv (recommended) — uv is Python's bunx equivalent
|
|
26
|
+
uv tool install kimi-cli
|
|
27
|
+
# Or run without installing:
|
|
28
|
+
uvx kimi-cli --print -p "hello"
|
|
29
|
+
|
|
30
|
+
# Authenticate
|
|
31
|
+
kimi login
|
|
32
|
+
# Or set API key:
|
|
33
|
+
export KIMI_API_KEY=your_key
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
Docs: [https://github.com/MoonshotAI/kimi-cli](https://github.com/MoonshotAI/kimi-cli)
|
|
37
|
+
|
|
38
|
+
### Claude CLI (secondary)
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
npm install -g @anthropic-ai/claude-code
|
|
42
|
+
claude --version # should be ≥ 2.1.128
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Note: Claude `-p` mode has known issues with web tools in Bun.spawn (deferred tool deadlock). Kimi is the default for reliability.
|
|
46
|
+
|
|
16
47
|
## Install
|
|
17
48
|
|
|
18
49
|
```bash
|
|
19
50
|
bun add -d @lythos/skill-arena
|
|
20
51
|
# or use directly
|
|
21
|
-
bunx @lythos/skill-arena@0.9.
|
|
52
|
+
bunx @lythos/skill-arena@0.9.20 <command>
|
|
22
53
|
```
|
|
23
54
|
|
|
24
55
|
## Quick Start
|
|
25
56
|
|
|
26
57
|
```bash
|
|
27
58
|
# Mode 1: Compare two skills on the same task
|
|
28
|
-
bunx @lythos/skill-arena@0.9.
|
|
59
|
+
bunx @lythos/skill-arena@0.9.20 \
|
|
29
60
|
--task "Generate auth flow diagram" \
|
|
30
61
|
--skills "design-doc-mermaid,mermaid-tools" \
|
|
31
62
|
--criteria "syntax,context,token"
|
|
32
63
|
|
|
33
64
|
# Mode 2: Compare full deck configurations
|
|
34
|
-
bunx @lythos/skill-arena@0.9.
|
|
65
|
+
bunx @lythos/skill-arena@0.9.20 \
|
|
35
66
|
--task "Generate auth flow diagram" \
|
|
36
67
|
--decks "./decks/minimal.toml,./decks/rich.toml" \
|
|
37
68
|
--criteria "quality,token,maintainability"
|
|
38
69
|
|
|
39
70
|
# Visualize results
|
|
40
|
-
bunx @lythos/skill-arena@0.9.
|
|
71
|
+
bunx @lythos/skill-arena@0.9.20 viz tmp/arena-<id>/
|
|
41
72
|
```
|
|
42
73
|
|
|
43
74
|
## Commands
|
|
@@ -46,16 +77,16 @@ bunx @lythos/skill-arena@0.9.19 viz tmp/arena-<id>/
|
|
|
46
77
|
|
|
47
78
|
```bash
|
|
48
79
|
# Print execution plan without running
|
|
49
|
-
bunx @lythos/skill-arena@0.9.
|
|
80
|
+
bunx @lythos/skill-arena@0.9.20 run --config arena.toml --dry-run
|
|
50
81
|
|
|
51
82
|
# Execute with per-side runs_per_side and statistical aggregation
|
|
52
|
-
bunx @lythos/skill-arena@0.9.
|
|
83
|
+
bunx @lythos/skill-arena@0.9.20 run --config arena.toml
|
|
53
84
|
```
|
|
54
85
|
|
|
55
86
|
### CLI-flag mode (backward compat)
|
|
56
87
|
|
|
57
88
|
```
|
|
58
|
-
bunx @lythos/skill-arena@0.9.
|
|
89
|
+
bunx @lythos/skill-arena@0.9.20 run \
|
|
59
90
|
--task ./TASK-arena.md \
|
|
60
91
|
--players ./players/claude.toml \
|
|
61
92
|
--decks ./decks/run-01.toml,./decks/run-02.toml \
|
|
@@ -65,13 +96,13 @@ bunx @lythos/skill-arena@0.9.19 run \
|
|
|
65
96
|
### Scaffold mode (legacy, manual execution)
|
|
66
97
|
|
|
67
98
|
```
|
|
68
|
-
bunx @lythos/skill-arena@0.9.
|
|
99
|
+
bunx @lythos/skill-arena@0.9.20 scaffold --task "..." --skills a,b
|
|
69
100
|
```
|
|
70
101
|
|
|
71
102
|
### Viz
|
|
72
103
|
|
|
73
104
|
```bash
|
|
74
|
-
bunx @lythos/skill-arena@0.9.
|
|
105
|
+
bunx @lythos/skill-arena@0.9.20 viz runs/arena-<id>/
|
|
75
106
|
```
|
|
76
107
|
|
|
77
108
|
## Skill Documentation
|
|
@@ -85,7 +116,7 @@ The agent-visible **Skill** layer documentation is here:
|
|
|
85
116
|
Part of the [lythoskill](https://github.com/lythos-labs/lythoskill) ecosystem — the thin-skill pattern separates heavy logic (this npm package) from lightweight agent instructions (SKILL.md).
|
|
86
117
|
|
|
87
118
|
```
|
|
88
|
-
Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.
|
|
119
|
+
Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.20 ...
|
|
89
120
|
Skill (packages/<name>/skill/) → build → SKILL.md + thin scripts
|
|
90
121
|
Output (skills/<name>/) → git commit → agent-visible skill
|
|
91
122
|
```
|
package/package.json
CHANGED
package/src/cli.ts
CHANGED
|
@@ -29,6 +29,8 @@ function printHelp(): void {
|
|
|
29
29
|
console.log(`🎭 lythoskill-arena — Skill comparison runner
|
|
30
30
|
|
|
31
31
|
Usage:
|
|
32
|
+
lythoskill-arena agent-run --task <path> --deck <path> [--player kimi] [--out <dir>]
|
|
33
|
+
lythoskill-arena agent-run --brief "<prompt>" --deck <path> [--out <dir>]
|
|
32
34
|
lythoskill-arena run --task <path> --players <A.toml,B.toml> --decks <A.toml,B.toml> --criteria <c1,c2,...> [--out <dir>]
|
|
33
35
|
lythoskill-arena scaffold --task "<description>" --skills <skill1,skill2,...>
|
|
34
36
|
lythoskill-arena scaffold --task "<description>" --decks <deck1,deck2,...>
|
|
@@ -53,6 +55,10 @@ Options:
|
|
|
53
55
|
-p, --project <dir> Project directory (default: .)
|
|
54
56
|
|
|
55
57
|
Examples:
|
|
58
|
+
# Single agent run (simplest path)
|
|
59
|
+
lythoskill-arena agent-run --task ./TASK.md --deck ./deck.toml
|
|
60
|
+
lythoskill-arena agent-run --task ./TASK.md --deck ./deck.toml --player kimi --out ./output
|
|
61
|
+
|
|
56
62
|
# Declarative mode (k8s-style)
|
|
57
63
|
lythoskill-arena run --config ./arena.toml
|
|
58
64
|
lythoskill-arena run --config ./arena.toml --dry-run
|
|
@@ -66,6 +72,119 @@ Examples:
|
|
|
66
72
|
`)
|
|
67
73
|
}
|
|
68
74
|
|
|
75
|
+
// ── agent-run: single agent execution (simplest path) ────────────────────
|
|
76
|
+
|
|
77
|
+
async function agentRun(args: string[]) {
|
|
78
|
+
const opts: Record<string, string | undefined> = {}
|
|
79
|
+
for (let i = 0; i < args.length; i++) {
|
|
80
|
+
if (args[i] === '--task' || args[i] === '-t') opts.task = args[++i]
|
|
81
|
+
else if (args[i] === '--brief' || args[i] === '-b') opts.brief = args[++i]
|
|
82
|
+
else if (args[i] === '--deck' || args[i] === '-d') opts.deck = args[++i]
|
|
83
|
+
else if (args[i] === '--player' || args[i] === '-p') opts.player = args[++i]
|
|
84
|
+
else if (args[i] === '--out' || args[i] === '-o') opts.out = args[++i]
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
if (!opts.deck) {
|
|
88
|
+
console.error('❌ --deck <path> is required')
|
|
89
|
+
process.exit(1)
|
|
90
|
+
}
|
|
91
|
+
if (!opts.task && !opts.brief) {
|
|
92
|
+
console.error('❌ --task <path> or --brief "<prompt>" is required')
|
|
93
|
+
process.exit(1)
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
const { resolve, join } = await import('node:path')
|
|
97
|
+
const deckPath = resolve(opts.deck)
|
|
98
|
+
if (!existsSync(deckPath)) { console.error(`❌ Deck file not found: ${deckPath}`); process.exit(1) }
|
|
99
|
+
|
|
100
|
+
// Resolve task: either from file, or create temp task from --brief
|
|
101
|
+
let taskPath: string
|
|
102
|
+
if (opts.task) {
|
|
103
|
+
taskPath = resolve(opts.task)
|
|
104
|
+
if (!existsSync(taskPath)) { console.error(`❌ Task file not found: ${taskPath}`); process.exit(1) }
|
|
105
|
+
} else {
|
|
106
|
+
const { mkdtempSync, writeFileSync } = await import('node:fs')
|
|
107
|
+
const { tmpdir } = await import('node:os')
|
|
108
|
+
const tmpDir = mkdtempSync(join(tmpdir(), 'arena-brief-'))
|
|
109
|
+
taskPath = join(tmpDir, 'TASK.md')
|
|
110
|
+
const briefTask = `---
|
|
111
|
+
name: ad-hoc task
|
|
112
|
+
description: ${opts.brief!.slice(0, 80)}
|
|
113
|
+
timeout: 120000
|
|
114
|
+
---
|
|
115
|
+
|
|
116
|
+
## Given
|
|
117
|
+
- You are an AI agent with the skills declared in the deck
|
|
118
|
+
|
|
119
|
+
## When
|
|
120
|
+
${opts.brief}
|
|
121
|
+
|
|
122
|
+
## Then
|
|
123
|
+
- Write your output to output.md
|
|
124
|
+
- The output should be complete and well-structured
|
|
125
|
+
|
|
126
|
+
## Judge
|
|
127
|
+
Evaluate whether the output is complete, accurate, and well-structured.
|
|
128
|
+
`
|
|
129
|
+
writeFileSync(taskPath, briefTask, 'utf-8')
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
const { useAgent } = await import('@lythos/test-utils/agents')
|
|
133
|
+
const { runAgentScenario } = await import('@lythos/test-utils/agent-bdd')
|
|
134
|
+
const { resolvePlayer } = await import('./player')
|
|
135
|
+
const { readFileSync, writeFileSync, mkdirSync } = await import('node:fs')
|
|
136
|
+
|
|
137
|
+
const player = resolvePlayer(opts.player ?? 'kimi')
|
|
138
|
+
const agent = useAgent(player)
|
|
139
|
+
const outDir = opts.out ? resolve(opts.out) : join(process.cwd(), 'agent-run-output')
|
|
140
|
+
mkdirSync(outDir, { recursive: true })
|
|
141
|
+
|
|
142
|
+
console.log(`🤖 agent-run: ${player} × ${deckPath}`)
|
|
143
|
+
console.log(`📋 task: ${taskPath}`)
|
|
144
|
+
|
|
145
|
+
let agentWorkdir = ''
|
|
146
|
+
const result = await runAgentScenario({
|
|
147
|
+
scenarioPath: taskPath,
|
|
148
|
+
agent,
|
|
149
|
+
async setupWorkdir(_scenario, workdir) {
|
|
150
|
+
agentWorkdir = workdir
|
|
151
|
+
mkdirSync(workdir, { recursive: true })
|
|
152
|
+
writeFileSync(join(workdir, 'skill-deck.toml'), readFileSync(deckPath, 'utf-8'))
|
|
153
|
+
|
|
154
|
+
const linkProc = Bun.spawn(
|
|
155
|
+
['bunx', '@lythos/skill-deck', 'link'],
|
|
156
|
+
{ cwd: workdir, env: { ...process.env, HOME: process.env.HOME! } },
|
|
157
|
+
)
|
|
158
|
+
await linkProc.exited
|
|
159
|
+
},
|
|
160
|
+
})
|
|
161
|
+
|
|
162
|
+
// Copy agent output to outDir
|
|
163
|
+
writeFileSync(join(outDir, 'agent-stdout.txt'), result.agentResult.stdout, 'utf-8')
|
|
164
|
+
if (result.agentResult.stderr) writeFileSync(join(outDir, 'agent-stderr.txt'), result.agentResult.stderr, 'utf-8')
|
|
165
|
+
if (result.verdict) writeFileSync(join(outDir, 'judge-verdict.json'), JSON.stringify(result.verdict, null, 2) + '\n', 'utf-8')
|
|
166
|
+
|
|
167
|
+
// Copy agent-produced files from workdir (output.md, output.docx, etc.)
|
|
168
|
+
if (agentWorkdir) {
|
|
169
|
+
const { readdirSync, statSync, copyFileSync } = await import('node:fs')
|
|
170
|
+
try {
|
|
171
|
+
for (const entry of readdirSync(agentWorkdir)) {
|
|
172
|
+
if (entry.startsWith('.') || entry === 'skill-deck.toml' || entry === 'skill-deck.lock') continue
|
|
173
|
+
const src = join(agentWorkdir, entry)
|
|
174
|
+
try {
|
|
175
|
+
if (statSync(src).isFile()) copyFileSync(src, join(outDir, entry))
|
|
176
|
+
} catch {}
|
|
177
|
+
}
|
|
178
|
+
} catch {}
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
console.log(`\n✅ Agent complete (${result.agentResult.durationMs}ms)`)
|
|
182
|
+
console.log(`📁 Output: ${outDir}`)
|
|
183
|
+
if (result.verdict) {
|
|
184
|
+
console.log(`🏆 Verdict: ${result.verdict.verdict} — ${result.verdict.reason.slice(0, 120)}`)
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
|
|
69
188
|
function parseArgs(argv: string[]) {
|
|
70
189
|
if (argv.includes('--help') || argv.includes('-h')) {
|
|
71
190
|
printHelp()
|
|
@@ -644,7 +763,9 @@ if (import.meta.main) {
|
|
|
644
763
|
const args = process.argv.slice(2)
|
|
645
764
|
const cmd = args[0]
|
|
646
765
|
|
|
647
|
-
if (cmd === '
|
|
766
|
+
if (cmd === 'agent-run') {
|
|
767
|
+
agentRun(args.slice(1))
|
|
768
|
+
} else if (cmd === 'viz') {
|
|
648
769
|
runViz(args.slice(1))
|
|
649
770
|
} else if (cmd === 'run') {
|
|
650
771
|
runProgrammaticArena(args.slice(1))
|
package/src/runner.ts
CHANGED
|
@@ -111,12 +111,11 @@ export async function runArenaFromToml(opts: {
|
|
|
111
111
|
const deckContent = readFileSync(cell.deck, 'utf-8')
|
|
112
112
|
writeFileSync(join(workdir, 'skill-deck.toml'), deckContent)
|
|
113
113
|
|
|
114
|
-
// Link skills
|
|
115
|
-
const
|
|
116
|
-
|
|
117
|
-
cwd: workdir,
|
|
118
|
-
|
|
119
|
-
})
|
|
114
|
+
// Link skills via bunx (works both locally and when installed via bunx)
|
|
115
|
+
const linkProc = Bun.spawn(
|
|
116
|
+
['bunx', '@lythos/skill-deck', 'link'],
|
|
117
|
+
{ cwd: workdir, env: { ...process.env, HOME: process.env.HOME! } },
|
|
118
|
+
)
|
|
120
119
|
await linkProc.exited
|
|
121
120
|
log?.(`[arena] deck link for ${cell.side}: exit ${linkProc.exitCode}`)
|
|
122
121
|
},
|