@lythos/skill-arena 0.9.19 → 0.9.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -13,31 +13,62 @@
13
13
  - **Mode 1**: Single-skill comparison (controlled variable — same helper skills, different test skill).
14
14
  - **Mode 2**: Full-deck comparison (Pareto frontier — no single winner, only optimal trade-offs).
15
15
 
16
+ ## Prerequisites
17
+
18
+ Arena runs AI agents as subprocesses. You need at least one agent CLI installed:
19
+
20
+ ### Kimi CLI (recommended default)
21
+
22
+ Kimi Code CLI is the default player for arena — it has reliable headless execution with eager tool loading (no deferred tool deadlock).
23
+
24
+ ```bash
25
+ # Install via uv (recommended) — uv is Python's bunx equivalent
26
+ uv tool install kimi-cli
27
+ # Or run without installing:
28
+ uvx kimi-cli --print -p "hello"
29
+
30
+ # Authenticate
31
+ kimi login
32
+ # Or set API key:
33
+ export KIMI_API_KEY=your_key
34
+ ```
35
+
36
+ Docs: [https://github.com/MoonshotAI/kimi-cli](https://github.com/MoonshotAI/kimi-cli)
37
+
38
+ ### Claude CLI (secondary)
39
+
40
+ ```bash
41
+ npm install -g @anthropic-ai/claude-code
42
+ claude --version # should be ≥ 2.1.128
43
+ ```
44
+
45
+ Note: Claude `-p` mode has known issues with web tools in Bun.spawn (deferred tool deadlock). Kimi is the default for reliability.
46
+
16
47
  ## Install
17
48
 
18
49
  ```bash
19
50
  bun add -d @lythos/skill-arena
20
51
  # or use directly
21
- bunx @lythos/skill-arena@0.9.19 <command>
52
+ bunx @lythos/skill-arena@0.9.21 <command>
22
53
  ```
23
54
 
24
55
  ## Quick Start
25
56
 
26
57
  ```bash
27
58
  # Mode 1: Compare two skills on the same task
28
- bunx @lythos/skill-arena@0.9.19 \
59
+ bunx @lythos/skill-arena@0.9.21 \
29
60
  --task "Generate auth flow diagram" \
30
61
  --skills "design-doc-mermaid,mermaid-tools" \
31
62
  --criteria "syntax,context,token"
32
63
 
33
64
  # Mode 2: Compare full deck configurations
34
- bunx @lythos/skill-arena@0.9.19 \
65
+ bunx @lythos/skill-arena@0.9.21 \
35
66
  --task "Generate auth flow diagram" \
36
67
  --decks "./decks/minimal.toml,./decks/rich.toml" \
37
68
  --criteria "quality,token,maintainability"
38
69
 
39
70
  # Visualize results
40
- bunx @lythos/skill-arena@0.9.19 viz tmp/arena-<id>/
71
+ bunx @lythos/skill-arena@0.9.21 viz tmp/arena-<id>/
41
72
  ```
42
73
 
43
74
  ## Commands
@@ -46,16 +77,16 @@ bunx @lythos/skill-arena@0.9.19 viz tmp/arena-<id>/
46
77
 
47
78
  ```bash
48
79
  # Print execution plan without running
49
- bunx @lythos/skill-arena@0.9.19 run --config arena.toml --dry-run
80
+ bunx @lythos/skill-arena@0.9.21 run --config arena.toml --dry-run
50
81
 
51
82
  # Execute with per-side runs_per_side and statistical aggregation
52
- bunx @lythos/skill-arena@0.9.19 run --config arena.toml
83
+ bunx @lythos/skill-arena@0.9.21 run --config arena.toml
53
84
  ```
54
85
 
55
86
  ### CLI-flag mode (backward compat)
56
87
 
57
88
  ```
58
- bunx @lythos/skill-arena@0.9.19 run \
89
+ bunx @lythos/skill-arena@0.9.21 run \
59
90
  --task ./TASK-arena.md \
60
91
  --players ./players/claude.toml \
61
92
  --decks ./decks/run-01.toml,./decks/run-02.toml \
@@ -65,13 +96,13 @@ bunx @lythos/skill-arena@0.9.19 run \
65
96
  ### Scaffold mode (legacy, manual execution)
66
97
 
67
98
  ```
68
- bunx @lythos/skill-arena@0.9.19 scaffold --task "..." --skills a,b
99
+ bunx @lythos/skill-arena@0.9.21 scaffold --task "..." --skills a,b
69
100
  ```
70
101
 
71
102
  ### Viz
72
103
 
73
104
  ```bash
74
- bunx @lythos/skill-arena@0.9.19 viz runs/arena-<id>/
105
+ bunx @lythos/skill-arena@0.9.21 viz runs/arena-<id>/
75
106
  ```
76
107
 
77
108
  ## Skill Documentation
@@ -85,7 +116,7 @@ The agent-visible **Skill** layer documentation is here:
85
116
  Part of the [lythoskill](https://github.com/lythos-labs/lythoskill) ecosystem — the thin-skill pattern separates heavy logic (this npm package) from lightweight agent instructions (SKILL.md).
86
117
 
87
118
  ```
88
- Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.19 ...
119
+ Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.21 ...
89
120
  Skill (packages/<name>/skill/) → build → SKILL.md + thin scripts
90
121
  Output (skills/<name>/) → git commit → agent-visible skill
91
122
  ```
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lythos/skill-arena",
3
- "version": "0.9.19",
3
+ "version": "0.9.21",
4
4
  "description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
5
5
  "keywords": [
6
6
  "ai-agent",
package/src/cli.ts CHANGED
@@ -29,6 +29,8 @@ function printHelp(): void {
29
29
  console.log(`🎭 lythoskill-arena — Skill comparison runner
30
30
 
31
31
  Usage:
32
+ lythoskill-arena agent-run --task <path> --deck <path> [--player kimi] [--out <dir>]
33
+ lythoskill-arena agent-run --brief "<prompt>" --deck <path> [--out <dir>]
32
34
  lythoskill-arena run --task <path> --players <A.toml,B.toml> --decks <A.toml,B.toml> --criteria <c1,c2,...> [--out <dir>]
33
35
  lythoskill-arena scaffold --task "<description>" --skills <skill1,skill2,...>
34
36
  lythoskill-arena scaffold --task "<description>" --decks <deck1,deck2,...>
@@ -53,6 +55,10 @@ Options:
53
55
  -p, --project <dir> Project directory (default: .)
54
56
 
55
57
  Examples:
58
+ # Single agent run (simplest path)
59
+ lythoskill-arena agent-run --task ./TASK.md --deck ./deck.toml
60
+ lythoskill-arena agent-run --task ./TASK.md --deck ./deck.toml --player kimi --out ./output
61
+
56
62
  # Declarative mode (k8s-style)
57
63
  lythoskill-arena run --config ./arena.toml
58
64
  lythoskill-arena run --config ./arena.toml --dry-run
@@ -66,6 +72,120 @@ Examples:
66
72
  `)
67
73
  }
68
74
 
75
+ // ── agent-run: single agent execution (simplest path) ────────────────────
76
+
77
+ async function agentRun(args: string[]) {
78
+ const opts: Record<string, string | undefined> = {}
79
+ for (let i = 0; i < args.length; i++) {
80
+ if (args[i] === '--task' || args[i] === '-t') opts.task = args[++i]
81
+ else if (args[i] === '--brief' || args[i] === '-b') opts.brief = args[++i]
82
+ else if (args[i] === '--deck' || args[i] === '-d') opts.deck = args[++i]
83
+ else if (args[i] === '--player' || args[i] === '-p') opts.player = args[++i]
84
+ else if (args[i] === '--out' || args[i] === '-o') opts.out = args[++i]
85
+ }
86
+
87
+ if (!opts.deck) {
88
+ console.error('❌ --deck <path> is required')
89
+ process.exit(1)
90
+ }
91
+ if (!opts.task && !opts.brief) {
92
+ console.error('❌ --task <path> or --brief "<prompt>" is required')
93
+ process.exit(1)
94
+ }
95
+
96
+ const { resolve, join } = await import('node:path')
97
+ const deckPath = resolve(opts.deck)
98
+ if (!existsSync(deckPath)) { console.error(`❌ Deck file not found: ${deckPath}`); process.exit(1) }
99
+
100
+ // Resolve task: either from file, or create temp task from --brief
101
+ let taskPath: string
102
+ if (opts.task) {
103
+ taskPath = resolve(opts.task)
104
+ if (!existsSync(taskPath)) { console.error(`❌ Task file not found: ${taskPath}`); process.exit(1) }
105
+ } else {
106
+ const { mkdtempSync, writeFileSync } = await import('node:fs')
107
+ const { tmpdir } = await import('node:os')
108
+ const tmpDir = mkdtempSync(join(tmpdir(), 'arena-brief-'))
109
+ taskPath = join(tmpDir, 'TASK.md')
110
+ const briefTask = `---
111
+ name: ad-hoc task
112
+ description: ${opts.brief!.slice(0, 80)}
113
+ timeout: 120000
114
+ ---
115
+
116
+ ## Given
117
+ - You are an AI agent with the skills declared in the deck
118
+
119
+ ## When
120
+ ${opts.brief}
121
+
122
+ ## Then
123
+ - Write your output to output.md
124
+ - The output should be complete and well-structured
125
+
126
+ ## Judge
127
+ Evaluate whether the output is complete, accurate, and well-structured.
128
+ `
129
+ writeFileSync(taskPath, briefTask, 'utf-8')
130
+ }
131
+
132
+ const { useAgent } = await import('@lythos/test-utils/agents')
133
+ const { runAgentScenario } = await import('@lythos/test-utils/agent-bdd')
134
+ const { resolvePlayer } = await import('./player')
135
+ const { readFileSync, writeFileSync, mkdirSync } = await import('node:fs')
136
+
137
+ const player = resolvePlayer(opts.player ?? 'kimi')
138
+ const agent = useAgent(player)
139
+ const outDir = opts.out ? resolve(opts.out) : join(process.cwd(), `agent-output-${new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19)}`)
140
+ mkdirSync(outDir, { recursive: true })
141
+
142
+ console.log(`🤖 agent-run: ${player} × ${deckPath}`)
143
+ console.log(`📋 task: ${taskPath}`)
144
+
145
+ let agentWorkdir = ''
146
+ const result = await runAgentScenario({
147
+ scenarioPath: taskPath,
148
+ agent,
149
+ async setupWorkdir(_scenario, workdir) {
150
+ agentWorkdir = workdir
151
+ mkdirSync(workdir, { recursive: true })
152
+ writeFileSync(join(workdir, 'skill-deck.toml'), readFileSync(deckPath, 'utf-8'))
153
+
154
+ const linkProc = Bun.spawn(
155
+ ['bunx', '@lythos/skill-deck', 'link'],
156
+ { cwd: workdir, env: { ...process.env, HOME: process.env.HOME! } },
157
+ )
158
+ await linkProc.exited
159
+ },
160
+ })
161
+
162
+ // Copy agent output to outDir
163
+ writeFileSync(join(outDir, 'agent-stdout.txt'), result.agentResult.stdout, 'utf-8')
164
+ if (result.agentResult.stderr) writeFileSync(join(outDir, 'agent-stderr.txt'), result.agentResult.stderr, 'utf-8')
165
+ if (result.verdict) writeFileSync(join(outDir, 'judge-verdict.json'), JSON.stringify(result.verdict, null, 2) + '\n', 'utf-8')
166
+
167
+ // Copy all agent-produced files from workdir (output.md, output.docx, etc.)
168
+ // Skip .claude/ (symlink dir) and deck artifacts. Recursive so docx/pdf work.
169
+ if (agentWorkdir) {
170
+ const { cpSync, readdirSync } = await import('node:fs')
171
+ const skipSet = new Set(['.claude', 'skill-deck.toml', 'skill-deck.lock'])
172
+ try {
173
+ for (const entry of readdirSync(agentWorkdir)) {
174
+ if (skipSet.has(entry)) continue
175
+ const src = join(agentWorkdir, entry)
176
+ const dest = join(outDir, entry)
177
+ try { cpSync(src, dest, { recursive: true }) } catch {}
178
+ }
179
+ } catch {}
180
+ }
181
+
182
+ console.log(`\n✅ Agent complete (${result.agentResult.durationMs}ms)`)
183
+ console.log(`📁 Output: ${outDir}`)
184
+ if (result.verdict) {
185
+ console.log(`🏆 Verdict: ${result.verdict.verdict} — ${result.verdict.reason.slice(0, 120)}`)
186
+ }
187
+ }
188
+
69
189
  function parseArgs(argv: string[]) {
70
190
  if (argv.includes('--help') || argv.includes('-h')) {
71
191
  printHelp()
@@ -644,7 +764,9 @@ if (import.meta.main) {
644
764
  const args = process.argv.slice(2)
645
765
  const cmd = args[0]
646
766
 
647
- if (cmd === 'viz') {
767
+ if (cmd === 'agent-run') {
768
+ agentRun(args.slice(1))
769
+ } else if (cmd === 'viz') {
648
770
  runViz(args.slice(1))
649
771
  } else if (cmd === 'run') {
650
772
  runProgrammaticArena(args.slice(1))
package/src/runner.ts CHANGED
@@ -111,12 +111,11 @@ export async function runArenaFromToml(opts: {
111
111
  const deckContent = readFileSync(cell.deck, 'utf-8')
112
112
  writeFileSync(join(workdir, 'skill-deck.toml'), deckContent)
113
113
 
114
- // Link skills into .claude/skills/ so claude -p can discover them
115
- const deckCli = resolve(import.meta.dir, '..', '..', 'lythoskill-deck', 'src', 'cli.ts')
116
- const linkProc = Bun.spawn(['bun', 'run', deckCli, 'link'], {
117
- cwd: workdir,
118
- env: { ...process.env, HOME: process.env.HOME },
119
- })
114
+ // Link skills via bunx (works both locally and when installed via bunx)
115
+ const linkProc = Bun.spawn(
116
+ ['bunx', '@lythos/skill-deck', 'link'],
117
+ { cwd: workdir, env: { ...process.env, HOME: process.env.HOME! } },
118
+ )
120
119
  await linkProc.exited
121
120
  log?.(`[arena] deck link for ${cell.side}: exit ${linkProc.exitCode}`)
122
121
  },