npm - @lythos/skill-arena - Versions diffs - 0.9.19 → 0.9.21 - Mend

@lythos/skill-arena 0.9.19 → 0.9.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -13,31 +13,62 @@
 - **Mode 1**: Single-skill comparison (controlled variable — same helper skills, different test skill).
 - **Mode 2**: Full-deck comparison (Pareto frontier — no single winner, only optimal trade-offs).
+## Prerequisites
+Arena runs AI agents as subprocesses. You need at least one agent CLI installed:
+### Kimi CLI (recommended default)
+Kimi Code CLI is the default player for arena — it has reliable headless execution with eager tool loading (no deferred tool deadlock).
+```bash
+# Install via uv (recommended) — uv is Python's bunx equivalent
+uv tool install kimi-cli
+# Or run without installing:
+uvx kimi-cli --print -p "hello"
+# Authenticate
+kimi login
+# Or set API key:
+export KIMI_API_KEY=your_key
+```
+Docs: [https://github.com/MoonshotAI/kimi-cli](https://github.com/MoonshotAI/kimi-cli)
+### Claude CLI (secondary)
+```bash
+npm install -g @anthropic-ai/claude-code
+claude --version  # should be ≥ 2.1.128
+```
+Note: Claude `-p` mode has known issues with web tools in Bun.spawn (deferred tool deadlock). Kimi is the default for reliability.
 ## Install
 ```bash
 bun add -d @lythos/skill-arena
 # or use directly
-bunx @lythos/skill-arena@0.9.19 <command>
+bunx @lythos/skill-arena@0.9.21 <command>
 ```
 ## Quick Start
 ```bash
 # Mode 1: Compare two skills on the same task
-bunx @lythos/skill-arena@0.9.19 \
+bunx @lythos/skill-arena@0.9.21 \
   --task "Generate auth flow diagram" \
   --skills "design-doc-mermaid,mermaid-tools" \
   --criteria "syntax,context,token"
 # Mode 2: Compare full deck configurations
-bunx @lythos/skill-arena@0.9.19 \
+bunx @lythos/skill-arena@0.9.21 \
   --task "Generate auth flow diagram" \
   --decks "./decks/minimal.toml,./decks/rich.toml" \
   --criteria "quality,token,maintainability"
 # Visualize results
-bunx @lythos/skill-arena@0.9.19 viz tmp/arena-<id>/
+bunx @lythos/skill-arena@0.9.21 viz tmp/arena-<id>/
 ```
 ## Commands
@@ -46,16 +77,16 @@ bunx @lythos/skill-arena@0.9.19 viz tmp/arena-<id>/
 ```bash
 # Print execution plan without running
-bunx @lythos/skill-arena@0.9.19 run --config arena.toml --dry-run
+bunx @lythos/skill-arena@0.9.21 run --config arena.toml --dry-run
 # Execute with per-side runs_per_side and statistical aggregation
-bunx @lythos/skill-arena@0.9.19 run --config arena.toml
+bunx @lythos/skill-arena@0.9.21 run --config arena.toml
 ```
 ### CLI-flag mode (backward compat)
 ```
-bunx @lythos/skill-arena@0.9.19 run \
+bunx @lythos/skill-arena@0.9.21 run \
   --task ./TASK-arena.md \
   --players ./players/claude.toml \
   --decks ./decks/run-01.toml,./decks/run-02.toml \
@@ -65,13 +96,13 @@ bunx @lythos/skill-arena@0.9.19 run \
 ### Scaffold mode (legacy, manual execution)
 ```
-bunx @lythos/skill-arena@0.9.19 scaffold --task "..." --skills a,b
+bunx @lythos/skill-arena@0.9.21 scaffold --task "..." --skills a,b
 ```
 ### Viz
 ```bash
-bunx @lythos/skill-arena@0.9.19 viz runs/arena-<id>/
+bunx @lythos/skill-arena@0.9.21 viz runs/arena-<id>/
 ```
 ## Skill Documentation
@@ -85,7 +116,7 @@ The agent-visible **Skill** layer documentation is here:
 Part of the [lythoskill](https://github.com/lythos-labs/lythoskill) ecosystem — the thin-skill pattern separates heavy logic (this npm package) from lightweight agent instructions (SKILL.md).
 ```
-Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.19 ...
+Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.21 ...
 Skill   (packages/<name>/skill/)     → build → SKILL.md + thin scripts
 Output  (skills/<name>/)             → git commit → agent-visible skill
 ```

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@lythos/skill-arena",
-  "version": "0.9.19",
+  "version": "0.9.21",
   "description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
   "keywords": [
     "ai-agent",

package/src/cli.ts CHANGED Viewed

@@ -29,6 +29,8 @@ function printHelp(): void {
   console.log(`🎭 lythoskill-arena — Skill comparison runner
 Usage:
+  lythoskill-arena agent-run --task <path> --deck <path> [--player kimi] [--out <dir>]
+  lythoskill-arena agent-run --brief "<prompt>" --deck <path> [--out <dir>]
   lythoskill-arena run --task <path> --players <A.toml,B.toml> --decks <A.toml,B.toml> --criteria <c1,c2,...> [--out <dir>]
   lythoskill-arena scaffold --task "<description>" --skills <skill1,skill2,...>
   lythoskill-arena scaffold --task "<description>" --decks <deck1,deck2,...>
@@ -53,6 +55,10 @@ Options:
   -p, --project <dir>    Project directory (default: .)
 Examples:
+  # Single agent run (simplest path)
+  lythoskill-arena agent-run --task ./TASK.md --deck ./deck.toml
+  lythoskill-arena agent-run --task ./TASK.md --deck ./deck.toml --player kimi --out ./output
   # Declarative mode (k8s-style)
   lythoskill-arena run --config ./arena.toml
   lythoskill-arena run --config ./arena.toml --dry-run
@@ -66,6 +72,120 @@ Examples:
 `)
 }
+// ── agent-run: single agent execution (simplest path) ────────────────────
+async function agentRun(args: string[]) {
+  const opts: Record<string, string | undefined> = {}
+  for (let i = 0; i < args.length; i++) {
+    if (args[i] === '--task' || args[i] === '-t') opts.task = args[++i]
+    else if (args[i] === '--brief' || args[i] === '-b') opts.brief = args[++i]
+    else if (args[i] === '--deck' || args[i] === '-d') opts.deck = args[++i]
+    else if (args[i] === '--player' || args[i] === '-p') opts.player = args[++i]
+    else if (args[i] === '--out' || args[i] === '-o') opts.out = args[++i]
+  }
+  if (!opts.deck) {
+    console.error('❌ --deck <path> is required')
+    process.exit(1)
+  }
+  if (!opts.task && !opts.brief) {
+    console.error('❌ --task <path> or --brief "<prompt>" is required')
+    process.exit(1)
+  }
+  const { resolve, join } = await import('node:path')
+  const deckPath = resolve(opts.deck)
+  if (!existsSync(deckPath)) { console.error(`❌ Deck file not found: ${deckPath}`); process.exit(1) }
+  // Resolve task: either from file, or create temp task from --brief
+  let taskPath: string
+  if (opts.task) {
+    taskPath = resolve(opts.task)
+    if (!existsSync(taskPath)) { console.error(`❌ Task file not found: ${taskPath}`); process.exit(1) }
+  } else {
+    const { mkdtempSync, writeFileSync } = await import('node:fs')
+    const { tmpdir } = await import('node:os')
+    const tmpDir = mkdtempSync(join(tmpdir(), 'arena-brief-'))
+    taskPath = join(tmpDir, 'TASK.md')
+    const briefTask = `---
+name: ad-hoc task
+description: ${opts.brief!.slice(0, 80)}
+timeout: 120000
+---
+## Given
+- You are an AI agent with the skills declared in the deck
+## When
+${opts.brief}
+## Then
+- Write your output to output.md
+- The output should be complete and well-structured
+## Judge
+Evaluate whether the output is complete, accurate, and well-structured.
+`
+    writeFileSync(taskPath, briefTask, 'utf-8')
+  }
+  const { useAgent } = await import('@lythos/test-utils/agents')
+  const { runAgentScenario } = await import('@lythos/test-utils/agent-bdd')
+  const { resolvePlayer } = await import('./player')
+  const { readFileSync, writeFileSync, mkdirSync } = await import('node:fs')
+  const player = resolvePlayer(opts.player ?? 'kimi')
+  const agent = useAgent(player)
+  const outDir = opts.out ? resolve(opts.out) : join(process.cwd(), `agent-output-${new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19)}`)
+  mkdirSync(outDir, { recursive: true })
+  console.log(`🤖 agent-run: ${player} × ${deckPath}`)
+  console.log(`📋 task: ${taskPath}`)
+  let agentWorkdir = ''
+  const result = await runAgentScenario({
+    scenarioPath: taskPath,
+    agent,
+    async setupWorkdir(_scenario, workdir) {
+      agentWorkdir = workdir
+      mkdirSync(workdir, { recursive: true })
+      writeFileSync(join(workdir, 'skill-deck.toml'), readFileSync(deckPath, 'utf-8'))
+      const linkProc = Bun.spawn(
+        ['bunx', '@lythos/skill-deck', 'link'],
+        { cwd: workdir, env: { ...process.env, HOME: process.env.HOME! } },
+      )
+      await linkProc.exited
+    },
+  })
+  // Copy agent output to outDir
+  writeFileSync(join(outDir, 'agent-stdout.txt'), result.agentResult.stdout, 'utf-8')
+  if (result.agentResult.stderr) writeFileSync(join(outDir, 'agent-stderr.txt'), result.agentResult.stderr, 'utf-8')
+  if (result.verdict) writeFileSync(join(outDir, 'judge-verdict.json'), JSON.stringify(result.verdict, null, 2) + '\n', 'utf-8')
+  // Copy all agent-produced files from workdir (output.md, output.docx, etc.)
+  // Skip .claude/ (symlink dir) and deck artifacts. Recursive so docx/pdf work.
+  if (agentWorkdir) {
+    const { cpSync, readdirSync } = await import('node:fs')
+    const skipSet = new Set(['.claude', 'skill-deck.toml', 'skill-deck.lock'])
+    try {
+      for (const entry of readdirSync(agentWorkdir)) {
+        if (skipSet.has(entry)) continue
+        const src = join(agentWorkdir, entry)
+        const dest = join(outDir, entry)
+        try { cpSync(src, dest, { recursive: true }) } catch {}
+      }
+    } catch {}
+  }
+  console.log(`\n✅ Agent complete (${result.agentResult.durationMs}ms)`)
+  console.log(`📁 Output: ${outDir}`)
+  if (result.verdict) {
+    console.log(`🏆 Verdict: ${result.verdict.verdict} — ${result.verdict.reason.slice(0, 120)}`)
+  }
+}
 function parseArgs(argv: string[]) {
   if (argv.includes('--help') || argv.includes('-h')) {
     printHelp()
@@ -644,7 +764,9 @@ if (import.meta.main) {
   const args = process.argv.slice(2)
   const cmd = args[0]
-  if (cmd === 'viz') {
+  if (cmd === 'agent-run') {
+    agentRun(args.slice(1))
+  } else if (cmd === 'viz') {
     runViz(args.slice(1))
   } else if (cmd === 'run') {
     runProgrammaticArena(args.slice(1))

package/src/runner.ts CHANGED Viewed

@@ -111,12 +111,11 @@ export async function runArenaFromToml(opts: {
           const deckContent = readFileSync(cell.deck, 'utf-8')
           writeFileSync(join(workdir, 'skill-deck.toml'), deckContent)
-          // Link skills into .claude/skills/ so claude -p can discover them
-          const deckCli = resolve(import.meta.dir, '..', '..', 'lythoskill-deck', 'src', 'cli.ts')
-          const linkProc = Bun.spawn(['bun', 'run', deckCli, 'link'], {
-            cwd: workdir,
-            env: { ...process.env, HOME: process.env.HOME },
-          })
+          // Link skills via bunx (works both locally and when installed via bunx)
+          const linkProc = Bun.spawn(
+            ['bunx', '@lythos/skill-deck', 'link'],
+            { cwd: workdir, env: { ...process.env, HOME: process.env.HOME! } },
+          )
           await linkProc.exited
           log?.(`[arena] deck link for ${cell.side}: exit ${linkProc.exitCode}`)
         },