@lythos/skill-arena 0.9.20 → 0.9.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +10 -10
  2. package/package.json +1 -1
  3. package/src/cli.ts +9 -8
package/README.md CHANGED
@@ -49,26 +49,26 @@ Note: Claude `-p` mode has known issues with web tools in Bun.spawn (deferred to
49
49
  ```bash
50
50
  bun add -d @lythos/skill-arena
51
51
  # or use directly
52
- bunx @lythos/skill-arena@0.9.20 <command>
52
+ bunx @lythos/skill-arena@0.9.22 <command>
53
53
  ```
54
54
 
55
55
  ## Quick Start
56
56
 
57
57
  ```bash
58
58
  # Mode 1: Compare two skills on the same task
59
- bunx @lythos/skill-arena@0.9.20 \
59
+ bunx @lythos/skill-arena@0.9.22 \
60
60
  --task "Generate auth flow diagram" \
61
61
  --skills "design-doc-mermaid,mermaid-tools" \
62
62
  --criteria "syntax,context,token"
63
63
 
64
64
  # Mode 2: Compare full deck configurations
65
- bunx @lythos/skill-arena@0.9.20 \
65
+ bunx @lythos/skill-arena@0.9.22 \
66
66
  --task "Generate auth flow diagram" \
67
67
  --decks "./decks/minimal.toml,./decks/rich.toml" \
68
68
  --criteria "quality,token,maintainability"
69
69
 
70
70
  # Visualize results
71
- bunx @lythos/skill-arena@0.9.20 viz tmp/arena-<id>/
71
+ bunx @lythos/skill-arena@0.9.22 viz tmp/arena-<id>/
72
72
  ```
73
73
 
74
74
  ## Commands
@@ -77,16 +77,16 @@ bunx @lythos/skill-arena@0.9.20 viz tmp/arena-<id>/
77
77
 
78
78
  ```bash
79
79
  # Print execution plan without running
80
- bunx @lythos/skill-arena@0.9.20 run --config arena.toml --dry-run
80
+ bunx @lythos/skill-arena@0.9.22 run --config arena.toml --dry-run
81
81
 
82
82
  # Execute with per-side runs_per_side and statistical aggregation
83
- bunx @lythos/skill-arena@0.9.20 run --config arena.toml
83
+ bunx @lythos/skill-arena@0.9.22 run --config arena.toml
84
84
  ```
85
85
 
86
86
  ### CLI-flag mode (backward compat)
87
87
 
88
88
  ```
89
- bunx @lythos/skill-arena@0.9.20 run \
89
+ bunx @lythos/skill-arena@0.9.22 run \
90
90
  --task ./TASK-arena.md \
91
91
  --players ./players/claude.toml \
92
92
  --decks ./decks/run-01.toml,./decks/run-02.toml \
@@ -96,13 +96,13 @@ bunx @lythos/skill-arena@0.9.20 run \
96
96
  ### Scaffold mode (legacy, manual execution)
97
97
 
98
98
  ```
99
- bunx @lythos/skill-arena@0.9.20 scaffold --task "..." --skills a,b
99
+ bunx @lythos/skill-arena@0.9.22 scaffold --task "..." --skills a,b
100
100
  ```
101
101
 
102
102
  ### Viz
103
103
 
104
104
  ```bash
105
- bunx @lythos/skill-arena@0.9.20 viz runs/arena-<id>/
105
+ bunx @lythos/skill-arena@0.9.22 viz runs/arena-<id>/
106
106
  ```
107
107
 
108
108
  ## Skill Documentation
@@ -116,7 +116,7 @@ The agent-visible **Skill** layer documentation is here:
116
116
  Part of the [lythoskill](https://github.com/lythos-labs/lythoskill) ecosystem — the thin-skill pattern separates heavy logic (this npm package) from lightweight agent instructions (SKILL.md).
117
117
 
118
118
  ```
119
- Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.20 ...
119
+ Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.22 ...
120
120
  Skill (packages/<name>/skill/) → build → SKILL.md + thin scripts
121
121
  Output (skills/<name>/) → git commit → agent-visible skill
122
122
  ```
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lythos/skill-arena",
3
- "version": "0.9.20",
3
+ "version": "0.9.22",
4
4
  "description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
5
5
  "keywords": [
6
6
  "ai-agent",
package/src/cli.ts CHANGED
@@ -109,7 +109,7 @@ async function agentRun(args: string[]) {
109
109
  taskPath = join(tmpDir, 'TASK.md')
110
110
  const briefTask = `---
111
111
  name: ad-hoc task
112
- description: ${opts.brief!.slice(0, 80)}
112
+ description: ${opts.brief!.replace(/"/g, '\\"').slice(0, 80)}
113
113
  timeout: 120000
114
114
  ---
115
115
 
@@ -136,7 +136,7 @@ Evaluate whether the output is complete, accurate, and well-structured.
136
136
 
137
137
  const player = resolvePlayer(opts.player ?? 'kimi')
138
138
  const agent = useAgent(player)
139
- const outDir = opts.out ? resolve(opts.out) : join(process.cwd(), 'agent-run-output')
139
+ const outDir = opts.out ? resolve(opts.out) : join(process.cwd(), `agent-output-${new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19)}`)
140
140
  mkdirSync(outDir, { recursive: true })
141
141
 
142
142
  console.log(`🤖 agent-run: ${player} × ${deckPath}`)
@@ -164,16 +164,17 @@ Evaluate whether the output is complete, accurate, and well-structured.
164
164
  if (result.agentResult.stderr) writeFileSync(join(outDir, 'agent-stderr.txt'), result.agentResult.stderr, 'utf-8')
165
165
  if (result.verdict) writeFileSync(join(outDir, 'judge-verdict.json'), JSON.stringify(result.verdict, null, 2) + '\n', 'utf-8')
166
166
 
167
- // Copy agent-produced files from workdir (output.md, output.docx, etc.)
167
+ // Copy all agent-produced files from workdir (output.md, output.docx, etc.)
168
+ // Skip .claude/ (symlink dir) and deck artifacts. Recursive so docx/pdf work.
168
169
  if (agentWorkdir) {
169
- const { readdirSync, statSync, copyFileSync } = await import('node:fs')
170
+ const { cpSync, readdirSync } = await import('node:fs')
171
+ const skipSet = new Set(['.claude', 'skill-deck.toml', 'skill-deck.lock'])
170
172
  try {
171
173
  for (const entry of readdirSync(agentWorkdir)) {
172
- if (entry.startsWith('.') || entry === 'skill-deck.toml' || entry === 'skill-deck.lock') continue
174
+ if (skipSet.has(entry)) continue
173
175
  const src = join(agentWorkdir, entry)
174
- try {
175
- if (statSync(src).isFile()) copyFileSync(src, join(outDir, entry))
176
- } catch {}
176
+ const dest = join(outDir, entry)
177
+ try { cpSync(src, dest, { recursive: true }) } catch {}
177
178
  }
178
179
  } catch {}
179
180
  }