@lythos/skill-arena 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +53 -0
- package/package.json +19 -0
- package/src/cli.ts +225 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 lythos-labs
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# @lythos/skill-arena
|
|
2
|
+
|
|
3
|
+
> Skill comparison benchmark tool. Run control-variable decks against the same task to compare skill effectiveness.
|
|
4
|
+
|
|
5
|
+
Part of the [lythoskill](https://github.com/lythos-labs/lythoskill) meta-skill ecosystem.
|
|
6
|
+
|
|
7
|
+
## What it does
|
|
8
|
+
|
|
9
|
+
Creates an arena directory with isolated decks for each skill under test, generates task cards for subagent dispatch, and produces a structured output for judge evaluation. Core principle: **control variables** — only the tested skill differs between decks.
|
|
10
|
+
|
|
11
|
+
## Install
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
bun add -d @lythos/skill-arena
|
|
15
|
+
# or
|
|
16
|
+
bunx @lythos/skill-arena <args>
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Commands
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
# Initialize an arena with 2-5 skills
|
|
23
|
+
bunx @lythos/skill-arena \
|
|
24
|
+
--task "Generate user auth flow diagram" \
|
|
25
|
+
--skills "design-doc-mermaid,mermaid-tools" \
|
|
26
|
+
--criteria "syntax,context,token"
|
|
27
|
+
|
|
28
|
+
# Options
|
|
29
|
+
# --task, -t Task description (required)
|
|
30
|
+
# --skills, -s Comma-separated skill list, min 2, max 5
|
|
31
|
+
# --criteria, -c Evaluation criteria (default: syntax,context,logic,token)
|
|
32
|
+
# --control Control variable skill (default: project-scribe)
|
|
33
|
+
# --dir, -d Arena parent directory (default: tmp)
|
|
34
|
+
# --project, -p Project root (default: .)
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Output
|
|
38
|
+
|
|
39
|
+
```
|
|
40
|
+
tmp/arena-<timestamp>-<slug>/
|
|
41
|
+
├── arena.json # metadata + config
|
|
42
|
+
├── decks/ # one control-variable deck per skill
|
|
43
|
+
├── runs/ # subagent output (you fill this)
|
|
44
|
+
└── TASK-arena.md # task card with subagent instructions
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Architecture
|
|
48
|
+
|
|
49
|
+
This is the **Starter** layer of the thin-skill pattern. The agent-visible **Skill** layer is in `packages/lythoskill-arena/skill/`.
|
|
50
|
+
|
|
51
|
+
## License
|
|
52
|
+
|
|
53
|
+
MIT
|
package/package.json
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@lythos/skill-arena",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
|
|
5
|
+
"license": "MIT",
|
|
6
|
+
"type": "module",
|
|
7
|
+
"bin": {
|
|
8
|
+
"lythoskill-arena": "src/cli.ts"
|
|
9
|
+
},
|
|
10
|
+
"files": [
|
|
11
|
+
"src",
|
|
12
|
+
"README.md",
|
|
13
|
+
"LICENSE"
|
|
14
|
+
],
|
|
15
|
+
"engines": {
|
|
16
|
+
"bun": ">=1.0.0"
|
|
17
|
+
},
|
|
18
|
+
"license": "MIT"
|
|
19
|
+
}
|
package/src/cli.ts
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/**
|
|
3
|
+
* lythoskill-arena CLI — Skill Arena 编排器
|
|
4
|
+
*
|
|
5
|
+
* 创建标准化的 arena 目录结构,为每个被测 skill 生成控制变量 deck。
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import {
|
|
9
|
+
existsSync, mkdirSync, writeFileSync,
|
|
10
|
+
} from 'node:fs'
|
|
11
|
+
import { join, resolve } from 'node:path'
|
|
12
|
+
|
|
13
|
+
// ── 简单的 slugify ──────────────────────────────────────────
|
|
14
|
+
function slugify(input: string): string {
|
|
15
|
+
return input
|
|
16
|
+
.toLowerCase()
|
|
17
|
+
.replace(/[^a-z0-9\u4e00-\u9fa5]+/g, '-')
|
|
18
|
+
.replace(/^-+|-+$/g, '')
|
|
19
|
+
.slice(0, 40)
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
function timestamp(): string {
|
|
23
|
+
const d = new Date()
|
|
24
|
+
return d.toISOString().replace(/[-:T.Z]/g, '').slice(0, 17) // yyyyMMddHHmmssSSS
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
// ── 解析参数(简单 slice 风格)──────────────────────────────
|
|
28
|
+
function parseArgs(argv: string[]) {
|
|
29
|
+
const options: Record<string, string | undefined> = {
|
|
30
|
+
task: undefined,
|
|
31
|
+
skills: undefined,
|
|
32
|
+
criteria: 'syntax,context,logic,token',
|
|
33
|
+
control: 'project-scribe',
|
|
34
|
+
dir: 'tmp',
|
|
35
|
+
project: '.',
|
|
36
|
+
}
|
|
37
|
+
const positionals: string[] = []
|
|
38
|
+
|
|
39
|
+
for (let i = 0; i < argv.length; i++) {
|
|
40
|
+
const arg = argv[i]
|
|
41
|
+
if (arg === '--task' || arg === '-t') {
|
|
42
|
+
options.task = argv[++i]
|
|
43
|
+
} else if (arg === '--skills' || arg === '-s') {
|
|
44
|
+
options.skills = argv[++i]
|
|
45
|
+
} else if (arg === '--criteria' || arg === '-c') {
|
|
46
|
+
options.criteria = argv[++i]
|
|
47
|
+
} else if (arg === '--control') {
|
|
48
|
+
options.control = argv[++i]
|
|
49
|
+
} else if (arg === '--dir' || arg === '-d') {
|
|
50
|
+
options.dir = argv[++i]
|
|
51
|
+
} else if (arg === '--project' || arg === '-p') {
|
|
52
|
+
options.project = argv[++i]
|
|
53
|
+
} else if (!arg.startsWith('-')) {
|
|
54
|
+
positionals.push(arg)
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
return { options, positionals }
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// ── 主流程 ──────────────────────────────────────────────────
|
|
61
|
+
export function runArena(argv: string[]) {
|
|
62
|
+
const { options, positionals } = parseArgs(argv)
|
|
63
|
+
|
|
64
|
+
const TASK = options.task || positionals.join(' ') || ''
|
|
65
|
+
if (!TASK) {
|
|
66
|
+
console.error('❌ 请提供 --task 或位置参数')
|
|
67
|
+
process.exit(1)
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
const SKILLS = (options.skills || '').split(',').map(s => s.trim()).filter(Boolean)
|
|
71
|
+
if (SKILLS.length < 2) {
|
|
72
|
+
console.error('❌ 至少需要 2 个 skill 才能进行 arena')
|
|
73
|
+
process.exit(1)
|
|
74
|
+
}
|
|
75
|
+
if (SKILLS.length > 5) {
|
|
76
|
+
console.error('❌ 一次 arena 最多 5 个 skill')
|
|
77
|
+
process.exit(1)
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
const CRITERIA = (options.criteria || 'syntax,context,logic,token')
|
|
81
|
+
.split(',').map(s => s.trim()).filter(Boolean)
|
|
82
|
+
|
|
83
|
+
const CONTROL_SKILLS = (options.control || 'lythoskill-project-cortex')
|
|
84
|
+
.split(',').map(s => s.trim()).filter(Boolean)
|
|
85
|
+
|
|
86
|
+
const PROJECT_DIR = resolve(options.project!)
|
|
87
|
+
const ARENA_SLUG = slugify(TASK)
|
|
88
|
+
const ARENA_ID = `arena-${timestamp()}-${ARENA_SLUG.slice(0, 30)}`
|
|
89
|
+
const ARENA_DIR = resolve(PROJECT_DIR, options.dir!, ARENA_ID)
|
|
90
|
+
|
|
91
|
+
// ── 创建目录结构 ────────────────────────────────────────────
|
|
92
|
+
mkdirSync(join(ARENA_DIR, 'decks'), { recursive: true })
|
|
93
|
+
mkdirSync(join(ARENA_DIR, 'runs'), { recursive: true })
|
|
94
|
+
|
|
95
|
+
// ── 生成参与者与 deck ───────────────────────────────────────
|
|
96
|
+
const participants = SKILLS.map((skill, i) => {
|
|
97
|
+
const id = `run-${String(i + 1).padStart(2, '0')}`
|
|
98
|
+
return {
|
|
99
|
+
id,
|
|
100
|
+
name: skill,
|
|
101
|
+
skill_name: skill,
|
|
102
|
+
deck_path: join(ARENA_DIR, 'decks', `arena-${id}.toml`),
|
|
103
|
+
}
|
|
104
|
+
})
|
|
105
|
+
|
|
106
|
+
const criteria = CRITERIA.map((c) => ({
|
|
107
|
+
name: c,
|
|
108
|
+
label: c,
|
|
109
|
+
weight: 1,
|
|
110
|
+
}))
|
|
111
|
+
|
|
112
|
+
for (const p of participants) {
|
|
113
|
+
const deckContent = `# ============================================================
|
|
114
|
+
# Arena Deck: ${p.id} — ${p.name}
|
|
115
|
+
# ============================================================
|
|
116
|
+
# 变量:${p.name}
|
|
117
|
+
# 控制变量:${CONTROL_SKILLS.join(', ')}
|
|
118
|
+
# ============================================================
|
|
119
|
+
|
|
120
|
+
[deck]
|
|
121
|
+
working_set = ".claude/skills"
|
|
122
|
+
cold_pool = "~/.agents/skill-repos"
|
|
123
|
+
max_cards = 10
|
|
124
|
+
|
|
125
|
+
[tool]
|
|
126
|
+
skills = [
|
|
127
|
+
"${p.skill_name}",
|
|
128
|
+
${CONTROL_SKILLS.map(s => ` "${s}",`).join('\n')}
|
|
129
|
+
]
|
|
130
|
+
`
|
|
131
|
+
writeFileSync(p.deck_path, deckContent)
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
// ── 生成 arena.json ─────────────────────────────────────────
|
|
135
|
+
const arenaJson = {
|
|
136
|
+
version: '1.0.0',
|
|
137
|
+
metadata: {
|
|
138
|
+
id: ARENA_ID,
|
|
139
|
+
slug: ARENA_SLUG,
|
|
140
|
+
created_at: new Date().toISOString(),
|
|
141
|
+
task_description: TASK,
|
|
142
|
+
participants,
|
|
143
|
+
criteria,
|
|
144
|
+
working_dir: ARENA_DIR,
|
|
145
|
+
},
|
|
146
|
+
status: 'setup',
|
|
147
|
+
runs: participants.map(p => ({
|
|
148
|
+
participant_id: p.id,
|
|
149
|
+
output_path: join(ARENA_DIR, 'runs', `${p.id}.md`),
|
|
150
|
+
})),
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
writeFileSync(join(ARENA_DIR, 'arena.json'), JSON.stringify(arenaJson, null, 2) + '\n')
|
|
154
|
+
|
|
155
|
+
// ── 生成 Task Card 模板 ─────────────────────────────────────
|
|
156
|
+
const taskCardPath = join(ARENA_DIR, 'TASK-arena.md')
|
|
157
|
+
const relArenaDir = ARENA_DIR.replace(PROJECT_DIR, '.')
|
|
158
|
+
const taskCardContent = `---
|
|
159
|
+
type: arena
|
|
160
|
+
objective: |
|
|
161
|
+
${TASK}
|
|
162
|
+
evaluation_criteria:
|
|
163
|
+
${criteria.map(c => ` - ${c.label}`).join('\n')}
|
|
164
|
+
arena_decks:
|
|
165
|
+
${participants.map(p => ` - ${p.deck_path.replace(PROJECT_DIR, '.')}`).join('\n')}
|
|
166
|
+
judge_persona: |
|
|
167
|
+
你是一个中立的技能评测员。对比所有 subagent 的输出,
|
|
168
|
+
按 evaluation_criteria 给出 1-5 分评分,最终给出 Winner 和选型建议。
|
|
169
|
+
acceptance:
|
|
170
|
+
${participants.map(p => ` - Subagent ${p.id} 使用 ${p.deck_path.replace(PROJECT_DIR, '.')} 完成任务并写入 runs/${p.id}.md`).join('\n')}
|
|
171
|
+
- Judge 读取所有 run 文件并生成 report.md
|
|
172
|
+
- 所有 subagent 完成后恢复父 deck
|
|
173
|
+
managed_dirs:
|
|
174
|
+
- ${relArenaDir}/
|
|
175
|
+
---
|
|
176
|
+
|
|
177
|
+
# Arena Task: ${TASK}
|
|
178
|
+
|
|
179
|
+
## Subagent 指令
|
|
180
|
+
|
|
181
|
+
${participants.map(p => `### ${p.id} (${p.name})
|
|
182
|
+
\`\`\`bash
|
|
183
|
+
cd "${PROJECT_DIR}"
|
|
184
|
+
bunx @lythos/skill-deck link --deck "${p.deck_path}"
|
|
185
|
+
# 然后执行任务,输出写入 "${join(ARENA_DIR, 'runs', `${p.id}.md`)}"
|
|
186
|
+
bunx @lythos/skill-deck link --deck "${join(PROJECT_DIR, 'skill-deck.toml')}"
|
|
187
|
+
\`\`\`
|
|
188
|
+
`).join('')}
|
|
189
|
+
|
|
190
|
+
### Judge
|
|
191
|
+
\`\`\`bash
|
|
192
|
+
cd "${PROJECT_DIR}"
|
|
193
|
+
bunx @lythos/skill-deck link --deck "${join(PROJECT_DIR, 'skill-deck.toml')}"
|
|
194
|
+
# 读取所有 run 文件,生成 "${join(ARENA_DIR, 'report.md')}"
|
|
195
|
+
\`\`\`
|
|
196
|
+
`
|
|
197
|
+
|
|
198
|
+
writeFileSync(taskCardPath, taskCardContent)
|
|
199
|
+
|
|
200
|
+
// ── 报告 ────────────────────────────────────────────────────
|
|
201
|
+
console.log(`
|
|
202
|
+
🎮 Skill Arena 初始化完成
|
|
203
|
+
|
|
204
|
+
ID: ${ARENA_ID}
|
|
205
|
+
任务: ${TASK}
|
|
206
|
+
目录: ${ARENA_DIR}
|
|
207
|
+
参与者: ${SKILLS.join(', ')}
|
|
208
|
+
控制变量: ${CONTROL_SKILLS.join(', ')}
|
|
209
|
+
评测维度: ${CRITERIA.join(', ')}
|
|
210
|
+
|
|
211
|
+
生成文件:
|
|
212
|
+
📋 ${join(ARENA_DIR, 'arena.json')}
|
|
213
|
+
🎴 ${participants.length} 个 arena deck → ${join(ARENA_DIR, 'decks')}
|
|
214
|
+
📝 Task Card → ${taskCardPath}
|
|
215
|
+
|
|
216
|
+
下一步:
|
|
217
|
+
1. 阅读 Task Card: cat "${taskCardPath}"
|
|
218
|
+
2. 按指令逐个/并行启动 subagent
|
|
219
|
+
3. Judge 生成 report.md
|
|
220
|
+
`)
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
if (import.meta.main) {
|
|
224
|
+
runArena(process.argv.slice(2))
|
|
225
|
+
}
|