@lythos/skill-arena 0.9.37 → 0.9.39
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -24
- package/package.json +6 -1
- package/src/cli.ts +32 -108
package/README.md
CHANGED
|
@@ -49,26 +49,24 @@ Note: Claude `-p` mode has known issues with web tools in Bun.spawn (deferred to
|
|
|
49
49
|
```bash
|
|
50
50
|
bun add -d @lythos/skill-arena
|
|
51
51
|
# or use directly
|
|
52
|
-
bunx @lythos/skill-arena@0.9.
|
|
52
|
+
bunx @lythos/skill-arena@0.9.39 <command>
|
|
53
53
|
```
|
|
54
54
|
|
|
55
55
|
## Quick Start
|
|
56
56
|
|
|
57
57
|
```bash
|
|
58
|
-
# Mode 1: Compare two
|
|
59
|
-
bunx @lythos/skill-arena@0.9.
|
|
60
|
-
--
|
|
61
|
-
--skills "design-doc-mermaid,mermaid-tools" \
|
|
62
|
-
--criteria "syntax,context,token"
|
|
58
|
+
# Mode 1: Compare two decks on the same task (declarative)
|
|
59
|
+
bunx @lythos/skill-arena@0.9.39 run \
|
|
60
|
+
--config examples/arena/research-compare/arena.toml
|
|
63
61
|
|
|
64
|
-
# Mode 2: Compare full deck configurations
|
|
65
|
-
bunx @lythos/skill-arena@0.9.
|
|
62
|
+
# Mode 2: Compare full deck configurations via CLI flags
|
|
63
|
+
bunx @lythos/skill-arena@0.9.39 run \
|
|
66
64
|
--task "Generate auth flow diagram" \
|
|
67
65
|
--decks "./decks/minimal.toml,./decks/rich.toml" \
|
|
68
66
|
--criteria "quality,token,maintainability"
|
|
69
67
|
|
|
70
68
|
# Visualize results
|
|
71
|
-
bunx @lythos/skill-arena@0.9.
|
|
69
|
+
bunx @lythos/skill-arena@0.9.39 viz tmp/arena-<id>/
|
|
72
70
|
```
|
|
73
71
|
|
|
74
72
|
## Commands
|
|
@@ -77,16 +75,16 @@ bunx @lythos/skill-arena@0.9.37 viz tmp/arena-<id>/
|
|
|
77
75
|
|
|
78
76
|
```bash
|
|
79
77
|
# Print execution plan without running
|
|
80
|
-
bunx @lythos/skill-arena@0.9.
|
|
78
|
+
bunx @lythos/skill-arena@0.9.39 run --config arena.toml --dry-run
|
|
81
79
|
|
|
82
80
|
# Execute with per-side runs_per_side and statistical aggregation
|
|
83
|
-
bunx @lythos/skill-arena@0.9.
|
|
81
|
+
bunx @lythos/skill-arena@0.9.39 run --config arena.toml
|
|
84
82
|
```
|
|
85
83
|
|
|
86
84
|
### CLI-flag mode (backward compat)
|
|
87
85
|
|
|
88
86
|
```
|
|
89
|
-
bunx @lythos/skill-arena@0.9.
|
|
87
|
+
bunx @lythos/skill-arena@0.9.39 run \
|
|
90
88
|
--task ./TASK-arena.md \
|
|
91
89
|
--players ./players/claude.toml \
|
|
92
90
|
--decks ./decks/run-01.toml,./decks/run-02.toml \
|
|
@@ -96,13 +94,13 @@ bunx @lythos/skill-arena@0.9.37 run \
|
|
|
96
94
|
### Scaffold mode (legacy, manual execution)
|
|
97
95
|
|
|
98
96
|
```
|
|
99
|
-
bunx @lythos/skill-arena@0.9.
|
|
97
|
+
bunx @lythos/skill-arena@0.9.39 scaffold --task "..." --decks a.toml,b.toml
|
|
100
98
|
```
|
|
101
99
|
|
|
102
100
|
### Viz
|
|
103
101
|
|
|
104
102
|
```bash
|
|
105
|
-
bunx @lythos/skill-arena@0.9.
|
|
103
|
+
bunx @lythos/skill-arena@0.9.39 viz runs/arena-<id>/
|
|
106
104
|
```
|
|
107
105
|
|
|
108
106
|
## Skill Documentation
|
|
@@ -116,7 +114,7 @@ The agent-visible **Skill** layer documentation is here:
|
|
|
116
114
|
Part of the [lythoskill](https://github.com/lythos-labs/lythoskill) ecosystem — the thin-skill pattern separates heavy logic (this npm package) from lightweight agent instructions (SKILL.md).
|
|
117
115
|
|
|
118
116
|
```
|
|
119
|
-
Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.
|
|
117
|
+
Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.39 ...
|
|
120
118
|
Skill (packages/<name>/skill/) → build → SKILL.md + thin scripts
|
|
121
119
|
Output (skills/<name>/) → git commit → agent-visible skill
|
|
122
120
|
```
|
|
@@ -137,15 +135,6 @@ arena.toml → ArenaToml (Zod) → ExecutionPlan (pure) → per-cell agent
|
|
|
137
135
|
|
|
138
136
|
Built on `@lythos/test-utils` shared infrastructure.
|
|
139
137
|
|
|
140
|
-
## Test Coverage
|
|
141
|
-
|
|
142
|
-
| Layer | Count | CI | Notes |
|
|
143
|
-
|-------|-------|----|-------|
|
|
144
|
-
| Unit tests | 41 | ✅ | TOML parser, player resolution, Pareto, stats |
|
|
145
|
-
| Agent BDD | — | ❌ | Requires `claude` CLI; run locally |
|
|
146
|
-
|
|
147
|
-
Pareto frontier is a **deterministic algorithm** — never delegated to LLM. 8 unit tests cover dominance, cross-dominance, transitive chains, partial criteria, and empty scores.
|
|
148
|
-
|
|
149
138
|
## License
|
|
150
139
|
|
|
151
140
|
MIT
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@lythos/skill-arena",
|
|
3
|
-
"version": "0.9.
|
|
3
|
+
"version": "0.9.39",
|
|
4
4
|
"description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"ai-agent",
|
|
@@ -16,6 +16,11 @@
|
|
|
16
16
|
"access": "public"
|
|
17
17
|
},
|
|
18
18
|
"type": "module",
|
|
19
|
+
"scripts": {
|
|
20
|
+
"test": "bun test src/ --pass-with-no-tests",
|
|
21
|
+
"test:coverage": "bun test src/ --coverage --coverage-reporter=lcov --coverage-dir=coverage --pass-with-no-tests",
|
|
22
|
+
"test:watch": "bun test src/ --watch"
|
|
23
|
+
},
|
|
19
24
|
"bin": {
|
|
20
25
|
"lythoskill-arena": "src/cli.ts"
|
|
21
26
|
},
|
package/src/cli.ts
CHANGED
|
@@ -40,7 +40,6 @@ Usage:
|
|
|
40
40
|
lythoskill-arena agent-run --task <path> --deck <path> [--player kimi] [--out <dir>] [--timeout <ms>]
|
|
41
41
|
lythoskill-arena agent-run --brief "<prompt>" --deck <path> [--out <dir>] [--timeout <ms>]
|
|
42
42
|
lythoskill-arena run --task <path> --players <A.toml,B.toml> --decks <A.toml,B.toml> --criteria <c1,c2,...> [--out <dir>]
|
|
43
|
-
lythoskill-arena scaffold --task "<description>" --skills <skill1,skill2,...>
|
|
44
43
|
lythoskill-arena scaffold --task "<description>" --decks <deck1,deck2,...>
|
|
45
44
|
lythoskill-arena viz <arena-dir>
|
|
46
45
|
|
|
@@ -51,13 +50,11 @@ Commands:
|
|
|
51
50
|
|
|
52
51
|
Options:
|
|
53
52
|
-t, --task <path|desc> Task description or path to TASK-arena.md
|
|
54
|
-
-s, --skills <list> Comma-separated skill names (scaffold only)
|
|
55
53
|
--decks <list> Comma-separated deck paths
|
|
56
54
|
-c, --criteria <list> Evaluation criteria (default: syntax,context,logic,token)
|
|
57
55
|
--players <list> Comma-separated player.toml paths (CLI run only)
|
|
58
56
|
--config <path> Path to arena.toml (declarative mode, k8s-style)
|
|
59
57
|
--dry-run Print execution plan without running (with --config)
|
|
60
|
-
--control <skill> Control skill for comparison (scaffold only)
|
|
61
58
|
--out <dir> Output directory (run: defaults to runs/arena-<id>)
|
|
62
59
|
-d, --dir <dir> Output directory (scaffold: defaults to tmp)
|
|
63
60
|
-p, --project <dir> Project directory (default: .)
|
|
@@ -75,7 +72,7 @@ Examples:
|
|
|
75
72
|
lythoskill-arena run --task ./TASK-arena.md --players ./players/claude.toml --decks ./decks/run-01.toml,./decks/run-02.toml --criteria coverage,relevance
|
|
76
73
|
|
|
77
74
|
# Legacy scaffolding
|
|
78
|
-
lythoskill-arena scaffold --task "Refactor auth module" --
|
|
75
|
+
lythoskill-arena scaffold --task "Refactor auth module" --decks ./decks/a.toml,./decks/b.toml
|
|
79
76
|
lythoskill-arena viz runs/arena-20260504
|
|
80
77
|
`)
|
|
81
78
|
}
|
|
@@ -107,9 +104,12 @@ async function agentRun(args: string[]) {
|
|
|
107
104
|
let deckPath: string
|
|
108
105
|
if (opts.deck.startsWith('http://') || opts.deck.startsWith('https://')) {
|
|
109
106
|
let url = opts.deck
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
107
|
+
try {
|
|
108
|
+
const u = new URL(url)
|
|
109
|
+
if (u.hostname === 'github.com' && u.pathname.includes('/blob/')) {
|
|
110
|
+
url = `https://raw.githubusercontent.com${u.pathname.replace('/blob/', '/')}`
|
|
111
|
+
}
|
|
112
|
+
} catch { /* keep original url */ }
|
|
113
113
|
const dest = resolve(process.cwd(), 'arena-deck.toml')
|
|
114
114
|
console.log(`📥 Fetching arena deck: ${url}`)
|
|
115
115
|
const res = await fetch(url, { signal: AbortSignal.timeout(30_000) })
|
|
@@ -125,6 +125,7 @@ async function agentRun(args: string[]) {
|
|
|
125
125
|
const { useAgent } = await import('@lythos/test-utils/agents')
|
|
126
126
|
// Optional: register claude-sdk adapter if the package is installed
|
|
127
127
|
try { await import('@lythos/agent-adapter-claude-sdk') } catch { /* package not installed */ }
|
|
128
|
+
try { await import('@lythos/agent-adapter-deepseek-serve') } catch { /* package not installed */ }
|
|
128
129
|
const { runAgentScenario } = await import('@lythos/test-utils/agent-bdd')
|
|
129
130
|
const { resolvePlayer } = await import('./player')
|
|
130
131
|
const { readFileSync, writeFileSync, mkdirSync } = await import('node:fs')
|
|
@@ -264,10 +265,8 @@ function parseArgs(argv: string[]) {
|
|
|
264
265
|
|
|
265
266
|
const options: Record<string, string | undefined> = {
|
|
266
267
|
task: undefined,
|
|
267
|
-
skills: undefined,
|
|
268
268
|
decks: undefined,
|
|
269
269
|
criteria: 'syntax,context,logic,token',
|
|
270
|
-
control: 'lythoskill-project-scribe',
|
|
271
270
|
dir: 'tmp',
|
|
272
271
|
project: '.',
|
|
273
272
|
config: undefined,
|
|
@@ -280,13 +279,10 @@ function parseArgs(argv: string[]) {
|
|
|
280
279
|
const arg = argv[i]
|
|
281
280
|
if (arg === '--task' || arg === '-t') {
|
|
282
281
|
options.task = argv[++i]
|
|
283
|
-
} else if (arg === '--skills' || arg === '-s') {
|
|
284
|
-
options.skills = argv[++i]
|
|
285
282
|
} else if (arg === '--decks') {
|
|
286
283
|
options.decks = argv[++i]
|
|
287
284
|
} else if (arg === '--criteria' || arg === '-c') {
|
|
288
285
|
options.criteria = argv[++i]
|
|
289
|
-
} else if (arg === '--control') {
|
|
290
286
|
options.control = argv[++i]
|
|
291
287
|
} else if (arg === '--dir' || arg === '-d') {
|
|
292
288
|
options.dir = argv[++i]
|
|
@@ -315,39 +311,13 @@ export function runArena(argv: string[]) {
|
|
|
315
311
|
process.exit(1)
|
|
316
312
|
}
|
|
317
313
|
|
|
318
|
-
const
|
|
319
|
-
const HAS_SKILLS = !!options.skills
|
|
320
|
-
|
|
321
|
-
if (!HAS_DECKS && !HAS_SKILLS) {
|
|
322
|
-
console.error('❌ 请提供 --skills 或 --decks')
|
|
323
|
-
process.exit(1)
|
|
324
|
-
}
|
|
325
|
-
if (HAS_DECKS && HAS_SKILLS) {
|
|
326
|
-
console.error('❌ --skills 和 --decks 不能同时使用')
|
|
327
|
-
process.exit(1)
|
|
328
|
-
}
|
|
329
|
-
|
|
330
|
-
const DECK_PATHS = HAS_DECKS
|
|
331
|
-
? (options.decks || '').split(',').map(s => s.trim()).filter(Boolean)
|
|
332
|
-
: []
|
|
314
|
+
const DECK_PATHS = (options.decks || '').split(',').map(s => s.trim()).filter(Boolean)
|
|
333
315
|
|
|
334
|
-
|
|
335
|
-
? (options.skills || '').split(',').map(s => s.trim()).filter(Boolean)
|
|
336
|
-
: []
|
|
337
|
-
|
|
338
|
-
if (HAS_SKILLS && SKILLS.length < 2) {
|
|
339
|
-
console.error('❌ 至少需要 2 个 skill 才能进行 arena')
|
|
340
|
-
process.exit(1)
|
|
341
|
-
}
|
|
342
|
-
if (HAS_SKILLS && SKILLS.length > 5) {
|
|
343
|
-
console.error('❌ 一次 arena 最多 5 个 skill')
|
|
344
|
-
process.exit(1)
|
|
345
|
-
}
|
|
346
|
-
if (HAS_DECKS && DECK_PATHS.length < 2) {
|
|
316
|
+
if (DECK_PATHS.length < 2) {
|
|
347
317
|
console.error('❌ 至少需要 2 个 deck 才能进行 arena')
|
|
348
318
|
process.exit(1)
|
|
349
319
|
}
|
|
350
|
-
if (
|
|
320
|
+
if (DECK_PATHS.length > 5) {
|
|
351
321
|
console.error('❌ 一次 arena 最多 5 个 deck')
|
|
352
322
|
process.exit(1)
|
|
353
323
|
}
|
|
@@ -355,9 +325,6 @@ export function runArena(argv: string[]) {
|
|
|
355
325
|
const CRITERIA = (options.criteria || 'syntax,context,logic,token')
|
|
356
326
|
.split(',').map(s => s.trim()).filter(Boolean)
|
|
357
327
|
|
|
358
|
-
const CONTROL_SKILLS = (options.control || 'lythoskill-project-scribe')
|
|
359
|
-
.split(',').map(s => s.trim()).filter(Boolean)
|
|
360
|
-
|
|
361
328
|
const PROJECT_DIR = resolve(options.project!)
|
|
362
329
|
const ARENA_SLUG = slugify(TASK)
|
|
363
330
|
const ARENA_ID = `arena-${timestamp()}-${ARENA_SLUG.slice(0, 30)}`
|
|
@@ -369,37 +336,20 @@ export function runArena(argv: string[]) {
|
|
|
369
336
|
mkdirSync(join(ARENA_DIR, 'sides'), { recursive: true })
|
|
370
337
|
|
|
371
338
|
// ── 生成参与者与 deck ───────────────────────────────────────
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
const
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
console.error(`❌ Deck 文件不存在: ${deckPath}`)
|
|
387
|
-
process.exit(1)
|
|
388
|
-
}
|
|
389
|
-
return { id, name, skill_name: name, deck_path: destPath }
|
|
390
|
-
})
|
|
391
|
-
} else {
|
|
392
|
-
mode = 'single-skill'
|
|
393
|
-
participants = SKILLS.map((skill, i) => {
|
|
394
|
-
const id = `run-${String(i + 1).padStart(2, '0')}`
|
|
395
|
-
return {
|
|
396
|
-
id,
|
|
397
|
-
name: skill,
|
|
398
|
-
skill_name: skill,
|
|
399
|
-
deck_path: join(ARENA_DIR, 'decks', `arena-${id}.toml`),
|
|
400
|
-
}
|
|
401
|
-
})
|
|
402
|
-
}
|
|
339
|
+
const participants = DECK_PATHS.map((deckPath, i) => {
|
|
340
|
+
const id = `run-${String(i + 1).padStart(2, '0')}`
|
|
341
|
+
const name = basename(deckPath).replace(/\.toml$/, '')
|
|
342
|
+
const destPath = join(ARENA_DIR, 'decks', `arena-${id}.toml`)
|
|
343
|
+
// Copy the provided deck to arena directory
|
|
344
|
+
if (existsSync(deckPath)) {
|
|
345
|
+
const content = readFileSync(deckPath, 'utf-8')
|
|
346
|
+
writeFileSync(destPath, content)
|
|
347
|
+
} else {
|
|
348
|
+
console.error(`❌ Deck 文件不存在: ${deckPath}`)
|
|
349
|
+
process.exit(1)
|
|
350
|
+
}
|
|
351
|
+
return { id, name, skill_name: name, deck_path: destPath }
|
|
352
|
+
})
|
|
403
353
|
|
|
404
354
|
const criteria = CRITERIA.map((c) => ({
|
|
405
355
|
name: c,
|
|
@@ -407,29 +357,6 @@ export function runArena(argv: string[]) {
|
|
|
407
357
|
weight: 1,
|
|
408
358
|
}))
|
|
409
359
|
|
|
410
|
-
if (mode === 'single-skill') {
|
|
411
|
-
for (const p of participants) {
|
|
412
|
-
const deckContent = `# ============================================================
|
|
413
|
-
# Arena Deck: ${p.id} — ${p.name}
|
|
414
|
-
# ============================================================
|
|
415
|
-
# 变量:${p.name}
|
|
416
|
-
# 控制变量:${CONTROL_SKILLS.join(', ')}
|
|
417
|
-
# ============================================================
|
|
418
|
-
|
|
419
|
-
[deck]
|
|
420
|
-
working_set = ".claude/skills"
|
|
421
|
-
cold_pool = "~/.agents/skill-repos"
|
|
422
|
-
max_cards = 10
|
|
423
|
-
|
|
424
|
-
[tool]
|
|
425
|
-
skills = [
|
|
426
|
-
${[...new Set([p.skill_name, ...CONTROL_SKILLS])].map(s => ` "${s}",`).join('\n')}
|
|
427
|
-
]
|
|
428
|
-
`
|
|
429
|
-
writeFileSync(p.deck_path, deckContent)
|
|
430
|
-
}
|
|
431
|
-
}
|
|
432
|
-
|
|
433
360
|
// ── 为每个 side 创建隔离工作空间 ────────────────────────────
|
|
434
361
|
for (const p of participants) {
|
|
435
362
|
const sideDir = join(ARENA_DIR, 'sides', p.id)
|
|
@@ -477,14 +404,11 @@ ${criteria.map(c => ` - ${c.label}`).join('\n')}
|
|
|
477
404
|
arena_decks:
|
|
478
405
|
${participants.map(p => ` - ${p.deck_path.replace(PROJECT_DIR, '.')}`).join('\n')}
|
|
479
406
|
judge_persona: |
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
如果发现任何涌现 combo(多个 skill 组合产生 1+1>2 的效果),单独标注。`
|
|
486
|
-
: `你是一个中立的技能评测员。对比所有 subagent 的输出,
|
|
487
|
-
按 evaluation_criteria 给出 1-5 分评分,最终给出 Winner 和选型建议。`}
|
|
407
|
+
你是一个多目标优化分析师。不要选 Winner。
|
|
408
|
+
对每个 deck 配置,按 evaluation_criteria 输出评分向量(1-5 分)。
|
|
409
|
+
识别 Pareto 非支配解集——没有"最强",只有"在不同维度上的最优权衡"。
|
|
410
|
+
对被支配的解,说明它被谁支配、在哪个维度上劣势。
|
|
411
|
+
如果发现任何涌现 combo(多个 skill 组合产生 1+1>2 的效果),单独标注。
|
|
488
412
|
acceptance:
|
|
489
413
|
${participants.map(p => ` - Subagent ${p.id} 在 sides/${p.id}/ 隔离环境完成任务并写入 runs/${p.id}.md`).join('\n')}
|
|
490
414
|
- Judge 读取所有 run 文件并生成 report.md
|
|
@@ -523,9 +447,9 @@ cd "${ARENA_DIR}"
|
|
|
523
447
|
ID: ${ARENA_ID}
|
|
524
448
|
任务: ${TASK}
|
|
525
449
|
目录: ${ARENA_DIR}
|
|
526
|
-
模式:
|
|
450
|
+
模式: deck 配置对比
|
|
527
451
|
参与者: ${participants.map(p => p.name).join(', ')}
|
|
528
|
-
|
|
452
|
+
评测维度: ${CRITERIA.join(', ')}
|
|
529
453
|
|
|
530
454
|
生成文件:
|
|
531
455
|
📋 ${join(ARENA_DIR, 'arena.json')}
|