@lythos/skill-arena 0.3.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +52 -23
- package/package.json +20 -3
- package/src/cli.ts +30 -1
package/README.md
CHANGED
|
@@ -1,52 +1,81 @@
|
|
|
1
1
|
# @lythos/skill-arena
|
|
2
2
|
|
|
3
|
-
>
|
|
3
|
+
> Controlled-variable benchmark for AI agent skills. Compare skills, decks, or configurations on the same task — single-skill A/B or full-deck Pareto frontier analysis.
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
## Why
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
"Which skill is better?" is the wrong question. The right question is "which skill is better for what."
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
`skill-arena` scaffolds isolated environments where subagents complete the same task under different decks. A judge agent scores outputs across multiple dimensions. Supports:
|
|
10
|
+
|
|
11
|
+
- **Mode 1**: Single-skill comparison (controlled variable — same helper skills, different test skill).
|
|
12
|
+
- **Mode 2**: Full-deck comparison (Pareto frontier — no single winner, only optimal trade-offs).
|
|
10
13
|
|
|
11
14
|
## Install
|
|
12
15
|
|
|
13
16
|
```bash
|
|
14
17
|
bun add -d @lythos/skill-arena
|
|
15
|
-
# or
|
|
16
|
-
bunx @lythos/skill-arena <
|
|
18
|
+
# or use directly
|
|
19
|
+
bunx @lythos/skill-arena <command>
|
|
17
20
|
```
|
|
18
21
|
|
|
19
|
-
##
|
|
22
|
+
## Quick Start
|
|
20
23
|
|
|
21
24
|
```bash
|
|
22
|
-
#
|
|
25
|
+
# Mode 1: Compare two skills on the same task
|
|
23
26
|
bunx @lythos/skill-arena \
|
|
24
|
-
--task "Generate
|
|
27
|
+
--task "Generate auth flow diagram" \
|
|
25
28
|
--skills "design-doc-mermaid,mermaid-tools" \
|
|
26
29
|
--criteria "syntax,context,token"
|
|
27
30
|
|
|
28
|
-
#
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
#
|
|
31
|
+
# Mode 2: Compare full deck configurations
|
|
32
|
+
bunx @lythos/skill-arena \
|
|
33
|
+
--task "Generate auth flow diagram" \
|
|
34
|
+
--decks "./decks/minimal.toml,./decks/rich.toml" \
|
|
35
|
+
--criteria "quality,token,maintainability"
|
|
36
|
+
|
|
37
|
+
# Visualize results
|
|
38
|
+
bunx @lythos/skill-arena viz tmp/arena-<id>/
|
|
35
39
|
```
|
|
36
40
|
|
|
37
|
-
##
|
|
41
|
+
## Commands
|
|
38
42
|
|
|
39
43
|
```
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
44
|
+
Usage: bunx @lythos/skill-arena <options> | bunx @lythos/skill-arena viz <dir>
|
|
45
|
+
|
|
46
|
+
Mode 1 — Single-Skill Comparison:
|
|
47
|
+
--task, -t <desc> Task description (required)
|
|
48
|
+
--skills, -s <list> Comma-separated skills, 2–5 (Mode 1)
|
|
49
|
+
--criteria, -c <list> Evaluation dimensions (default: syntax,context,logic,token)
|
|
50
|
+
--control <skill> Control skill (default: lythoskill-project-scribe)
|
|
51
|
+
|
|
52
|
+
Mode 2 — Full-Deck Comparison:
|
|
53
|
+
--decks <paths> Comma-separated deck toml paths, 2–5 (Mode 2)
|
|
54
|
+
--criteria, -c <list> Evaluation dimensions
|
|
55
|
+
|
|
56
|
+
Common:
|
|
57
|
+
--dir, -d <path> Arena parent directory (default: tmp)
|
|
58
|
+
--project, -p <path> Project root (default: .)
|
|
59
|
+
|
|
60
|
+
Viz:
|
|
61
|
+
viz <dir> Render ASCII charts from report.md
|
|
45
62
|
```
|
|
46
63
|
|
|
64
|
+
## Skill Documentation
|
|
65
|
+
|
|
66
|
+
This package is the **Starter** layer (CLI implementation).
|
|
67
|
+
The agent-visible **Skill** layer documentation is here:
|
|
68
|
+
[packages/lythoskill-arena/skill/SKILL.md](../../packages/lythoskill-arena/skill/SKILL.md)
|
|
69
|
+
|
|
47
70
|
## Architecture
|
|
48
71
|
|
|
49
|
-
|
|
72
|
+
Part of the [lythoskill](https://github.com/lythos-labs/lythoskill) ecosystem — the thin-skill pattern separates heavy logic (this npm package) from lightweight agent instructions (SKILL.md).
|
|
73
|
+
|
|
74
|
+
```
|
|
75
|
+
Starter (this package) → npm publish → bunx @lythos/skill-arena ...
|
|
76
|
+
Skill (packages/<name>/skill/) → build → SKILL.md + thin scripts
|
|
77
|
+
Output (skills/<name>/) → git commit → agent-visible skill
|
|
78
|
+
```
|
|
50
79
|
|
|
51
80
|
## License
|
|
52
81
|
|
package/package.json
CHANGED
|
@@ -1,7 +1,16 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@lythos/skill-arena",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.5.0",
|
|
4
4
|
"description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
|
|
5
|
+
"keywords": [
|
|
6
|
+
"ai-agent",
|
|
7
|
+
"skill",
|
|
8
|
+
"claude-code",
|
|
9
|
+
"agent-skills",
|
|
10
|
+
"llm-tooling",
|
|
11
|
+
"lythoskill"
|
|
12
|
+
],
|
|
13
|
+
"author": "lythos-labs",
|
|
5
14
|
"license": "MIT",
|
|
6
15
|
"type": "module",
|
|
7
16
|
"bin": {
|
|
@@ -12,8 +21,16 @@
|
|
|
12
21
|
"README.md",
|
|
13
22
|
"LICENSE"
|
|
14
23
|
],
|
|
24
|
+
"repository": {
|
|
25
|
+
"type": "git",
|
|
26
|
+
"url": "git+https://github.com/lythos-labs/lythoskill.git",
|
|
27
|
+
"directory": "packages/lythoskill-arena"
|
|
28
|
+
},
|
|
29
|
+
"bugs": {
|
|
30
|
+
"url": "https://github.com/lythos-labs/lythoskill/issues"
|
|
31
|
+
},
|
|
32
|
+
"homepage": "https://github.com/lythos-labs/lythoskill/tree/main/packages/lythoskill-arena#readme",
|
|
15
33
|
"engines": {
|
|
16
34
|
"bun": ">=1.0.0"
|
|
17
|
-
}
|
|
18
|
-
"license": "MIT"
|
|
35
|
+
}
|
|
19
36
|
}
|
package/src/cli.ts
CHANGED
|
@@ -25,7 +25,36 @@ function timestamp(): string {
|
|
|
25
25
|
}
|
|
26
26
|
|
|
27
27
|
// ── 解析参数(简单 slice 风格)──────────────────────────────
|
|
28
|
+
function printHelp(): void {
|
|
29
|
+
console.log(`🎭 lythoskill-arena — Skill comparison runner
|
|
30
|
+
|
|
31
|
+
Usage:
|
|
32
|
+
lythoskill-arena --task "<task description>" --skills <skill1,skill2,...>
|
|
33
|
+
lythoskill-arena --task "<task description>" --decks <deck1,deck2,...>
|
|
34
|
+
lythoskill-arena viz <arena-dir>
|
|
35
|
+
|
|
36
|
+
Options:
|
|
37
|
+
-t, --task <desc> Task description (required)
|
|
38
|
+
-s, --skills <list> Comma-separated skill names
|
|
39
|
+
--decks <list> Comma-separated deck paths
|
|
40
|
+
-c, --criteria <list> Evaluation criteria (default: syntax,context,logic,token)
|
|
41
|
+
--control <skill> Control skill for comparison (default: lythoskill-project-scribe)
|
|
42
|
+
-d, --dir <dir> Output directory (default: tmp)
|
|
43
|
+
-p, --project <dir> Project directory (default: .)
|
|
44
|
+
|
|
45
|
+
Examples:
|
|
46
|
+
lythoskill-arena --task "Refactor auth module" --skills skill-a,skill-b
|
|
47
|
+
lythoskill-arena --task "Write tests" --decks ./decks/minimal.toml,./decks/full.toml
|
|
48
|
+
lythoskill-arena viz tmp/arena-20260430
|
|
49
|
+
`)
|
|
50
|
+
}
|
|
51
|
+
|
|
28
52
|
function parseArgs(argv: string[]) {
|
|
53
|
+
if (argv.includes('--help') || argv.includes('-h')) {
|
|
54
|
+
printHelp()
|
|
55
|
+
process.exit(0)
|
|
56
|
+
}
|
|
57
|
+
|
|
29
58
|
const options: Record<string, string | undefined> = {
|
|
30
59
|
task: undefined,
|
|
31
60
|
skills: undefined,
|
|
@@ -110,7 +139,7 @@ export function runArena(argv: string[]) {
|
|
|
110
139
|
const CRITERIA = (options.criteria || 'syntax,context,logic,token')
|
|
111
140
|
.split(',').map(s => s.trim()).filter(Boolean)
|
|
112
141
|
|
|
113
|
-
const CONTROL_SKILLS = (options.control || 'lythoskill-project-
|
|
142
|
+
const CONTROL_SKILLS = (options.control || 'lythoskill-project-scribe')
|
|
114
143
|
.split(',').map(s => s.trim()).filter(Boolean)
|
|
115
144
|
|
|
116
145
|
const PROJECT_DIR = resolve(options.project!)
|