@lythos/skill-arena 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +52 -23
- package/package.json +20 -3
- package/src/cli.ts +1 -1
package/README.md
CHANGED
|
@@ -1,52 +1,81 @@
|
|
|
1
1
|
# @lythos/skill-arena
|
|
2
2
|
|
|
3
|
-
>
|
|
3
|
+
> Controlled-variable benchmark for AI agent skills. Compare skills, decks, or configurations on the same task — single-skill A/B or full-deck Pareto frontier analysis.
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
## Why
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
"Which skill is better?" is the wrong question. The right question is "which skill is better for what."
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
`skill-arena` scaffolds isolated environments where subagents complete the same task under different decks. A judge agent scores outputs across multiple dimensions. Supports:
|
|
10
|
+
|
|
11
|
+
- **Mode 1**: Single-skill comparison (controlled variable — same helper skills, different test skill).
|
|
12
|
+
- **Mode 2**: Full-deck comparison (Pareto frontier — no single winner, only optimal trade-offs).
|
|
10
13
|
|
|
11
14
|
## Install
|
|
12
15
|
|
|
13
16
|
```bash
|
|
14
17
|
bun add -d @lythos/skill-arena
|
|
15
|
-
# or
|
|
16
|
-
bunx @lythos/skill-arena <
|
|
18
|
+
# or use directly
|
|
19
|
+
bunx @lythos/skill-arena <command>
|
|
17
20
|
```
|
|
18
21
|
|
|
19
|
-
##
|
|
22
|
+
## Quick Start
|
|
20
23
|
|
|
21
24
|
```bash
|
|
22
|
-
#
|
|
25
|
+
# Mode 1: Compare two skills on the same task
|
|
23
26
|
bunx @lythos/skill-arena \
|
|
24
|
-
--task "Generate
|
|
27
|
+
--task "Generate auth flow diagram" \
|
|
25
28
|
--skills "design-doc-mermaid,mermaid-tools" \
|
|
26
29
|
--criteria "syntax,context,token"
|
|
27
30
|
|
|
28
|
-
#
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
#
|
|
31
|
+
# Mode 2: Compare full deck configurations
|
|
32
|
+
bunx @lythos/skill-arena \
|
|
33
|
+
--task "Generate auth flow diagram" \
|
|
34
|
+
--decks "./decks/minimal.toml,./decks/rich.toml" \
|
|
35
|
+
--criteria "quality,token,maintainability"
|
|
36
|
+
|
|
37
|
+
# Visualize results
|
|
38
|
+
bunx @lythos/skill-arena viz tmp/arena-<id>/
|
|
35
39
|
```
|
|
36
40
|
|
|
37
|
-
##
|
|
41
|
+
## Commands
|
|
38
42
|
|
|
39
43
|
```
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
44
|
+
Usage: bunx @lythos/skill-arena <options> | bunx @lythos/skill-arena viz <dir>
|
|
45
|
+
|
|
46
|
+
Mode 1 — Single-Skill Comparison:
|
|
47
|
+
--task, -t <desc> Task description (required)
|
|
48
|
+
--skills, -s <list> Comma-separated skills, 2–5 (Mode 1)
|
|
49
|
+
--criteria, -c <list> Evaluation dimensions (default: syntax,context,logic,token)
|
|
50
|
+
--control <skill> Control skill (default: lythoskill-project-scribe)
|
|
51
|
+
|
|
52
|
+
Mode 2 — Full-Deck Comparison:
|
|
53
|
+
--decks <paths> Comma-separated deck toml paths, 2–5 (Mode 2)
|
|
54
|
+
--criteria, -c <list> Evaluation dimensions
|
|
55
|
+
|
|
56
|
+
Common:
|
|
57
|
+
--dir, -d <path> Arena parent directory (default: tmp)
|
|
58
|
+
--project, -p <path> Project root (default: .)
|
|
59
|
+
|
|
60
|
+
Viz:
|
|
61
|
+
viz <dir> Render ASCII charts from report.md
|
|
45
62
|
```
|
|
46
63
|
|
|
64
|
+
## Skill Documentation
|
|
65
|
+
|
|
66
|
+
This package is the **Starter** layer (CLI implementation).
|
|
67
|
+
The agent-visible **Skill** layer documentation is here:
|
|
68
|
+
[packages/lythoskill-arena/skill/SKILL.md](../../packages/lythoskill-arena/skill/SKILL.md)
|
|
69
|
+
|
|
47
70
|
## Architecture
|
|
48
71
|
|
|
49
|
-
|
|
72
|
+
Part of the [lythoskill](https://github.com/lythos-labs/lythoskill) ecosystem — the thin-skill pattern separates heavy logic (this npm package) from lightweight agent instructions (SKILL.md).
|
|
73
|
+
|
|
74
|
+
```
|
|
75
|
+
Starter (this package) → npm publish → bunx @lythos/skill-arena ...
|
|
76
|
+
Skill (packages/<name>/skill/) → build → SKILL.md + thin scripts
|
|
77
|
+
Output (skills/<name>/) → git commit → agent-visible skill
|
|
78
|
+
```
|
|
50
79
|
|
|
51
80
|
## License
|
|
52
81
|
|
package/package.json
CHANGED
|
@@ -1,7 +1,16 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@lythos/skill-arena",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.4.0",
|
|
4
4
|
"description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
|
|
5
|
+
"keywords": [
|
|
6
|
+
"ai-agent",
|
|
7
|
+
"skill",
|
|
8
|
+
"claude-code",
|
|
9
|
+
"agent-skills",
|
|
10
|
+
"llm-tooling",
|
|
11
|
+
"lythoskill"
|
|
12
|
+
],
|
|
13
|
+
"author": "lythos-labs",
|
|
5
14
|
"license": "MIT",
|
|
6
15
|
"type": "module",
|
|
7
16
|
"bin": {
|
|
@@ -12,8 +21,16 @@
|
|
|
12
21
|
"README.md",
|
|
13
22
|
"LICENSE"
|
|
14
23
|
],
|
|
24
|
+
"repository": {
|
|
25
|
+
"type": "git",
|
|
26
|
+
"url": "git+https://github.com/lythos-labs/lythoskill.git",
|
|
27
|
+
"directory": "packages/lythoskill-arena"
|
|
28
|
+
},
|
|
29
|
+
"bugs": {
|
|
30
|
+
"url": "https://github.com/lythos-labs/lythoskill/issues"
|
|
31
|
+
},
|
|
32
|
+
"homepage": "https://github.com/lythos-labs/lythoskill/tree/main/packages/lythoskill-arena#readme",
|
|
15
33
|
"engines": {
|
|
16
34
|
"bun": ">=1.0.0"
|
|
17
|
-
}
|
|
18
|
-
"license": "MIT"
|
|
35
|
+
}
|
|
19
36
|
}
|
package/src/cli.ts
CHANGED
|
@@ -110,7 +110,7 @@ export function runArena(argv: string[]) {
|
|
|
110
110
|
const CRITERIA = (options.criteria || 'syntax,context,logic,token')
|
|
111
111
|
.split(',').map(s => s.trim()).filter(Boolean)
|
|
112
112
|
|
|
113
|
-
const CONTROL_SKILLS = (options.control || 'lythoskill-project-
|
|
113
|
+
const CONTROL_SKILLS = (options.control || 'lythoskill-project-scribe')
|
|
114
114
|
.split(',').map(s => s.trim()).filter(Boolean)
|
|
115
115
|
|
|
116
116
|
const PROJECT_DIR = resolve(options.project!)
|