@lythos/skill-arena 0.9.39 → 0.9.40
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +15 -16
- package/package.json +1 -1
- package/src/cli.ts +57 -42
package/README.md
CHANGED
|
@@ -49,24 +49,23 @@ Note: Claude `-p` mode has known issues with web tools in Bun.spawn (deferred to
|
|
|
49
49
|
```bash
|
|
50
50
|
bun add -d @lythos/skill-arena
|
|
51
51
|
# or use directly
|
|
52
|
-
bunx @lythos/skill-arena@0.9.
|
|
52
|
+
bunx @lythos/skill-arena@0.9.40 <command>
|
|
53
53
|
```
|
|
54
54
|
|
|
55
55
|
## Quick Start
|
|
56
56
|
|
|
57
57
|
```bash
|
|
58
|
-
#
|
|
59
|
-
bunx @lythos/skill-arena@0.9.
|
|
60
|
-
--
|
|
58
|
+
# Single: test a deck with one agent (exec shortcut)
|
|
59
|
+
bunx @lythos/skill-arena@0.9.40 single \
|
|
60
|
+
--brief "Generate auth flow diagram" \
|
|
61
|
+
--deck ./examples/decks/documents.toml
|
|
61
62
|
|
|
62
|
-
#
|
|
63
|
-
bunx @lythos/skill-arena@0.9.
|
|
64
|
-
--
|
|
65
|
-
--decks "./decks/minimal.toml,./decks/rich.toml" \
|
|
66
|
-
--criteria "quality,token,maintainability"
|
|
63
|
+
# Vs: compare multiple decks side by side (declarative)
|
|
64
|
+
bunx @lythos/skill-arena@0.9.40 vs \
|
|
65
|
+
--config examples/arena/research-compare/arena.toml
|
|
67
66
|
|
|
68
67
|
# Visualize results
|
|
69
|
-
bunx @lythos/skill-arena@0.9.
|
|
68
|
+
bunx @lythos/skill-arena@0.9.40 viz tmp/arena-<id>/
|
|
70
69
|
```
|
|
71
70
|
|
|
72
71
|
## Commands
|
|
@@ -75,16 +74,16 @@ bunx @lythos/skill-arena@0.9.39 viz tmp/arena-<id>/
|
|
|
75
74
|
|
|
76
75
|
```bash
|
|
77
76
|
# Print execution plan without running
|
|
78
|
-
bunx @lythos/skill-arena@0.9.
|
|
77
|
+
bunx @lythos/skill-arena@0.9.40 vs --config arena.toml --dry-run
|
|
79
78
|
|
|
80
79
|
# Execute with per-side runs_per_side and statistical aggregation
|
|
81
|
-
bunx @lythos/skill-arena@0.9.
|
|
80
|
+
bunx @lythos/skill-arena@0.9.40 vs --config arena.toml
|
|
82
81
|
```
|
|
83
82
|
|
|
84
83
|
### CLI-flag mode (backward compat)
|
|
85
84
|
|
|
86
85
|
```
|
|
87
|
-
bunx @lythos/skill-arena@0.9.
|
|
86
|
+
bunx @lythos/skill-arena@0.9.40 run \
|
|
88
87
|
--task ./TASK-arena.md \
|
|
89
88
|
--players ./players/claude.toml \
|
|
90
89
|
--decks ./decks/run-01.toml,./decks/run-02.toml \
|
|
@@ -94,13 +93,13 @@ bunx @lythos/skill-arena@0.9.39 run \
|
|
|
94
93
|
### Scaffold mode (legacy, manual execution)
|
|
95
94
|
|
|
96
95
|
```
|
|
97
|
-
bunx @lythos/skill-arena@0.9.
|
|
96
|
+
bunx @lythos/skill-arena@0.9.40 scaffold --task "..." --decks a.toml,b.toml
|
|
98
97
|
```
|
|
99
98
|
|
|
100
99
|
### Viz
|
|
101
100
|
|
|
102
101
|
```bash
|
|
103
|
-
bunx @lythos/skill-arena@0.9.
|
|
102
|
+
bunx @lythos/skill-arena@0.9.40 viz runs/arena-<id>/
|
|
104
103
|
```
|
|
105
104
|
|
|
106
105
|
## Skill Documentation
|
|
@@ -114,7 +113,7 @@ The agent-visible **Skill** layer documentation is here:
|
|
|
114
113
|
Part of the [lythoskill](https://github.com/lythos-labs/lythoskill) ecosystem — the thin-skill pattern separates heavy logic (this npm package) from lightweight agent instructions (SKILL.md).
|
|
115
114
|
|
|
116
115
|
```
|
|
117
|
-
Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.
|
|
116
|
+
Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.40 ...
|
|
118
117
|
Skill (packages/<name>/skill/) → build → SKILL.md + thin scripts
|
|
119
118
|
Output (skills/<name>/) → git commit → agent-visible skill
|
|
120
119
|
```
|
package/package.json
CHANGED
package/src/cli.ts
CHANGED
|
@@ -37,39 +37,39 @@ function printHelp(): void {
|
|
|
37
37
|
console.log(`🎭 lythoskill-arena — Skill comparison runner
|
|
38
38
|
|
|
39
39
|
Usage:
|
|
40
|
-
lythoskill-arena
|
|
41
|
-
lythoskill-arena
|
|
42
|
-
lythoskill-arena
|
|
40
|
+
lythoskill-arena single --task <path> --deck <path> [--player kimi] [--out <dir>] [--timeout <ms>]
|
|
41
|
+
lythoskill-arena single --brief "<prompt>" --deck <path> [--out <dir>] [--timeout <ms>]
|
|
42
|
+
lythoskill-arena vs --config arena.toml [--dry-run]
|
|
43
43
|
lythoskill-arena scaffold --task "<description>" --decks <deck1,deck2,...>
|
|
44
44
|
lythoskill-arena viz <arena-dir>
|
|
45
45
|
|
|
46
46
|
Commands:
|
|
47
|
-
|
|
47
|
+
single Single-player deck test (exec shortcut): test a deck with one player
|
|
48
|
+
vs Multi-side comparison: run arena from declarative arena.toml
|
|
48
49
|
scaffold Create arena directory structure (legacy, manual subagent execution)
|
|
49
50
|
viz Visualize arena report (ASCII charts)
|
|
50
51
|
|
|
51
52
|
Options:
|
|
52
|
-
-t, --task <path|desc> Task description or path to TASK-arena.md
|
|
53
|
-
--
|
|
54
|
-
|
|
55
|
-
--
|
|
56
|
-
|
|
57
|
-
--
|
|
58
|
-
--
|
|
59
|
-
|
|
60
|
-
-
|
|
53
|
+
-t, --task <path|desc> Task description or path to TASK-arena.md / .agent.md
|
|
54
|
+
--deck <path> Deck path (single only)
|
|
55
|
+
--brief "<text>" Inline task description (single only, alternative to --task)
|
|
56
|
+
--player <name> Agent player (single only, default: kimi)
|
|
57
|
+
-c, --criteria <list> Evaluation criteria (scaffold only, default: syntax,context,logic,token)
|
|
58
|
+
--config <path> Path to arena.toml (vs only)
|
|
59
|
+
--dry-run Print execution plan without running (vs --config only)
|
|
60
|
+
--out <dir> Output directory
|
|
61
|
+
-d, --dir <dir> Parent dir (scaffold: defaults to tmp)
|
|
62
|
+
-p, --project <dir> Project root (default: .)
|
|
63
|
+
--timeout <ms> Subagent timeout (single only)
|
|
61
64
|
|
|
62
65
|
Examples:
|
|
63
|
-
# Single
|
|
64
|
-
lythoskill-arena
|
|
65
|
-
lythoskill-arena
|
|
66
|
+
# Single-player deck test (exec shortcut)
|
|
67
|
+
lythoskill-arena single --task ./TASK.agent.md --deck ./deck.toml
|
|
68
|
+
lythoskill-arena single --brief "Generate auth flow diagram" --deck ./deck.toml --player kimi
|
|
66
69
|
|
|
67
|
-
#
|
|
68
|
-
lythoskill-arena
|
|
69
|
-
lythoskill-arena
|
|
70
|
-
|
|
71
|
-
# CLI-flag mode (backward compat)
|
|
72
|
-
lythoskill-arena run --task ./TASK-arena.md --players ./players/claude.toml --decks ./decks/run-01.toml,./decks/run-02.toml --criteria coverage,relevance
|
|
70
|
+
# Multi-side comparison (declarative)
|
|
71
|
+
lythoskill-arena vs --config ./arena.toml
|
|
72
|
+
lythoskill-arena vs --config ./arena.toml --dry-run
|
|
73
73
|
|
|
74
74
|
# Legacy scaffolding
|
|
75
75
|
lythoskill-arena scaffold --task "Refactor auth module" --decks ./decks/a.toml,./decks/b.toml
|
|
@@ -77,9 +77,9 @@ Examples:
|
|
|
77
77
|
`)
|
|
78
78
|
}
|
|
79
79
|
|
|
80
|
-
// ──
|
|
80
|
+
// ── single: single-player deck test (exec shortcut) ──────────────────────
|
|
81
81
|
|
|
82
|
-
async function
|
|
82
|
+
async function singleRun(args: string[]) {
|
|
83
83
|
const opts: Record<string, string | undefined> = {}
|
|
84
84
|
for (let i = 0; i < args.length; i++) {
|
|
85
85
|
if (args[i] === '--task' || args[i] === '-t') opts.task = args[++i]
|
|
@@ -91,11 +91,16 @@ async function agentRun(args: string[]) {
|
|
|
91
91
|
}
|
|
92
92
|
|
|
93
93
|
if (!opts.deck) {
|
|
94
|
-
console.error(
|
|
94
|
+
console.error(`❌ --deck <path> is required.
|
|
95
|
+
Usage: lythoskill-arena single --deck ./deck.toml --task ./scenario.agent.md
|
|
96
|
+
lythoskill-arena single --deck ./deck.toml --brief "your task description"
|
|
97
|
+
Example decks: examples/decks/scout.toml, examples/decks/documents.toml`)
|
|
95
98
|
process.exit(1)
|
|
96
99
|
}
|
|
97
100
|
if (!opts.task && (!opts.brief || !opts.brief.trim())) {
|
|
98
|
-
console.error(
|
|
101
|
+
console.error(`❌ --task <path> or --brief "<prompt>" is required.
|
|
102
|
+
Usage: lythoskill-arena single --deck ./deck.toml --task ./scenario.agent.md
|
|
103
|
+
lythoskill-arena single --deck ./deck.toml --brief "your task description"`)
|
|
99
104
|
process.exit(1)
|
|
100
105
|
}
|
|
101
106
|
|
|
@@ -119,7 +124,10 @@ async function agentRun(args: string[]) {
|
|
|
119
124
|
deckPath = dest
|
|
120
125
|
} else {
|
|
121
126
|
deckPath = resolve(opts.deck)
|
|
122
|
-
if (!deckExists(deckPath)) { console.error(`❌ Deck file not found: ${deckPath}
|
|
127
|
+
if (!deckExists(deckPath)) { console.error(`❌ Deck file not found: ${deckPath}
|
|
128
|
+
Create one: examples/decks/scout.toml (minimal), examples/decks/documents.toml (documents)
|
|
129
|
+
Or fetch: curl -fsSL https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml > deck.toml
|
|
130
|
+
Or create: see https://github.com/lythos-labs/lythoskill/tree/main/examples/decks/`); process.exit(1) }
|
|
123
131
|
}
|
|
124
132
|
|
|
125
133
|
const { useAgent } = await import('@lythos/test-utils/agents')
|
|
@@ -139,7 +147,10 @@ async function agentRun(args: string[]) {
|
|
|
139
147
|
const scenarioOpt: Record<string, unknown> = {}
|
|
140
148
|
if (opts.task) {
|
|
141
149
|
const taskPath = resolve(opts.task)
|
|
142
|
-
if (!existsSync(taskPath)) { console.error(`❌ Task file not found: ${taskPath}
|
|
150
|
+
if (!existsSync(taskPath)) { console.error(`❌ Task file not found: ${taskPath}
|
|
151
|
+
Create a .agent.md scenario or use --brief for inline tasks.
|
|
152
|
+
Format: frontmatter (name, description, timeout) + Given/When/Then/Judge sections.
|
|
153
|
+
Example: see playground/arena-one-shot/TASK-arena.agent.md`); process.exit(1) }
|
|
143
154
|
scenarioOpt.scenarioPath = taskPath
|
|
144
155
|
} else {
|
|
145
156
|
scenarioOpt.scenario = {
|
|
@@ -693,7 +704,7 @@ function runViz(argv: string[]) {
|
|
|
693
704
|
|
|
694
705
|
// ── Run: programmatic arena execution ───────────────────────
|
|
695
706
|
|
|
696
|
-
async function
|
|
707
|
+
async function vsRun(argv: string[]) {
|
|
697
708
|
const { options } = parseArgs(argv)
|
|
698
709
|
const { readFileSync } = await import('node:fs')
|
|
699
710
|
|
|
@@ -731,13 +742,15 @@ async function runProgrammaticArena(argv: string[]) {
|
|
|
731
742
|
return
|
|
732
743
|
}
|
|
733
744
|
|
|
734
|
-
//
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
745
|
+
// --config was not provided
|
|
746
|
+
console.error(`❌ --config <arena.toml> is required.
|
|
747
|
+
Usage: lythoskill-arena vs --config ./arena.toml
|
|
748
|
+
lythoskill-arena vs --config ./arena.toml --dry-run
|
|
749
|
+
Example configs:
|
|
750
|
+
examples/arena/research-compare/arena.toml — two-side A/B
|
|
751
|
+
examples/arena/add-remove/arena.toml — three-side Pareto
|
|
752
|
+
Create one: cp examples/arena/research-compare/arena.toml ./arena.toml`)
|
|
753
|
+
process.exit(1)
|
|
741
754
|
|
|
742
755
|
const result = await runArenaProgrammatic({
|
|
743
756
|
taskPath: options.task,
|
|
@@ -758,18 +771,20 @@ if (import.meta.main) {
|
|
|
758
771
|
const args = process.argv.slice(2)
|
|
759
772
|
const cmd = args[0]
|
|
760
773
|
|
|
761
|
-
if (cmd === '
|
|
762
|
-
|
|
774
|
+
if (cmd === 'single') {
|
|
775
|
+
singleRun(args.slice(1))
|
|
763
776
|
} else if (cmd === 'viz') {
|
|
764
777
|
runViz(args.slice(1))
|
|
765
|
-
} else if (cmd === '
|
|
766
|
-
|
|
778
|
+
} else if (cmd === 'vs') {
|
|
779
|
+
vsRun(args.slice(1))
|
|
767
780
|
} else if (cmd === 'scaffold' || !cmd || args[0]?.startsWith('-')) {
|
|
768
781
|
// Legacy behavior: if no subcommand or starts with flags, treat as scaffold
|
|
769
782
|
runArena(cmd === 'scaffold' ? args.slice(1) : args)
|
|
770
783
|
} else {
|
|
771
|
-
console.error(`❌ Unknown command: ${cmd}
|
|
772
|
-
|
|
784
|
+
console.error(`❌ Unknown command: "${cmd}"
|
|
785
|
+
Available: single, vs, scaffold, viz
|
|
786
|
+
Usage: lythoskill-arena <command> [options]
|
|
787
|
+
Help: lythoskill-arena --help`)
|
|
773
788
|
process.exit(1)
|
|
774
789
|
}
|
|
775
790
|
}
|