@lythos/skill-arena 0.9.39 → 0.9.41
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +15 -16
- package/package.json +1 -1
- package/src/cli.ts +78 -42
- package/src/runner.ts +34 -3
package/README.md
CHANGED
|
@@ -49,24 +49,23 @@ Note: Claude `-p` mode has known issues with web tools in Bun.spawn (deferred to
|
|
|
49
49
|
```bash
|
|
50
50
|
bun add -d @lythos/skill-arena
|
|
51
51
|
# or use directly
|
|
52
|
-
bunx @lythos/skill-arena@0.9.
|
|
52
|
+
bunx @lythos/skill-arena@0.9.41 <command>
|
|
53
53
|
```
|
|
54
54
|
|
|
55
55
|
## Quick Start
|
|
56
56
|
|
|
57
57
|
```bash
|
|
58
|
-
#
|
|
59
|
-
bunx @lythos/skill-arena@0.9.
|
|
60
|
-
--
|
|
58
|
+
# Single: test a deck with one agent (exec shortcut)
|
|
59
|
+
bunx @lythos/skill-arena@0.9.41 single \
|
|
60
|
+
--brief "Generate auth flow diagram" \
|
|
61
|
+
--deck ./examples/decks/documents.toml
|
|
61
62
|
|
|
62
|
-
#
|
|
63
|
-
bunx @lythos/skill-arena@0.9.
|
|
64
|
-
--
|
|
65
|
-
--decks "./decks/minimal.toml,./decks/rich.toml" \
|
|
66
|
-
--criteria "quality,token,maintainability"
|
|
63
|
+
# Vs: compare multiple decks side by side (declarative)
|
|
64
|
+
bunx @lythos/skill-arena@0.9.41 vs \
|
|
65
|
+
--config examples/arena/research-compare/arena.toml
|
|
67
66
|
|
|
68
67
|
# Visualize results
|
|
69
|
-
bunx @lythos/skill-arena@0.9.
|
|
68
|
+
bunx @lythos/skill-arena@0.9.41 viz tmp/arena-<id>/
|
|
70
69
|
```
|
|
71
70
|
|
|
72
71
|
## Commands
|
|
@@ -75,16 +74,16 @@ bunx @lythos/skill-arena@0.9.39 viz tmp/arena-<id>/
|
|
|
75
74
|
|
|
76
75
|
```bash
|
|
77
76
|
# Print execution plan without running
|
|
78
|
-
bunx @lythos/skill-arena@0.9.
|
|
77
|
+
bunx @lythos/skill-arena@0.9.41 vs --config arena.toml --dry-run
|
|
79
78
|
|
|
80
79
|
# Execute with per-side runs_per_side and statistical aggregation
|
|
81
|
-
bunx @lythos/skill-arena@0.9.
|
|
80
|
+
bunx @lythos/skill-arena@0.9.41 vs --config arena.toml
|
|
82
81
|
```
|
|
83
82
|
|
|
84
83
|
### CLI-flag mode (backward compat)
|
|
85
84
|
|
|
86
85
|
```
|
|
87
|
-
bunx @lythos/skill-arena@0.9.
|
|
86
|
+
bunx @lythos/skill-arena@0.9.41 run \
|
|
88
87
|
--task ./TASK-arena.md \
|
|
89
88
|
--players ./players/claude.toml \
|
|
90
89
|
--decks ./decks/run-01.toml,./decks/run-02.toml \
|
|
@@ -94,13 +93,13 @@ bunx @lythos/skill-arena@0.9.39 run \
|
|
|
94
93
|
### Scaffold mode (legacy, manual execution)
|
|
95
94
|
|
|
96
95
|
```
|
|
97
|
-
bunx @lythos/skill-arena@0.9.
|
|
96
|
+
bunx @lythos/skill-arena@0.9.41 scaffold --task "..." --decks a.toml,b.toml
|
|
98
97
|
```
|
|
99
98
|
|
|
100
99
|
### Viz
|
|
101
100
|
|
|
102
101
|
```bash
|
|
103
|
-
bunx @lythos/skill-arena@0.9.
|
|
102
|
+
bunx @lythos/skill-arena@0.9.41 viz runs/arena-<id>/
|
|
104
103
|
```
|
|
105
104
|
|
|
106
105
|
## Skill Documentation
|
|
@@ -114,7 +113,7 @@ The agent-visible **Skill** layer documentation is here:
|
|
|
114
113
|
Part of the [lythoskill](https://github.com/lythos-labs/lythoskill) ecosystem — the thin-skill pattern separates heavy logic (this npm package) from lightweight agent instructions (SKILL.md).
|
|
115
114
|
|
|
116
115
|
```
|
|
117
|
-
Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.
|
|
116
|
+
Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.41 ...
|
|
118
117
|
Skill (packages/<name>/skill/) → build → SKILL.md + thin scripts
|
|
119
118
|
Output (skills/<name>/) → git commit → agent-visible skill
|
|
120
119
|
```
|
package/package.json
CHANGED
package/src/cli.ts
CHANGED
|
@@ -37,39 +37,39 @@ function printHelp(): void {
|
|
|
37
37
|
console.log(`🎭 lythoskill-arena — Skill comparison runner
|
|
38
38
|
|
|
39
39
|
Usage:
|
|
40
|
-
lythoskill-arena
|
|
41
|
-
lythoskill-arena
|
|
42
|
-
lythoskill-arena
|
|
40
|
+
lythoskill-arena single --task <path> --deck <path> [--player kimi] [--out <dir>] [--timeout <ms>]
|
|
41
|
+
lythoskill-arena single --brief "<prompt>" --deck <path> [--out <dir>] [--timeout <ms>]
|
|
42
|
+
lythoskill-arena vs --config arena.toml [--dry-run]
|
|
43
43
|
lythoskill-arena scaffold --task "<description>" --decks <deck1,deck2,...>
|
|
44
44
|
lythoskill-arena viz <arena-dir>
|
|
45
45
|
|
|
46
46
|
Commands:
|
|
47
|
-
|
|
47
|
+
single Single-player deck test (exec shortcut): test a deck with one player
|
|
48
|
+
vs Multi-side comparison: run arena from declarative arena.toml
|
|
48
49
|
scaffold Create arena directory structure (legacy, manual subagent execution)
|
|
49
50
|
viz Visualize arena report (ASCII charts)
|
|
50
51
|
|
|
51
52
|
Options:
|
|
52
|
-
-t, --task <path|desc> Task description or path to TASK-arena.md
|
|
53
|
-
--
|
|
54
|
-
|
|
55
|
-
--
|
|
56
|
-
|
|
57
|
-
--
|
|
58
|
-
--
|
|
59
|
-
|
|
60
|
-
-
|
|
53
|
+
-t, --task <path|desc> Task description or path to TASK-arena.md / .agent.md
|
|
54
|
+
--deck <path> Deck path (single only)
|
|
55
|
+
--brief "<text>" Inline task description (single only, alternative to --task)
|
|
56
|
+
--player <name> Agent player (single only, default: kimi)
|
|
57
|
+
-c, --criteria <list> Evaluation criteria (scaffold only, default: syntax,context,logic,token)
|
|
58
|
+
--config <path> Path to arena.toml (vs only)
|
|
59
|
+
--dry-run Print execution plan without running (vs --config only)
|
|
60
|
+
--out <dir> Output directory
|
|
61
|
+
-d, --dir <dir> Parent dir (scaffold: defaults to tmp)
|
|
62
|
+
-p, --project <dir> Project root (default: .)
|
|
63
|
+
--timeout <ms> Subagent timeout (single only)
|
|
61
64
|
|
|
62
65
|
Examples:
|
|
63
|
-
# Single
|
|
64
|
-
lythoskill-arena
|
|
65
|
-
lythoskill-arena
|
|
66
|
+
# Single-player deck test (exec shortcut)
|
|
67
|
+
lythoskill-arena single --task ./TASK.agent.md --deck ./deck.toml
|
|
68
|
+
lythoskill-arena single --brief "Generate auth flow diagram" --deck ./deck.toml --player kimi
|
|
66
69
|
|
|
67
|
-
#
|
|
68
|
-
lythoskill-arena
|
|
69
|
-
lythoskill-arena
|
|
70
|
-
|
|
71
|
-
# CLI-flag mode (backward compat)
|
|
72
|
-
lythoskill-arena run --task ./TASK-arena.md --players ./players/claude.toml --decks ./decks/run-01.toml,./decks/run-02.toml --criteria coverage,relevance
|
|
70
|
+
# Multi-side comparison (declarative)
|
|
71
|
+
lythoskill-arena vs --config ./arena.toml
|
|
72
|
+
lythoskill-arena vs --config ./arena.toml --dry-run
|
|
73
73
|
|
|
74
74
|
# Legacy scaffolding
|
|
75
75
|
lythoskill-arena scaffold --task "Refactor auth module" --decks ./decks/a.toml,./decks/b.toml
|
|
@@ -77,9 +77,9 @@ Examples:
|
|
|
77
77
|
`)
|
|
78
78
|
}
|
|
79
79
|
|
|
80
|
-
// ──
|
|
80
|
+
// ── single: single-player deck test (exec shortcut) ──────────────────────
|
|
81
81
|
|
|
82
|
-
async function
|
|
82
|
+
async function singleRun(args: string[]) {
|
|
83
83
|
const opts: Record<string, string | undefined> = {}
|
|
84
84
|
for (let i = 0; i < args.length; i++) {
|
|
85
85
|
if (args[i] === '--task' || args[i] === '-t') opts.task = args[++i]
|
|
@@ -91,11 +91,16 @@ async function agentRun(args: string[]) {
|
|
|
91
91
|
}
|
|
92
92
|
|
|
93
93
|
if (!opts.deck) {
|
|
94
|
-
console.error(
|
|
94
|
+
console.error(`❌ --deck <path> is required.
|
|
95
|
+
Usage: lythoskill-arena single --deck ./deck.toml --task ./scenario.agent.md
|
|
96
|
+
lythoskill-arena single --deck ./deck.toml --brief "your task description"
|
|
97
|
+
Example decks: examples/decks/scout.toml, examples/decks/documents.toml`)
|
|
95
98
|
process.exit(1)
|
|
96
99
|
}
|
|
97
100
|
if (!opts.task && (!opts.brief || !opts.brief.trim())) {
|
|
98
|
-
console.error(
|
|
101
|
+
console.error(`❌ --task <path> or --brief "<prompt>" is required.
|
|
102
|
+
Usage: lythoskill-arena single --deck ./deck.toml --task ./scenario.agent.md
|
|
103
|
+
lythoskill-arena single --deck ./deck.toml --brief "your task description"`)
|
|
99
104
|
process.exit(1)
|
|
100
105
|
}
|
|
101
106
|
|
|
@@ -119,7 +124,10 @@ async function agentRun(args: string[]) {
|
|
|
119
124
|
deckPath = dest
|
|
120
125
|
} else {
|
|
121
126
|
deckPath = resolve(opts.deck)
|
|
122
|
-
if (!deckExists(deckPath)) { console.error(`❌ Deck file not found: ${deckPath}
|
|
127
|
+
if (!deckExists(deckPath)) { console.error(`❌ Deck file not found: ${deckPath}
|
|
128
|
+
Create one: examples/decks/scout.toml (minimal), examples/decks/documents.toml (documents)
|
|
129
|
+
Or fetch: curl -fsSL https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml > deck.toml
|
|
130
|
+
Or create: see https://github.com/lythos-labs/lythoskill/tree/main/examples/decks/`); process.exit(1) }
|
|
123
131
|
}
|
|
124
132
|
|
|
125
133
|
const { useAgent } = await import('@lythos/test-utils/agents')
|
|
@@ -139,8 +147,32 @@ async function agentRun(args: string[]) {
|
|
|
139
147
|
const scenarioOpt: Record<string, unknown> = {}
|
|
140
148
|
if (opts.task) {
|
|
141
149
|
const taskPath = resolve(opts.task)
|
|
142
|
-
if (!existsSync(taskPath)) { console.error(`❌ Task file not found: ${taskPath}
|
|
150
|
+
if (!existsSync(taskPath)) { console.error(`❌ Task file not found: ${taskPath}
|
|
151
|
+
Create a .agent.md scenario or use --brief for inline tasks.
|
|
152
|
+
Format: frontmatter (name, description, timeout) + Given/When/Then/Judge sections.
|
|
153
|
+
Example: see playground/arena-one-shot/TASK-arena.agent.md`); process.exit(1) }
|
|
143
154
|
scenarioOpt.scenarioPath = taskPath
|
|
155
|
+
// Quick validation: check frontmatter presence
|
|
156
|
+
const raw = readFileSync(taskPath, 'utf-8')
|
|
157
|
+
if (!raw.startsWith('---')) { console.error(`❌ Invalid .agent.md: missing frontmatter (must start with "---")
|
|
158
|
+
Correct format:
|
|
159
|
+
---
|
|
160
|
+
name: my-scenario
|
|
161
|
+
description: what this tests
|
|
162
|
+
timeout: 120000
|
|
163
|
+
---
|
|
164
|
+
## Given
|
|
165
|
+
...
|
|
166
|
+
## When
|
|
167
|
+
...
|
|
168
|
+
## Then
|
|
169
|
+
...
|
|
170
|
+
## Judge
|
|
171
|
+
...
|
|
172
|
+
Template: playground/arena-one-shot/TASK-arena.agent.md`); process.exit(1) }
|
|
173
|
+
if (!raw.includes('## When')) { console.error(`❌ Invalid .agent.md: missing "## When" section.
|
|
174
|
+
The ## When section defines what the agent should do.
|
|
175
|
+
See template: playground/arena-one-shot/TASK-arena.agent.md`); process.exit(1) }
|
|
144
176
|
} else {
|
|
145
177
|
scenarioOpt.scenario = {
|
|
146
178
|
name: 'ad-hoc task',
|
|
@@ -693,7 +725,7 @@ function runViz(argv: string[]) {
|
|
|
693
725
|
|
|
694
726
|
// ── Run: programmatic arena execution ───────────────────────
|
|
695
727
|
|
|
696
|
-
async function
|
|
728
|
+
async function vsRun(argv: string[]) {
|
|
697
729
|
const { options } = parseArgs(argv)
|
|
698
730
|
const { readFileSync } = await import('node:fs')
|
|
699
731
|
|
|
@@ -731,13 +763,15 @@ async function runProgrammaticArena(argv: string[]) {
|
|
|
731
763
|
return
|
|
732
764
|
}
|
|
733
765
|
|
|
734
|
-
//
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
766
|
+
// --config was not provided
|
|
767
|
+
console.error(`❌ --config <arena.toml> is required.
|
|
768
|
+
Usage: lythoskill-arena vs --config ./arena.toml
|
|
769
|
+
lythoskill-arena vs --config ./arena.toml --dry-run
|
|
770
|
+
Example configs:
|
|
771
|
+
examples/arena/research-compare/arena.toml — two-side A/B
|
|
772
|
+
examples/arena/add-remove/arena.toml — three-side Pareto
|
|
773
|
+
Create one: cp examples/arena/research-compare/arena.toml ./arena.toml`)
|
|
774
|
+
process.exit(1)
|
|
741
775
|
|
|
742
776
|
const result = await runArenaProgrammatic({
|
|
743
777
|
taskPath: options.task,
|
|
@@ -758,18 +792,20 @@ if (import.meta.main) {
|
|
|
758
792
|
const args = process.argv.slice(2)
|
|
759
793
|
const cmd = args[0]
|
|
760
794
|
|
|
761
|
-
if (cmd === '
|
|
762
|
-
|
|
795
|
+
if (cmd === 'single') {
|
|
796
|
+
singleRun(args.slice(1))
|
|
763
797
|
} else if (cmd === 'viz') {
|
|
764
798
|
runViz(args.slice(1))
|
|
765
|
-
} else if (cmd === '
|
|
766
|
-
|
|
799
|
+
} else if (cmd === 'vs') {
|
|
800
|
+
vsRun(args.slice(1))
|
|
767
801
|
} else if (cmd === 'scaffold' || !cmd || args[0]?.startsWith('-')) {
|
|
768
802
|
// Legacy behavior: if no subcommand or starts with flags, treat as scaffold
|
|
769
803
|
runArena(cmd === 'scaffold' ? args.slice(1) : args)
|
|
770
804
|
} else {
|
|
771
|
-
console.error(`❌ Unknown command: ${cmd}
|
|
772
|
-
|
|
805
|
+
console.error(`❌ Unknown command: "${cmd}"
|
|
806
|
+
Available: single, vs, scaffold, viz
|
|
807
|
+
Usage: lythoskill-arena <command> [options]
|
|
808
|
+
Help: lythoskill-arena --help`)
|
|
773
809
|
process.exit(1)
|
|
774
810
|
}
|
|
775
811
|
}
|
package/src/runner.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { mkdirSync, writeFileSync, readFileSync } from 'node:fs'
|
|
1
|
+
import { existsSync, mkdirSync, writeFileSync, readFileSync, rmSync } from 'node:fs'
|
|
2
2
|
import { join, resolve } from 'node:path'
|
|
3
3
|
import { tmpdir } from 'node:os'
|
|
4
4
|
import { runAgentScenario, type AgentScenario } from '@lythos/test-utils/agent-bdd'
|
|
@@ -57,7 +57,35 @@ export async function runArenaFromToml(opts: {
|
|
|
57
57
|
if (configDir) return resolve(configDir, p)
|
|
58
58
|
return resolve(p)
|
|
59
59
|
}
|
|
60
|
-
const
|
|
60
|
+
const resolveOrCreateTask = (): { path: string; cleanup?: () => void } => {
|
|
61
|
+
const candidate = resolvePath(taskPath)
|
|
62
|
+
if (existsSync(candidate)) return { path: candidate }
|
|
63
|
+
// taskPath is inline text — write temp scenario file
|
|
64
|
+
const tmp = join(tmpdir(), `arena-task-${stamp()}.agent.md`)
|
|
65
|
+
writeFileSync(tmp, `---
|
|
66
|
+
name: arena task
|
|
67
|
+
description: ${taskPath.slice(0, 80)}
|
|
68
|
+
timeout: 120000
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
## Given
|
|
72
|
+
- Working directory with an empty project
|
|
73
|
+
- bun is available
|
|
74
|
+
|
|
75
|
+
## When
|
|
76
|
+
${taskPath}
|
|
77
|
+
|
|
78
|
+
## Then
|
|
79
|
+
- Complete the task above
|
|
80
|
+
- Write a summary to output.md
|
|
81
|
+
|
|
82
|
+
## Judge
|
|
83
|
+
- completeness
|
|
84
|
+
- correctness
|
|
85
|
+
`)
|
|
86
|
+
return { path: tmp, cleanup: () => { try { rmSync(tmp) } catch {} } }
|
|
87
|
+
}
|
|
88
|
+
const { path: taskAbs, cleanup: taskCleanup } = resolveOrCreateTask()
|
|
61
89
|
const resolvedToml: ArenaToml = {
|
|
62
90
|
...toml,
|
|
63
91
|
side: toml.side.map(s => ({ ...s, deck: resolvePath(s.deck) })),
|
|
@@ -78,10 +106,13 @@ export async function runArenaFromToml(opts: {
|
|
|
78
106
|
const resolved = resolveSides(resolvedToml)
|
|
79
107
|
|
|
80
108
|
// Build manifest
|
|
109
|
+
const taskContent = existsSync(taskAbs)
|
|
110
|
+
? readFileSync(taskAbs, 'utf-8').slice(0, 200)
|
|
111
|
+
: taskPath // inline description, not a file path
|
|
81
112
|
const manifest = ArenaManifest.parse({
|
|
82
113
|
id: arenaId,
|
|
83
114
|
created_at: new Date().toISOString(),
|
|
84
|
-
task:
|
|
115
|
+
task: taskContent,
|
|
85
116
|
mode: 'decks',
|
|
86
117
|
participants: [...new Map(resolved.map(r => [r.side.name, r])).values()].map(r => ({
|
|
87
118
|
id: r.side.name,
|