@lythos/skill-arena 0.9.39 → 0.9.40

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +15 -16
  2. package/package.json +1 -1
  3. package/src/cli.ts +57 -42
package/README.md CHANGED
@@ -49,24 +49,23 @@ Note: Claude `-p` mode has known issues with web tools in Bun.spawn (deferred to
49
49
  ```bash
50
50
  bun add -d @lythos/skill-arena
51
51
  # or use directly
52
- bunx @lythos/skill-arena@0.9.39 <command>
52
+ bunx @lythos/skill-arena@0.9.40 <command>
53
53
  ```
54
54
 
55
55
  ## Quick Start
56
56
 
57
57
  ```bash
58
- # Mode 1: Compare two decks on the same task (declarative)
59
- bunx @lythos/skill-arena@0.9.39 run \
60
- --config examples/arena/research-compare/arena.toml
58
+ # Single: test a deck with one agent (exec shortcut)
59
+ bunx @lythos/skill-arena@0.9.40 single \
60
+ --brief "Generate auth flow diagram" \
61
+ --deck ./examples/decks/documents.toml
61
62
 
62
- # Mode 2: Compare full deck configurations via CLI flags
63
- bunx @lythos/skill-arena@0.9.39 run \
64
- --task "Generate auth flow diagram" \
65
- --decks "./decks/minimal.toml,./decks/rich.toml" \
66
- --criteria "quality,token,maintainability"
63
+ # Vs: compare multiple decks side by side (declarative)
64
+ bunx @lythos/skill-arena@0.9.40 vs \
65
+ --config examples/arena/research-compare/arena.toml
67
66
 
68
67
  # Visualize results
69
- bunx @lythos/skill-arena@0.9.39 viz tmp/arena-<id>/
68
+ bunx @lythos/skill-arena@0.9.40 viz tmp/arena-<id>/
70
69
  ```
71
70
 
72
71
  ## Commands
@@ -75,16 +74,16 @@ bunx @lythos/skill-arena@0.9.39 viz tmp/arena-<id>/
75
74
 
76
75
  ```bash
77
76
  # Print execution plan without running
78
- bunx @lythos/skill-arena@0.9.39 run --config arena.toml --dry-run
77
+ bunx @lythos/skill-arena@0.9.40 vs --config arena.toml --dry-run
79
78
 
80
79
  # Execute with per-side runs_per_side and statistical aggregation
81
- bunx @lythos/skill-arena@0.9.39 run --config arena.toml
80
+ bunx @lythos/skill-arena@0.9.40 vs --config arena.toml
82
81
  ```
83
82
 
84
83
  ### CLI-flag mode (backward compat)
85
84
 
86
85
  ```
87
- bunx @lythos/skill-arena@0.9.39 run \
86
+ bunx @lythos/skill-arena@0.9.40 run \
88
87
  --task ./TASK-arena.md \
89
88
  --players ./players/claude.toml \
90
89
  --decks ./decks/run-01.toml,./decks/run-02.toml \
@@ -94,13 +93,13 @@ bunx @lythos/skill-arena@0.9.39 run \
94
93
  ### Scaffold mode (legacy, manual execution)
95
94
 
96
95
  ```
97
- bunx @lythos/skill-arena@0.9.39 scaffold --task "..." --decks a.toml,b.toml
96
+ bunx @lythos/skill-arena@0.9.40 scaffold --task "..." --decks a.toml,b.toml
98
97
  ```
99
98
 
100
99
  ### Viz
101
100
 
102
101
  ```bash
103
- bunx @lythos/skill-arena@0.9.39 viz runs/arena-<id>/
102
+ bunx @lythos/skill-arena@0.9.40 viz runs/arena-<id>/
104
103
  ```
105
104
 
106
105
  ## Skill Documentation
@@ -114,7 +113,7 @@ The agent-visible **Skill** layer documentation is here:
114
113
  Part of the [lythoskill](https://github.com/lythos-labs/lythoskill) ecosystem — the thin-skill pattern separates heavy logic (this npm package) from lightweight agent instructions (SKILL.md).
115
114
 
116
115
  ```
117
- Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.39 ...
116
+ Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.40 ...
118
117
  Skill (packages/<name>/skill/) → build → SKILL.md + thin scripts
119
118
  Output (skills/<name>/) → git commit → agent-visible skill
120
119
  ```
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lythos/skill-arena",
3
- "version": "0.9.39",
3
+ "version": "0.9.40",
4
4
  "description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
5
5
  "keywords": [
6
6
  "ai-agent",
package/src/cli.ts CHANGED
@@ -37,39 +37,39 @@ function printHelp(): void {
37
37
  console.log(`🎭 lythoskill-arena — Skill comparison runner
38
38
 
39
39
  Usage:
40
- lythoskill-arena agent-run --task <path> --deck <path> [--player kimi] [--out <dir>] [--timeout <ms>]
41
- lythoskill-arena agent-run --brief "<prompt>" --deck <path> [--out <dir>] [--timeout <ms>]
42
- lythoskill-arena run --task <path> --players <A.toml,B.toml> --decks <A.toml,B.toml> --criteria <c1,c2,...> [--out <dir>]
40
+ lythoskill-arena single --task <path> --deck <path> [--player kimi] [--out <dir>] [--timeout <ms>]
41
+ lythoskill-arena single --brief "<prompt>" --deck <path> [--out <dir>] [--timeout <ms>]
42
+ lythoskill-arena vs --config arena.toml [--dry-run]
43
43
  lythoskill-arena scaffold --task "<description>" --decks <deck1,deck2,...>
44
44
  lythoskill-arena viz <arena-dir>
45
45
 
46
46
  Commands:
47
- run Run arena programmatically (declarative arena.toml or CLI flags)
47
+ single Single-player deck test (exec shortcut): test a deck with one player
48
+ vs Multi-side comparison: run arena from declarative arena.toml
48
49
  scaffold Create arena directory structure (legacy, manual subagent execution)
49
50
  viz Visualize arena report (ASCII charts)
50
51
 
51
52
  Options:
52
- -t, --task <path|desc> Task description or path to TASK-arena.md
53
- --decks <list> Comma-separated deck paths
54
- -c, --criteria <list> Evaluation criteria (default: syntax,context,logic,token)
55
- --players <list> Comma-separated player.toml paths (CLI run only)
56
- --config <path> Path to arena.toml (declarative mode, k8s-style)
57
- --dry-run Print execution plan without running (with --config)
58
- --out <dir> Output directory (run: defaults to runs/arena-<id>)
59
- -d, --dir <dir> Output directory (scaffold: defaults to tmp)
60
- -p, --project <dir> Project directory (default: .)
53
+ -t, --task <path|desc> Task description or path to TASK-arena.md / .agent.md
54
+ --deck <path> Deck path (single only)
55
+ --brief "<text>" Inline task description (single only, alternative to --task)
56
+ --player <name> Agent player (single only, default: kimi)
57
+ -c, --criteria <list> Evaluation criteria (scaffold only, default: syntax,context,logic,token)
58
+ --config <path> Path to arena.toml (vs only)
59
+ --dry-run Print execution plan without running (vs --config only)
60
+ --out <dir> Output directory
61
+ -d, --dir <dir> Parent dir (scaffold: defaults to tmp)
62
+ -p, --project <dir> Project root (default: .)
63
+ --timeout <ms> Subagent timeout (single only)
61
64
 
62
65
  Examples:
63
- # Single agent run (simplest path)
64
- lythoskill-arena agent-run --task ./TASK.md --deck ./deck.toml
65
- lythoskill-arena agent-run --task ./TASK.md --deck ./deck.toml --player kimi --out ./output
66
+ # Single-player deck test (exec shortcut)
67
+ lythoskill-arena single --task ./TASK.agent.md --deck ./deck.toml
68
+ lythoskill-arena single --brief "Generate auth flow diagram" --deck ./deck.toml --player kimi
66
69
 
67
- # Declarative mode (k8s-style)
68
- lythoskill-arena run --config ./arena.toml
69
- lythoskill-arena run --config ./arena.toml --dry-run
70
-
71
- # CLI-flag mode (backward compat)
72
- lythoskill-arena run --task ./TASK-arena.md --players ./players/claude.toml --decks ./decks/run-01.toml,./decks/run-02.toml --criteria coverage,relevance
70
+ # Multi-side comparison (declarative)
71
+ lythoskill-arena vs --config ./arena.toml
72
+ lythoskill-arena vs --config ./arena.toml --dry-run
73
73
 
74
74
  # Legacy scaffolding
75
75
  lythoskill-arena scaffold --task "Refactor auth module" --decks ./decks/a.toml,./decks/b.toml
@@ -77,9 +77,9 @@ Examples:
77
77
  `)
78
78
  }
79
79
 
80
- // ── agent-run: single agent execution (simplest path) ────────────────────
80
+ // ── single: single-player deck test (exec shortcut) ──────────────────────
81
81
 
82
- async function agentRun(args: string[]) {
82
+ async function singleRun(args: string[]) {
83
83
  const opts: Record<string, string | undefined> = {}
84
84
  for (let i = 0; i < args.length; i++) {
85
85
  if (args[i] === '--task' || args[i] === '-t') opts.task = args[++i]
@@ -91,11 +91,16 @@ async function agentRun(args: string[]) {
91
91
  }
92
92
 
93
93
  if (!opts.deck) {
94
- console.error('❌ --deck <path> is required')
94
+ console.error(`❌ --deck <path> is required.
95
+ Usage: lythoskill-arena single --deck ./deck.toml --task ./scenario.agent.md
96
+ lythoskill-arena single --deck ./deck.toml --brief "your task description"
97
+ Example decks: examples/decks/scout.toml, examples/decks/documents.toml`)
95
98
  process.exit(1)
96
99
  }
97
100
  if (!opts.task && (!opts.brief || !opts.brief.trim())) {
98
- console.error('❌ --task <path> or --brief "<prompt>" is required and cannot be empty')
101
+ console.error(`❌ --task <path> or --brief "<prompt>" is required.
102
+ Usage: lythoskill-arena single --deck ./deck.toml --task ./scenario.agent.md
103
+ lythoskill-arena single --deck ./deck.toml --brief "your task description"`)
99
104
  process.exit(1)
100
105
  }
101
106
 
@@ -119,7 +124,10 @@ async function agentRun(args: string[]) {
119
124
  deckPath = dest
120
125
  } else {
121
126
  deckPath = resolve(opts.deck)
122
- if (!deckExists(deckPath)) { console.error(`❌ Deck file not found: ${deckPath}`); process.exit(1) }
127
+ if (!deckExists(deckPath)) { console.error(`❌ Deck file not found: ${deckPath}
128
+ Create one: examples/decks/scout.toml (minimal), examples/decks/documents.toml (documents)
129
+ Or fetch: curl -fsSL https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml > deck.toml
130
+ Or create: see https://github.com/lythos-labs/lythoskill/tree/main/examples/decks/`); process.exit(1) }
123
131
  }
124
132
 
125
133
  const { useAgent } = await import('@lythos/test-utils/agents')
@@ -139,7 +147,10 @@ async function agentRun(args: string[]) {
139
147
  const scenarioOpt: Record<string, unknown> = {}
140
148
  if (opts.task) {
141
149
  const taskPath = resolve(opts.task)
142
- if (!existsSync(taskPath)) { console.error(`❌ Task file not found: ${taskPath}`); process.exit(1) }
150
+ if (!existsSync(taskPath)) { console.error(`❌ Task file not found: ${taskPath}
151
+ Create a .agent.md scenario or use --brief for inline tasks.
152
+ Format: frontmatter (name, description, timeout) + Given/When/Then/Judge sections.
153
+ Example: see playground/arena-one-shot/TASK-arena.agent.md`); process.exit(1) }
143
154
  scenarioOpt.scenarioPath = taskPath
144
155
  } else {
145
156
  scenarioOpt.scenario = {
@@ -693,7 +704,7 @@ function runViz(argv: string[]) {
693
704
 
694
705
  // ── Run: programmatic arena execution ───────────────────────
695
706
 
696
- async function runProgrammaticArena(argv: string[]) {
707
+ async function vsRun(argv: string[]) {
697
708
  const { options } = parseArgs(argv)
698
709
  const { readFileSync } = await import('node:fs')
699
710
 
@@ -731,13 +742,15 @@ async function runProgrammaticArena(argv: string[]) {
731
742
  return
732
743
  }
733
744
 
734
- // CLI-flag mode (backward compat)
735
- if (!options.task || !options.decks) {
736
- console.error('❌ --task <path> and --decks <list> are required for "run" (or use --config <arena.toml>)')
737
- process.exit(1)
738
- }
739
-
740
- const { runArena: runArenaProgrammatic } = await import('./runner')
745
+ // --config was not provided
746
+ console.error(`❌ --config <arena.toml> is required.
747
+ Usage: lythoskill-arena vs --config ./arena.toml
748
+ lythoskill-arena vs --config ./arena.toml --dry-run
749
+ Example configs:
750
+ examples/arena/research-compare/arena.toml — two-side A/B
751
+ examples/arena/add-remove/arena.toml — three-side Pareto
752
+ Create one: cp examples/arena/research-compare/arena.toml ./arena.toml`)
753
+ process.exit(1)
741
754
 
742
755
  const result = await runArenaProgrammatic({
743
756
  taskPath: options.task,
@@ -758,18 +771,20 @@ if (import.meta.main) {
758
771
  const args = process.argv.slice(2)
759
772
  const cmd = args[0]
760
773
 
761
- if (cmd === 'agent-run') {
762
- agentRun(args.slice(1))
774
+ if (cmd === 'single') {
775
+ singleRun(args.slice(1))
763
776
  } else if (cmd === 'viz') {
764
777
  runViz(args.slice(1))
765
- } else if (cmd === 'run') {
766
- runProgrammaticArena(args.slice(1))
778
+ } else if (cmd === 'vs') {
779
+ vsRun(args.slice(1))
767
780
  } else if (cmd === 'scaffold' || !cmd || args[0]?.startsWith('-')) {
768
781
  // Legacy behavior: if no subcommand or starts with flags, treat as scaffold
769
782
  runArena(cmd === 'scaffold' ? args.slice(1) : args)
770
783
  } else {
771
- console.error(`❌ Unknown command: ${cmd}`)
772
- printHelp()
784
+ console.error(`❌ Unknown command: "${cmd}"
785
+ Available: single, vs, scaffold, viz
786
+ Usage: lythoskill-arena <command> [options]
787
+ Help: lythoskill-arena --help`)
773
788
  process.exit(1)
774
789
  }
775
790
  }