@lythos/skill-arena 0.9.39 → 0.9.41

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -49,24 +49,23 @@ Note: Claude `-p` mode has known issues with web tools in Bun.spawn (deferred to
49
49
  ```bash
50
50
  bun add -d @lythos/skill-arena
51
51
  # or use directly
52
- bunx @lythos/skill-arena@0.9.39 <command>
52
+ bunx @lythos/skill-arena@0.9.41 <command>
53
53
  ```
54
54
 
55
55
  ## Quick Start
56
56
 
57
57
  ```bash
58
- # Mode 1: Compare two decks on the same task (declarative)
59
- bunx @lythos/skill-arena@0.9.39 run \
60
- --config examples/arena/research-compare/arena.toml
58
+ # Single: test a deck with one agent (exec shortcut)
59
+ bunx @lythos/skill-arena@0.9.41 single \
60
+ --brief "Generate auth flow diagram" \
61
+ --deck ./examples/decks/documents.toml
61
62
 
62
- # Mode 2: Compare full deck configurations via CLI flags
63
- bunx @lythos/skill-arena@0.9.39 run \
64
- --task "Generate auth flow diagram" \
65
- --decks "./decks/minimal.toml,./decks/rich.toml" \
66
- --criteria "quality,token,maintainability"
63
+ # Vs: compare multiple decks side by side (declarative)
64
+ bunx @lythos/skill-arena@0.9.41 vs \
65
+ --config examples/arena/research-compare/arena.toml
67
66
 
68
67
  # Visualize results
69
- bunx @lythos/skill-arena@0.9.39 viz tmp/arena-<id>/
68
+ bunx @lythos/skill-arena@0.9.41 viz tmp/arena-<id>/
70
69
  ```
71
70
 
72
71
  ## Commands
@@ -75,16 +74,16 @@ bunx @lythos/skill-arena@0.9.39 viz tmp/arena-<id>/
75
74
 
76
75
  ```bash
77
76
  # Print execution plan without running
78
- bunx @lythos/skill-arena@0.9.39 run --config arena.toml --dry-run
77
+ bunx @lythos/skill-arena@0.9.41 vs --config arena.toml --dry-run
79
78
 
80
79
  # Execute with per-side runs_per_side and statistical aggregation
81
- bunx @lythos/skill-arena@0.9.39 run --config arena.toml
80
+ bunx @lythos/skill-arena@0.9.41 vs --config arena.toml
82
81
  ```
83
82
 
84
83
  ### CLI-flag mode (backward compat)
85
84
 
86
85
  ```
87
- bunx @lythos/skill-arena@0.9.39 run \
86
+ bunx @lythos/skill-arena@0.9.41 run \
88
87
  --task ./TASK-arena.md \
89
88
  --players ./players/claude.toml \
90
89
  --decks ./decks/run-01.toml,./decks/run-02.toml \
@@ -94,13 +93,13 @@ bunx @lythos/skill-arena@0.9.39 run \
94
93
  ### Scaffold mode (legacy, manual execution)
95
94
 
96
95
  ```
97
- bunx @lythos/skill-arena@0.9.39 scaffold --task "..." --decks a.toml,b.toml
96
+ bunx @lythos/skill-arena@0.9.41 scaffold --task "..." --decks a.toml,b.toml
98
97
  ```
99
98
 
100
99
  ### Viz
101
100
 
102
101
  ```bash
103
- bunx @lythos/skill-arena@0.9.39 viz runs/arena-<id>/
102
+ bunx @lythos/skill-arena@0.9.41 viz runs/arena-<id>/
104
103
  ```
105
104
 
106
105
  ## Skill Documentation
@@ -114,7 +113,7 @@ The agent-visible **Skill** layer documentation is here:
114
113
  Part of the [lythoskill](https://github.com/lythos-labs/lythoskill) ecosystem — the thin-skill pattern separates heavy logic (this npm package) from lightweight agent instructions (SKILL.md).
115
114
 
116
115
  ```
117
- Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.39 ...
116
+ Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.41 ...
118
117
  Skill (packages/<name>/skill/) → build → SKILL.md + thin scripts
119
118
  Output (skills/<name>/) → git commit → agent-visible skill
120
119
  ```
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lythos/skill-arena",
3
- "version": "0.9.39",
3
+ "version": "0.9.41",
4
4
  "description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
5
5
  "keywords": [
6
6
  "ai-agent",
package/src/cli.ts CHANGED
@@ -37,39 +37,39 @@ function printHelp(): void {
37
37
  console.log(`🎭 lythoskill-arena — Skill comparison runner
38
38
 
39
39
  Usage:
40
- lythoskill-arena agent-run --task <path> --deck <path> [--player kimi] [--out <dir>] [--timeout <ms>]
41
- lythoskill-arena agent-run --brief "<prompt>" --deck <path> [--out <dir>] [--timeout <ms>]
42
- lythoskill-arena run --task <path> --players <A.toml,B.toml> --decks <A.toml,B.toml> --criteria <c1,c2,...> [--out <dir>]
40
+ lythoskill-arena single --task <path> --deck <path> [--player kimi] [--out <dir>] [--timeout <ms>]
41
+ lythoskill-arena single --brief "<prompt>" --deck <path> [--out <dir>] [--timeout <ms>]
42
+ lythoskill-arena vs --config arena.toml [--dry-run]
43
43
  lythoskill-arena scaffold --task "<description>" --decks <deck1,deck2,...>
44
44
  lythoskill-arena viz <arena-dir>
45
45
 
46
46
  Commands:
47
- run Run arena programmatically (declarative arena.toml or CLI flags)
47
+ single Single-player deck test (exec shortcut): test a deck with one player
48
+ vs Multi-side comparison: run arena from declarative arena.toml
48
49
  scaffold Create arena directory structure (legacy, manual subagent execution)
49
50
  viz Visualize arena report (ASCII charts)
50
51
 
51
52
  Options:
52
- -t, --task <path|desc> Task description or path to TASK-arena.md
53
- --decks <list> Comma-separated deck paths
54
- -c, --criteria <list> Evaluation criteria (default: syntax,context,logic,token)
55
- --players <list> Comma-separated player.toml paths (CLI run only)
56
- --config <path> Path to arena.toml (declarative mode, k8s-style)
57
- --dry-run Print execution plan without running (with --config)
58
- --out <dir> Output directory (run: defaults to runs/arena-<id>)
59
- -d, --dir <dir> Output directory (scaffold: defaults to tmp)
60
- -p, --project <dir> Project directory (default: .)
53
+ -t, --task <path|desc> Task description or path to TASK-arena.md / .agent.md
54
+ --deck <path> Deck path (single only)
55
+ --brief "<text>" Inline task description (single only, alternative to --task)
56
+ --player <name> Agent player (single only, default: kimi)
57
+ -c, --criteria <list> Evaluation criteria (scaffold only, default: syntax,context,logic,token)
58
+ --config <path> Path to arena.toml (vs only)
59
+ --dry-run Print execution plan without running (vs --config only)
60
+ --out <dir> Output directory
61
+ -d, --dir <dir> Parent dir (scaffold: defaults to tmp)
62
+ -p, --project <dir> Project root (default: .)
63
+ --timeout <ms> Subagent timeout (single only)
61
64
 
62
65
  Examples:
63
- # Single agent run (simplest path)
64
- lythoskill-arena agent-run --task ./TASK.md --deck ./deck.toml
65
- lythoskill-arena agent-run --task ./TASK.md --deck ./deck.toml --player kimi --out ./output
66
+ # Single-player deck test (exec shortcut)
67
+ lythoskill-arena single --task ./TASK.agent.md --deck ./deck.toml
68
+ lythoskill-arena single --brief "Generate auth flow diagram" --deck ./deck.toml --player kimi
66
69
 
67
- # Declarative mode (k8s-style)
68
- lythoskill-arena run --config ./arena.toml
69
- lythoskill-arena run --config ./arena.toml --dry-run
70
-
71
- # CLI-flag mode (backward compat)
72
- lythoskill-arena run --task ./TASK-arena.md --players ./players/claude.toml --decks ./decks/run-01.toml,./decks/run-02.toml --criteria coverage,relevance
70
+ # Multi-side comparison (declarative)
71
+ lythoskill-arena vs --config ./arena.toml
72
+ lythoskill-arena vs --config ./arena.toml --dry-run
73
73
 
74
74
  # Legacy scaffolding
75
75
  lythoskill-arena scaffold --task "Refactor auth module" --decks ./decks/a.toml,./decks/b.toml
@@ -77,9 +77,9 @@ Examples:
77
77
  `)
78
78
  }
79
79
 
80
- // ── agent-run: single agent execution (simplest path) ────────────────────
80
+ // ── single: single-player deck test (exec shortcut) ──────────────────────
81
81
 
82
- async function agentRun(args: string[]) {
82
+ async function singleRun(args: string[]) {
83
83
  const opts: Record<string, string | undefined> = {}
84
84
  for (let i = 0; i < args.length; i++) {
85
85
  if (args[i] === '--task' || args[i] === '-t') opts.task = args[++i]
@@ -91,11 +91,16 @@ async function agentRun(args: string[]) {
91
91
  }
92
92
 
93
93
  if (!opts.deck) {
94
- console.error('❌ --deck <path> is required')
94
+ console.error(`❌ --deck <path> is required.
95
+ Usage: lythoskill-arena single --deck ./deck.toml --task ./scenario.agent.md
96
+ lythoskill-arena single --deck ./deck.toml --brief "your task description"
97
+ Example decks: examples/decks/scout.toml, examples/decks/documents.toml`)
95
98
  process.exit(1)
96
99
  }
97
100
  if (!opts.task && (!opts.brief || !opts.brief.trim())) {
98
- console.error('❌ --task <path> or --brief "<prompt>" is required and cannot be empty')
101
+ console.error(`❌ --task <path> or --brief "<prompt>" is required.
102
+ Usage: lythoskill-arena single --deck ./deck.toml --task ./scenario.agent.md
103
+ lythoskill-arena single --deck ./deck.toml --brief "your task description"`)
99
104
  process.exit(1)
100
105
  }
101
106
 
@@ -119,7 +124,10 @@ async function agentRun(args: string[]) {
119
124
  deckPath = dest
120
125
  } else {
121
126
  deckPath = resolve(opts.deck)
122
- if (!deckExists(deckPath)) { console.error(`❌ Deck file not found: ${deckPath}`); process.exit(1) }
127
+ if (!deckExists(deckPath)) { console.error(`❌ Deck file not found: ${deckPath}
128
+ Create one: examples/decks/scout.toml (minimal), examples/decks/documents.toml (documents)
129
+ Or fetch: curl -fsSL https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml > deck.toml
130
+ Or create: see https://github.com/lythos-labs/lythoskill/tree/main/examples/decks/`); process.exit(1) }
123
131
  }
124
132
 
125
133
  const { useAgent } = await import('@lythos/test-utils/agents')
@@ -139,8 +147,32 @@ async function agentRun(args: string[]) {
139
147
  const scenarioOpt: Record<string, unknown> = {}
140
148
  if (opts.task) {
141
149
  const taskPath = resolve(opts.task)
142
- if (!existsSync(taskPath)) { console.error(`❌ Task file not found: ${taskPath}`); process.exit(1) }
150
+ if (!existsSync(taskPath)) { console.error(`❌ Task file not found: ${taskPath}
151
+ Create a .agent.md scenario or use --brief for inline tasks.
152
+ Format: frontmatter (name, description, timeout) + Given/When/Then/Judge sections.
153
+ Example: see playground/arena-one-shot/TASK-arena.agent.md`); process.exit(1) }
143
154
  scenarioOpt.scenarioPath = taskPath
155
+ // Quick validation: check frontmatter presence
156
+ const raw = readFileSync(taskPath, 'utf-8')
157
+ if (!raw.startsWith('---')) { console.error(`❌ Invalid .agent.md: missing frontmatter (must start with "---")
158
+ Correct format:
159
+ ---
160
+ name: my-scenario
161
+ description: what this tests
162
+ timeout: 120000
163
+ ---
164
+ ## Given
165
+ ...
166
+ ## When
167
+ ...
168
+ ## Then
169
+ ...
170
+ ## Judge
171
+ ...
172
+ Template: playground/arena-one-shot/TASK-arena.agent.md`); process.exit(1) }
173
+ if (!raw.includes('## When')) { console.error(`❌ Invalid .agent.md: missing "## When" section.
174
+ The ## When section defines what the agent should do.
175
+ See template: playground/arena-one-shot/TASK-arena.agent.md`); process.exit(1) }
144
176
  } else {
145
177
  scenarioOpt.scenario = {
146
178
  name: 'ad-hoc task',
@@ -693,7 +725,7 @@ function runViz(argv: string[]) {
693
725
 
694
726
  // ── Run: programmatic arena execution ───────────────────────
695
727
 
696
- async function runProgrammaticArena(argv: string[]) {
728
+ async function vsRun(argv: string[]) {
697
729
  const { options } = parseArgs(argv)
698
730
  const { readFileSync } = await import('node:fs')
699
731
 
@@ -731,13 +763,15 @@ async function runProgrammaticArena(argv: string[]) {
731
763
  return
732
764
  }
733
765
 
734
- // CLI-flag mode (backward compat)
735
- if (!options.task || !options.decks) {
736
- console.error('❌ --task <path> and --decks <list> are required for "run" (or use --config <arena.toml>)')
737
- process.exit(1)
738
- }
739
-
740
- const { runArena: runArenaProgrammatic } = await import('./runner')
766
+ // --config was not provided
767
+ console.error(`❌ --config <arena.toml> is required.
768
+ Usage: lythoskill-arena vs --config ./arena.toml
769
+ lythoskill-arena vs --config ./arena.toml --dry-run
770
+ Example configs:
771
+ examples/arena/research-compare/arena.toml — two-side A/B
772
+ examples/arena/add-remove/arena.toml — three-side Pareto
773
+ Create one: cp examples/arena/research-compare/arena.toml ./arena.toml`)
774
+ process.exit(1)
741
775
 
742
776
  const result = await runArenaProgrammatic({
743
777
  taskPath: options.task,
@@ -758,18 +792,20 @@ if (import.meta.main) {
758
792
  const args = process.argv.slice(2)
759
793
  const cmd = args[0]
760
794
 
761
- if (cmd === 'agent-run') {
762
- agentRun(args.slice(1))
795
+ if (cmd === 'single') {
796
+ singleRun(args.slice(1))
763
797
  } else if (cmd === 'viz') {
764
798
  runViz(args.slice(1))
765
- } else if (cmd === 'run') {
766
- runProgrammaticArena(args.slice(1))
799
+ } else if (cmd === 'vs') {
800
+ vsRun(args.slice(1))
767
801
  } else if (cmd === 'scaffold' || !cmd || args[0]?.startsWith('-')) {
768
802
  // Legacy behavior: if no subcommand or starts with flags, treat as scaffold
769
803
  runArena(cmd === 'scaffold' ? args.slice(1) : args)
770
804
  } else {
771
- console.error(`❌ Unknown command: ${cmd}`)
772
- printHelp()
805
+ console.error(`❌ Unknown command: "${cmd}"
806
+ Available: single, vs, scaffold, viz
807
+ Usage: lythoskill-arena <command> [options]
808
+ Help: lythoskill-arena --help`)
773
809
  process.exit(1)
774
810
  }
775
811
  }
package/src/runner.ts CHANGED
@@ -1,4 +1,4 @@
1
- import { mkdirSync, writeFileSync, readFileSync } from 'node:fs'
1
+ import { existsSync, mkdirSync, writeFileSync, readFileSync, rmSync } from 'node:fs'
2
2
  import { join, resolve } from 'node:path'
3
3
  import { tmpdir } from 'node:os'
4
4
  import { runAgentScenario, type AgentScenario } from '@lythos/test-utils/agent-bdd'
@@ -57,7 +57,35 @@ export async function runArenaFromToml(opts: {
57
57
  if (configDir) return resolve(configDir, p)
58
58
  return resolve(p)
59
59
  }
60
- const taskAbs = resolvePath(taskPath)
60
+ const resolveOrCreateTask = (): { path: string; cleanup?: () => void } => {
61
+ const candidate = resolvePath(taskPath)
62
+ if (existsSync(candidate)) return { path: candidate }
63
+ // taskPath is inline text — write temp scenario file
64
+ const tmp = join(tmpdir(), `arena-task-${stamp()}.agent.md`)
65
+ writeFileSync(tmp, `---
66
+ name: arena task
67
+ description: ${taskPath.slice(0, 80)}
68
+ timeout: 120000
69
+ ---
70
+
71
+ ## Given
72
+ - Working directory with an empty project
73
+ - bun is available
74
+
75
+ ## When
76
+ ${taskPath}
77
+
78
+ ## Then
79
+ - Complete the task above
80
+ - Write a summary to output.md
81
+
82
+ ## Judge
83
+ - completeness
84
+ - correctness
85
+ `)
86
+ return { path: tmp, cleanup: () => { try { rmSync(tmp) } catch {} } }
87
+ }
88
+ const { path: taskAbs, cleanup: taskCleanup } = resolveOrCreateTask()
61
89
  const resolvedToml: ArenaToml = {
62
90
  ...toml,
63
91
  side: toml.side.map(s => ({ ...s, deck: resolvePath(s.deck) })),
@@ -78,10 +106,13 @@ export async function runArenaFromToml(opts: {
78
106
  const resolved = resolveSides(resolvedToml)
79
107
 
80
108
  // Build manifest
109
+ const taskContent = existsSync(taskAbs)
110
+ ? readFileSync(taskAbs, 'utf-8').slice(0, 200)
111
+ : taskPath // inline description, not a file path
81
112
  const manifest = ArenaManifest.parse({
82
113
  id: arenaId,
83
114
  created_at: new Date().toISOString(),
84
- task: readFileSync(taskAbs, 'utf-8').slice(0, 200),
115
+ task: taskContent,
85
116
  mode: 'decks',
86
117
  participants: [...new Map(resolved.map(r => [r.side.name, r])).values()].map(r => ({
87
118
  id: r.side.name,