@lythos/skill-arena 0.9.40 → 0.9.42

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -49,23 +49,20 @@ Note: Claude `-p` mode has known issues with web tools in Bun.spawn (deferred to
49
49
  ```bash
50
50
  bun add -d @lythos/skill-arena
51
51
  # or use directly
52
- bunx @lythos/skill-arena@0.9.40 <command>
52
+ bunx @lythos/skill-arena@0.9.42 <command>
53
53
  ```
54
54
 
55
55
  ## Quick Start
56
56
 
57
57
  ```bash
58
- # Single: test a deck with one agent (exec shortcut)
59
- bunx @lythos/skill-arena@0.9.40 single \
60
- --brief "Generate auth flow diagram" \
61
- --deck ./examples/decks/documents.toml
62
-
63
- # Vs: compare multiple decks side by side (declarative)
64
- bunx @lythos/skill-arena@0.9.40 vs \
65
- --config examples/arena/research-compare/arena.toml
66
-
67
- # Visualize results
68
- bunx @lythos/skill-arena@0.9.40 viz tmp/arena-<id>/
58
+ # Single: test a deck with one agent
59
+ bunx @lythos/skill-arena@0.9.42 single \
60
+ --deck https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml \
61
+ --brief "Generate auth flow diagram"
62
+
63
+ # Vs: compare multiple decks side by side
64
+ curl -fsSL https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/arena/research-compare/arena.toml > arena.toml
65
+ bunx @lythos/skill-arena@0.9.42 vs --config ./arena.toml
69
66
  ```
70
67
 
71
68
  ## Commands
@@ -74,32 +71,23 @@ bunx @lythos/skill-arena@0.9.40 viz tmp/arena-<id>/
74
71
 
75
72
  ```bash
76
73
  # Print execution plan without running
77
- bunx @lythos/skill-arena@0.9.40 vs --config arena.toml --dry-run
74
+ bunx @lythos/skill-arena@0.9.42 vs --config arena.toml --dry-run
78
75
 
79
76
  # Execute with per-side runs_per_side and statistical aggregation
80
- bunx @lythos/skill-arena@0.9.40 vs --config arena.toml
81
- ```
82
-
83
- ### CLI-flag mode (backward compat)
84
-
85
- ```
86
- bunx @lythos/skill-arena@0.9.40 run \
87
- --task ./TASK-arena.md \
88
- --players ./players/claude.toml \
89
- --decks ./decks/run-01.toml,./decks/run-02.toml \
90
- --criteria coverage,relevance,actionability,depth
77
+ bunx @lythos/skill-arena@0.9.42 vs --config arena.toml
91
78
  ```
92
79
 
93
80
  ### Scaffold mode (legacy, manual execution)
94
81
 
95
82
  ```
96
- bunx @lythos/skill-arena@0.9.40 scaffold --task "..." --decks a.toml,b.toml
83
+ bunx @lythos/skill-arena@0.9.42 scaffold --task "Generate auth flow diagram" \
84
+ --decks https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml,https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/documents.toml
97
85
  ```
98
86
 
99
87
  ### Viz
100
88
 
101
89
  ```bash
102
- bunx @lythos/skill-arena@0.9.40 viz runs/arena-<id>/
90
+ bunx @lythos/skill-arena@0.9.42 viz runs/arena-<id>/
103
91
  ```
104
92
 
105
93
  ## Skill Documentation
@@ -113,7 +101,7 @@ The agent-visible **Skill** layer documentation is here:
113
101
  Part of the [lythoskill](https://github.com/lythos-labs/lythoskill) ecosystem — the thin-skill pattern separates heavy logic (this npm package) from lightweight agent instructions (SKILL.md).
114
102
 
115
103
  ```
116
- Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.40 ...
104
+ Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.42 ...
117
105
  Skill (packages/<name>/skill/) → build → SKILL.md + thin scripts
118
106
  Output (skills/<name>/) → git commit → agent-visible skill
119
107
  ```
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lythos/skill-arena",
3
- "version": "0.9.40",
3
+ "version": "0.9.42",
4
4
  "description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
5
5
  "keywords": [
6
6
  "ai-agent",
package/src/cli.ts CHANGED
@@ -63,16 +63,20 @@ Options:
63
63
  --timeout <ms> Subagent timeout (single only)
64
64
 
65
65
  Examples:
66
- # Single-player deck test (exec shortcut)
67
- lythoskill-arena single --task ./TASK.agent.md --deck ./deck.toml
68
- lythoskill-arena single --brief "Generate auth flow diagram" --deck ./deck.toml --player kimi
66
+ # Single-player deck test (--deck accepts local paths and http/https URLs)
67
+ lythoskill-arena single \\
68
+ --deck https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml \\
69
+ --brief "Generate auth flow diagram" --player kimi
69
70
 
70
71
  # Multi-side comparison (declarative)
72
+ curl -fsSL https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/arena/add-remove/arena.toml > arena.toml
71
73
  lythoskill-arena vs --config ./arena.toml
72
74
  lythoskill-arena vs --config ./arena.toml --dry-run
73
75
 
74
76
  # Legacy scaffolding
75
- lythoskill-arena scaffold --task "Refactor auth module" --decks ./decks/a.toml,./decks/b.toml
77
+ # scaffold creates structure; decks via URL (auto-downloaded during link):
78
+ lythoskill-arena scaffold --task "Refactor auth module" \\
79
+ --decks https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml,https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/documents.toml
76
80
  lythoskill-arena viz runs/arena-20260504
77
81
  `)
78
82
  }
@@ -91,16 +95,19 @@ async function singleRun(args: string[]) {
91
95
  }
92
96
 
93
97
  if (!opts.deck) {
94
- console.error(`❌ --deck <path> is required.
95
- Usage: lythoskill-arena single --deck ./deck.toml --task ./scenario.agent.md
96
- lythoskill-arena single --deck ./deck.toml --brief "your task description"
97
- Example decks: examples/decks/scout.toml, examples/decks/documents.toml`)
98
+ console.error(`❌ --deck <path|url> is required.
99
+ --deck accepts local paths and http/https URLs (auto-fetched).
100
+ Example: lythoskill-arena single \\
101
+ --deck https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml \\
102
+ --brief "your task"`)
98
103
  process.exit(1)
99
104
  }
100
105
  if (!opts.task && (!opts.brief || !opts.brief.trim())) {
101
- console.error(`❌ --task <path> or --brief "<prompt>" is required.
102
- Usage: lythoskill-arena single --deck ./deck.toml --task ./scenario.agent.md
103
- lythoskill-arena single --deck ./deck.toml --brief "your task description"`)
106
+ console.error(`❌ --task <path> or --brief "<text>" is required.
107
+ --task reads a .agent.md scenario file; --brief takes inline text.
108
+ Example: lythoskill-arena single \\
109
+ --deck https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml \\
110
+ --brief "your task"`)
104
111
  process.exit(1)
105
112
  }
106
113
 
@@ -125,9 +132,9 @@ async function singleRun(args: string[]) {
125
132
  } else {
126
133
  deckPath = resolve(opts.deck)
127
134
  if (!deckExists(deckPath)) { console.error(`❌ Deck file not found: ${deckPath}
128
- Create one: examples/decks/scout.toml (minimal), examples/decks/documents.toml (documents)
129
- Or fetch: curl -fsSL https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml > deck.toml
130
- Or create: see https://github.com/lythos-labs/lythoskill/tree/main/examples/decks/`); process.exit(1) }
135
+ Make sure the path is correct, or use a URL:
136
+ --deck https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml
137
+ (URLs are auto-fetched — no local file needed)`); process.exit(1) }
131
138
  }
132
139
 
133
140
  const { useAgent } = await import('@lythos/test-utils/agents')
@@ -148,10 +155,31 @@ async function singleRun(args: string[]) {
148
155
  if (opts.task) {
149
156
  const taskPath = resolve(opts.task)
150
157
  if (!existsSync(taskPath)) { console.error(`❌ Task file not found: ${taskPath}
151
- Create a .agent.md scenario or use --brief for inline tasks.
152
- Format: frontmatter (name, description, timeout) + Given/When/Then/Judge sections.
153
- Example: see playground/arena-one-shot/TASK-arena.agent.md`); process.exit(1) }
158
+ Use --brief for inline tasks, or point --task to an existing .agent.md file.
159
+ Format: name + description + Given/When/Then/Judge sections.
160
+ Example: lythoskill-arena single --brief "your task" --deck <url>`); process.exit(1) }
154
161
  scenarioOpt.scenarioPath = taskPath
162
+ // Quick validation: check frontmatter presence
163
+ const raw = readFileSync(taskPath, 'utf-8')
164
+ if (!raw.startsWith('---')) { console.error(`❌ Invalid .agent.md: missing frontmatter (must start with "---")
165
+ Correct format:
166
+ ---
167
+ name: my-scenario
168
+ description: what this tests
169
+ timeout: 120000
170
+ ---
171
+ ## Given
172
+ ...
173
+ ## When
174
+ ...
175
+ ## Then
176
+ ...
177
+ ## Judge
178
+ ...
179
+ Template: playground/arena-one-shot/TASK-arena.agent.md`); process.exit(1) }
180
+ if (!raw.includes('## When')) { console.error(`❌ Invalid .agent.md: missing "## When" section.
181
+ The ## When section defines what the agent should do.
182
+ See template: playground/arena-one-shot/TASK-arena.agent.md`); process.exit(1) }
155
183
  } else {
156
184
  scenarioOpt.scenario = {
157
185
  name: 'ad-hoc task',
@@ -746,10 +774,9 @@ async function vsRun(argv: string[]) {
746
774
  console.error(`❌ --config <arena.toml> is required.
747
775
  Usage: lythoskill-arena vs --config ./arena.toml
748
776
  lythoskill-arena vs --config ./arena.toml --dry-run
749
- Example configs:
750
- examples/arena/research-compare/arena.toml two-side A/B
751
- examples/arena/add-remove/arena.toml three-side Pareto
752
- Create one: cp examples/arena/research-compare/arena.toml ./arena.toml`)
777
+ Fetch an example:
778
+ curl -fsSL https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/arena/add-remove/arena.toml > arena.toml
779
+ Then edit arena.toml and run: lythoskill-arena vs --config ./arena.toml`)
753
780
  process.exit(1)
754
781
 
755
782
  const result = await runArenaProgrammatic({
package/src/runner.ts CHANGED
@@ -1,4 +1,4 @@
1
- import { mkdirSync, writeFileSync, readFileSync } from 'node:fs'
1
+ import { existsSync, mkdirSync, writeFileSync, readFileSync, rmSync } from 'node:fs'
2
2
  import { join, resolve } from 'node:path'
3
3
  import { tmpdir } from 'node:os'
4
4
  import { runAgentScenario, type AgentScenario } from '@lythos/test-utils/agent-bdd'
@@ -57,7 +57,35 @@ export async function runArenaFromToml(opts: {
57
57
  if (configDir) return resolve(configDir, p)
58
58
  return resolve(p)
59
59
  }
60
- const taskAbs = resolvePath(taskPath)
60
+ const resolveOrCreateTask = (): { path: string; cleanup?: () => void } => {
61
+ const candidate = resolvePath(taskPath)
62
+ if (existsSync(candidate)) return { path: candidate }
63
+ // taskPath is inline text — write temp scenario file
64
+ const tmp = join(tmpdir(), `arena-task-${stamp()}.agent.md`)
65
+ writeFileSync(tmp, `---
66
+ name: arena task
67
+ description: ${taskPath.slice(0, 80)}
68
+ timeout: 120000
69
+ ---
70
+
71
+ ## Given
72
+ - Working directory with an empty project
73
+ - bun is available
74
+
75
+ ## When
76
+ ${taskPath}
77
+
78
+ ## Then
79
+ - Complete the task above
80
+ - Write a summary to output.md
81
+
82
+ ## Judge
83
+ - completeness
84
+ - correctness
85
+ `)
86
+ return { path: tmp, cleanup: () => { try { rmSync(tmp) } catch {} } }
87
+ }
88
+ const { path: taskAbs, cleanup: taskCleanup } = resolveOrCreateTask()
61
89
  const resolvedToml: ArenaToml = {
62
90
  ...toml,
63
91
  side: toml.side.map(s => ({ ...s, deck: resolvePath(s.deck) })),
@@ -78,10 +106,13 @@ export async function runArenaFromToml(opts: {
78
106
  const resolved = resolveSides(resolvedToml)
79
107
 
80
108
  // Build manifest
109
+ const taskContent = existsSync(taskAbs)
110
+ ? readFileSync(taskAbs, 'utf-8').slice(0, 200)
111
+ : taskPath // inline description, not a file path
81
112
  const manifest = ArenaManifest.parse({
82
113
  id: arenaId,
83
114
  created_at: new Date().toISOString(),
84
- task: readFileSync(taskAbs, 'utf-8').slice(0, 200),
115
+ task: taskContent,
85
116
  mode: 'decks',
86
117
  participants: [...new Map(resolved.map(r => [r.side.name, r])).values()].map(r => ({
87
118
  id: r.side.name,