@lythos/skill-arena 0.9.42 → 0.9.44

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +8 -8
  2. package/package.json +1 -1
  3. package/src/cli.ts +73 -39
package/README.md CHANGED
@@ -49,20 +49,20 @@ Note: Claude `-p` mode has known issues with web tools in Bun.spawn (deferred to
49
49
  ```bash
50
50
  bun add -d @lythos/skill-arena
51
51
  # or use directly
52
- bunx @lythos/skill-arena@0.9.42 <command>
52
+ bunx @lythos/skill-arena@0.9.44 <command>
53
53
  ```
54
54
 
55
55
  ## Quick Start
56
56
 
57
57
  ```bash
58
58
  # Single: test a deck with one agent
59
- bunx @lythos/skill-arena@0.9.42 single \
59
+ bunx @lythos/skill-arena@0.9.44 single \
60
60
  --deck https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml \
61
61
  --brief "Generate auth flow diagram"
62
62
 
63
63
  # Vs: compare multiple decks side by side
64
64
  curl -fsSL https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/arena/research-compare/arena.toml > arena.toml
65
- bunx @lythos/skill-arena@0.9.42 vs --config ./arena.toml
65
+ bunx @lythos/skill-arena@0.9.44 vs --config ./arena.toml
66
66
  ```
67
67
 
68
68
  ## Commands
@@ -71,23 +71,23 @@ bunx @lythos/skill-arena@0.9.42 vs --config ./arena.toml
71
71
 
72
72
  ```bash
73
73
  # Print execution plan without running
74
- bunx @lythos/skill-arena@0.9.42 vs --config arena.toml --dry-run
74
+ bunx @lythos/skill-arena@0.9.44 vs --config arena.toml --dry-run
75
75
 
76
76
  # Execute with per-side runs_per_side and statistical aggregation
77
- bunx @lythos/skill-arena@0.9.42 vs --config arena.toml
77
+ bunx @lythos/skill-arena@0.9.44 vs --config arena.toml
78
78
  ```
79
79
 
80
80
  ### Scaffold mode (legacy, manual execution)
81
81
 
82
82
  ```
83
- bunx @lythos/skill-arena@0.9.42 scaffold --task "Generate auth flow diagram" \
83
+ bunx @lythos/skill-arena@0.9.44 scaffold --task "Generate auth flow diagram" \
84
84
  --decks https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml,https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/documents.toml
85
85
  ```
86
86
 
87
87
  ### Viz
88
88
 
89
89
  ```bash
90
- bunx @lythos/skill-arena@0.9.42 viz runs/arena-<id>/
90
+ bunx @lythos/skill-arena@0.9.44 viz runs/arena-<id>/
91
91
  ```
92
92
 
93
93
  ## Skill Documentation
@@ -101,7 +101,7 @@ The agent-visible **Skill** layer documentation is here:
101
101
  Part of the [lythoskill](https://github.com/lythos-labs/lythoskill) ecosystem — the thin-skill pattern separates heavy logic (this npm package) from lightweight agent instructions (SKILL.md).
102
102
 
103
103
  ```
104
- Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.42 ...
104
+ Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.44 ...
105
105
  Skill (packages/<name>/skill/) → build → SKILL.md + thin scripts
106
106
  Output (skills/<name>/) → git commit → agent-visible skill
107
107
  ```
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lythos/skill-arena",
3
- "version": "0.9.42",
3
+ "version": "0.9.44",
4
4
  "description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
5
5
  "keywords": [
6
6
  "ai-agent",
package/src/cli.ts CHANGED
@@ -67,6 +67,8 @@ Examples:
67
67
  lythoskill-arena single \\
68
68
  --deck https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml \\
69
69
  --brief "Generate auth flow diagram" --player kimi
70
+ # If you already have a local deck file, point to it directly:
71
+ # lythoskill-arena single --deck ./examples/decks/scout.toml --brief "..."
70
72
 
71
73
  # Multi-side comparison (declarative)
72
74
  curl -fsSL https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/arena/add-remove/arena.toml > arena.toml
@@ -97,21 +99,71 @@ async function singleRun(args: string[]) {
97
99
  if (!opts.deck) {
98
100
  console.error(`❌ --deck <path|url> is required.
99
101
  --deck accepts local paths and http/https URLs (auto-fetched).
100
- Example: lythoskill-arena single \\
101
- --deck https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml \\
102
- --brief "your task"`)
102
+
103
+ Example (no local file needed — URL is auto-fetched):
104
+ lythoskill-arena single \\
105
+ --deck https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml \\
106
+ --brief "your task"
107
+
108
+ Or with a local deck file you already have:
109
+ lythoskill-arena single --deck ./examples/decks/scout.toml --brief "your task"`)
103
110
  process.exit(1)
104
111
  }
105
112
  if (!opts.task && (!opts.brief || !opts.brief.trim())) {
106
113
  console.error(`❌ --task <path> or --brief "<text>" is required.
107
114
  --task reads a .agent.md scenario file; --brief takes inline text.
108
- Example: lythoskill-arena single \\
109
- --deck https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml \\
110
- --brief "your task"`)
115
+
116
+ Example (no local file needed — URL is auto-fetched):
117
+ lythoskill-arena single \\
118
+ --deck https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml \\
119
+ --brief "your task"
120
+
121
+ Or with a local deck file:
122
+ lythoskill-arena single --deck ./examples/decks/scout.toml --brief "your task"`)
111
123
  process.exit(1)
112
124
  }
113
125
 
114
- const { resolve, join } = await import('node:path')
126
+ // Validate --task file early before any URL fetch — so bad path fails fast without a wasted network call.
127
+ let resolvedTaskPath: string | undefined
128
+ if (opts.task) {
129
+ resolvedTaskPath = resolve(opts.task)
130
+ if (!existsSync(resolvedTaskPath)) {
131
+ console.error(`❌ Task file not found: ${resolvedTaskPath}
132
+ Use --brief for inline tasks, or point --task to an existing .agent.md file.
133
+ Format: name + description + Given/When/Then/Judge sections.
134
+
135
+ Example (URL): lythoskill-arena single --brief "your task" --deck https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml
136
+ Or (local): lythoskill-arena single --brief "your task" --deck ./examples/decks/scout.toml`)
137
+ process.exit(1)
138
+ }
139
+ const raw = readFileSync(resolvedTaskPath, 'utf-8')
140
+ if (!raw.startsWith('---')) {
141
+ console.error(`❌ Invalid .agent.md: missing frontmatter (must start with "---")
142
+ Correct format:
143
+ ---
144
+ name: my-scenario
145
+ description: what this tests
146
+ timeout: 120000
147
+ ---
148
+ ## Given
149
+ ...
150
+ ## When
151
+ ...
152
+ ## Then
153
+ ...
154
+ ## Judge
155
+ ...
156
+ Template: playground/arena-one-shot/TASK-arena.agent.md`)
157
+ process.exit(1)
158
+ }
159
+ if (!raw.includes('## When')) {
160
+ console.error(`❌ Invalid .agent.md: missing "## When" section.
161
+ The ## When section defines what the agent should do.
162
+ See template: playground/arena-one-shot/TASK-arena.agent.md`)
163
+ process.exit(1)
164
+ }
165
+ }
166
+
115
167
  const { existsSync: deckExists, writeFileSync: deckWrite } = await import('node:fs')
116
168
  let deckPath: string
117
169
  if (opts.deck.startsWith('http://') || opts.deck.startsWith('https://')) {
@@ -124,8 +176,17 @@ async function singleRun(args: string[]) {
124
176
  } catch { /* keep original url */ }
125
177
  const dest = resolve(process.cwd(), 'arena-deck.toml')
126
178
  console.log(`📥 Fetching arena deck: ${url}`)
127
- const res = await fetch(url, { signal: AbortSignal.timeout(30_000) })
128
- if (!res.ok) { console.error(`❌ Failed to fetch deck (HTTP ${res.status}): ${url}`); process.exit(1) }
179
+ let res: Response
180
+ try { res = await fetch(url, { signal: AbortSignal.timeout(30_000) }) } catch (e: any) {
181
+ console.error(`❌ Cannot reach ${url}
182
+ Network issue? Try a GitHub proxy mirror:
183
+ ${url.replace('https://raw.githubusercontent.com/', 'https://ghfast.top/https://raw.githubusercontent.com/')}
184
+ Or download manually and reference the local file.`)
185
+ process.exit(1)
186
+ }
187
+ if (!res.ok) { console.error(`❌ Failed to fetch deck (HTTP ${res.status}): ${url}
188
+ Try a GitHub proxy mirror:
189
+ ${url.replace('https://raw.githubusercontent.com/', 'https://ghfast.top/https://raw.githubusercontent.com/')}`); process.exit(1) }
129
190
  deckWrite(dest, await res.text())
130
191
  console.log(` → saved to ${dest}`)
131
192
  deckPath = dest
@@ -143,43 +204,16 @@ async function singleRun(args: string[]) {
143
204
  try { await import('@lythos/agent-adapter-deepseek-serve') } catch { /* package not installed */ }
144
205
  const { runAgentScenario } = await import('@lythos/test-utils/agent-bdd')
145
206
  const { resolvePlayer } = await import('./player')
146
- const { readFileSync, writeFileSync, mkdirSync } = await import('node:fs')
147
207
 
148
208
  const player = resolvePlayer(opts.player ?? 'kimi')
149
209
  const agent = useAgent(player)
150
210
  const outDir = opts.out ? resolve(opts.out) : join(process.cwd(), `agent-output-${new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19)}`)
151
211
  mkdirSync(outDir, { recursive: true })
152
212
 
153
- // Resolve task: --brief builds scenario directly, --task reads .agent.md file
213
+ // Resolve task: --brief builds scenario directly, --task uses pre-validated path
154
214
  const scenarioOpt: Record<string, unknown> = {}
155
- if (opts.task) {
156
- const taskPath = resolve(opts.task)
157
- if (!existsSync(taskPath)) { console.error(`❌ Task file not found: ${taskPath}
158
- Use --brief for inline tasks, or point --task to an existing .agent.md file.
159
- Format: name + description + Given/When/Then/Judge sections.
160
- Example: lythoskill-arena single --brief "your task" --deck <url>`); process.exit(1) }
161
- scenarioOpt.scenarioPath = taskPath
162
- // Quick validation: check frontmatter presence
163
- const raw = readFileSync(taskPath, 'utf-8')
164
- if (!raw.startsWith('---')) { console.error(`❌ Invalid .agent.md: missing frontmatter (must start with "---")
165
- Correct format:
166
- ---
167
- name: my-scenario
168
- description: what this tests
169
- timeout: 120000
170
- ---
171
- ## Given
172
- ...
173
- ## When
174
- ...
175
- ## Then
176
- ...
177
- ## Judge
178
- ...
179
- Template: playground/arena-one-shot/TASK-arena.agent.md`); process.exit(1) }
180
- if (!raw.includes('## When')) { console.error(`❌ Invalid .agent.md: missing "## When" section.
181
- The ## When section defines what the agent should do.
182
- See template: playground/arena-one-shot/TASK-arena.agent.md`); process.exit(1) }
215
+ if (resolvedTaskPath) {
216
+ scenarioOpt.scenarioPath = resolvedTaskPath
183
217
  } else {
184
218
  scenarioOpt.scenario = {
185
219
  name: 'ad-hoc task',