@lythos/skill-arena 0.9.43 → 0.9.44

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +8 -8
  2. package/package.json +1 -1
  3. package/src/cli.ts +62 -37
package/README.md CHANGED
@@ -49,20 +49,20 @@ Note: Claude `-p` mode has known issues with web tools in Bun.spawn (deferred to
49
49
  ```bash
50
50
  bun add -d @lythos/skill-arena
51
51
  # or use directly
52
- bunx @lythos/skill-arena@0.9.43 <command>
52
+ bunx @lythos/skill-arena@0.9.44 <command>
53
53
  ```
54
54
 
55
55
  ## Quick Start
56
56
 
57
57
  ```bash
58
58
  # Single: test a deck with one agent
59
- bunx @lythos/skill-arena@0.9.43 single \
59
+ bunx @lythos/skill-arena@0.9.44 single \
60
60
  --deck https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml \
61
61
  --brief "Generate auth flow diagram"
62
62
 
63
63
  # Vs: compare multiple decks side by side
64
64
  curl -fsSL https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/arena/research-compare/arena.toml > arena.toml
65
- bunx @lythos/skill-arena@0.9.43 vs --config ./arena.toml
65
+ bunx @lythos/skill-arena@0.9.44 vs --config ./arena.toml
66
66
  ```
67
67
 
68
68
  ## Commands
@@ -71,23 +71,23 @@ bunx @lythos/skill-arena@0.9.43 vs --config ./arena.toml
71
71
 
72
72
  ```bash
73
73
  # Print execution plan without running
74
- bunx @lythos/skill-arena@0.9.43 vs --config arena.toml --dry-run
74
+ bunx @lythos/skill-arena@0.9.44 vs --config arena.toml --dry-run
75
75
 
76
76
  # Execute with per-side runs_per_side and statistical aggregation
77
- bunx @lythos/skill-arena@0.9.43 vs --config arena.toml
77
+ bunx @lythos/skill-arena@0.9.44 vs --config arena.toml
78
78
  ```
79
79
 
80
80
  ### Scaffold mode (legacy, manual execution)
81
81
 
82
82
  ```
83
- bunx @lythos/skill-arena@0.9.43 scaffold --task "Generate auth flow diagram" \
83
+ bunx @lythos/skill-arena@0.9.44 scaffold --task "Generate auth flow diagram" \
84
84
  --decks https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml,https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/documents.toml
85
85
  ```
86
86
 
87
87
  ### Viz
88
88
 
89
89
  ```bash
90
- bunx @lythos/skill-arena@0.9.43 viz runs/arena-<id>/
90
+ bunx @lythos/skill-arena@0.9.44 viz runs/arena-<id>/
91
91
  ```
92
92
 
93
93
  ## Skill Documentation
@@ -101,7 +101,7 @@ The agent-visible **Skill** layer documentation is here:
101
101
  Part of the [lythoskill](https://github.com/lythos-labs/lythoskill) ecosystem — the thin-skill pattern separates heavy logic (this npm package) from lightweight agent instructions (SKILL.md).
102
102
 
103
103
  ```
104
- Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.43 ...
104
+ Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.44 ...
105
105
  Skill (packages/<name>/skill/) → build → SKILL.md + thin scripts
106
106
  Output (skills/<name>/) → git commit → agent-visible skill
107
107
  ```
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lythos/skill-arena",
3
- "version": "0.9.43",
3
+ "version": "0.9.44",
4
4
  "description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
5
5
  "keywords": [
6
6
  "ai-agent",
package/src/cli.ts CHANGED
@@ -67,6 +67,8 @@ Examples:
67
67
  lythoskill-arena single \\
68
68
  --deck https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml \\
69
69
  --brief "Generate auth flow diagram" --player kimi
70
+ # If you already have a local deck file, point to it directly:
71
+ # lythoskill-arena single --deck ./examples/decks/scout.toml --brief "..."
70
72
 
71
73
  # Multi-side comparison (declarative)
72
74
  curl -fsSL https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/arena/add-remove/arena.toml > arena.toml
@@ -97,21 +99,71 @@ async function singleRun(args: string[]) {
97
99
  if (!opts.deck) {
98
100
  console.error(`❌ --deck <path|url> is required.
99
101
  --deck accepts local paths and http/https URLs (auto-fetched).
100
- Example: lythoskill-arena single \\
101
- --deck https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml \\
102
- --brief "your task"`)
102
+
103
+ Example (no local file needed — URL is auto-fetched):
104
+ lythoskill-arena single \\
105
+ --deck https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml \\
106
+ --brief "your task"
107
+
108
+ Or with a local deck file you already have:
109
+ lythoskill-arena single --deck ./examples/decks/scout.toml --brief "your task"`)
103
110
  process.exit(1)
104
111
  }
105
112
  if (!opts.task && (!opts.brief || !opts.brief.trim())) {
106
113
  console.error(`❌ --task <path> or --brief "<text>" is required.
107
114
  --task reads a .agent.md scenario file; --brief takes inline text.
108
- Example: lythoskill-arena single \\
109
- --deck https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml \\
110
- --brief "your task"`)
115
+
116
+ Example (no local file needed — URL is auto-fetched):
117
+ lythoskill-arena single \\
118
+ --deck https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml \\
119
+ --brief "your task"
120
+
121
+ Or with a local deck file:
122
+ lythoskill-arena single --deck ./examples/decks/scout.toml --brief "your task"`)
111
123
  process.exit(1)
112
124
  }
113
125
 
114
- const { resolve, join } = await import('node:path')
126
+ // Validate --task file early before any URL fetch — so bad path fails fast without a wasted network call.
127
+ let resolvedTaskPath: string | undefined
128
+ if (opts.task) {
129
+ resolvedTaskPath = resolve(opts.task)
130
+ if (!existsSync(resolvedTaskPath)) {
131
+ console.error(`❌ Task file not found: ${resolvedTaskPath}
132
+ Use --brief for inline tasks, or point --task to an existing .agent.md file.
133
+ Format: name + description + Given/When/Then/Judge sections.
134
+
135
+ Example (URL): lythoskill-arena single --brief "your task" --deck https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml
136
+ Or (local): lythoskill-arena single --brief "your task" --deck ./examples/decks/scout.toml`)
137
+ process.exit(1)
138
+ }
139
+ const raw = readFileSync(resolvedTaskPath, 'utf-8')
140
+ if (!raw.startsWith('---')) {
141
+ console.error(`❌ Invalid .agent.md: missing frontmatter (must start with "---")
142
+ Correct format:
143
+ ---
144
+ name: my-scenario
145
+ description: what this tests
146
+ timeout: 120000
147
+ ---
148
+ ## Given
149
+ ...
150
+ ## When
151
+ ...
152
+ ## Then
153
+ ...
154
+ ## Judge
155
+ ...
156
+ Template: playground/arena-one-shot/TASK-arena.agent.md`)
157
+ process.exit(1)
158
+ }
159
+ if (!raw.includes('## When')) {
160
+ console.error(`❌ Invalid .agent.md: missing "## When" section.
161
+ The ## When section defines what the agent should do.
162
+ See template: playground/arena-one-shot/TASK-arena.agent.md`)
163
+ process.exit(1)
164
+ }
165
+ }
166
+
115
167
  const { existsSync: deckExists, writeFileSync: deckWrite } = await import('node:fs')
116
168
  let deckPath: string
117
169
  if (opts.deck.startsWith('http://') || opts.deck.startsWith('https://')) {
@@ -152,43 +204,16 @@ async function singleRun(args: string[]) {
152
204
  try { await import('@lythos/agent-adapter-deepseek-serve') } catch { /* package not installed */ }
153
205
  const { runAgentScenario } = await import('@lythos/test-utils/agent-bdd')
154
206
  const { resolvePlayer } = await import('./player')
155
- const { readFileSync, writeFileSync, mkdirSync } = await import('node:fs')
156
207
 
157
208
  const player = resolvePlayer(opts.player ?? 'kimi')
158
209
  const agent = useAgent(player)
159
210
  const outDir = opts.out ? resolve(opts.out) : join(process.cwd(), `agent-output-${new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19)}`)
160
211
  mkdirSync(outDir, { recursive: true })
161
212
 
162
- // Resolve task: --brief builds scenario directly, --task reads .agent.md file
213
+ // Resolve task: --brief builds scenario directly, --task uses pre-validated path
163
214
  const scenarioOpt: Record<string, unknown> = {}
164
- if (opts.task) {
165
- const taskPath = resolve(opts.task)
166
- if (!existsSync(taskPath)) { console.error(`❌ Task file not found: ${taskPath}
167
- Use --brief for inline tasks, or point --task to an existing .agent.md file.
168
- Format: name + description + Given/When/Then/Judge sections.
169
- Example: lythoskill-arena single --brief "your task" --deck <url>`); process.exit(1) }
170
- scenarioOpt.scenarioPath = taskPath
171
- // Quick validation: check frontmatter presence
172
- const raw = readFileSync(taskPath, 'utf-8')
173
- if (!raw.startsWith('---')) { console.error(`❌ Invalid .agent.md: missing frontmatter (must start with "---")
174
- Correct format:
175
- ---
176
- name: my-scenario
177
- description: what this tests
178
- timeout: 120000
179
- ---
180
- ## Given
181
- ...
182
- ## When
183
- ...
184
- ## Then
185
- ...
186
- ## Judge
187
- ...
188
- Template: playground/arena-one-shot/TASK-arena.agent.md`); process.exit(1) }
189
- if (!raw.includes('## When')) { console.error(`❌ Invalid .agent.md: missing "## When" section.
190
- The ## When section defines what the agent should do.
191
- See template: playground/arena-one-shot/TASK-arena.agent.md`); process.exit(1) }
215
+ if (resolvedTaskPath) {
216
+ scenarioOpt.scenarioPath = resolvedTaskPath
192
217
  } else {
193
218
  scenarioOpt.scenario = {
194
219
  name: 'ad-hoc task',