@lythos/skill-arena 0.9.43 → 0.9.44
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -8
- package/package.json +1 -1
- package/src/cli.ts +62 -37
package/README.md
CHANGED
|
@@ -49,20 +49,20 @@ Note: Claude `-p` mode has known issues with web tools in Bun.spawn (deferred to
|
|
|
49
49
|
```bash
|
|
50
50
|
bun add -d @lythos/skill-arena
|
|
51
51
|
# or use directly
|
|
52
|
-
bunx @lythos/skill-arena@0.9.
|
|
52
|
+
bunx @lythos/skill-arena@0.9.44 <command>
|
|
53
53
|
```
|
|
54
54
|
|
|
55
55
|
## Quick Start
|
|
56
56
|
|
|
57
57
|
```bash
|
|
58
58
|
# Single: test a deck with one agent
|
|
59
|
-
bunx @lythos/skill-arena@0.9.
|
|
59
|
+
bunx @lythos/skill-arena@0.9.44 single \
|
|
60
60
|
--deck https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml \
|
|
61
61
|
--brief "Generate auth flow diagram"
|
|
62
62
|
|
|
63
63
|
# Vs: compare multiple decks side by side
|
|
64
64
|
curl -fsSL https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/arena/research-compare/arena.toml > arena.toml
|
|
65
|
-
bunx @lythos/skill-arena@0.9.
|
|
65
|
+
bunx @lythos/skill-arena@0.9.44 vs --config ./arena.toml
|
|
66
66
|
```
|
|
67
67
|
|
|
68
68
|
## Commands
|
|
@@ -71,23 +71,23 @@ bunx @lythos/skill-arena@0.9.43 vs --config ./arena.toml
|
|
|
71
71
|
|
|
72
72
|
```bash
|
|
73
73
|
# Print execution plan without running
|
|
74
|
-
bunx @lythos/skill-arena@0.9.
|
|
74
|
+
bunx @lythos/skill-arena@0.9.44 vs --config arena.toml --dry-run
|
|
75
75
|
|
|
76
76
|
# Execute with per-side runs_per_side and statistical aggregation
|
|
77
|
-
bunx @lythos/skill-arena@0.9.
|
|
77
|
+
bunx @lythos/skill-arena@0.9.44 vs --config arena.toml
|
|
78
78
|
```
|
|
79
79
|
|
|
80
80
|
### Scaffold mode (legacy, manual execution)
|
|
81
81
|
|
|
82
82
|
```
|
|
83
|
-
bunx @lythos/skill-arena@0.9.
|
|
83
|
+
bunx @lythos/skill-arena@0.9.44 scaffold --task "Generate auth flow diagram" \
|
|
84
84
|
--decks https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml,https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/documents.toml
|
|
85
85
|
```
|
|
86
86
|
|
|
87
87
|
### Viz
|
|
88
88
|
|
|
89
89
|
```bash
|
|
90
|
-
bunx @lythos/skill-arena@0.9.
|
|
90
|
+
bunx @lythos/skill-arena@0.9.44 viz runs/arena-<id>/
|
|
91
91
|
```
|
|
92
92
|
|
|
93
93
|
## Skill Documentation
|
|
@@ -101,7 +101,7 @@ The agent-visible **Skill** layer documentation is here:
|
|
|
101
101
|
Part of the [lythoskill](https://github.com/lythos-labs/lythoskill) ecosystem — the thin-skill pattern separates heavy logic (this npm package) from lightweight agent instructions (SKILL.md).
|
|
102
102
|
|
|
103
103
|
```
|
|
104
|
-
Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.
|
|
104
|
+
Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.44 ...
|
|
105
105
|
Skill (packages/<name>/skill/) → build → SKILL.md + thin scripts
|
|
106
106
|
Output (skills/<name>/) → git commit → agent-visible skill
|
|
107
107
|
```
|
package/package.json
CHANGED
package/src/cli.ts
CHANGED
|
@@ -67,6 +67,8 @@ Examples:
|
|
|
67
67
|
lythoskill-arena single \\
|
|
68
68
|
--deck https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml \\
|
|
69
69
|
--brief "Generate auth flow diagram" --player kimi
|
|
70
|
+
# If you already have a local deck file, point to it directly:
|
|
71
|
+
# lythoskill-arena single --deck ./examples/decks/scout.toml --brief "..."
|
|
70
72
|
|
|
71
73
|
# Multi-side comparison (declarative)
|
|
72
74
|
curl -fsSL https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/arena/add-remove/arena.toml > arena.toml
|
|
@@ -97,21 +99,71 @@ async function singleRun(args: string[]) {
|
|
|
97
99
|
if (!opts.deck) {
|
|
98
100
|
console.error(`❌ --deck <path|url> is required.
|
|
99
101
|
--deck accepts local paths and http/https URLs (auto-fetched).
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
102
|
+
|
|
103
|
+
Example (no local file needed — URL is auto-fetched):
|
|
104
|
+
lythoskill-arena single \\
|
|
105
|
+
--deck https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml \\
|
|
106
|
+
--brief "your task"
|
|
107
|
+
|
|
108
|
+
Or with a local deck file you already have:
|
|
109
|
+
lythoskill-arena single --deck ./examples/decks/scout.toml --brief "your task"`)
|
|
103
110
|
process.exit(1)
|
|
104
111
|
}
|
|
105
112
|
if (!opts.task && (!opts.brief || !opts.brief.trim())) {
|
|
106
113
|
console.error(`❌ --task <path> or --brief "<text>" is required.
|
|
107
114
|
--task reads a .agent.md scenario file; --brief takes inline text.
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
115
|
+
|
|
116
|
+
Example (no local file needed — URL is auto-fetched):
|
|
117
|
+
lythoskill-arena single \\
|
|
118
|
+
--deck https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml \\
|
|
119
|
+
--brief "your task"
|
|
120
|
+
|
|
121
|
+
Or with a local deck file:
|
|
122
|
+
lythoskill-arena single --deck ./examples/decks/scout.toml --brief "your task"`)
|
|
111
123
|
process.exit(1)
|
|
112
124
|
}
|
|
113
125
|
|
|
114
|
-
|
|
126
|
+
// Validate --task file early — before any URL fetch — so bad path fails fast without a wasted network call.
|
|
127
|
+
let resolvedTaskPath: string | undefined
|
|
128
|
+
if (opts.task) {
|
|
129
|
+
resolvedTaskPath = resolve(opts.task)
|
|
130
|
+
if (!existsSync(resolvedTaskPath)) {
|
|
131
|
+
console.error(`❌ Task file not found: ${resolvedTaskPath}
|
|
132
|
+
Use --brief for inline tasks, or point --task to an existing .agent.md file.
|
|
133
|
+
Format: name + description + Given/When/Then/Judge sections.
|
|
134
|
+
|
|
135
|
+
Example (URL): lythoskill-arena single --brief "your task" --deck https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml
|
|
136
|
+
Or (local): lythoskill-arena single --brief "your task" --deck ./examples/decks/scout.toml`)
|
|
137
|
+
process.exit(1)
|
|
138
|
+
}
|
|
139
|
+
const raw = readFileSync(resolvedTaskPath, 'utf-8')
|
|
140
|
+
if (!raw.startsWith('---')) {
|
|
141
|
+
console.error(`❌ Invalid .agent.md: missing frontmatter (must start with "---")
|
|
142
|
+
Correct format:
|
|
143
|
+
---
|
|
144
|
+
name: my-scenario
|
|
145
|
+
description: what this tests
|
|
146
|
+
timeout: 120000
|
|
147
|
+
---
|
|
148
|
+
## Given
|
|
149
|
+
...
|
|
150
|
+
## When
|
|
151
|
+
...
|
|
152
|
+
## Then
|
|
153
|
+
...
|
|
154
|
+
## Judge
|
|
155
|
+
...
|
|
156
|
+
Template: playground/arena-one-shot/TASK-arena.agent.md`)
|
|
157
|
+
process.exit(1)
|
|
158
|
+
}
|
|
159
|
+
if (!raw.includes('## When')) {
|
|
160
|
+
console.error(`❌ Invalid .agent.md: missing "## When" section.
|
|
161
|
+
The ## When section defines what the agent should do.
|
|
162
|
+
See template: playground/arena-one-shot/TASK-arena.agent.md`)
|
|
163
|
+
process.exit(1)
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
115
167
|
const { existsSync: deckExists, writeFileSync: deckWrite } = await import('node:fs')
|
|
116
168
|
let deckPath: string
|
|
117
169
|
if (opts.deck.startsWith('http://') || opts.deck.startsWith('https://')) {
|
|
@@ -152,43 +204,16 @@ async function singleRun(args: string[]) {
|
|
|
152
204
|
try { await import('@lythos/agent-adapter-deepseek-serve') } catch { /* package not installed */ }
|
|
153
205
|
const { runAgentScenario } = await import('@lythos/test-utils/agent-bdd')
|
|
154
206
|
const { resolvePlayer } = await import('./player')
|
|
155
|
-
const { readFileSync, writeFileSync, mkdirSync } = await import('node:fs')
|
|
156
207
|
|
|
157
208
|
const player = resolvePlayer(opts.player ?? 'kimi')
|
|
158
209
|
const agent = useAgent(player)
|
|
159
210
|
const outDir = opts.out ? resolve(opts.out) : join(process.cwd(), `agent-output-${new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19)}`)
|
|
160
211
|
mkdirSync(outDir, { recursive: true })
|
|
161
212
|
|
|
162
|
-
// Resolve task: --brief builds scenario directly, --task
|
|
213
|
+
// Resolve task: --brief builds scenario directly, --task uses pre-validated path
|
|
163
214
|
const scenarioOpt: Record<string, unknown> = {}
|
|
164
|
-
if (
|
|
165
|
-
|
|
166
|
-
if (!existsSync(taskPath)) { console.error(`❌ Task file not found: ${taskPath}
|
|
167
|
-
Use --brief for inline tasks, or point --task to an existing .agent.md file.
|
|
168
|
-
Format: name + description + Given/When/Then/Judge sections.
|
|
169
|
-
Example: lythoskill-arena single --brief "your task" --deck <url>`); process.exit(1) }
|
|
170
|
-
scenarioOpt.scenarioPath = taskPath
|
|
171
|
-
// Quick validation: check frontmatter presence
|
|
172
|
-
const raw = readFileSync(taskPath, 'utf-8')
|
|
173
|
-
if (!raw.startsWith('---')) { console.error(`❌ Invalid .agent.md: missing frontmatter (must start with "---")
|
|
174
|
-
Correct format:
|
|
175
|
-
---
|
|
176
|
-
name: my-scenario
|
|
177
|
-
description: what this tests
|
|
178
|
-
timeout: 120000
|
|
179
|
-
---
|
|
180
|
-
## Given
|
|
181
|
-
...
|
|
182
|
-
## When
|
|
183
|
-
...
|
|
184
|
-
## Then
|
|
185
|
-
...
|
|
186
|
-
## Judge
|
|
187
|
-
...
|
|
188
|
-
Template: playground/arena-one-shot/TASK-arena.agent.md`); process.exit(1) }
|
|
189
|
-
if (!raw.includes('## When')) { console.error(`❌ Invalid .agent.md: missing "## When" section.
|
|
190
|
-
The ## When section defines what the agent should do.
|
|
191
|
-
See template: playground/arena-one-shot/TASK-arena.agent.md`); process.exit(1) }
|
|
215
|
+
if (resolvedTaskPath) {
|
|
216
|
+
scenarioOpt.scenarioPath = resolvedTaskPath
|
|
192
217
|
} else {
|
|
193
218
|
scenarioOpt.scenario = {
|
|
194
219
|
name: 'ad-hoc task',
|