@lythos/skill-arena 0.13.0 → 0.13.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +24 -10
- package/package.json +5 -5
- package/src/cli.ts +21 -6
- package/src/runner.ts +49 -2
package/README.md
CHANGED
|
@@ -49,45 +49,59 @@ Note: Claude `-p` mode has known issues with web tools in Bun.spawn (deferred to
|
|
|
49
49
|
```bash
|
|
50
50
|
bun add -d @lythos/skill-arena
|
|
51
51
|
# or use directly
|
|
52
|
-
bunx @lythos/skill-arena@0.13.
|
|
52
|
+
bunx @lythos/skill-arena@0.13.2 <command>
|
|
53
53
|
```
|
|
54
54
|
|
|
55
55
|
## Quick Start
|
|
56
56
|
|
|
57
57
|
```bash
|
|
58
|
-
# Single: test a deck with one agent
|
|
59
|
-
bunx @lythos/skill-arena@
|
|
58
|
+
# Single: test a deck with one agent (most common)
|
|
59
|
+
bunx @lythos/skill-arena@latest single \
|
|
60
|
+
--deck ./examples/decks/scout.toml \
|
|
61
|
+
--brief "Generate auth flow diagram" \
|
|
62
|
+
--player kimi \
|
|
63
|
+
--timeout 300000 \
|
|
64
|
+
--out ./output
|
|
65
|
+
|
|
66
|
+
# Single with remote deck (URL auto-fetched)
|
|
67
|
+
bunx @lythos/skill-arena@latest single \
|
|
60
68
|
--deck https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml \
|
|
61
|
-
--brief "Generate auth flow diagram"
|
|
69
|
+
--brief "Generate auth flow diagram" \
|
|
70
|
+
--out ./output
|
|
62
71
|
|
|
63
72
|
# Vs: compare multiple decks side by side
|
|
64
73
|
curl -fsSL https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/arena/research-compare/arena.toml > arena.toml
|
|
65
|
-
bunx @lythos/skill-arena@
|
|
74
|
+
bunx @lythos/skill-arena@latest vs --config ./arena.toml
|
|
66
75
|
```
|
|
67
76
|
|
|
77
|
+
**Default behavior:**
|
|
78
|
+
- Agent runs in an isolated `/tmp` workdir (no workspace pollution)
|
|
79
|
+
- All artifacts are copied to `--out` after completion
|
|
80
|
+
- Prompt template injects fixed contract (decision-log, robustness, tool preference) + your brief as variable
|
|
81
|
+
|
|
68
82
|
## Commands
|
|
69
83
|
|
|
70
84
|
### Declarative mode (k8s-style, recommended)
|
|
71
85
|
|
|
72
86
|
```bash
|
|
73
87
|
# Print execution plan without running
|
|
74
|
-
bunx @lythos/skill-arena@0.13.
|
|
88
|
+
bunx @lythos/skill-arena@0.13.2 vs --config arena.toml --dry-run
|
|
75
89
|
|
|
76
90
|
# Execute with per-side runs_per_side and statistical aggregation
|
|
77
|
-
bunx @lythos/skill-arena@0.13.
|
|
91
|
+
bunx @lythos/skill-arena@0.13.2 vs --config arena.toml
|
|
78
92
|
```
|
|
79
93
|
|
|
80
94
|
### Scaffold mode (legacy, manual execution)
|
|
81
95
|
|
|
82
96
|
```
|
|
83
|
-
bunx @lythos/skill-arena@0.13.
|
|
97
|
+
bunx @lythos/skill-arena@0.13.2 scaffold --task "Generate auth flow diagram" \
|
|
84
98
|
--decks https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml,https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/documents.toml
|
|
85
99
|
```
|
|
86
100
|
|
|
87
101
|
### Viz
|
|
88
102
|
|
|
89
103
|
```bash
|
|
90
|
-
bunx @lythos/skill-arena@0.13.
|
|
104
|
+
bunx @lythos/skill-arena@0.13.2 viz runs/arena-<id>/
|
|
91
105
|
```
|
|
92
106
|
|
|
93
107
|
## Skill Documentation
|
|
@@ -101,7 +115,7 @@ The agent-visible **Skill** layer documentation is here:
|
|
|
101
115
|
Part of the [lythoskill](https://github.com/lythos-labs/lythoskill) ecosystem — the thin-skill pattern separates heavy logic (this npm package) from lightweight agent instructions (SKILL.md).
|
|
102
116
|
|
|
103
117
|
```
|
|
104
|
-
Starter (this package) → npm publish → bunx @lythos/skill-arena@0.13.
|
|
118
|
+
Starter (this package) → npm publish → bunx @lythos/skill-arena@0.13.2 ...
|
|
105
119
|
Skill (packages/<name>/skill/) → build → SKILL.md + thin scripts
|
|
106
120
|
Output (skills/<name>/) → git commit → agent-visible skill
|
|
107
121
|
```
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@lythos/skill-arena",
|
|
3
|
-
"version": "0.13.
|
|
3
|
+
"version": "0.13.2",
|
|
4
4
|
"description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"ai-agent",
|
|
@@ -42,13 +42,13 @@
|
|
|
42
42
|
"bun": ">=1.0.0"
|
|
43
43
|
},
|
|
44
44
|
"dependencies": {
|
|
45
|
-
"@lythos/cold-pool": "^0.13.
|
|
46
|
-
"@lythos/infra": "^0.13.
|
|
47
|
-
"@lythos/test-utils": "^0.13.
|
|
45
|
+
"@lythos/cold-pool": "^0.13.2",
|
|
46
|
+
"@lythos/infra": "^0.13.2",
|
|
47
|
+
"@lythos/test-utils": "^0.13.2",
|
|
48
48
|
"zod": "^3.24.0",
|
|
49
49
|
"zod-to-json-schema": "^3.25.2"
|
|
50
50
|
},
|
|
51
51
|
"optionalDependencies": {
|
|
52
|
-
"@lythos/agent-adapter-claude-sdk": "^0.13.
|
|
52
|
+
"@lythos/agent-adapter-claude-sdk": "^0.13.2"
|
|
53
53
|
}
|
|
54
54
|
}
|
package/src/cli.ts
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
1
2
|
import { writeFileSync, readFileSync, mkdirSync, existsSync, realpathSync } from 'node:fs'
|
|
2
3
|
import { join, resolve } from 'node:path'
|
|
3
|
-
import { homedir } from 'node:os'
|
|
4
|
+
import { homedir, tmpdir } from 'node:os'
|
|
4
5
|
import { ZodError } from 'zod'
|
|
5
|
-
import { formatPlanOutput, type ArenaResult } from './runner'
|
|
6
|
+
import { formatPlanOutput, type ArenaResult, buildArenaPrompt } from './runner'
|
|
6
7
|
import { parseArenaToml, buildExecutionPlan } from './arena-toml'
|
|
7
8
|
import { buildCopyPlan, parseDeckSkills } from './preflight'
|
|
8
9
|
import { checkSkillExistence, formatSkillWarnings, resolveColdPoolDir } from './preflight'
|
|
@@ -263,9 +264,17 @@ async function singleRun(args: string[]) {
|
|
|
263
264
|
else console.log(`📋 brief: ${opts.brief!.slice(0, 60)}...`)
|
|
264
265
|
|
|
265
266
|
// Setup workdir
|
|
266
|
-
const agentWorkdir = join(
|
|
267
|
+
const agentWorkdir = join(tmpdir(), `arena-single-${Date.now()}`)
|
|
267
268
|
mkdirSync(agentWorkdir, { recursive: true })
|
|
268
269
|
writeFileSync(join(agentWorkdir, 'skill-deck.toml'), readFileSync(deckPath, 'utf-8'))
|
|
270
|
+
writeFileSync(join(agentWorkdir, 'AGENTS.md'), [
|
|
271
|
+
'# Arena Test Environment',
|
|
272
|
+
`**Mode**: single`,
|
|
273
|
+
'## How This Works',
|
|
274
|
+
'- Isolated arena test directory. Skills in skill-deck.toml, linked via deck link.',
|
|
275
|
+
'- Complete the task using available skills. Output to this directory.',
|
|
276
|
+
'- MANDATORY: write decision-log.jsonl (see prompt for schema).',
|
|
277
|
+
].join('\n'))
|
|
269
278
|
|
|
270
279
|
const deckRaw = readFileSync(join(agentWorkdir, 'skill-deck.toml'), 'utf-8')
|
|
271
280
|
let deckParsed: Record<string, any> = {}
|
|
@@ -306,10 +315,16 @@ async function singleRun(args: string[]) {
|
|
|
306
315
|
console.warn('⚠️ Could not check skill existence:', e instanceof Error ? e.message : e)
|
|
307
316
|
}
|
|
308
317
|
|
|
309
|
-
//
|
|
318
|
+
// Template injection: brief is the {task} variable, template carries fixed contract
|
|
319
|
+
const fullPrompt = buildArenaPrompt({
|
|
320
|
+
brief: taskText,
|
|
321
|
+
cwd: agentWorkdir,
|
|
322
|
+
deckPath: deckPath,
|
|
323
|
+
outputDir: agentWorkdir,
|
|
324
|
+
})
|
|
310
325
|
const agentResult = await agent.spawn({
|
|
311
326
|
cwd: agentWorkdir,
|
|
312
|
-
brief:
|
|
327
|
+
brief: fullPrompt,
|
|
313
328
|
timeoutMs: Number(opts.timeout ?? 120000),
|
|
314
329
|
})
|
|
315
330
|
|
|
@@ -320,7 +335,7 @@ async function singleRun(args: string[]) {
|
|
|
320
335
|
// Copy agent-produced files to outDir
|
|
321
336
|
const { cpSync, readdirSync, existsSync: es3 } = await import('node:fs')
|
|
322
337
|
if (es3(agentWorkdir)) {
|
|
323
|
-
const skipSet = new Set(['.claude', 'skill-deck.toml', 'skill-deck.lock'])
|
|
338
|
+
const skipSet = new Set(['.claude', 'skill-deck.toml', 'skill-deck.lock', 'AGENTS.md'])
|
|
324
339
|
try {
|
|
325
340
|
const entries = readdirSync(agentWorkdir)
|
|
326
341
|
const plan = buildCopyPlan(agentWorkdir, outDir, entries, skipSet)
|
package/src/runner.ts
CHANGED
|
@@ -52,6 +52,47 @@ function resolveJudgeText(toml: ArenaToml, configDir?: string): string | null {
|
|
|
52
52
|
return null
|
|
53
53
|
}
|
|
54
54
|
|
|
55
|
+
// ── Prompt template (IoC: brief = variable, template = fixed contract) ────
|
|
56
|
+
|
|
57
|
+
export function buildArenaPrompt(opts: {
|
|
58
|
+
brief: string
|
|
59
|
+
cwd: string
|
|
60
|
+
deckPath: string
|
|
61
|
+
outputDir?: string
|
|
62
|
+
preflightReport?: string
|
|
63
|
+
}): string {
|
|
64
|
+
const out = opts.outputDir ?? opts.cwd
|
|
65
|
+
const lines = [
|
|
66
|
+
'You are running an arena evaluation cell.',
|
|
67
|
+
'',
|
|
68
|
+
`CWD: ${opts.cwd}`,
|
|
69
|
+
`Deck: ${opts.deckPath}`,
|
|
70
|
+
`Produce output to: ${out}/`,
|
|
71
|
+
'',
|
|
72
|
+
'MANDATORY — write decision-log.jsonl to the output directory.',
|
|
73
|
+
'Each line is one JSON object with: t (seconds elapsed),',
|
|
74
|
+
'phase (setup/content/design/output), decision (what you chose),',
|
|
75
|
+
'reason (why). This is your decision trail — the only way the',
|
|
76
|
+
'orchestrator can understand your reasoning chain.',
|
|
77
|
+
'',
|
|
78
|
+
'Example:',
|
|
79
|
+
'{"t":0,"phase":"setup","decision":"selected Golden Hour palette","reason":"warm tones match baking theme"}',
|
|
80
|
+
'{"t":12,"phase":"content","decision":"6 science topics","reason":"requires chemistry depth"}',
|
|
81
|
+
'',
|
|
82
|
+
'ROBUSTNESS — If any command or script fails, read the error output, fix the issue, and retry.',
|
|
83
|
+
'Do not stop on the first error. Ensure all required output files exist before finishing.',
|
|
84
|
+
'',
|
|
85
|
+
'TOOLS — Use the skills already linked in .claude/skills/ (check with `ls .claude/skills/`).',
|
|
86
|
+
'They are available and tested. Only write alternative scripts if the linked skills explicitly',
|
|
87
|
+
'cannot handle the task.',
|
|
88
|
+
]
|
|
89
|
+
if (opts.preflightReport) {
|
|
90
|
+
lines.push('', 'Preflight:', opts.preflightReport)
|
|
91
|
+
}
|
|
92
|
+
lines.push('', 'TASK:', opts.brief)
|
|
93
|
+
return lines.join('\n')
|
|
94
|
+
}
|
|
95
|
+
|
|
55
96
|
// ── Plan formatting ───────────────────────────────────────────────────────
|
|
56
97
|
|
|
57
98
|
export function formatPlanOutput(plan: ExecutionPlan): string[] {
|
|
@@ -140,10 +181,10 @@ export async function runArenaFromToml(opts: {
|
|
|
140
181
|
writeFileSync(join(workDir, 'AGENTS.md'), [
|
|
141
182
|
'# Arena Test Environment',
|
|
142
183
|
`**Side**: ${cell.side}`, `**Player**: ${cell.player}`, `**Run**: ${cell.run}`,
|
|
143
|
-
'## Task', '', taskText,
|
|
144
184
|
'## How This Works',
|
|
145
185
|
'- Isolated arena test directory. Skills in skill-deck.toml, linked via deck link.',
|
|
146
186
|
'- Complete the task using available skills. Output to this directory.',
|
|
187
|
+
'- MANDATORY: write decision-log.jsonl (see prompt for schema).',
|
|
147
188
|
].join('\n'))
|
|
148
189
|
const linkProc = Bun.spawn(
|
|
149
190
|
['bunx', '@lythos/skill-deck', 'link'],
|
|
@@ -156,9 +197,15 @@ export async function runArenaFromToml(opts: {
|
|
|
156
197
|
|
|
157
198
|
// Direct agent.spawn (no parseAgentMd, no AgentScenario)
|
|
158
199
|
const agent = useAgent(resolvePlayer(cell.player))
|
|
200
|
+
const fullPrompt = buildArenaPrompt({
|
|
201
|
+
brief: taskText,
|
|
202
|
+
cwd: workDir,
|
|
203
|
+
deckPath: cell.deck,
|
|
204
|
+
outputDir: workDir,
|
|
205
|
+
})
|
|
159
206
|
const agentResult = await agent.spawn({
|
|
160
207
|
cwd: workDir,
|
|
161
|
-
brief:
|
|
208
|
+
brief: fullPrompt,
|
|
162
209
|
timeoutMs: 300000,
|
|
163
210
|
})
|
|
164
211
|
|