@lythos/skill-arena 0.13.1 → 0.13.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +24 -10
- package/package.json +5 -5
- package/src/cli.ts +20 -6
- package/src/runner.ts +49 -2
package/README.md
CHANGED
|
@@ -49,45 +49,59 @@ Note: Claude `-p` mode has known issues with web tools in Bun.spawn (deferred to
|
|
|
49
49
|
```bash
|
|
50
50
|
bun add -d @lythos/skill-arena
|
|
51
51
|
# or use directly
|
|
52
|
-
bunx @lythos/skill-arena@0.13.
|
|
52
|
+
bunx @lythos/skill-arena@0.13.3 <command>
|
|
53
53
|
```
|
|
54
54
|
|
|
55
55
|
## Quick Start
|
|
56
56
|
|
|
57
57
|
```bash
|
|
58
|
-
# Single: test a deck with one agent
|
|
59
|
-
bunx @lythos/skill-arena@
|
|
58
|
+
# Single: test a deck with one agent (most common)
|
|
59
|
+
bunx @lythos/skill-arena@latest single \
|
|
60
|
+
--deck ./examples/decks/scout.toml \
|
|
61
|
+
--brief "Generate auth flow diagram" \
|
|
62
|
+
--player kimi \
|
|
63
|
+
--timeout 300000 \
|
|
64
|
+
--out ./output
|
|
65
|
+
|
|
66
|
+
# Single with remote deck (URL auto-fetched)
|
|
67
|
+
bunx @lythos/skill-arena@latest single \
|
|
60
68
|
--deck https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml \
|
|
61
|
-
--brief "Generate auth flow diagram"
|
|
69
|
+
--brief "Generate auth flow diagram" \
|
|
70
|
+
--out ./output
|
|
62
71
|
|
|
63
72
|
# Vs: compare multiple decks side by side
|
|
64
73
|
curl -fsSL https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/arena/research-compare/arena.toml > arena.toml
|
|
65
|
-
bunx @lythos/skill-arena@
|
|
74
|
+
bunx @lythos/skill-arena@latest vs --config ./arena.toml
|
|
66
75
|
```
|
|
67
76
|
|
|
77
|
+
**Default behavior:**
|
|
78
|
+
- Agent runs in an isolated `/tmp` workdir (no workspace pollution)
|
|
79
|
+
- All artifacts are copied to `--out` after completion
|
|
80
|
+
- Prompt template injects fixed contract (decision-log, robustness, tool preference) + your brief as variable
|
|
81
|
+
|
|
68
82
|
## Commands
|
|
69
83
|
|
|
70
84
|
### Declarative mode (k8s-style, recommended)
|
|
71
85
|
|
|
72
86
|
```bash
|
|
73
87
|
# Print execution plan without running
|
|
74
|
-
bunx @lythos/skill-arena@0.13.
|
|
88
|
+
bunx @lythos/skill-arena@0.13.3 vs --config arena.toml --dry-run
|
|
75
89
|
|
|
76
90
|
# Execute with per-side runs_per_side and statistical aggregation
|
|
77
|
-
bunx @lythos/skill-arena@0.13.
|
|
91
|
+
bunx @lythos/skill-arena@0.13.3 vs --config arena.toml
|
|
78
92
|
```
|
|
79
93
|
|
|
80
94
|
### Scaffold mode (legacy, manual execution)
|
|
81
95
|
|
|
82
96
|
```
|
|
83
|
-
bunx @lythos/skill-arena@0.13.
|
|
97
|
+
bunx @lythos/skill-arena@0.13.3 scaffold --task "Generate auth flow diagram" \
|
|
84
98
|
--decks https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml,https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/documents.toml
|
|
85
99
|
```
|
|
86
100
|
|
|
87
101
|
### Viz
|
|
88
102
|
|
|
89
103
|
```bash
|
|
90
|
-
bunx @lythos/skill-arena@0.13.
|
|
104
|
+
bunx @lythos/skill-arena@0.13.3 viz runs/arena-<id>/
|
|
91
105
|
```
|
|
92
106
|
|
|
93
107
|
## Skill Documentation
|
|
@@ -101,7 +115,7 @@ The agent-visible **Skill** layer documentation is here:
|
|
|
101
115
|
Part of the [lythoskill](https://github.com/lythos-labs/lythoskill) ecosystem — the thin-skill pattern separates heavy logic (this npm package) from lightweight agent instructions (SKILL.md).
|
|
102
116
|
|
|
103
117
|
```
|
|
104
|
-
Starter (this package) → npm publish → bunx @lythos/skill-arena@0.13.
|
|
118
|
+
Starter (this package) → npm publish → bunx @lythos/skill-arena@0.13.3 ...
|
|
105
119
|
Skill (packages/<name>/skill/) → build → SKILL.md + thin scripts
|
|
106
120
|
Output (skills/<name>/) → git commit → agent-visible skill
|
|
107
121
|
```
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@lythos/skill-arena",
|
|
3
|
-
"version": "0.13.
|
|
3
|
+
"version": "0.13.3",
|
|
4
4
|
"description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"ai-agent",
|
|
@@ -42,13 +42,13 @@
|
|
|
42
42
|
"bun": ">=1.0.0"
|
|
43
43
|
},
|
|
44
44
|
"dependencies": {
|
|
45
|
-
"@lythos/cold-pool": "^0.13.
|
|
46
|
-
"@lythos/infra": "^0.13.
|
|
47
|
-
"@lythos/test-utils": "^0.13.
|
|
45
|
+
"@lythos/cold-pool": "^0.13.3",
|
|
46
|
+
"@lythos/infra": "^0.13.3",
|
|
47
|
+
"@lythos/test-utils": "^0.13.3",
|
|
48
48
|
"zod": "^3.24.0",
|
|
49
49
|
"zod-to-json-schema": "^3.25.2"
|
|
50
50
|
},
|
|
51
51
|
"optionalDependencies": {
|
|
52
|
-
"@lythos/agent-adapter-claude-sdk": "^0.13.
|
|
52
|
+
"@lythos/agent-adapter-claude-sdk": "^0.13.3"
|
|
53
53
|
}
|
|
54
54
|
}
|
package/src/cli.ts
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
#!/usr/bin/env bun
|
|
2
2
|
import { writeFileSync, readFileSync, mkdirSync, existsSync, realpathSync } from 'node:fs'
|
|
3
3
|
import { join, resolve } from 'node:path'
|
|
4
|
-
import { homedir } from 'node:os'
|
|
4
|
+
import { homedir, tmpdir } from 'node:os'
|
|
5
5
|
import { ZodError } from 'zod'
|
|
6
|
-
import { formatPlanOutput, type ArenaResult } from './runner'
|
|
6
|
+
import { formatPlanOutput, type ArenaResult, buildArenaPrompt } from './runner'
|
|
7
7
|
import { parseArenaToml, buildExecutionPlan } from './arena-toml'
|
|
8
8
|
import { buildCopyPlan, parseDeckSkills } from './preflight'
|
|
9
9
|
import { checkSkillExistence, formatSkillWarnings, resolveColdPoolDir } from './preflight'
|
|
@@ -264,9 +264,17 @@ async function singleRun(args: string[]) {
|
|
|
264
264
|
else console.log(`📋 brief: ${opts.brief!.slice(0, 60)}...`)
|
|
265
265
|
|
|
266
266
|
// Setup workdir
|
|
267
|
-
const agentWorkdir = join(
|
|
267
|
+
const agentWorkdir = join(tmpdir(), `arena-single-${Date.now()}`)
|
|
268
268
|
mkdirSync(agentWorkdir, { recursive: true })
|
|
269
269
|
writeFileSync(join(agentWorkdir, 'skill-deck.toml'), readFileSync(deckPath, 'utf-8'))
|
|
270
|
+
writeFileSync(join(agentWorkdir, 'AGENTS.md'), [
|
|
271
|
+
'# Arena Test Environment',
|
|
272
|
+
`**Mode**: single`,
|
|
273
|
+
'## How This Works',
|
|
274
|
+
'- Isolated arena test directory. Skills in skill-deck.toml, linked via deck link.',
|
|
275
|
+
'- Complete the task using available skills. Output to this directory.',
|
|
276
|
+
'- MANDATORY: write decision-log.jsonl (see prompt for schema).',
|
|
277
|
+
].join('\n'))
|
|
270
278
|
|
|
271
279
|
const deckRaw = readFileSync(join(agentWorkdir, 'skill-deck.toml'), 'utf-8')
|
|
272
280
|
let deckParsed: Record<string, any> = {}
|
|
@@ -307,10 +315,16 @@ async function singleRun(args: string[]) {
|
|
|
307
315
|
console.warn('⚠️ Could not check skill existence:', e instanceof Error ? e.message : e)
|
|
308
316
|
}
|
|
309
317
|
|
|
310
|
-
//
|
|
318
|
+
// Template injection: brief is the {task} variable, template carries fixed contract
|
|
319
|
+
const fullPrompt = buildArenaPrompt({
|
|
320
|
+
brief: taskText,
|
|
321
|
+
cwd: agentWorkdir,
|
|
322
|
+
deckPath: deckPath,
|
|
323
|
+
outputDir: agentWorkdir,
|
|
324
|
+
})
|
|
311
325
|
const agentResult = await agent.spawn({
|
|
312
326
|
cwd: agentWorkdir,
|
|
313
|
-
brief:
|
|
327
|
+
brief: fullPrompt,
|
|
314
328
|
timeoutMs: Number(opts.timeout ?? 120000),
|
|
315
329
|
})
|
|
316
330
|
|
|
@@ -321,7 +335,7 @@ async function singleRun(args: string[]) {
|
|
|
321
335
|
// Copy agent-produced files to outDir
|
|
322
336
|
const { cpSync, readdirSync, existsSync: es3 } = await import('node:fs')
|
|
323
337
|
if (es3(agentWorkdir)) {
|
|
324
|
-
const skipSet = new Set(['.claude', 'skill-deck.toml', 'skill-deck.lock'])
|
|
338
|
+
const skipSet = new Set(['.claude', 'skill-deck.toml', 'skill-deck.lock', 'AGENTS.md'])
|
|
325
339
|
try {
|
|
326
340
|
const entries = readdirSync(agentWorkdir)
|
|
327
341
|
const plan = buildCopyPlan(agentWorkdir, outDir, entries, skipSet)
|
package/src/runner.ts
CHANGED
|
@@ -52,6 +52,47 @@ function resolveJudgeText(toml: ArenaToml, configDir?: string): string | null {
|
|
|
52
52
|
return null
|
|
53
53
|
}
|
|
54
54
|
|
|
55
|
+
// ── Prompt template (IoC: brief = variable, template = fixed contract) ────
|
|
56
|
+
|
|
57
|
+
export function buildArenaPrompt(opts: {
|
|
58
|
+
brief: string
|
|
59
|
+
cwd: string
|
|
60
|
+
deckPath: string
|
|
61
|
+
outputDir?: string
|
|
62
|
+
preflightReport?: string
|
|
63
|
+
}): string {
|
|
64
|
+
const out = opts.outputDir ?? opts.cwd
|
|
65
|
+
const lines = [
|
|
66
|
+
'You are running an arena evaluation cell.',
|
|
67
|
+
'',
|
|
68
|
+
`CWD: ${opts.cwd}`,
|
|
69
|
+
`Deck: ${opts.deckPath}`,
|
|
70
|
+
`Produce output to: ${out}/`,
|
|
71
|
+
'',
|
|
72
|
+
'MANDATORY — write decision-log.jsonl to the output directory.',
|
|
73
|
+
'Each line is one JSON object with: t (seconds elapsed),',
|
|
74
|
+
'phase (setup/content/design/output), decision (what you chose),',
|
|
75
|
+
'reason (why). This is your decision trail — the only way the',
|
|
76
|
+
'orchestrator can understand your reasoning chain.',
|
|
77
|
+
'',
|
|
78
|
+
'Example:',
|
|
79
|
+
'{"t":0,"phase":"setup","decision":"selected Golden Hour palette","reason":"warm tones match baking theme"}',
|
|
80
|
+
'{"t":12,"phase":"content","decision":"6 science topics","reason":"requires chemistry depth"}',
|
|
81
|
+
'',
|
|
82
|
+
'ROBUSTNESS — If any command or script fails, read the error output, fix the issue, and retry.',
|
|
83
|
+
'Do not stop on the first error. Ensure all required output files exist before finishing.',
|
|
84
|
+
'',
|
|
85
|
+
'TOOLS — Use the skills already linked in .claude/skills/ (check with `ls .claude/skills/`).',
|
|
86
|
+
'They are available and tested. Only write alternative scripts if the linked skills explicitly',
|
|
87
|
+
'cannot handle the task.',
|
|
88
|
+
]
|
|
89
|
+
if (opts.preflightReport) {
|
|
90
|
+
lines.push('', 'Preflight:', opts.preflightReport)
|
|
91
|
+
}
|
|
92
|
+
lines.push('', 'TASK:', opts.brief)
|
|
93
|
+
return lines.join('\n')
|
|
94
|
+
}
|
|
95
|
+
|
|
55
96
|
// ── Plan formatting ───────────────────────────────────────────────────────
|
|
56
97
|
|
|
57
98
|
export function formatPlanOutput(plan: ExecutionPlan): string[] {
|
|
@@ -140,10 +181,10 @@ export async function runArenaFromToml(opts: {
|
|
|
140
181
|
writeFileSync(join(workDir, 'AGENTS.md'), [
|
|
141
182
|
'# Arena Test Environment',
|
|
142
183
|
`**Side**: ${cell.side}`, `**Player**: ${cell.player}`, `**Run**: ${cell.run}`,
|
|
143
|
-
'## Task', '', taskText,
|
|
144
184
|
'## How This Works',
|
|
145
185
|
'- Isolated arena test directory. Skills in skill-deck.toml, linked via deck link.',
|
|
146
186
|
'- Complete the task using available skills. Output to this directory.',
|
|
187
|
+
'- MANDATORY: write decision-log.jsonl (see prompt for schema).',
|
|
147
188
|
].join('\n'))
|
|
148
189
|
const linkProc = Bun.spawn(
|
|
149
190
|
['bunx', '@lythos/skill-deck', 'link'],
|
|
@@ -156,9 +197,15 @@ export async function runArenaFromToml(opts: {
|
|
|
156
197
|
|
|
157
198
|
// Direct agent.spawn (no parseAgentMd, no AgentScenario)
|
|
158
199
|
const agent = useAgent(resolvePlayer(cell.player))
|
|
200
|
+
const fullPrompt = buildArenaPrompt({
|
|
201
|
+
brief: taskText,
|
|
202
|
+
cwd: workDir,
|
|
203
|
+
deckPath: cell.deck,
|
|
204
|
+
outputDir: workDir,
|
|
205
|
+
})
|
|
159
206
|
const agentResult = await agent.spawn({
|
|
160
207
|
cwd: workDir,
|
|
161
|
-
brief:
|
|
208
|
+
brief: fullPrompt,
|
|
162
209
|
timeoutMs: 300000,
|
|
163
210
|
})
|
|
164
211
|
|