@lythos/skill-arena 0.13.0 → 0.13.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -49,45 +49,59 @@ Note: Claude `-p` mode has known issues with web tools in Bun.spawn (deferred to
49
49
  ```bash
50
50
  bun add -d @lythos/skill-arena
51
51
  # or use directly
52
- bunx @lythos/skill-arena@0.13.0 <command>
52
+ bunx @lythos/skill-arena@0.13.2 <command>
53
53
  ```
54
54
 
55
55
  ## Quick Start
56
56
 
57
57
  ```bash
58
- # Single: test a deck with one agent
59
- bunx @lythos/skill-arena@0.13.0 single \
58
+ # Single: test a deck with one agent (most common)
59
+ bunx @lythos/skill-arena@latest single \
60
+ --deck ./examples/decks/scout.toml \
61
+ --brief "Generate auth flow diagram" \
62
+ --player kimi \
63
+ --timeout 300000 \
64
+ --out ./output
65
+
66
+ # Single with remote deck (URL auto-fetched)
67
+ bunx @lythos/skill-arena@latest single \
60
68
  --deck https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml \
61
- --brief "Generate auth flow diagram"
69
+ --brief "Generate auth flow diagram" \
70
+ --out ./output
62
71
 
63
72
  # Vs: compare multiple decks side by side
64
73
  curl -fsSL https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/arena/research-compare/arena.toml > arena.toml
65
- bunx @lythos/skill-arena@0.13.0 vs --config ./arena.toml
74
+ bunx @lythos/skill-arena@latest vs --config ./arena.toml
66
75
  ```
67
76
 
77
+ **Default behavior:**
78
+ - Agent runs in an isolated `/tmp` workdir (no workspace pollution)
79
+ - All artifacts are copied to `--out` after completion
80
+ - Prompt template injects fixed contract (decision-log, robustness, tool preference) + your brief as variable
81
+
68
82
  ## Commands
69
83
 
70
84
  ### Declarative mode (k8s-style, recommended)
71
85
 
72
86
  ```bash
73
87
  # Print execution plan without running
74
- bunx @lythos/skill-arena@0.13.0 vs --config arena.toml --dry-run
88
+ bunx @lythos/skill-arena@0.13.2 vs --config arena.toml --dry-run
75
89
 
76
90
  # Execute with per-side runs_per_side and statistical aggregation
77
- bunx @lythos/skill-arena@0.13.0 vs --config arena.toml
91
+ bunx @lythos/skill-arena@0.13.2 vs --config arena.toml
78
92
  ```
79
93
 
80
94
  ### Scaffold mode (legacy, manual execution)
81
95
 
82
96
  ```
83
- bunx @lythos/skill-arena@0.13.0 scaffold --task "Generate auth flow diagram" \
97
+ bunx @lythos/skill-arena@0.13.2 scaffold --task "Generate auth flow diagram" \
84
98
  --decks https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml,https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/documents.toml
85
99
  ```
86
100
 
87
101
  ### Viz
88
102
 
89
103
  ```bash
90
- bunx @lythos/skill-arena@0.13.0 viz runs/arena-<id>/
104
+ bunx @lythos/skill-arena@0.13.2 viz runs/arena-<id>/
91
105
  ```
92
106
 
93
107
  ## Skill Documentation
@@ -101,7 +115,7 @@ The agent-visible **Skill** layer documentation is here:
101
115
  Part of the [lythoskill](https://github.com/lythos-labs/lythoskill) ecosystem — the thin-skill pattern separates heavy logic (this npm package) from lightweight agent instructions (SKILL.md).
102
116
 
103
117
  ```
104
- Starter (this package) → npm publish → bunx @lythos/skill-arena@0.13.0 ...
118
+ Starter (this package) → npm publish → bunx @lythos/skill-arena@0.13.2 ...
105
119
  Skill (packages/<name>/skill/) → build → SKILL.md + thin scripts
106
120
  Output (skills/<name>/) → git commit → agent-visible skill
107
121
  ```
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lythos/skill-arena",
3
- "version": "0.13.0",
3
+ "version": "0.13.2",
4
4
  "description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
5
5
  "keywords": [
6
6
  "ai-agent",
@@ -42,13 +42,13 @@
42
42
  "bun": ">=1.0.0"
43
43
  },
44
44
  "dependencies": {
45
- "@lythos/cold-pool": "^0.13.0",
46
- "@lythos/infra": "^0.13.0",
47
- "@lythos/test-utils": "^0.13.0",
45
+ "@lythos/cold-pool": "^0.13.2",
46
+ "@lythos/infra": "^0.13.2",
47
+ "@lythos/test-utils": "^0.13.2",
48
48
  "zod": "^3.24.0",
49
49
  "zod-to-json-schema": "^3.25.2"
50
50
  },
51
51
  "optionalDependencies": {
52
- "@lythos/agent-adapter-claude-sdk": "^0.13.0"
52
+ "@lythos/agent-adapter-claude-sdk": "^0.13.2"
53
53
  }
54
54
  }
package/src/cli.ts CHANGED
@@ -1,8 +1,9 @@
1
+ #!/usr/bin/env bun
1
2
  import { writeFileSync, readFileSync, mkdirSync, existsSync, realpathSync } from 'node:fs'
2
3
  import { join, resolve } from 'node:path'
3
- import { homedir } from 'node:os'
4
+ import { homedir, tmpdir } from 'node:os'
4
5
  import { ZodError } from 'zod'
5
- import { formatPlanOutput, type ArenaResult } from './runner'
6
+ import { formatPlanOutput, type ArenaResult, buildArenaPrompt } from './runner'
6
7
  import { parseArenaToml, buildExecutionPlan } from './arena-toml'
7
8
  import { buildCopyPlan, parseDeckSkills } from './preflight'
8
9
  import { checkSkillExistence, formatSkillWarnings, resolveColdPoolDir } from './preflight'
@@ -263,9 +264,17 @@ async function singleRun(args: string[]) {
263
264
  else console.log(`📋 brief: ${opts.brief!.slice(0, 60)}...`)
264
265
 
265
266
  // Setup workdir
266
- const agentWorkdir = join(process.cwd(), `arena-single-${Date.now()}`)
267
+ const agentWorkdir = join(tmpdir(), `arena-single-${Date.now()}`)
267
268
  mkdirSync(agentWorkdir, { recursive: true })
268
269
  writeFileSync(join(agentWorkdir, 'skill-deck.toml'), readFileSync(deckPath, 'utf-8'))
270
+ writeFileSync(join(agentWorkdir, 'AGENTS.md'), [
271
+ '# Arena Test Environment',
272
+ `**Mode**: single`,
273
+ '## How This Works',
274
+ '- Isolated arena test directory. Skills in skill-deck.toml, linked via deck link.',
275
+ '- Complete the task using available skills. Output to this directory.',
276
+ '- MANDATORY: write decision-log.jsonl (see prompt for schema).',
277
+ ].join('\n'))
269
278
 
270
279
  const deckRaw = readFileSync(join(agentWorkdir, 'skill-deck.toml'), 'utf-8')
271
280
  let deckParsed: Record<string, any> = {}
@@ -306,10 +315,16 @@ async function singleRun(args: string[]) {
306
315
  console.warn('⚠️ Could not check skill existence:', e instanceof Error ? e.message : e)
307
316
  }
308
317
 
309
- // Direct agent.spawn natural-language task text, no parsing
318
+ // Template injection: brief is the {task} variable, template carries fixed contract
319
+ const fullPrompt = buildArenaPrompt({
320
+ brief: taskText,
321
+ cwd: agentWorkdir,
322
+ deckPath: deckPath,
323
+ outputDir: agentWorkdir,
324
+ })
310
325
  const agentResult = await agent.spawn({
311
326
  cwd: agentWorkdir,
312
- brief: taskText,
327
+ brief: fullPrompt,
313
328
  timeoutMs: Number(opts.timeout ?? 120000),
314
329
  })
315
330
 
@@ -320,7 +335,7 @@ async function singleRun(args: string[]) {
320
335
  // Copy agent-produced files to outDir
321
336
  const { cpSync, readdirSync, existsSync: es3 } = await import('node:fs')
322
337
  if (es3(agentWorkdir)) {
323
- const skipSet = new Set(['.claude', 'skill-deck.toml', 'skill-deck.lock'])
338
+ const skipSet = new Set(['.claude', 'skill-deck.toml', 'skill-deck.lock', 'AGENTS.md'])
324
339
  try {
325
340
  const entries = readdirSync(agentWorkdir)
326
341
  const plan = buildCopyPlan(agentWorkdir, outDir, entries, skipSet)
package/src/runner.ts CHANGED
@@ -52,6 +52,47 @@ function resolveJudgeText(toml: ArenaToml, configDir?: string): string | null {
52
52
  return null
53
53
  }
54
54
 
55
+ // ── Prompt template (IoC: brief = variable, template = fixed contract) ────
56
+
57
+ export function buildArenaPrompt(opts: {
58
+ brief: string
59
+ cwd: string
60
+ deckPath: string
61
+ outputDir?: string
62
+ preflightReport?: string
63
+ }): string {
64
+ const out = opts.outputDir ?? opts.cwd
65
+ const lines = [
66
+ 'You are running an arena evaluation cell.',
67
+ '',
68
+ `CWD: ${opts.cwd}`,
69
+ `Deck: ${opts.deckPath}`,
70
+ `Produce output to: ${out}/`,
71
+ '',
72
+ 'MANDATORY — write decision-log.jsonl to the output directory.',
73
+ 'Each line is one JSON object with: t (seconds elapsed),',
74
+ 'phase (setup/content/design/output), decision (what you chose),',
75
+ 'reason (why). This is your decision trail — the only way the',
76
+ 'orchestrator can understand your reasoning chain.',
77
+ '',
78
+ 'Example:',
79
+ '{"t":0,"phase":"setup","decision":"selected Golden Hour palette","reason":"warm tones match baking theme"}',
80
+ '{"t":12,"phase":"content","decision":"6 science topics","reason":"requires chemistry depth"}',
81
+ '',
82
+ 'ROBUSTNESS — If any command or script fails, read the error output, fix the issue, and retry.',
83
+ 'Do not stop on the first error. Ensure all required output files exist before finishing.',
84
+ '',
85
+ 'TOOLS — Use the skills already linked in .claude/skills/ (check with `ls .claude/skills/`).',
86
+ 'They are available and tested. Only write alternative scripts if the linked skills explicitly',
87
+ 'cannot handle the task.',
88
+ ]
89
+ if (opts.preflightReport) {
90
+ lines.push('', 'Preflight:', opts.preflightReport)
91
+ }
92
+ lines.push('', 'TASK:', opts.brief)
93
+ return lines.join('\n')
94
+ }
95
+
55
96
  // ── Plan formatting ───────────────────────────────────────────────────────
56
97
 
57
98
  export function formatPlanOutput(plan: ExecutionPlan): string[] {
@@ -140,10 +181,10 @@ export async function runArenaFromToml(opts: {
140
181
  writeFileSync(join(workDir, 'AGENTS.md'), [
141
182
  '# Arena Test Environment',
142
183
  `**Side**: ${cell.side}`, `**Player**: ${cell.player}`, `**Run**: ${cell.run}`,
143
- '## Task', '', taskText,
144
184
  '## How This Works',
145
185
  '- Isolated arena test directory. Skills in skill-deck.toml, linked via deck link.',
146
186
  '- Complete the task using available skills. Output to this directory.',
187
+ '- MANDATORY: write decision-log.jsonl (see prompt for schema).',
147
188
  ].join('\n'))
148
189
  const linkProc = Bun.spawn(
149
190
  ['bunx', '@lythos/skill-deck', 'link'],
@@ -156,9 +197,15 @@ export async function runArenaFromToml(opts: {
156
197
 
157
198
  // Direct agent.spawn (no parseAgentMd, no AgentScenario)
158
199
  const agent = useAgent(resolvePlayer(cell.player))
200
+ const fullPrompt = buildArenaPrompt({
201
+ brief: taskText,
202
+ cwd: workDir,
203
+ deckPath: cell.deck,
204
+ outputDir: workDir,
205
+ })
159
206
  const agentResult = await agent.spawn({
160
207
  cwd: workDir,
161
- brief: taskText,
208
+ brief: fullPrompt,
162
209
  timeoutMs: 300000,
163
210
  })
164
211