@lythos/skill-arena 0.9.21 → 0.9.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -49,26 +49,26 @@ Note: Claude `-p` mode has known issues with web tools in Bun.spawn (deferred to
49
49
  ```bash
50
50
  bun add -d @lythos/skill-arena
51
51
  # or use directly
52
- bunx @lythos/skill-arena@0.9.21 <command>
52
+ bunx @lythos/skill-arena@0.9.23 <command>
53
53
  ```
54
54
 
55
55
  ## Quick Start
56
56
 
57
57
  ```bash
58
58
  # Mode 1: Compare two skills on the same task
59
- bunx @lythos/skill-arena@0.9.21 \
59
+ bunx @lythos/skill-arena@0.9.23 \
60
60
  --task "Generate auth flow diagram" \
61
61
  --skills "design-doc-mermaid,mermaid-tools" \
62
62
  --criteria "syntax,context,token"
63
63
 
64
64
  # Mode 2: Compare full deck configurations
65
- bunx @lythos/skill-arena@0.9.21 \
65
+ bunx @lythos/skill-arena@0.9.23 \
66
66
  --task "Generate auth flow diagram" \
67
67
  --decks "./decks/minimal.toml,./decks/rich.toml" \
68
68
  --criteria "quality,token,maintainability"
69
69
 
70
70
  # Visualize results
71
- bunx @lythos/skill-arena@0.9.21 viz tmp/arena-<id>/
71
+ bunx @lythos/skill-arena@0.9.23 viz tmp/arena-<id>/
72
72
  ```
73
73
 
74
74
  ## Commands
@@ -77,16 +77,16 @@ bunx @lythos/skill-arena@0.9.21 viz tmp/arena-<id>/
77
77
 
78
78
  ```bash
79
79
  # Print execution plan without running
80
- bunx @lythos/skill-arena@0.9.21 run --config arena.toml --dry-run
80
+ bunx @lythos/skill-arena@0.9.23 run --config arena.toml --dry-run
81
81
 
82
82
  # Execute with per-side runs_per_side and statistical aggregation
83
- bunx @lythos/skill-arena@0.9.21 run --config arena.toml
83
+ bunx @lythos/skill-arena@0.9.23 run --config arena.toml
84
84
  ```
85
85
 
86
86
  ### CLI-flag mode (backward compat)
87
87
 
88
88
  ```
89
- bunx @lythos/skill-arena@0.9.21 run \
89
+ bunx @lythos/skill-arena@0.9.23 run \
90
90
  --task ./TASK-arena.md \
91
91
  --players ./players/claude.toml \
92
92
  --decks ./decks/run-01.toml,./decks/run-02.toml \
@@ -96,13 +96,13 @@ bunx @lythos/skill-arena@0.9.21 run \
96
96
  ### Scaffold mode (legacy, manual execution)
97
97
 
98
98
  ```
99
- bunx @lythos/skill-arena@0.9.21 scaffold --task "..." --skills a,b
99
+ bunx @lythos/skill-arena@0.9.23 scaffold --task "..." --skills a,b
100
100
  ```
101
101
 
102
102
  ### Viz
103
103
 
104
104
  ```bash
105
- bunx @lythos/skill-arena@0.9.21 viz runs/arena-<id>/
105
+ bunx @lythos/skill-arena@0.9.23 viz runs/arena-<id>/
106
106
  ```
107
107
 
108
108
  ## Skill Documentation
@@ -116,7 +116,7 @@ The agent-visible **Skill** layer documentation is here:
116
116
  Part of the [lythoskill](https://github.com/lythos-labs/lythoskill) ecosystem — the thin-skill pattern separates heavy logic (this npm package) from lightweight agent instructions (SKILL.md).
117
117
 
118
118
  ```
119
- Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.21 ...
119
+ Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.23 ...
120
120
  Skill (packages/<name>/skill/) → build → SKILL.md + thin scripts
121
121
  Output (skills/<name>/) → git commit → agent-visible skill
122
122
  ```
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lythos/skill-arena",
3
- "version": "0.9.21",
3
+ "version": "0.9.23",
4
4
  "description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
5
5
  "keywords": [
6
6
  "ai-agent",
@@ -40,5 +40,8 @@
40
40
  "@lythos/test-utils": "^0.9.1",
41
41
  "zod": "^3.24.0",
42
42
  "zod-to-json-schema": "^3.25.2"
43
+ },
44
+ "optionalDependencies": {
45
+ "@lythos/agent-adapter-claude-sdk": "workspace:*"
43
46
  }
44
47
  }
package/src/cli.ts CHANGED
@@ -9,6 +9,14 @@ import {
9
9
  existsSync, mkdirSync, writeFileSync, readFileSync,
10
10
  } from 'node:fs'
11
11
  import { join, resolve, basename } from 'node:path'
12
+ import {
13
+ parseDeckSkills,
14
+ checkSkillExistence,
15
+ validateLinkResult,
16
+ buildCopyPlan,
17
+ resolveColdPoolDir,
18
+ formatSkillWarnings,
19
+ } from './preflight'
12
20
 
13
21
  // ── 简单的 slugify ──────────────────────────────────────────
14
22
  function slugify(input: string): string {
@@ -88,8 +96,8 @@ async function agentRun(args: string[]) {
88
96
  console.error('❌ --deck <path> is required')
89
97
  process.exit(1)
90
98
  }
91
- if (!opts.task && !opts.brief) {
92
- console.error('❌ --task <path> or --brief "<prompt>" is required')
99
+ if (!opts.task && (!opts.brief || !opts.brief.trim())) {
100
+ console.error('❌ --task <path> or --brief "<prompt>" is required and cannot be empty')
93
101
  process.exit(1)
94
102
  }
95
103
 
@@ -97,39 +105,9 @@ async function agentRun(args: string[]) {
97
105
  const deckPath = resolve(opts.deck)
98
106
  if (!existsSync(deckPath)) { console.error(`❌ Deck file not found: ${deckPath}`); process.exit(1) }
99
107
 
100
- // Resolve task: either from file, or create temp task from --brief
101
- let taskPath: string
102
- if (opts.task) {
103
- taskPath = resolve(opts.task)
104
- if (!existsSync(taskPath)) { console.error(`❌ Task file not found: ${taskPath}`); process.exit(1) }
105
- } else {
106
- const { mkdtempSync, writeFileSync } = await import('node:fs')
107
- const { tmpdir } = await import('node:os')
108
- const tmpDir = mkdtempSync(join(tmpdir(), 'arena-brief-'))
109
- taskPath = join(tmpDir, 'TASK.md')
110
- const briefTask = `---
111
- name: ad-hoc task
112
- description: ${opts.brief!.slice(0, 80)}
113
- timeout: 120000
114
- ---
115
-
116
- ## Given
117
- - You are an AI agent with the skills declared in the deck
118
-
119
- ## When
120
- ${opts.brief}
121
-
122
- ## Then
123
- - Write your output to output.md
124
- - The output should be complete and well-structured
125
-
126
- ## Judge
127
- Evaluate whether the output is complete, accurate, and well-structured.
128
- `
129
- writeFileSync(taskPath, briefTask, 'utf-8')
130
- }
131
-
132
108
  const { useAgent } = await import('@lythos/test-utils/agents')
109
+ // Optional: register claude-sdk adapter if the package is installed
110
+ try { await import('@lythos/agent-adapter-claude-sdk') } catch { /* package not installed */ }
133
111
  const { runAgentScenario } = await import('@lythos/test-utils/agent-bdd')
134
112
  const { resolvePlayer } = await import('./player')
135
113
  const { readFileSync, writeFileSync, mkdirSync } = await import('node:fs')
@@ -139,27 +117,87 @@ Evaluate whether the output is complete, accurate, and well-structured.
139
117
  const outDir = opts.out ? resolve(opts.out) : join(process.cwd(), `agent-output-${new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19)}`)
140
118
  mkdirSync(outDir, { recursive: true })
141
119
 
120
+ // Resolve task: --brief builds scenario directly, --task reads .agent.md file
121
+ const scenarioOpt: Record<string, unknown> = {}
122
+ if (opts.task) {
123
+ const taskPath = resolve(opts.task)
124
+ if (!existsSync(taskPath)) { console.error(`❌ Task file not found: ${taskPath}`); process.exit(1) }
125
+ scenarioOpt.scenarioPath = taskPath
126
+ } else {
127
+ scenarioOpt.scenario = {
128
+ name: 'ad-hoc task',
129
+ description: opts.brief!.slice(0, 80),
130
+ timeout: 120000,
131
+ given: { deck: {} },
132
+ when: opts.brief!,
133
+ then: ['Write your output to output.md', 'The output should be complete and well-structured'],
134
+ judge: 'Evaluate whether the output is complete, accurate, and well-structured.',
135
+ }
136
+ }
137
+
142
138
  console.log(`🤖 agent-run: ${player} × ${deckPath}`)
143
- console.log(`📋 task: ${taskPath}`)
139
+ if (opts.task) console.log(`📋 task: ${resolve(opts.task!)}`)
140
+ else console.log(`📋 brief: ${opts.brief!.slice(0, 60)}...`)
144
141
 
145
142
  let agentWorkdir = ''
146
143
  const result = await runAgentScenario({
147
- scenarioPath: taskPath,
144
+ ...scenarioOpt,
148
145
  agent,
149
146
  async setupWorkdir(_scenario, workdir) {
150
147
  agentWorkdir = workdir
151
148
  mkdirSync(workdir, { recursive: true })
152
149
  writeFileSync(join(workdir, 'skill-deck.toml'), readFileSync(deckPath, 'utf-8'))
153
150
 
154
- const linkProc = Bun.spawn(
155
- ['bunx', '@lythos/skill-deck', 'link'],
156
- { cwd: workdir, env: { ...process.env, HOME: process.env.HOME! } },
157
- )
158
- await linkProc.exited
151
+ // ── Pre-flight: deck link (skip if deck declares no skills) ──
152
+ const deckRaw = readFileSync(join(workdir, 'skill-deck.toml'), 'utf-8')
153
+ let deckParsed: Record<string, any> = {}
154
+ try { deckParsed = Bun.TOML.parse(deckRaw) as Record<string, any> } catch {}
155
+ const hasSkills = parseDeckSkills(deckParsed).length > 0
156
+
157
+ if (hasSkills) {
158
+ // Prefer local dev CLI over bunx (bunx needs tempdir write, blocked by some sandboxes)
159
+ const { existsSync: es2 } = await import('node:fs')
160
+ const localDeckCli = join(import.meta.dir, '..', '..', 'lythoskill-deck', 'src', 'cli.ts')
161
+ const linkCmd = es2(localDeckCli)
162
+ ? ['bun', localDeckCli, 'link']
163
+ : ['bunx', '@lythos/skill-deck', 'link']
164
+ const linkProc = Bun.spawn(linkCmd,
165
+ { cwd: workdir, env: { ...process.env, HOME: process.env.HOME! } },
166
+ )
167
+ await linkProc.exited
168
+ const linkStderr = await new Response(linkProc.stderr).text()
169
+ const linkResult = validateLinkResult(linkProc.exitCode, linkStderr)
170
+ if (!linkResult.ok) {
171
+ console.error(`❌ ${linkResult.error}`)
172
+ process.exit(1)
173
+ }
174
+ } else {
175
+ console.log('ℹ️ No skills declared in deck — skipping link')
176
+ }
177
+
178
+ // ── Pre-flight: skill existence check (reuses deckParsed from above) ─
179
+ const { existsSync: es } = await import('node:fs')
180
+ const { homedir: hd } = await import('node:os')
181
+ try {
182
+ const coldPoolDefault = join(hd(), '.agents', 'skill-repos')
183
+ const coldPoolDir = resolveColdPoolDir(
184
+ deckParsed?.deck?.cold_pool,
185
+ hd(),
186
+ coldPoolDefault
187
+ )
188
+
189
+ const skills = parseDeckSkills(deckParsed)
190
+ const checks = checkSkillExistence(skills, coldPoolDir, es)
191
+ for (const warning of formatSkillWarnings(checks)) {
192
+ console.warn(`⚠️ ${warning}`)
193
+ }
194
+ } catch (e) {
195
+ console.warn('⚠️ Could not check skill existence:', e instanceof Error ? e.message : e)
196
+ }
159
197
  },
160
198
  })
161
199
 
162
- // Copy agent output to outDir
200
+ // ── Copy agent output to outDir ──────────────────────────────────
163
201
  writeFileSync(join(outDir, 'agent-stdout.txt'), result.agentResult.stdout, 'utf-8')
164
202
  if (result.agentResult.stderr) writeFileSync(join(outDir, 'agent-stderr.txt'), result.agentResult.stderr, 'utf-8')
165
203
  if (result.verdict) writeFileSync(join(outDir, 'judge-verdict.json'), JSON.stringify(result.verdict, null, 2) + '\n', 'utf-8')
@@ -167,16 +205,31 @@ Evaluate whether the output is complete, accurate, and well-structured.
167
205
  // Copy all agent-produced files from workdir (output.md, output.docx, etc.)
168
206
  // Skip .claude/ (symlink dir) and deck artifacts. Recursive so docx/pdf work.
169
207
  if (agentWorkdir) {
170
- const { cpSync, readdirSync } = await import('node:fs')
171
- const skipSet = new Set(['.claude', 'skill-deck.toml', 'skill-deck.lock'])
172
- try {
173
- for (const entry of readdirSync(agentWorkdir)) {
174
- if (skipSet.has(entry)) continue
175
- const src = join(agentWorkdir, entry)
176
- const dest = join(outDir, entry)
177
- try { cpSync(src, dest, { recursive: true }) } catch {}
208
+ const { cpSync, readdirSync, existsSync: es2 } = await import('node:fs')
209
+ if (!es2(agentWorkdir)) {
210
+ console.warn(`⚠️ Agent workdir vanished before copy: ${agentWorkdir}`)
211
+ } else {
212
+ const skipSet = new Set(['.claude', 'skill-deck.toml', 'skill-deck.lock'])
213
+ try {
214
+ const entries = readdirSync(agentWorkdir)
215
+ const plan = buildCopyPlan(agentWorkdir, outDir, entries, skipSet)
216
+ for (const { src, dest, name } of plan) {
217
+ try {
218
+ cpSync(src, dest, { recursive: true })
219
+ } catch (e) {
220
+ console.warn(`⚠️ Failed to copy agent output: ${name} — ${e instanceof Error ? e.message : e}`)
221
+ }
222
+ }
223
+ } catch (e) {
224
+ console.warn(`⚠️ Failed to read agent workdir for copy: ${e instanceof Error ? e.message : e}`)
178
225
  }
179
- } catch {}
226
+ }
227
+ }
228
+
229
+ // ── Post-flight: output validation ──────────────────────────────
230
+ if (!result.agentResult.stdout || result.agentResult.stdout.trim().length === 0) {
231
+ console.warn('⚠️ Agent produced empty stdout — the task may have failed silently.')
232
+ console.warn(` Agent stderr: ${(result.agentResult.stderr || '(empty)').slice(0, 200)}`)
180
233
  }
181
234
 
182
235
  console.log(`\n✅ Agent complete (${result.agentResult.durationMs}ms)`)
package/src/player.ts CHANGED
@@ -15,6 +15,7 @@ const BUILTIN_PLAYERS: Record<string, string> = {
15
15
  'claude': 'claude',
16
16
  'claude-code': 'claude',
17
17
  'kimi': 'kimi',
18
+ 'deepseek': 'deepseek',
18
19
  'cursor': 'cursor',
19
20
  'gemini': 'gemini',
20
21
  }
@@ -0,0 +1,395 @@
1
+ /**
2
+ * preflight.test.ts — TDD tests for arena agent-run pre-flight pure functions
3
+ *
4
+ * Coverage targets:
5
+ * parseDeckSkills — all TOML formats, edge cases
6
+ * checkSkillExistence — cold pool hit/miss, path resolution
7
+ * validateLinkResult — exit codes, error formatting
8
+ * buildCopyPlan — skip set, path mapping
9
+ * resolveColdPoolDir — tilde expansion, fallback
10
+ * formatSkillWarnings — warning string generation
11
+ */
12
+
13
+ import { describe, test, expect } from 'bun:test'
14
+ import {
15
+ parseDeckSkills,
16
+ checkSkillExistence,
17
+ validateLinkResult,
18
+ buildCopyPlan,
19
+ resolveColdPoolDir,
20
+ formatSkillWarnings,
21
+ } from './preflight'
22
+
23
+ // ═══════════════════════════════════════════════════════════════════════════
24
+ // parseDeckSkills
25
+ // ═══════════════════════════════════════════════════════════════════════════
26
+
27
+ describe('parseDeckSkills', () => {
28
+
29
+ test('empty deck → empty array', () => {
30
+ expect(parseDeckSkills({})).toEqual([])
31
+ })
32
+
33
+ test('deck with no skill sections → empty array', () => {
34
+ expect(parseDeckSkills({ deck: { max_cards: 10 } })).toEqual([])
35
+ })
36
+
37
+ test('inline-table format: single tool skill with path', () => {
38
+ const parsed = {
39
+ tool: {
40
+ skills: {
41
+ pdf: { path: 'github.com/anthropics/skills/skills/pdf' }
42
+ }
43
+ }
44
+ }
45
+ expect(parseDeckSkills(parsed)).toEqual([
46
+ { name: 'pdf', path: 'github.com/anthropics/skills/skills/pdf', section: 'tool' }
47
+ ])
48
+ })
49
+
50
+ test('inline-table format: multiple skills', () => {
51
+ const parsed = {
52
+ tool: {
53
+ skills: {
54
+ pdf: { path: 'github.com/anthropics/skills/skills/pdf' },
55
+ docx: { path: 'github.com/anthropics/skills/skills/docx' },
56
+ }
57
+ }
58
+ }
59
+ expect(parseDeckSkills(parsed)).toEqual([
60
+ { name: 'pdf', path: 'github.com/anthropics/skills/skills/pdf', section: 'tool' },
61
+ { name: 'docx', path: 'github.com/anthropics/skills/skills/docx', section: 'tool' },
62
+ ])
63
+ })
64
+
65
+ test('array format: skills = ["a", "b"]', () => {
66
+ const parsed = {
67
+ tool: {
68
+ skills: ['web-search', 'docx']
69
+ }
70
+ }
71
+ expect(parseDeckSkills(parsed)).toEqual([
72
+ { name: 'web-search', path: null, section: 'tool' },
73
+ { name: 'docx', path: null, section: 'tool' },
74
+ ])
75
+ })
76
+
77
+ test('innate section parsed separately', () => {
78
+ const parsed = {
79
+ innate: {
80
+ skills: {
81
+ deck: { path: 'github.com/lythos-labs/lythoskill/skills/lythoskill-deck' }
82
+ }
83
+ },
84
+ tool: {
85
+ skills: {
86
+ pdf: { path: 'github.com/anthropics/skills/skills/pdf' }
87
+ }
88
+ }
89
+ }
90
+ expect(parseDeckSkills(parsed)).toEqual([
91
+ { name: 'deck', path: 'github.com/lythos-labs/lythoskill/skills/lythoskill-deck', section: 'innate' },
92
+ { name: 'pdf', path: 'github.com/anthropics/skills/skills/pdf', section: 'tool' },
93
+ ])
94
+ })
95
+
96
+ test('transient section parsed', () => {
97
+ const parsed = {
98
+ transient: {
99
+ skills: {
100
+ experiment: { path: 'localhost/my-experiment' }
101
+ }
102
+ }
103
+ }
104
+ expect(parseDeckSkills(parsed)).toEqual([
105
+ { name: 'experiment', path: 'localhost/my-experiment', section: 'transient' }
106
+ ])
107
+ })
108
+
109
+ test('object entry without path → path=null', () => {
110
+ const parsed = {
111
+ tool: {
112
+ skills: {
113
+ bare: {} // no path field
114
+ }
115
+ }
116
+ }
117
+ expect(parseDeckSkills(parsed)).toEqual([
118
+ { name: 'bare', path: null, section: 'tool' }
119
+ ])
120
+ })
121
+
122
+ test('object entry with non-string path → path=null', () => {
123
+ const parsed = {
124
+ tool: {
125
+ skills: {
126
+ weird: { path: 42 } // number, not string
127
+ }
128
+ }
129
+ }
130
+ expect(parseDeckSkills(parsed)).toEqual([
131
+ { name: 'weird', path: null, section: 'tool' }
132
+ ])
133
+ })
134
+
135
+ test('array entry that is not a string → skipped', () => {
136
+ const parsed = {
137
+ tool: { skills: ['valid', 123, null, 'also-valid'] }
138
+ }
139
+ expect(parseDeckSkills(parsed)).toEqual([
140
+ { name: 'valid', path: null, section: 'tool' },
141
+ { name: 'also-valid', path: null, section: 'tool' },
142
+ ])
143
+ })
144
+
145
+ test('all three sections populated → ordered innate, tool, transient', () => {
146
+ const parsed = {
147
+ innate: { skills: { a: { path: '/a' } } },
148
+ tool: { skills: { b: { path: '/b' } } },
149
+ transient: { skills: { c: { path: '/c' } } },
150
+ }
151
+ expect(parseDeckSkills(parsed)).toEqual([
152
+ { name: 'a', path: '/a', section: 'innate' },
153
+ { name: 'b', path: '/b', section: 'tool' },
154
+ { name: 'c', path: '/c', section: 'transient' },
155
+ ])
156
+ })
157
+ })
158
+
159
+ // ═══════════════════════════════════════════════════════════════════════════
160
+ // checkSkillExistence
161
+ // ═══════════════════════════════════════════════════════════════════════════
162
+
163
+ describe('checkSkillExistence', () => {
164
+
165
+ test('empty skills → empty array', () => {
166
+ const exists = (_: string) => true
167
+ expect(checkSkillExistence([], '/cold', exists)).toEqual([])
168
+ })
169
+
170
+ test('skill with explicit path → resolves <coldPool>/<path>/SKILL.md', () => {
171
+ const exists = (p: string) => p === '/cold/github.com/owner/repo/skills/my-skill/SKILL.md'
172
+ const skills = [{ name: 'my-skill', path: 'github.com/owner/repo/skills/my-skill', section: 'tool' }]
173
+ const result = checkSkillExistence(skills, '/cold', exists)
174
+ expect(result).toEqual([
175
+ { name: 'my-skill', expectedPath: '/cold/github.com/owner/repo/skills/my-skill/SKILL.md', found: true, section: 'tool' }
176
+ ])
177
+ })
178
+
179
+ test('skill without path (array format) → resolves <coldPool>/<name>/SKILL.md', () => {
180
+ const exists = (p: string) => p === '/cold/web-search/SKILL.md'
181
+ const skills = [{ name: 'web-search', path: null, section: 'tool' }]
182
+ const result = checkSkillExistence(skills, '/cold', exists)
183
+ expect(result).toEqual([
184
+ { name: 'web-search', expectedPath: '/cold/web-search/SKILL.md', found: true, section: 'tool' }
185
+ ])
186
+ })
187
+
188
+ test('HTTP path → uses name as fallback for path resolution', () => {
189
+ const exists = (p: string) => p === '/cold/my-skill/SKILL.md'
190
+ const skills = [{ name: 'my-skill', path: 'https://example.com/deck.toml', section: 'tool' }]
191
+ const result = checkSkillExistence(skills, '/cold', exists)
192
+ expect(result).toEqual([
193
+ { name: 'my-skill', expectedPath: '/cold/my-skill/SKILL.md', found: true, section: 'tool' }
194
+ ])
195
+ })
196
+
197
+ test('all found → all found=true', () => {
198
+ const exists = (_: string) => true
199
+ const skills = [
200
+ { name: 'a', path: '/a', section: 'tool' },
201
+ { name: 'b', path: '/b', section: 'tool' },
202
+ ]
203
+ expect(checkSkillExistence(skills, '/cold', exists)).toEqual([
204
+ { name: 'a', expectedPath: '/cold//a/SKILL.md', found: true, section: 'tool' },
205
+ { name: 'b', expectedPath: '/cold//b/SKILL.md', found: true, section: 'tool' },
206
+ ])
207
+ })
208
+
209
+ test('some missing → mixed found/not-found', () => {
210
+ const exists = (p: string) => p.includes('a')
211
+ const skills = [
212
+ { name: 'a', path: '/a', section: 'tool' },
213
+ { name: 'b', path: '/b', section: 'tool' },
214
+ ]
215
+ expect(checkSkillExistence(skills, '/cold', exists)).toEqual([
216
+ { name: 'a', expectedPath: '/cold//a/SKILL.md', found: true, section: 'tool' },
217
+ { name: 'b', expectedPath: '/cold//b/SKILL.md', found: false, section: 'tool' },
218
+ ])
219
+ })
220
+
221
+ test('different coldPoolDir → different expectedPath prefix', () => {
222
+ const exists = (_: string) => true
223
+ const skills = [{ name: 'x', path: 'github.com/x', section: 'tool' }]
224
+ const a = checkSkillExistence(skills, '/home/user/.agents/skill-repos', exists)
225
+ const b = checkSkillExistence(skills, '/opt/cold', exists)
226
+ expect(a[0].expectedPath).toStartWith('/home/user/.agents/skill-repos/')
227
+ expect(b[0].expectedPath).toStartWith('/opt/cold/')
228
+ })
229
+ })
230
+
231
+ // ═══════════════════════════════════════════════════════════════════════════
232
+ // validateLinkResult
233
+ // ═══════════════════════════════════════════════════════════════════════════
234
+
235
+ describe('validateLinkResult', () => {
236
+
237
+ test('exitCode 0 → ok', () => {
238
+ expect(validateLinkResult(0, '')).toEqual({ ok: true })
239
+ })
240
+
241
+ test('exitCode 0 with stderr → still ok (stderr is not always errors)', () => {
242
+ expect(validateLinkResult(0, 'some warning output')).toEqual({ ok: true })
243
+ })
244
+
245
+ test('exitCode 1 → not ok, error contains snippet', () => {
246
+ const result = validateLinkResult(1, 'something went wrong')
247
+ expect(result.ok).toBe(false)
248
+ expect(result.error).toContain('exit 1')
249
+ expect(result.error).toContain('something went wrong')
250
+ })
251
+
252
+ test('exitCode null → not ok (null !== 0)', () => {
253
+ const result = validateLinkResult(null, 'process killed')
254
+ expect(result.ok).toBe(false)
255
+ expect(result.error).toContain('exit null')
256
+ })
257
+
258
+ test('stderr truncated to 300 chars in error message', () => {
259
+ const longStderr = 'x'.repeat(500)
260
+ const result = validateLinkResult(1, longStderr)
261
+ expect(result.ok).toBe(false)
262
+ expect(result.error!.length).toBeLessThan(350) // "Deck link failed (exit 1): " + 300 chars
263
+ })
264
+
265
+ test('exitCode 0, empty stderr → ok with no error field', () => {
266
+ const result = validateLinkResult(0, '')
267
+ expect(result.ok).toBe(true)
268
+ expect(result.error).toBeUndefined()
269
+ })
270
+ })
271
+
272
+ // ═══════════════════════════════════════════════════════════════════════════
273
+ // buildCopyPlan
274
+ // ═══════════════════════════════════════════════════════════════════════════
275
+
276
+ describe('buildCopyPlan', () => {
277
+
278
+ test('empty entries → empty plan', () => {
279
+ expect(buildCopyPlan('/work', '/out', [], new Set())).toEqual([])
280
+ })
281
+
282
+ test('all skipped → empty plan', () => {
283
+ const skip = new Set(['.claude', 'skill-deck.toml'])
284
+ expect(buildCopyPlan('/work', '/out', ['.claude', 'skill-deck.toml'], skip)).toEqual([])
285
+ })
286
+
287
+ test('normal entries → mapped to outDir', () => {
288
+ const skip = new Set<string>()
289
+ expect(buildCopyPlan('/work', '/out', ['output.md', 'report.docx'], skip)).toEqual([
290
+ { src: '/work/output.md', dest: '/out/output.md', name: 'output.md' },
291
+ { src: '/work/report.docx', dest: '/out/report.docx', name: 'report.docx' },
292
+ ])
293
+ })
294
+
295
+ test('mixed skip and non-skip → only non-skipped', () => {
296
+ const skip = new Set(['.claude', 'skill-deck.toml', 'skill-deck.lock'])
297
+ const entries = ['.claude', 'output.md', 'skill-deck.toml', 'report.docx', 'skill-deck.lock']
298
+ expect(buildCopyPlan('/work', '/out', entries, skip)).toEqual([
299
+ { src: '/work/output.md', dest: '/out/output.md', name: 'output.md' },
300
+ { src: '/work/report.docx', dest: '/out/report.docx', name: 'report.docx' },
301
+ ])
302
+ })
303
+
304
+ test('preserves entry order', () => {
305
+ const skip = new Set<string>()
306
+ const entries = ['c', 'a', 'b']
307
+ expect(buildCopyPlan('/w', '/o', entries, skip).map(e => e.name)).toEqual(['c', 'a', 'b'])
308
+ })
309
+
310
+ test('nested paths work (agent-produced subdirectories)', () => {
311
+ const skip = new Set<string>()
312
+ expect(buildCopyPlan('/work', '/out', ['subdir/output.pdf'], skip)).toEqual([
313
+ { src: '/work/subdir/output.pdf', dest: '/out/subdir/output.pdf', name: 'subdir/output.pdf' },
314
+ ])
315
+ })
316
+ })
317
+
318
+ // ═══════════════════════════════════════════════════════════════════════════
319
+ // resolveColdPoolDir
320
+ // ═══════════════════════════════════════════════════════════════════════════
321
+
322
+ describe('resolveColdPoolDir', () => {
323
+
324
+ test('explicit absolute path → returned as-is', () => {
325
+ expect(resolveColdPoolDir('/opt/cold', '/home/user', '/fallback')).toBe('/opt/cold')
326
+ })
327
+
328
+ test('explicit relative path → returned as-is', () => {
329
+ expect(resolveColdPoolDir('my-cold-pool', '/home/user', '/fallback')).toBe('my-cold-pool')
330
+ })
331
+
332
+ test('tilde path → expanded with homeDir', () => {
333
+ expect(resolveColdPoolDir('~/.agents/skill-repos', '/home/user', '/fallback'))
334
+ .toBe('/home/user/.agents/skill-repos')
335
+ })
336
+
337
+ test('tilde at start only → expanded; tilde elsewhere not expanded', () => {
338
+ expect(resolveColdPoolDir('path/with~/tilde', '/home/user', '/fallback'))
339
+ .toBe('path/with~/tilde')
340
+ })
341
+
342
+ test('undefined → uses fallback', () => {
343
+ expect(resolveColdPoolDir(undefined, '/home/user', '/default/cold'))
344
+ .toBe('/default/cold')
345
+ })
346
+
347
+ test('empty string → uses fallback (|| operator)', () => {
348
+ expect(resolveColdPoolDir('', '/home/user', '/default/cold'))
349
+ .toBe('/default/cold')
350
+ })
351
+ })
352
+
353
+ // ═══════════════════════════════════════════════════════════════════════════
354
+ // formatSkillWarnings
355
+ // ═══════════════════════════════════════════════════════════════════════════
356
+
357
+ describe('formatSkillWarnings', () => {
358
+
359
+ test('all found → empty array', () => {
360
+ const checks = [
361
+ { name: 'a', expectedPath: '/p/a/SKILL.md', found: true, section: 'tool' },
362
+ { name: 'b', expectedPath: '/p/b/SKILL.md', found: true, section: 'tool' },
363
+ ]
364
+ expect(formatSkillWarnings(checks)).toEqual([])
365
+ })
366
+
367
+ test('some missing → one warning per missing skill', () => {
368
+ const checks = [
369
+ { name: 'pdf', expectedPath: '/cold/pdf/SKILL.md', found: false, section: 'tool' },
370
+ { name: 'docx', expectedPath: '/cold/docx/SKILL.md', found: true, section: 'tool' },
371
+ ]
372
+ expect(formatSkillWarnings(checks)).toEqual([
373
+ 'Skill "pdf" declared in deck [tool] but SKILL.md not found at: /cold/pdf/SKILL.md',
374
+ ])
375
+ })
376
+
377
+ test('all missing → warning for each', () => {
378
+ const checks = [
379
+ { name: 'a', expectedPath: '/p/a/SKILL.md', found: false, section: 'innate' },
380
+ { name: 'b', expectedPath: '/p/b/SKILL.md', found: false, section: 'tool' },
381
+ ]
382
+ expect(formatSkillWarnings(checks)).toHaveLength(2)
383
+ })
384
+
385
+ test('empty array → empty array', () => {
386
+ expect(formatSkillWarnings([])).toEqual([])
387
+ })
388
+
389
+ test('section name appears in warning string', () => {
390
+ const checks = [
391
+ { name: 'x', expectedPath: '/p/x', found: false, section: 'transient' },
392
+ ]
393
+ expect(formatSkillWarnings(checks)[0]).toContain('[transient]')
394
+ })
395
+ })
@@ -0,0 +1,186 @@
1
+ /**
2
+ * preflight.ts — Arena agent-run pre-flight pure functions
3
+ *
4
+ * Extracted from cli.ts agentRun to enable unit testing.
5
+ * All functions are pure: no filesystem IO, no spawn, no console.
6
+ * IO is injected via function parameters (e.g., existsFn, readdirFn).
7
+ */
8
+
9
+ // ── Types ─────────────────────────────────────────────────────────────────
10
+
11
+ /** A skill as declared in skill-deck.toml */
12
+ export interface SkillDecl {
13
+ name: string // TOML key (e.g., "pdf")
14
+ path: string | null // explicit path from inline-table format; null for array format
15
+ section: string // "innate" | "tool" | "transient"
16
+ }
17
+
18
+ /** Result of checking one skill against the cold pool */
19
+ export interface SkillCheck {
20
+ name: string
21
+ expectedPath: string // resolved cold pool path that was checked
22
+ found: boolean
23
+ section: string
24
+ }
25
+
26
+ /** Result of deck link validation */
27
+ export interface LinkResult {
28
+ ok: boolean
29
+ error?: string
30
+ }
31
+
32
+ /** A single file copy operation plan entry */
33
+ export interface CopyEntry {
34
+ src: string
35
+ dest: string
36
+ name: string // entry basename for error reporting
37
+ }
38
+
39
+ // ── parseDeckSkills ──────────────────────────────────────────────────────
40
+
41
+ /**
42
+ * Parse a skill-deck.toml string and extract all declared skills.
43
+ *
44
+ * Handles both TOML formats:
45
+ * [tool.skills.pdf] → { name: "pdf", path: "github.com/...", section: "tool" }
46
+ * path = "github.com/..."
47
+ *
48
+ * skills = ["a", "b"] → { name: "a", path: null, section: "tool" }
49
+ *
50
+ * Pure: string → SkillDecl[]. No IO, no Bun.TOML dependency (caller parses first).
51
+ */
52
+ export function parseDeckSkills(
53
+ deckParsed: Record<string, any>
54
+ ): SkillDecl[] {
55
+ const results: SkillDecl[] = []
56
+ const sections = ['innate', 'tool', 'transient'] as const
57
+
58
+ for (const section of sections) {
59
+ const skills = deckParsed?.[section]?.skills
60
+ if (!skills) continue
61
+
62
+ if (Array.isArray(skills)) {
63
+ // Array format: skills = ["name1", "name2"]
64
+ for (const name of skills) {
65
+ if (typeof name === 'string') {
66
+ results.push({ name, path: null, section })
67
+ }
68
+ }
69
+ } else if (typeof skills === 'object') {
70
+ // Inline-table format: [tool.skills.name], path = "..."
71
+ for (const [name, entry] of Object.entries(skills as Record<string, any>)) {
72
+ const skillPath = typeof entry?.path === 'string' ? entry.path : null
73
+ results.push({ name, path: skillPath, section })
74
+ }
75
+ }
76
+ }
77
+
78
+ return results
79
+ }
80
+
81
+ // ── checkSkillExistence ──────────────────────────────────────────────────
82
+
83
+ /**
84
+ * Check each declared skill against the cold pool filesystem.
85
+ *
86
+ * For skills with explicit paths: resolve `<coldPoolDir>/<path>/SKILL.md`
87
+ * For skills without paths (array format): resolve `<coldPoolDir>/<name>/SKILL.md`
88
+ * Skills with HTTP/URL paths are skipped (not local).
89
+ *
90
+ * `existsFn` is the IO injection point — swap for real fs or mock.
91
+ */
92
+ export function checkSkillExistence(
93
+ skills: SkillDecl[],
94
+ coldPoolDir: string,
95
+ existsFn: (path: string) => boolean
96
+ ): SkillCheck[] {
97
+ return skills.map(skill => {
98
+ const resolvedName = skill.path && !skill.path.startsWith('http')
99
+ ? skill.path
100
+ : skill.name
101
+ const expectedPath = `${coldPoolDir}/${resolvedName}/SKILL.md`
102
+ return {
103
+ name: skill.name,
104
+ expectedPath,
105
+ found: existsFn(expectedPath),
106
+ section: skill.section,
107
+ }
108
+ })
109
+ }
110
+
111
+ // ── validateLinkResult ───────────────────────────────────────────────────
112
+
113
+ /**
114
+ * Validate the outcome of `bunx @lythos/skill-deck link`.
115
+ *
116
+ * Pure: (exitCode, stderr) → LinkResult.
117
+ * Non-zero exit code = failure. Zero + no stderr = success.
118
+ */
119
+ export function validateLinkResult(
120
+ exitCode: number | null,
121
+ stderr: string
122
+ ): LinkResult {
123
+ if (exitCode !== 0) {
124
+ const snippet = (stderr || '').slice(0, 300)
125
+ return {
126
+ ok: false,
127
+ error: `Deck link failed (exit ${exitCode}): ${snippet}`,
128
+ }
129
+ }
130
+ return { ok: true }
131
+ }
132
+
133
+ // ── buildCopyPlan ────────────────────────────────────────────────────────
134
+
135
+ /**
136
+ * Build a copy plan from workdir entries → outDir.
137
+ *
138
+ * Skips entries in `skipSet`. Each surviving entry maps to `<outDir>/<name>`.
139
+ * Pure: strings + set → CopyEntry[]. No filesystem access.
140
+ */
141
+ export function buildCopyPlan(
142
+ workdir: string,
143
+ outDir: string,
144
+ entries: string[],
145
+ skipSet: Set<string>
146
+ ): CopyEntry[] {
147
+ const plan: CopyEntry[] = []
148
+ for (const name of entries) {
149
+ if (skipSet.has(name)) continue
150
+ plan.push({
151
+ src: `${workdir}/${name}`,
152
+ dest: `${outDir}/${name}`,
153
+ name,
154
+ })
155
+ }
156
+ return plan
157
+ }
158
+
159
+ // ── resolveColdPoolDir ───────────────────────────────────────────────────
160
+
161
+ /**
162
+ * Resolve cold_pool root from deck config, expanding ~.
163
+ *
164
+ * Pure: string → string. No filesystem access.
165
+ */
166
+ export function resolveColdPoolDir(
167
+ coldPoolRoot: string | undefined,
168
+ homeDir: string,
169
+ fallbackDir: string
170
+ ): string {
171
+ const raw = coldPoolRoot || fallbackDir
172
+ return raw.startsWith('~') ? `${homeDir}${raw.slice(1)}` : raw
173
+ }
174
+
175
+ // ── formatSkillWarnings ──────────────────────────────────────────────────
176
+
177
+ /**
178
+ * Format skill check results into human-readable warning strings.
179
+ *
180
+ * Pure: SkillCheck[] → string[].
181
+ */
182
+ export function formatSkillWarnings(checks: SkillCheck[]): string[] {
183
+ return checks
184
+ .filter(c => !c.found)
185
+ .map(c => `Skill "${c.name}" declared in deck [${c.section}] but SKILL.md not found at: ${c.expectedPath}`)
186
+ }
package/src/runner.ts CHANGED
@@ -3,6 +3,8 @@ import { join, resolve } from 'node:path'
3
3
  import { tmpdir } from 'node:os'
4
4
  import { runAgentScenario, type AgentScenario } from '@lythos/test-utils/agent-bdd'
5
5
  import { useAgent } from '@lythos/test-utils/agents'
6
+ // Optional: register claude-sdk adapter if the package is installed
7
+ try { await import('@lythos/agent-adapter-claude-sdk') } catch { /* package not installed */ }
6
8
  import { ArenaManifest, Player } from '@lythos/test-utils/schema'
7
9
  import type { ArenaManifest as ArenaManifestType, JudgeVerdict } from '@lythos/test-utils/schema'
8
10
  import { runComparativeJudge } from './comparative-judge'