@lythos/skill-arena 0.9.36 → 0.9.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -49,26 +49,26 @@ Note: Claude `-p` mode has known issues with web tools in Bun.spawn (deferred to
49
49
  ```bash
50
50
  bun add -d @lythos/skill-arena
51
51
  # or use directly
52
- bunx @lythos/skill-arena@0.9.36 <command>
52
+ bunx @lythos/skill-arena@0.9.38 <command>
53
53
  ```
54
54
 
55
55
  ## Quick Start
56
56
 
57
57
  ```bash
58
58
  # Mode 1: Compare two skills on the same task
59
- bunx @lythos/skill-arena@0.9.36 \
59
+ bunx @lythos/skill-arena@0.9.38 \
60
60
  --task "Generate auth flow diagram" \
61
61
  --skills "design-doc-mermaid,mermaid-tools" \
62
62
  --criteria "syntax,context,token"
63
63
 
64
64
  # Mode 2: Compare full deck configurations
65
- bunx @lythos/skill-arena@0.9.36 \
65
+ bunx @lythos/skill-arena@0.9.38 \
66
66
  --task "Generate auth flow diagram" \
67
67
  --decks "./decks/minimal.toml,./decks/rich.toml" \
68
68
  --criteria "quality,token,maintainability"
69
69
 
70
70
  # Visualize results
71
- bunx @lythos/skill-arena@0.9.36 viz tmp/arena-<id>/
71
+ bunx @lythos/skill-arena@0.9.38 viz tmp/arena-<id>/
72
72
  ```
73
73
 
74
74
  ## Commands
@@ -77,16 +77,16 @@ bunx @lythos/skill-arena@0.9.36 viz tmp/arena-<id>/
77
77
 
78
78
  ```bash
79
79
  # Print execution plan without running
80
- bunx @lythos/skill-arena@0.9.36 run --config arena.toml --dry-run
80
+ bunx @lythos/skill-arena@0.9.38 run --config arena.toml --dry-run
81
81
 
82
82
  # Execute with per-side runs_per_side and statistical aggregation
83
- bunx @lythos/skill-arena@0.9.36 run --config arena.toml
83
+ bunx @lythos/skill-arena@0.9.38 run --config arena.toml
84
84
  ```
85
85
 
86
86
  ### CLI-flag mode (backward compat)
87
87
 
88
88
  ```
89
- bunx @lythos/skill-arena@0.9.36 run \
89
+ bunx @lythos/skill-arena@0.9.38 run \
90
90
  --task ./TASK-arena.md \
91
91
  --players ./players/claude.toml \
92
92
  --decks ./decks/run-01.toml,./decks/run-02.toml \
@@ -96,13 +96,13 @@ bunx @lythos/skill-arena@0.9.36 run \
96
96
  ### Scaffold mode (legacy, manual execution)
97
97
 
98
98
  ```
99
- bunx @lythos/skill-arena@0.9.36 scaffold --task "..." --skills a,b
99
+ bunx @lythos/skill-arena@0.9.38 scaffold --task "..." --skills a,b
100
100
  ```
101
101
 
102
102
  ### Viz
103
103
 
104
104
  ```bash
105
- bunx @lythos/skill-arena@0.9.36 viz runs/arena-<id>/
105
+ bunx @lythos/skill-arena@0.9.38 viz runs/arena-<id>/
106
106
  ```
107
107
 
108
108
  ## Skill Documentation
@@ -116,7 +116,7 @@ The agent-visible **Skill** layer documentation is here:
116
116
  Part of the [lythoskill](https://github.com/lythos-labs/lythoskill) ecosystem — the thin-skill pattern separates heavy logic (this npm package) from lightweight agent instructions (SKILL.md).
117
117
 
118
118
  ```
119
- Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.36 ...
119
+ Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.38 ...
120
120
  Skill (packages/<name>/skill/) → build → SKILL.md + thin scripts
121
121
  Output (skills/<name>/) → git commit → agent-visible skill
122
122
  ```
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lythos/skill-arena",
3
- "version": "0.9.36",
3
+ "version": "0.9.38",
4
4
  "description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
5
5
  "keywords": [
6
6
  "ai-agent",
@@ -16,6 +16,11 @@
16
16
  "access": "public"
17
17
  },
18
18
  "type": "module",
19
+ "scripts": {
20
+ "test": "bun test src/ --pass-with-no-tests",
21
+ "test:coverage": "bun test src/ --coverage --coverage-reporter=lcov --coverage-dir=coverage --pass-with-no-tests",
22
+ "test:watch": "bun test src/ --watch"
23
+ },
19
24
  "bin": {
20
25
  "lythoskill-arena": "src/cli.ts"
21
26
  },
package/src/cli.ts CHANGED
@@ -107,9 +107,12 @@ async function agentRun(args: string[]) {
107
107
  let deckPath: string
108
108
  if (opts.deck.startsWith('http://') || opts.deck.startsWith('https://')) {
109
109
  let url = opts.deck
110
- if (url.includes('github.com/') && url.includes('/blob/')) {
111
- url = url.replace('github.com/', 'raw.githubusercontent.com/').replace('/blob/', '/')
112
- }
110
+ try {
111
+ const u = new URL(url)
112
+ if (u.hostname === 'github.com' && u.pathname.includes('/blob/')) {
113
+ url = `https://raw.githubusercontent.com${u.pathname.replace('/blob/', '/')}`
114
+ }
115
+ } catch { /* keep original url */ }
113
116
  const dest = resolve(process.cwd(), 'arena-deck.toml')
114
117
  console.log(`📥 Fetching arena deck: ${url}`)
115
118
  const res = await fetch(url, { signal: AbortSignal.timeout(30_000) })
@@ -125,6 +128,7 @@ async function agentRun(args: string[]) {
125
128
  const { useAgent } = await import('@lythos/test-utils/agents')
126
129
  // Optional: register claude-sdk adapter if the package is installed
127
130
  try { await import('@lythos/agent-adapter-claude-sdk') } catch { /* package not installed */ }
131
+ try { await import('@lythos/agent-adapter-deepseek-serve') } catch { /* package not installed */ }
128
132
  const { runAgentScenario } = await import('@lythos/test-utils/agent-bdd')
129
133
  const { resolvePlayer } = await import('./player')
130
134
  const { readFileSync, writeFileSync, mkdirSync } = await import('node:fs')
package/src/runner.ts CHANGED
@@ -109,11 +109,35 @@ export async function runArenaFromToml(opts: {
109
109
  const result = await runAgentScenario({
110
110
  scenarioPath: taskAbs,
111
111
  agent,
112
- async setupWorkdir(_scenario: AgentScenario, workdir: string) {
112
+ async setupWorkdir(scenario: AgentScenario, workdir: string) {
113
113
  mkdirSync(workdir, { recursive: true })
114
114
  const deckContent = readFileSync(cell.deck, 'utf-8')
115
115
  writeFileSync(join(workdir, 'skill-deck.toml'), deckContent)
116
116
 
117
+ // Write AGENTS.md bootloader — agents read this on entry
118
+ writeFileSync(join(workdir, 'AGENTS.md'), [
119
+ '# Arena Test Environment',
120
+ '',
121
+ `**Side**: ${cell.side}`,
122
+ `**Player**: ${cell.player}`,
123
+ `**Run**: ${cell.run}`,
124
+ '',
125
+ '## Task',
126
+ '',
127
+ scenario.it ?? scenario.description ?? '(no task description)',
128
+ '',
129
+ '## How This Works',
130
+ '',
131
+ '- This is an isolated arena test directory. No parent `.claude/skills/` exists.',
132
+ '- Skills are configured in `skill-deck.toml` and symlinked by `deck link`.',
133
+ '- Complete the task above using the available skills.',
134
+ '- Output your work to this directory (or `output/` if specified).',
135
+ '',
136
+ '## Expected Output',
137
+ '',
138
+ 'After completing the task, write a brief summary of what you did.',
139
+ ].join('\n'))
140
+
117
141
  // Link skills via bunx (works both locally and when installed via bunx)
118
142
  const linkProc = Bun.spawn(
119
143
  ['bunx', '@lythos/skill-deck', 'link'],