@lythos/skill-arena 0.9.40 → 0.9.41

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -49,23 +49,23 @@ Note: Claude `-p` mode has known issues with web tools in Bun.spawn (deferred to
49
49
  ```bash
50
50
  bun add -d @lythos/skill-arena
51
51
  # or use directly
52
- bunx @lythos/skill-arena@0.9.40 <command>
52
+ bunx @lythos/skill-arena@0.9.41 <command>
53
53
  ```
54
54
 
55
55
  ## Quick Start
56
56
 
57
57
  ```bash
58
58
  # Single: test a deck with one agent (exec shortcut)
59
- bunx @lythos/skill-arena@0.9.40 single \
59
+ bunx @lythos/skill-arena@0.9.41 single \
60
60
  --brief "Generate auth flow diagram" \
61
61
  --deck ./examples/decks/documents.toml
62
62
 
63
63
  # Vs: compare multiple decks side by side (declarative)
64
- bunx @lythos/skill-arena@0.9.40 vs \
64
+ bunx @lythos/skill-arena@0.9.41 vs \
65
65
  --config examples/arena/research-compare/arena.toml
66
66
 
67
67
  # Visualize results
68
- bunx @lythos/skill-arena@0.9.40 viz tmp/arena-<id>/
68
+ bunx @lythos/skill-arena@0.9.41 viz tmp/arena-<id>/
69
69
  ```
70
70
 
71
71
  ## Commands
@@ -74,16 +74,16 @@ bunx @lythos/skill-arena@0.9.40 viz tmp/arena-<id>/
74
74
 
75
75
  ```bash
76
76
  # Print execution plan without running
77
- bunx @lythos/skill-arena@0.9.40 vs --config arena.toml --dry-run
77
+ bunx @lythos/skill-arena@0.9.41 vs --config arena.toml --dry-run
78
78
 
79
79
  # Execute with per-side runs_per_side and statistical aggregation
80
- bunx @lythos/skill-arena@0.9.40 vs --config arena.toml
80
+ bunx @lythos/skill-arena@0.9.41 vs --config arena.toml
81
81
  ```
82
82
 
83
83
  ### CLI-flag mode (backward compat)
84
84
 
85
85
  ```
86
- bunx @lythos/skill-arena@0.9.40 run \
86
+ bunx @lythos/skill-arena@0.9.41 run \
87
87
  --task ./TASK-arena.md \
88
88
  --players ./players/claude.toml \
89
89
  --decks ./decks/run-01.toml,./decks/run-02.toml \
@@ -93,13 +93,13 @@ bunx @lythos/skill-arena@0.9.40 run \
93
93
  ### Scaffold mode (legacy, manual execution)
94
94
 
95
95
  ```
96
- bunx @lythos/skill-arena@0.9.40 scaffold --task "..." --decks a.toml,b.toml
96
+ bunx @lythos/skill-arena@0.9.41 scaffold --task "..." --decks a.toml,b.toml
97
97
  ```
98
98
 
99
99
  ### Viz
100
100
 
101
101
  ```bash
102
- bunx @lythos/skill-arena@0.9.40 viz runs/arena-<id>/
102
+ bunx @lythos/skill-arena@0.9.41 viz runs/arena-<id>/
103
103
  ```
104
104
 
105
105
  ## Skill Documentation
@@ -113,7 +113,7 @@ The agent-visible **Skill** layer documentation is here:
113
113
  Part of the [lythoskill](https://github.com/lythos-labs/lythoskill) ecosystem — the thin-skill pattern separates heavy logic (this npm package) from lightweight agent instructions (SKILL.md).
114
114
 
115
115
  ```
116
- Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.40 ...
116
+ Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.41 ...
117
117
  Skill (packages/<name>/skill/) → build → SKILL.md + thin scripts
118
118
  Output (skills/<name>/) → git commit → agent-visible skill
119
119
  ```
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lythos/skill-arena",
3
- "version": "0.9.40",
3
+ "version": "0.9.41",
4
4
  "description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
5
5
  "keywords": [
6
6
  "ai-agent",
package/src/cli.ts CHANGED
@@ -152,6 +152,27 @@ async function singleRun(args: string[]) {
152
152
  Format: frontmatter (name, description, timeout) + Given/When/Then/Judge sections.
153
153
  Example: see playground/arena-one-shot/TASK-arena.agent.md`); process.exit(1) }
154
154
  scenarioOpt.scenarioPath = taskPath
155
+ // Quick validation: check frontmatter presence
156
+ const raw = readFileSync(taskPath, 'utf-8')
157
+ if (!raw.startsWith('---')) { console.error(`❌ Invalid .agent.md: missing frontmatter (must start with "---")
158
+ Correct format:
159
+ ---
160
+ name: my-scenario
161
+ description: what this tests
162
+ timeout: 120000
163
+ ---
164
+ ## Given
165
+ ...
166
+ ## When
167
+ ...
168
+ ## Then
169
+ ...
170
+ ## Judge
171
+ ...
172
+ Template: playground/arena-one-shot/TASK-arena.agent.md`); process.exit(1) }
173
+ if (!raw.includes('## When')) { console.error(`❌ Invalid .agent.md: missing "## When" section.
174
+ The ## When section defines what the agent should do.
175
+ See template: playground/arena-one-shot/TASK-arena.agent.md`); process.exit(1) }
155
176
  } else {
156
177
  scenarioOpt.scenario = {
157
178
  name: 'ad-hoc task',
package/src/runner.ts CHANGED
@@ -1,4 +1,4 @@
1
- import { mkdirSync, writeFileSync, readFileSync } from 'node:fs'
1
+ import { existsSync, mkdirSync, writeFileSync, readFileSync, rmSync } from 'node:fs'
2
2
  import { join, resolve } from 'node:path'
3
3
  import { tmpdir } from 'node:os'
4
4
  import { runAgentScenario, type AgentScenario } from '@lythos/test-utils/agent-bdd'
@@ -57,7 +57,35 @@ export async function runArenaFromToml(opts: {
57
57
  if (configDir) return resolve(configDir, p)
58
58
  return resolve(p)
59
59
  }
60
- const taskAbs = resolvePath(taskPath)
60
+ const resolveOrCreateTask = (): { path: string; cleanup?: () => void } => {
61
+ const candidate = resolvePath(taskPath)
62
+ if (existsSync(candidate)) return { path: candidate }
63
+ // taskPath is inline text — write temp scenario file
64
+ const tmp = join(tmpdir(), `arena-task-${stamp()}.agent.md`)
65
+ writeFileSync(tmp, `---
66
+ name: arena task
67
+ description: ${taskPath.slice(0, 80)}
68
+ timeout: 120000
69
+ ---
70
+
71
+ ## Given
72
+ - Working directory with an empty project
73
+ - bun is available
74
+
75
+ ## When
76
+ ${taskPath}
77
+
78
+ ## Then
79
+ - Complete the task above
80
+ - Write a summary to output.md
81
+
82
+ ## Judge
83
+ - completeness
84
+ - correctness
85
+ `)
86
+ return { path: tmp, cleanup: () => { try { rmSync(tmp) } catch {} } }
87
+ }
88
+ const { path: taskAbs, cleanup: taskCleanup } = resolveOrCreateTask()
61
89
  const resolvedToml: ArenaToml = {
62
90
  ...toml,
63
91
  side: toml.side.map(s => ({ ...s, deck: resolvePath(s.deck) })),
@@ -78,10 +106,13 @@ export async function runArenaFromToml(opts: {
78
106
  const resolved = resolveSides(resolvedToml)
79
107
 
80
108
  // Build manifest
109
+ const taskContent = existsSync(taskAbs)
110
+ ? readFileSync(taskAbs, 'utf-8').slice(0, 200)
111
+ : taskPath // inline description, not a file path
81
112
  const manifest = ArenaManifest.parse({
82
113
  id: arenaId,
83
114
  created_at: new Date().toISOString(),
84
- task: readFileSync(taskAbs, 'utf-8').slice(0, 200),
115
+ task: taskContent,
85
116
  mode: 'decks',
86
117
  participants: [...new Map(resolved.map(r => [r.side.name, r])).values()].map(r => ({
87
118
  id: r.side.name,