npm - @lythos/skill-arena - Versions diffs - 0.9.40 → 0.9.41 - Mend

@lythos/skill-arena 0.9.40 → 0.9.41

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -49,23 +49,23 @@ Note: Claude `-p` mode has known issues with web tools in Bun.spawn (deferred to
 ```bash
 bun add -d @lythos/skill-arena
 # or use directly
-bunx @lythos/skill-arena@0.9.40 <command>
+bunx @lythos/skill-arena@0.9.41 <command>
 ```
 ## Quick Start
 ```bash
 # Single: test a deck with one agent (exec shortcut)
-bunx @lythos/skill-arena@0.9.40 single \
+bunx @lythos/skill-arena@0.9.41 single \
   --brief "Generate auth flow diagram" \
   --deck ./examples/decks/documents.toml
 # Vs: compare multiple decks side by side (declarative)
-bunx @lythos/skill-arena@0.9.40 vs \
+bunx @lythos/skill-arena@0.9.41 vs \
   --config examples/arena/research-compare/arena.toml
 # Visualize results
-bunx @lythos/skill-arena@0.9.40 viz tmp/arena-<id>/
+bunx @lythos/skill-arena@0.9.41 viz tmp/arena-<id>/
 ```
 ## Commands
@@ -74,16 +74,16 @@ bunx @lythos/skill-arena@0.9.40 viz tmp/arena-<id>/
 ```bash
 # Print execution plan without running
-bunx @lythos/skill-arena@0.9.40 vs --config arena.toml --dry-run
+bunx @lythos/skill-arena@0.9.41 vs --config arena.toml --dry-run
 # Execute with per-side runs_per_side and statistical aggregation
-bunx @lythos/skill-arena@0.9.40 vs --config arena.toml
+bunx @lythos/skill-arena@0.9.41 vs --config arena.toml
 ```
 ### CLI-flag mode (backward compat)
 ```
-bunx @lythos/skill-arena@0.9.40 run \
+bunx @lythos/skill-arena@0.9.41 run \
   --task ./TASK-arena.md \
   --players ./players/claude.toml \
   --decks ./decks/run-01.toml,./decks/run-02.toml \
@@ -93,13 +93,13 @@ bunx @lythos/skill-arena@0.9.40 run \
 ### Scaffold mode (legacy, manual execution)
 ```
-bunx @lythos/skill-arena@0.9.40 scaffold --task "..." --decks a.toml,b.toml
+bunx @lythos/skill-arena@0.9.41 scaffold --task "..." --decks a.toml,b.toml
 ```
 ### Viz
 ```bash
-bunx @lythos/skill-arena@0.9.40 viz runs/arena-<id>/
+bunx @lythos/skill-arena@0.9.41 viz runs/arena-<id>/
 ```
 ## Skill Documentation
@@ -113,7 +113,7 @@ The agent-visible **Skill** layer documentation is here:
 Part of the [lythoskill](https://github.com/lythos-labs/lythoskill) ecosystem — the thin-skill pattern separates heavy logic (this npm package) from lightweight agent instructions (SKILL.md).
 ```
-Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.40 ...
+Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.41 ...
 Skill   (packages/<name>/skill/)     → build → SKILL.md + thin scripts
 Output  (skills/<name>/)             → git commit → agent-visible skill
 ```

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@lythos/skill-arena",
-  "version": "0.9.40",
+  "version": "0.9.41",
   "description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
   "keywords": [
     "ai-agent",

package/src/cli.ts CHANGED Viewed

@@ -152,6 +152,27 @@ async function singleRun(args: string[]) {
    Format: frontmatter (name, description, timeout) + Given/When/Then/Judge sections.
    Example: see playground/arena-one-shot/TASK-arena.agent.md`); process.exit(1) }
     scenarioOpt.scenarioPath = taskPath
+    // Quick validation: check frontmatter presence
+    const raw = readFileSync(taskPath, 'utf-8')
+    if (!raw.startsWith('---')) { console.error(`❌ Invalid .agent.md: missing frontmatter (must start with "---")
+   Correct format:
+   ---
+   name: my-scenario
+   description: what this tests
+   timeout: 120000
+   ---
+   ## Given
+   ...
+   ## When
+   ...
+   ## Then
+   ...
+   ## Judge
+   ...
+   Template: playground/arena-one-shot/TASK-arena.agent.md`); process.exit(1) }
+    if (!raw.includes('## When')) { console.error(`❌ Invalid .agent.md: missing "## When" section.
+   The ## When section defines what the agent should do.
+   See template: playground/arena-one-shot/TASK-arena.agent.md`); process.exit(1) }
   } else {
     scenarioOpt.scenario = {
       name: 'ad-hoc task',

package/src/runner.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { mkdirSync, writeFileSync, readFileSync } from 'node:fs'
+import { existsSync, mkdirSync, writeFileSync, readFileSync, rmSync } from 'node:fs'
 import { join, resolve } from 'node:path'
 import { tmpdir } from 'node:os'
 import { runAgentScenario, type AgentScenario } from '@lythos/test-utils/agent-bdd'
@@ -57,7 +57,35 @@ export async function runArenaFromToml(opts: {
     if (configDir) return resolve(configDir, p)
     return resolve(p)
   }
-  const taskAbs = resolvePath(taskPath)
+  const resolveOrCreateTask = (): { path: string; cleanup?: () => void } => {
+    const candidate = resolvePath(taskPath)
+    if (existsSync(candidate)) return { path: candidate }
+    // taskPath is inline text — write temp scenario file
+    const tmp = join(tmpdir(), `arena-task-${stamp()}.agent.md`)
+    writeFileSync(tmp, `---
+name: arena task
+description: ${taskPath.slice(0, 80)}
+timeout: 120000
+---
+## Given
+- Working directory with an empty project
+- bun is available
+## When
+${taskPath}
+## Then
+- Complete the task above
+- Write a summary to output.md
+## Judge
+- completeness
+- correctness
+`)
+    return { path: tmp, cleanup: () => { try { rmSync(tmp) } catch {} } }
+  }
+  const { path: taskAbs, cleanup: taskCleanup } = resolveOrCreateTask()
   const resolvedToml: ArenaToml = {
     ...toml,
     side: toml.side.map(s => ({ ...s, deck: resolvePath(s.deck) })),
@@ -78,10 +106,13 @@ export async function runArenaFromToml(opts: {
   const resolved = resolveSides(resolvedToml)
   // Build manifest
+  const taskContent = existsSync(taskAbs)
+    ? readFileSync(taskAbs, 'utf-8').slice(0, 200)
+    : taskPath // inline description, not a file path
   const manifest = ArenaManifest.parse({
     id: arenaId,
     created_at: new Date().toISOString(),
-    task: readFileSync(taskAbs, 'utf-8').slice(0, 200),
+    task: taskContent,
     mode: 'decks',
     participants: [...new Map(resolved.map(r => [r.side.name, r])).values()].map(r => ({
       id: r.side.name,