npm - @lythos/skill-arena - Versions diffs - 0.13.0 → 0.13.2 - Mend

@lythos/skill-arena 0.13.0 → 0.13.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -49,45 +49,59 @@ Note: Claude `-p` mode has known issues with web tools in Bun.spawn (deferred to
 ```bash
 bun add -d @lythos/skill-arena
 # or use directly
-bunx @lythos/skill-arena@0.13.0 <command>
+bunx @lythos/skill-arena@0.13.2 <command>
 ```
 ## Quick Start
 ```bash
-# Single: test a deck with one agent
-bunx @lythos/skill-arena@0.13.0 single \
+# Single: test a deck with one agent (most common)
+bunx @lythos/skill-arena@latest single \
+  --deck ./examples/decks/scout.toml \
+  --brief "Generate auth flow diagram" \
+  --player kimi \
+  --timeout 300000 \
+  --out ./output
+# Single with remote deck (URL auto-fetched)
+bunx @lythos/skill-arena@latest single \
   --deck https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml \
-  --brief "Generate auth flow diagram"
+  --brief "Generate auth flow diagram" \
+  --out ./output
 # Vs: compare multiple decks side by side
 curl -fsSL https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/arena/research-compare/arena.toml > arena.toml
-bunx @lythos/skill-arena@0.13.0 vs --config ./arena.toml
+bunx @lythos/skill-arena@latest vs --config ./arena.toml
 ```
+**Default behavior:**
+- Agent runs in an isolated `/tmp` workdir (no workspace pollution)
+- All artifacts are copied to `--out` after completion
+- Prompt template injects fixed contract (decision-log, robustness, tool preference) + your brief as variable
 ## Commands
 ### Declarative mode (k8s-style, recommended)
 ```bash
 # Print execution plan without running
-bunx @lythos/skill-arena@0.13.0 vs --config arena.toml --dry-run
+bunx @lythos/skill-arena@0.13.2 vs --config arena.toml --dry-run
 # Execute with per-side runs_per_side and statistical aggregation
-bunx @lythos/skill-arena@0.13.0 vs --config arena.toml
+bunx @lythos/skill-arena@0.13.2 vs --config arena.toml
 ```
 ### Scaffold mode (legacy, manual execution)
 ```
-bunx @lythos/skill-arena@0.13.0 scaffold --task "Generate auth flow diagram" \
+bunx @lythos/skill-arena@0.13.2 scaffold --task "Generate auth flow diagram" \
   --decks https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml,https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/documents.toml
 ```
 ### Viz
 ```bash
-bunx @lythos/skill-arena@0.13.0 viz runs/arena-<id>/
+bunx @lythos/skill-arena@0.13.2 viz runs/arena-<id>/
 ```
 ## Skill Documentation
@@ -101,7 +115,7 @@ The agent-visible **Skill** layer documentation is here:
 Part of the [lythoskill](https://github.com/lythos-labs/lythoskill) ecosystem — the thin-skill pattern separates heavy logic (this npm package) from lightweight agent instructions (SKILL.md).
 ```
-Starter (this package) → npm publish → bunx @lythos/skill-arena@0.13.0 ...
+Starter (this package) → npm publish → bunx @lythos/skill-arena@0.13.2 ...
 Skill   (packages/<name>/skill/)     → build → SKILL.md + thin scripts
 Output  (skills/<name>/)             → git commit → agent-visible skill
 ```

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@lythos/skill-arena",
-  "version": "0.13.0",
+  "version": "0.13.2",
   "description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
   "keywords": [
     "ai-agent",
@@ -42,13 +42,13 @@
     "bun": ">=1.0.0"
   },
   "dependencies": {
-    "@lythos/cold-pool": "^0.13.0",
-    "@lythos/infra": "^0.13.0",
-    "@lythos/test-utils": "^0.13.0",
+    "@lythos/cold-pool": "^0.13.2",
+    "@lythos/infra": "^0.13.2",
+    "@lythos/test-utils": "^0.13.2",
     "zod": "^3.24.0",
     "zod-to-json-schema": "^3.25.2"
   },
   "optionalDependencies": {
-    "@lythos/agent-adapter-claude-sdk": "^0.13.0"
+    "@lythos/agent-adapter-claude-sdk": "^0.13.2"
   }
 }

package/src/cli.ts CHANGED Viewed

@@ -1,8 +1,9 @@
+#!/usr/bin/env bun
 import { writeFileSync, readFileSync, mkdirSync, existsSync, realpathSync } from 'node:fs'
 import { join, resolve } from 'node:path'
-import { homedir } from 'node:os'
+import { homedir, tmpdir } from 'node:os'
 import { ZodError } from 'zod'
-import { formatPlanOutput, type ArenaResult } from './runner'
+import { formatPlanOutput, type ArenaResult, buildArenaPrompt } from './runner'
 import { parseArenaToml, buildExecutionPlan } from './arena-toml'
 import { buildCopyPlan, parseDeckSkills } from './preflight'
 import { checkSkillExistence, formatSkillWarnings, resolveColdPoolDir } from './preflight'
@@ -263,9 +264,17 @@ async function singleRun(args: string[]) {
   else console.log(`📋 brief: ${opts.brief!.slice(0, 60)}...`)
   // Setup workdir
-  const agentWorkdir = join(process.cwd(), `arena-single-${Date.now()}`)
+  const agentWorkdir = join(tmpdir(), `arena-single-${Date.now()}`)
   mkdirSync(agentWorkdir, { recursive: true })
   writeFileSync(join(agentWorkdir, 'skill-deck.toml'), readFileSync(deckPath, 'utf-8'))
+  writeFileSync(join(agentWorkdir, 'AGENTS.md'), [
+    '# Arena Test Environment',
+    `**Mode**: single`,
+    '## How This Works',
+    '- Isolated arena test directory. Skills in skill-deck.toml, linked via deck link.',
+    '- Complete the task using available skills. Output to this directory.',
+    '- MANDATORY: write decision-log.jsonl (see prompt for schema).',
+  ].join('\n'))
   const deckRaw = readFileSync(join(agentWorkdir, 'skill-deck.toml'), 'utf-8')
   let deckParsed: Record<string, any> = {}
@@ -306,10 +315,16 @@ async function singleRun(args: string[]) {
     console.warn('⚠️  Could not check skill existence:', e instanceof Error ? e.message : e)
   }
-  // Direct agent.spawn — natural-language task text, no parsing
+  // Template injection: brief is the {task} variable, template carries fixed contract
+  const fullPrompt = buildArenaPrompt({
+    brief: taskText,
+    cwd: agentWorkdir,
+    deckPath: deckPath,
+    outputDir: agentWorkdir,
+  })
   const agentResult = await agent.spawn({
     cwd: agentWorkdir,
-    brief: taskText,
+    brief: fullPrompt,
     timeoutMs: Number(opts.timeout ?? 120000),
   })
@@ -320,7 +335,7 @@ async function singleRun(args: string[]) {
   // Copy agent-produced files to outDir
   const { cpSync, readdirSync, existsSync: es3 } = await import('node:fs')
   if (es3(agentWorkdir)) {
-    const skipSet = new Set(['.claude', 'skill-deck.toml', 'skill-deck.lock'])
+    const skipSet = new Set(['.claude', 'skill-deck.toml', 'skill-deck.lock', 'AGENTS.md'])
     try {
       const entries = readdirSync(agentWorkdir)
       const plan = buildCopyPlan(agentWorkdir, outDir, entries, skipSet)

package/src/runner.ts CHANGED Viewed

@@ -52,6 +52,47 @@ function resolveJudgeText(toml: ArenaToml, configDir?: string): string | null {
   return null
 }
+// ── Prompt template (IoC: brief = variable, template = fixed contract) ────
+export function buildArenaPrompt(opts: {
+  brief: string
+  cwd: string
+  deckPath: string
+  outputDir?: string
+  preflightReport?: string
+}): string {
+  const out = opts.outputDir ?? opts.cwd
+  const lines = [
+    'You are running an arena evaluation cell.',
+    '',
+    `CWD: ${opts.cwd}`,
+    `Deck: ${opts.deckPath}`,
+    `Produce output to: ${out}/`,
+    '',
+    'MANDATORY — write decision-log.jsonl to the output directory.',
+    'Each line is one JSON object with: t (seconds elapsed),',
+    'phase (setup/content/design/output), decision (what you chose),',
+    'reason (why). This is your decision trail — the only way the',
+    'orchestrator can understand your reasoning chain.',
+    '',
+    'Example:',
+    '{"t":0,"phase":"setup","decision":"selected Golden Hour palette","reason":"warm tones match baking theme"}',
+    '{"t":12,"phase":"content","decision":"6 science topics","reason":"requires chemistry depth"}',
+    '',
+    'ROBUSTNESS — If any command or script fails, read the error output, fix the issue, and retry.',
+    'Do not stop on the first error. Ensure all required output files exist before finishing.',
+    '',
+    'TOOLS — Use the skills already linked in .claude/skills/ (check with `ls .claude/skills/`).',
+    'They are available and tested. Only write alternative scripts if the linked skills explicitly',
+    'cannot handle the task.',
+  ]
+  if (opts.preflightReport) {
+    lines.push('', 'Preflight:', opts.preflightReport)
+  }
+  lines.push('', 'TASK:', opts.brief)
+  return lines.join('\n')
+}
 // ── Plan formatting ───────────────────────────────────────────────────────
 export function formatPlanOutput(plan: ExecutionPlan): string[] {
@@ -140,10 +181,10 @@ export async function runArenaFromToml(opts: {
       writeFileSync(join(workDir, 'AGENTS.md'), [
         '# Arena Test Environment',
         `**Side**: ${cell.side}`, `**Player**: ${cell.player}`, `**Run**: ${cell.run}`,
-        '## Task', '', taskText,
         '## How This Works',
         '- Isolated arena test directory. Skills in skill-deck.toml, linked via deck link.',
         '- Complete the task using available skills. Output to this directory.',
+        '- MANDATORY: write decision-log.jsonl (see prompt for schema).',
       ].join('\n'))
       const linkProc = Bun.spawn(
         ['bunx', '@lythos/skill-deck', 'link'],
@@ -156,9 +197,15 @@ export async function runArenaFromToml(opts: {
       // Direct agent.spawn (no parseAgentMd, no AgentScenario)
       const agent = useAgent(resolvePlayer(cell.player))
+      const fullPrompt = buildArenaPrompt({
+        brief: taskText,
+        cwd: workDir,
+        deckPath: cell.deck,
+        outputDir: workDir,
+      })
       const agentResult = await agent.spawn({
         cwd: workDir,
-        brief: taskText,
+        brief: fullPrompt,
         timeoutMs: 300000,
       })