npm - @lythos/skill-arena - Versions diffs - 0.9.38 → 0.9.39 - Mend

@lythos/skill-arena 0.9.38 → 0.9.39

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/README.md CHANGED Viewed

@@ -49,26 +49,24 @@ Note: Claude `-p` mode has known issues with web tools in Bun.spawn (deferred to
 ```bash
 bun add -d @lythos/skill-arena
 # or use directly
-bunx @lythos/skill-arena@0.9.38 <command>
+bunx @lythos/skill-arena@0.9.39 <command>
 ```
 ## Quick Start
 ```bash
-# Mode 1: Compare two skills on the same task
-bunx @lythos/skill-arena@0.9.38 \
-  --task "Generate auth flow diagram" \
-  --skills "design-doc-mermaid,mermaid-tools" \
-  --criteria "syntax,context,token"
+# Mode 1: Compare two decks on the same task (declarative)
+bunx @lythos/skill-arena@0.9.39 run \
+  --config examples/arena/research-compare/arena.toml
-# Mode 2: Compare full deck configurations
-bunx @lythos/skill-arena@0.9.38 \
+# Mode 2: Compare full deck configurations via CLI flags
+bunx @lythos/skill-arena@0.9.39 run \
   --task "Generate auth flow diagram" \
   --decks "./decks/minimal.toml,./decks/rich.toml" \
   --criteria "quality,token,maintainability"
 # Visualize results
-bunx @lythos/skill-arena@0.9.38 viz tmp/arena-<id>/
+bunx @lythos/skill-arena@0.9.39 viz tmp/arena-<id>/
 ```
 ## Commands
@@ -77,16 +75,16 @@ bunx @lythos/skill-arena@0.9.38 viz tmp/arena-<id>/
 ```bash
 # Print execution plan without running
-bunx @lythos/skill-arena@0.9.38 run --config arena.toml --dry-run
+bunx @lythos/skill-arena@0.9.39 run --config arena.toml --dry-run
 # Execute with per-side runs_per_side and statistical aggregation
-bunx @lythos/skill-arena@0.9.38 run --config arena.toml
+bunx @lythos/skill-arena@0.9.39 run --config arena.toml
 ```
 ### CLI-flag mode (backward compat)
 ```
-bunx @lythos/skill-arena@0.9.38 run \
+bunx @lythos/skill-arena@0.9.39 run \
   --task ./TASK-arena.md \
   --players ./players/claude.toml \
   --decks ./decks/run-01.toml,./decks/run-02.toml \
@@ -96,13 +94,13 @@ bunx @lythos/skill-arena@0.9.38 run \
 ### Scaffold mode (legacy, manual execution)
 ```
-bunx @lythos/skill-arena@0.9.38 scaffold --task "..." --skills a,b
+bunx @lythos/skill-arena@0.9.39 scaffold --task "..." --decks a.toml,b.toml
 ```
 ### Viz
 ```bash
-bunx @lythos/skill-arena@0.9.38 viz runs/arena-<id>/
+bunx @lythos/skill-arena@0.9.39 viz runs/arena-<id>/
 ```
 ## Skill Documentation
@@ -116,7 +114,7 @@ The agent-visible **Skill** layer documentation is here:
 Part of the [lythoskill](https://github.com/lythos-labs/lythoskill) ecosystem — the thin-skill pattern separates heavy logic (this npm package) from lightweight agent instructions (SKILL.md).
 ```
-Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.38 ...
+Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.39 ...
 Skill   (packages/<name>/skill/)     → build → SKILL.md + thin scripts
 Output  (skills/<name>/)             → git commit → agent-visible skill
 ```
@@ -137,15 +135,6 @@ arena.toml  →  ArenaToml (Zod)  →  ExecutionPlan (pure)  →  per-cell agent
 Built on `@lythos/test-utils` shared infrastructure.
-## Test Coverage
-| Layer | Count | CI | Notes |
-|-------|-------|----|-------|
-| Unit tests | 41 | ✅ | TOML parser, player resolution, Pareto, stats |
-| Agent BDD | — | ❌ | Requires `claude` CLI; run locally |
-Pareto frontier is a **deterministic algorithm** — never delegated to LLM. 8 unit tests cover dominance, cross-dominance, transitive chains, partial criteria, and empty scores.
 ## License
 MIT

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@lythos/skill-arena",
-  "version": "0.9.38",
+  "version": "0.9.39",
   "description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
   "keywords": [
     "ai-agent",

package/src/cli.ts CHANGED Viewed

@@ -40,7 +40,6 @@ Usage:
   lythoskill-arena agent-run --task <path> --deck <path> [--player kimi] [--out <dir>] [--timeout <ms>]
   lythoskill-arena agent-run --brief "<prompt>" --deck <path> [--out <dir>] [--timeout <ms>]
   lythoskill-arena run --task <path> --players <A.toml,B.toml> --decks <A.toml,B.toml> --criteria <c1,c2,...> [--out <dir>]
-  lythoskill-arena scaffold --task "<description>" --skills <skill1,skill2,...>
   lythoskill-arena scaffold --task "<description>" --decks <deck1,deck2,...>
   lythoskill-arena viz <arena-dir>
@@ -51,13 +50,11 @@ Commands:
 Options:
   -t, --task <path|desc> Task description or path to TASK-arena.md
-  -s, --skills <list>    Comma-separated skill names (scaffold only)
       --decks <list>     Comma-separated deck paths
   -c, --criteria <list>  Evaluation criteria (default: syntax,context,logic,token)
       --players <list>   Comma-separated player.toml paths (CLI run only)
       --config <path>    Path to arena.toml (declarative mode, k8s-style)
       --dry-run          Print execution plan without running (with --config)
-      --control <skill>  Control skill for comparison (scaffold only)
       --out <dir>        Output directory (run: defaults to runs/arena-<id>)
   -d, --dir <dir>        Output directory (scaffold: defaults to tmp)
   -p, --project <dir>    Project directory (default: .)
@@ -75,7 +72,7 @@ Examples:
   lythoskill-arena run --task ./TASK-arena.md --players ./players/claude.toml --decks ./decks/run-01.toml,./decks/run-02.toml --criteria coverage,relevance
   # Legacy scaffolding
-  lythoskill-arena scaffold --task "Refactor auth module" --skills skill-a,skill-b
+  lythoskill-arena scaffold --task "Refactor auth module" --decks ./decks/a.toml,./decks/b.toml
   lythoskill-arena viz runs/arena-20260504
 `)
 }
@@ -268,10 +265,8 @@ function parseArgs(argv: string[]) {
   const options: Record<string, string | undefined> = {
     task: undefined,
-    skills: undefined,
     decks: undefined,
     criteria: 'syntax,context,logic,token',
-    control: 'lythoskill-project-scribe',
     dir: 'tmp',
     project: '.',
     config: undefined,
@@ -284,13 +279,10 @@ function parseArgs(argv: string[]) {
     const arg = argv[i]
     if (arg === '--task' || arg === '-t') {
       options.task = argv[++i]
-    } else if (arg === '--skills' || arg === '-s') {
-      options.skills = argv[++i]
     } else if (arg === '--decks') {
       options.decks = argv[++i]
     } else if (arg === '--criteria' || arg === '-c') {
       options.criteria = argv[++i]
-    } else if (arg === '--control') {
       options.control = argv[++i]
     } else if (arg === '--dir' || arg === '-d') {
       options.dir = argv[++i]
@@ -319,39 +311,13 @@ export function runArena(argv: string[]) {
     process.exit(1)
   }
-  const HAS_DECKS = !!options.decks
-  const HAS_SKILLS = !!options.skills
+  const DECK_PATHS = (options.decks || '').split(',').map(s => s.trim()).filter(Boolean)
-  if (!HAS_DECKS && !HAS_SKILLS) {
-    console.error('❌ 请提供 --skills 或 --decks')
-    process.exit(1)
-  }
-  if (HAS_DECKS && HAS_SKILLS) {
-    console.error('❌ --skills 和 --decks 不能同时使用')
-    process.exit(1)
-  }
-  const DECK_PATHS = HAS_DECKS
-    ? (options.decks || '').split(',').map(s => s.trim()).filter(Boolean)
-    : []
-  const SKILLS = HAS_SKILLS
-    ? (options.skills || '').split(',').map(s => s.trim()).filter(Boolean)
-    : []
-  if (HAS_SKILLS && SKILLS.length < 2) {
-    console.error('❌ 至少需要 2 个 skill 才能进行 arena')
-    process.exit(1)
-  }
-  if (HAS_SKILLS && SKILLS.length > 5) {
-    console.error('❌ 一次 arena 最多 5 个 skill')
-    process.exit(1)
-  }
-  if (HAS_DECKS && DECK_PATHS.length < 2) {
+  if (DECK_PATHS.length < 2) {
     console.error('❌ 至少需要 2 个 deck 才能进行 arena')
     process.exit(1)
   }
-  if (HAS_DECKS && DECK_PATHS.length > 5) {
+  if (DECK_PATHS.length > 5) {
     console.error('❌ 一次 arena 最多 5 个 deck')
     process.exit(1)
   }
@@ -359,9 +325,6 @@ export function runArena(argv: string[]) {
   const CRITERIA = (options.criteria || 'syntax,context,logic,token')
     .split(',').map(s => s.trim()).filter(Boolean)
-  const CONTROL_SKILLS = (options.control || 'lythoskill-project-scribe')
-    .split(',').map(s => s.trim()).filter(Boolean)
   const PROJECT_DIR = resolve(options.project!)
   const ARENA_SLUG = slugify(TASK)
   const ARENA_ID = `arena-${timestamp()}-${ARENA_SLUG.slice(0, 30)}`
@@ -373,37 +336,20 @@ export function runArena(argv: string[]) {
   mkdirSync(join(ARENA_DIR, 'sides'), { recursive: true })
   // ── 生成参与者与 deck ───────────────────────────────────────
-  let participants: { id: string; name: string; skill_name: string; deck_path: string }[]
-  let mode: 'single-skill' | 'full-deck'
-  if (HAS_DECKS) {
-    mode = 'full-deck'
-    participants = DECK_PATHS.map((deckPath, i) => {
-      const id = `run-${String(i + 1).padStart(2, '0')}`
-      const name = basename(deckPath).replace(/\.toml$/, '')
-      const destPath = join(ARENA_DIR, 'decks', `arena-${id}.toml`)
-      // Copy the provided deck to arena directory
-      if (existsSync(deckPath)) {
-        const content = readFileSync(deckPath, 'utf-8')
-        writeFileSync(destPath, content)
-      } else {
-        console.error(`❌ Deck 文件不存在: ${deckPath}`)
-        process.exit(1)
-      }
-      return { id, name, skill_name: name, deck_path: destPath }
-    })
-  } else {
-    mode = 'single-skill'
-    participants = SKILLS.map((skill, i) => {
-      const id = `run-${String(i + 1).padStart(2, '0')}`
-      return {
-        id,
-        name: skill,
-        skill_name: skill,
-        deck_path: join(ARENA_DIR, 'decks', `arena-${id}.toml`),
-      }
-    })
-  }
+  const participants = DECK_PATHS.map((deckPath, i) => {
+    const id = `run-${String(i + 1).padStart(2, '0')}`
+    const name = basename(deckPath).replace(/\.toml$/, '')
+    const destPath = join(ARENA_DIR, 'decks', `arena-${id}.toml`)
+    // Copy the provided deck to arena directory
+    if (existsSync(deckPath)) {
+      const content = readFileSync(deckPath, 'utf-8')
+      writeFileSync(destPath, content)
+    } else {
+      console.error(`❌ Deck 文件不存在: ${deckPath}`)
+      process.exit(1)
+    }
+    return { id, name, skill_name: name, deck_path: destPath }
+  })
   const criteria = CRITERIA.map((c) => ({
     name: c,
@@ -411,29 +357,6 @@ export function runArena(argv: string[]) {
     weight: 1,
   }))
-  if (mode === 'single-skill') {
-    for (const p of participants) {
-      const deckContent = `# ============================================================
-# Arena Deck: ${p.id} — ${p.name}
-# ============================================================
-# 变量：${p.name}
-# 控制变量：${CONTROL_SKILLS.join(', ')}
-# ============================================================
-[deck]
-working_set = ".claude/skills"
-cold_pool   = "~/.agents/skill-repos"
-max_cards   = 10
-[tool]
-skills = [
-${[...new Set([p.skill_name, ...CONTROL_SKILLS])].map(s => `  "${s}",`).join('\n')}
-]
-`
-      writeFileSync(p.deck_path, deckContent)
-    }
-  }
   // ── 为每个 side 创建隔离工作空间 ────────────────────────────
   for (const p of participants) {
     const sideDir = join(ARENA_DIR, 'sides', p.id)
@@ -481,14 +404,11 @@ ${criteria.map(c => `  - ${c.label}`).join('\n')}
 arena_decks:
 ${participants.map(p => `  - ${p.deck_path.replace(PROJECT_DIR, '.')}`).join('\n')}
 judge_persona: |
-  ${mode === 'full-deck'
-    ? `你是一个多目标优化分析师。不要选 Winner。
-  对每个 deck 配置，按 evaluation_criteria 输出评分向量（1-5 分）。
-  识别 Pareto 非支配解集——没有"最强"，只有"在不同维度上的最优权衡"。
-  对被支配的解，说明它被谁支配、在哪个维度上劣势。
-  如果发现任何涌现 combo（多个 skill 组合产生 1+1>2 的效果），单独标注。`
-    : `你是一个中立的技能评测员。对比所有 subagent 的输出，
-  按 evaluation_criteria 给出 1-5 分评分，最终给出 Winner 和选型建议。`}
+    你是一个多目标优化分析师。不要选 Winner。
+    对每个 deck 配置，按 evaluation_criteria 输出评分向量（1-5 分）。
+    识别 Pareto 非支配解集——没有"最强"，只有"在不同维度上的最优权衡"。
+    对被支配的解，说明它被谁支配、在哪个维度上劣势。
+    如果发现任何涌现 combo（多个 skill 组合产生 1+1>2 的效果），单独标注。
 acceptance:
 ${participants.map(p => `  - Subagent ${p.id} 在 sides/${p.id}/ 隔离环境完成任务并写入 runs/${p.id}.md`).join('\n')}
   - Judge 读取所有 run 文件并生成 report.md
@@ -527,9 +447,9 @@ cd "${ARENA_DIR}"
 ID:        ${ARENA_ID}
 任务:      ${TASK}
 目录:      ${ARENA_DIR}
-模式:      ${mode === 'full-deck' ? '完整 deck 配置对比' : '单 skill 对比'}
+模式:      deck 配置对比
 参与者:    ${participants.map(p => p.name).join(', ')}
-${mode === 'single-skill' ? `控制变量:  ${CONTROL_SKILLS.join(', ')}\n` : ''}评测维度:  ${CRITERIA.join(', ')}
+评测维度:  ${CRITERIA.join(', ')}
 生成文件:
   📋 ${join(ARENA_DIR, 'arena.json')}