npm - @lythos/skill-arena - Versions diffs - 0.9.3 → 0.9.6 - Mend

@lythos/skill-arena 0.9.3 → 0.9.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/README.md CHANGED Viewed

@@ -1,6 +1,8 @@
 # @lythos/skill-arena
-> Controlled-variable benchmark for AI agent skills. Compare skills, decks, or configurations on the same task — single-skill A/B or full-deck Pareto frontier analysis.
+![CI](https://img.shields.io/badge/CI-41%20unit%20tests-brightgreen) ![Intent/Plan](https://img.shields.io/badge/arch-intent%2Fplan%2Fexecute-8A2BE2)
+> Controlled-variable benchmark for AI agent skills. Compare skills, decks, or configurations on the same task — single-skill A/B or full-deck Pareto frontier analysis. Now with declarative `arena.toml` (k8s-manifest style) and deterministic Pareto frontier.
 ## Why
@@ -40,25 +42,36 @@ bunx @lythos/skill-arena viz tmp/arena-<id>/
 ## Commands
+### Declarative mode (k8s-style, recommended)
+```bash
+# Print execution plan without running
+bunx @lythos/skill-arena run --config arena.toml --dry-run
+# Execute with per-side runs_per_side and statistical aggregation
+bunx @lythos/skill-arena run --config arena.toml
+```
+### CLI-flag mode (backward compat)
+```
+bunx @lythos/skill-arena run \
+  --task ./TASK-arena.md \
+  --players ./players/claude.toml \
+  --decks ./decks/run-01.toml,./decks/run-02.toml \
+  --criteria coverage,relevance,actionability,depth
 ```
-Usage: bunx @lythos/skill-arena <options> | bunx @lythos/skill-arena viz <dir>
-Mode 1 — Single-Skill Comparison:
-  --task, -t <desc>       Task description (required)
-  --skills, -s <list>     Comma-separated skills, 2–5 (Mode 1)
-  --criteria, -c <list>   Evaluation dimensions (default: syntax,context,logic,token)
-  --control <skill>      Control skill (default: lythoskill-project-scribe)
+### Scaffold mode (legacy, manual execution)
-Mode 2 — Full-Deck Comparison:
-  --decks <paths>        Comma-separated deck toml paths, 2–5 (Mode 2)
-  --criteria, -c <list>   Evaluation dimensions
+```
+bunx @lythos/skill-arena scaffold --task "..." --skills a,b
+```
-Common:
-  --dir, -d <path>       Arena parent directory (default: tmp)
-  --project, -p <path>   Project root (default: .)
+### Viz
-Viz:
-  viz <dir>               Render ASCII charts from report.md
+```bash
+bunx @lythos/skill-arena viz runs/arena-<id>/
 ```
 ## Skill Documentation
@@ -77,6 +90,31 @@ Skill   (packages/<name>/skill/)     → build → SKILL.md + thin scripts
 Output  (skills/<name>/)             → git commit → agent-visible skill
 ```
+### Runtime architecture (intent/plan/execute)
+```
+arena.toml  →  ArenaToml (Zod)  →  ExecutionPlan (pure)  →  per-cell agent spawn (IO)
+                                    ↓
+                aggregateAllStats (pure)  ←  verdicts[]
+                                    ↓
+                runComparativeJudge (IO)  →  report.md + Pareto frontier
+```
+- **Intent**: `arena.toml` declarative config (k8s-manifest style)
+- **Plan**: `buildExecutionPlan()`, `aggregateSideStats()`, `computePareto()` — pure functions
+- **Execute**: `runAgentScenario` per cell, `runComparativeJudge` — IO via `AgentAdapter`
+Built on `@lythos/test-utils` shared infrastructure.
+## Test Coverage
+| Layer | Count | CI | Notes |
+|-------|-------|----|-------|
+| Unit tests | 41 | ✅ | TOML parser, player resolution, Pareto, stats |
+| Agent BDD | — | ❌ | Requires `claude` CLI; run locally |
+Pareto frontier is a **deterministic algorithm** — never delegated to LLM. 8 unit tests cover dominance, cross-dominance, transitive chains, partial criteria, and empty scores.
 ## License
 MIT

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@lythos/skill-arena",
-  "version": "0.9.3",
+  "version": "0.9.6",
   "description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
   "keywords": [
     "ai-agent",

package/src/arena-toml.test.ts CHANGED Viewed

@@ -1,5 +1,6 @@
 import { describe, test, expect } from 'bun:test'
 import { parseArenaToml, buildExecutionPlan, ArenaToml } from './arena-toml'
+import { formatPlanOutput } from './runner'
 const minimalToml = `
 [arena]
@@ -150,6 +151,32 @@ describe('buildExecutionPlan', () => {
     expect(baselineCells.every(c => c.control)).toBe(true)
   })
+  test('dry-run output format matches expected log', () => {
+    const toml = parseArenaToml(minimalToml)
+    const plan = buildExecutionPlan(toml)
+    // Simulate what --dry-run would log
+    const logs: string[] = []
+    for (const line of formatPlanOutput(plan)) {
+      logs.push(line)
+    }
+    expect(logs.some(l => l.includes('2 cells'))).toBe(true)
+    expect(logs.some(l => l.includes('runner-a'))).toBe(true)
+    expect(logs.some(l => l.includes('runner-b'))).toBe(true)
+    expect(logs.some(l => l.includes('claude-code'))).toBe(true)
+    expect(logs.every(l => !l.includes('control'))).toBe(true) // no control flags in minimal
+  })
+  test('dry-run output shows control flag for control sides', () => {
+    const toml = parseArenaToml(fullToml)
+    const plan = buildExecutionPlan(toml)
+    const lines = formatPlanOutput(plan)
+    const baselineLines = lines.filter(l => l.includes('baseline'))
+    // All baseline cells should have [control] flag
+    expect(baselineLines.every(l => l.includes('[control]'))).toBe(true)
+  })
   test('dry-run: plan is pure data, no side effects', () => {
     // The entire plan generation is a pure function — dry-run is just printing it
     const toml = parseArenaToml(fullToml)

package/src/cli.ts CHANGED Viewed

@@ -80,6 +80,9 @@ function parseArgs(argv: string[]) {
     control: 'lythoskill-project-scribe',
     dir: 'tmp',
     project: '.',
+    config: undefined,
+    out: undefined,
+    players: undefined,
   }
   const positionals: string[] = []
@@ -99,6 +102,12 @@ function parseArgs(argv: string[]) {
       options.dir = argv[++i]
     } else if (arg === '--project' || arg === '-p') {
       options.project = argv[++i]
+    } else if (arg === '--config') {
+      options.config = argv[++i]
+    } else if (arg === '--out') {
+      options.out = argv[++i]
+    } else if (arg === '--players') {
+      options.players = argv[++i]
     } else if (!arg.startsWith('-')) {
       positionals.push(arg)
     }

package/src/runner.ts CHANGED Viewed

@@ -5,7 +5,7 @@ import { useAgent } from '@lythos/test-utils/agents'
 import { ArenaManifest, Player } from '@lythos/test-utils/schema'
 import type { ArenaManifest as ArenaManifestType, JudgeVerdict } from '@lythos/test-utils/schema'
 import { runComparativeJudge } from './comparative-judge'
-import { parseArenaToml, buildExecutionPlan, type ArenaToml } from './arena-toml'
+import { parseArenaToml, buildExecutionPlan, type ArenaToml, type ExecutionPlan } from './arena-toml'
 import { resolvePlayer, resolveSides } from './player'
 import { aggregateAllStats } from './stats'
 import type { SideStats } from './stats'
@@ -26,18 +26,33 @@ export interface ArenaResult {
   artifactsDir: string
 }
+/** Format an execution plan as readable CLI output (pure). */
+export function formatPlanOutput(plan: ExecutionPlan): string[] {
+  const lines: string[] = []
+  const sideCount = new Set(plan.cells.map(c => c.side)).size
+  lines.push(`\n📋 Dry-run: ${plan.total_runs} cells across ${sideCount} sides × ${plan.cells.length / Math.max(1, sideCount)} runs`)
+  for (const cell of plan.cells) {
+    lines.push(`   ${cell.side}/run-${cell.run}: ${cell.player} × ${cell.deck}${cell.control ? ' [control]' : ''}`)
+  }
+  return lines
+}
 export async function runArenaFromToml(opts: {
   toml: ArenaToml
   taskPath: string
   outDir?: string
   dryRun?: boolean
+  log?: (msg: string) => void
 }): Promise<ArenaResult | { plan: ReturnType<typeof buildExecutionPlan> }> {
-  const { toml, taskPath, outDir, dryRun } = opts
+  const { toml, taskPath, outDir, dryRun, log } = opts
   const plan = buildExecutionPlan(toml)
   // dry-run: return plan without executing
   if (dryRun) {
+    for (const line of formatPlanOutput(plan)) {
+      log?.(line)
+    }
     return { plan }
   }