npm - @lythos/skill-arena - Versions diffs - 0.9.1 → 0.9.2 - Mend

@lythos/skill-arena 0.9.1 → 0.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/package.json +5 -1
package/src/cli.ts +51 -10
package/src/comparative-judge.test.ts +92 -0
package/src/comparative-judge.ts +166 -0
package/src/runner.ts +187 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@lythos/skill-arena",
-  "version": "0.9.1",
+  "version": "0.9.2",
   "description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
   "keywords": [
     "ai-agent",
@@ -35,5 +35,9 @@
   "homepage": "https://github.com/lythos-labs/lythoskill/tree/main/packages/lythoskill-arena#readme",
   "engines": {
     "bun": ">=1.0.0"
+  },
+  "dependencies": {
+    "@lythos/test-utils": "^0.9.1",
+    "zod-to-json-schema": "^3.25.2"
   }
 }

package/src/cli.ts CHANGED Viewed

@@ -29,23 +29,31 @@ function printHelp(): void {
   console.log(`🎭 lythoskill-arena — Skill comparison runner
 Usage:
-  lythoskill-arena --task "<task description>" --skills <skill1,skill2,...>
-  lythoskill-arena --task "<task description>" --decks <deck1,deck2,...>
+  lythoskill-arena run --task <path> --players <A.toml,B.toml> --decks <A.toml,B.toml> --criteria <c1,c2,...> [--out <dir>]
+  lythoskill-arena scaffold --task "<description>" --skills <skill1,skill2,...>
+  lythoskill-arena scaffold --task "<description>" --decks <deck1,deck2,...>
   lythoskill-arena viz <arena-dir>
+Commands:
+  run       Run arena programmatically (cartesian player × deck → judge → report)
+  scaffold  Create arena directory structure (legacy, manual subagent execution)
+  viz       Visualize arena report (ASCII charts)
 Options:
-  -t, --task <desc>      Task description (required)
-  -s, --skills <list>    Comma-separated skill names
+  -t, --task <path|desc> Task description or path to TASK-arena.md
+  -s, --skills <list>    Comma-separated skill names (scaffold only)
       --decks <list>     Comma-separated deck paths
   -c, --criteria <list>  Evaluation criteria (default: syntax,context,logic,token)
-      --control <skill>  Control skill for comparison (default: lythoskill-project-scribe)
-  -d, --dir <dir>        Output directory (default: tmp)
+      --players <list>   Comma-separated player.toml paths (run only)
+      --control <skill>  Control skill for comparison (scaffold only)
+      --out <dir>        Output directory (run: defaults to runs/arena-<id>)
+  -d, --dir <dir>        Output directory (scaffold: defaults to tmp)
   -p, --project <dir>    Project directory (default: .)
 Examples:
-  lythoskill-arena --task "Refactor auth module" --skills skill-a,skill-b
-  lythoskill-arena --task "Write tests" --decks ./decks/minimal.toml,./decks/full.toml
-  lythoskill-arena viz tmp/arena-20260430
+  lythoskill-arena run --task ./TASK-arena.md --players ./players/claude.toml,./players/kimi.toml --decks ./decks/run-01.toml,./decks/run-02.toml --criteria coverage,relevance
+  lythoskill-arena scaffold --task "Refactor auth module" --skills skill-a,skill-b
+  lythoskill-arena viz runs/arena-20260504
 `)
 }
@@ -551,6 +559,32 @@ function runViz(argv: string[]) {
   console.log(renderRadarChart(report))
 }
+// ── Run: programmatic arena execution ───────────────────────
+async function runProgrammaticArena(argv: string[]) {
+  const { options } = parseArgs(argv)
+  if (!options.task || !options.decks) {
+    console.error('❌ --task <path> and --decks <list> are required for "run"')
+    process.exit(1)
+  }
+  const { runArena: runArenaProgrammatic } = await import('./runner')
+  const result = await runArenaProgrammatic({
+    taskPath: options.task,
+    playerPaths: (options.players ?? 'players/claude-code.toml').split(',').map(s => s.trim()).filter(Boolean),
+    deckPaths: options.decks.split(',').map(s => s.trim()).filter(Boolean),
+    criteria: (options.criteria ?? 'syntax,context,logic,token').split(',').map(s => s.trim()).filter(Boolean),
+    outDir: options.out ?? `runs/arena-${timestamp()}`,
+    projectDir: options.project,
+  })
+  console.log(`\n🎮 Arena complete: ${result.manifest.id}`)
+  console.log(`📁 Artifacts: ${result.artifactsDir}`)
+  console.log(`📊 Report: ${result.artifactsDir}/report.md`)
+}
 // ── Main Entry ───────────────────────────────────────────────
 if (import.meta.main) {
@@ -559,7 +593,14 @@ if (import.meta.main) {
   if (cmd === 'viz') {
     runViz(args.slice(1))
+  } else if (cmd === 'run') {
+    runProgrammaticArena(args.slice(1))
+  } else if (cmd === 'scaffold' || !cmd || args[0]?.startsWith('-')) {
+    // Legacy behavior: if no subcommand or starts with flags, treat as scaffold
+    runArena(cmd === 'scaffold' ? args.slice(1) : args)
   } else {
-    runArena(args)
+    console.error(`❌ Unknown command: ${cmd}`)
+    printHelp()
+    process.exit(1)
   }
 }

package/src/comparative-judge.test.ts ADDED Viewed

@@ -0,0 +1,92 @@
+import { describe, test, expect } from 'bun:test'
+import { computePareto } from './comparative-judge'
+describe('computePareto', () => {
+  test('single participant is always non-dominated', () => {
+    const result = computePareto([
+      { participant_id: 'run-01', scores: { a: 5, b: 3 } },
+    ])
+    expect(result).toHaveLength(1)
+    expect(result[0].dominated).toBe(false)
+    expect(result[0].dominated_by).toEqual([])
+  })
+  test('clear dominance: run-01 dominates run-02 on all criteria', () => {
+    const result = computePareto([
+      { participant_id: 'run-01', scores: { coverage: 5, relevance: 5 } },
+      { participant_id: 'run-02', scores: { coverage: 3, relevance: 2 } },
+    ])
+    expect(result[0].dominated).toBe(false)
+    expect(result[1].dominated).toBe(true)
+    expect(result[1].dominated_by).toEqual(['run-01'])
+  })
+  test('equal scores: no one dominates', () => {
+    const result = computePareto([
+      { participant_id: 'run-01', scores: { a: 4, b: 4 } },
+      { participant_id: 'run-02', scores: { a: 4, b: 4 } },
+    ])
+    expect(result[0].dominated).toBe(false)
+    expect(result[1].dominated).toBe(false)
+  })
+  test('cross dominance: each wins on different criteria', () => {
+    const result = computePareto([
+      { participant_id: 'run-01', scores: { speed: 5, accuracy: 2 } },
+      { participant_id: 'run-02', scores: { speed: 2, accuracy: 5 } },
+    ])
+    // Neither dominates: run-01 better on speed but worse on accuracy
+    expect(result[0].dominated).toBe(false)
+    expect(result[1].dominated).toBe(false)
+  })
+  test('multi-participant: transitive dominance chain', () => {
+    const result = computePareto([
+      { participant_id: 'best', scores: { a: 5, b: 5, c: 5 } },
+      { participant_id: 'mid', scores: { a: 4, b: 4, c: 4 } },
+      { participant_id: 'worst', scores: { a: 2, b: 2, c: 2 } },
+    ])
+    // best dominates both, mid dominates worst
+    expect(result[0].dominated).toBe(false) // best
+    expect(result[1].dominated).toBe(true)  // mid (by best)
+    expect(result[1].dominated_by).toEqual(['best'])
+    expect(result[2].dominated).toBe(true)  // worst (by both)
+    expect(result[2].dominated_by.sort()).toEqual(['best', 'mid'].sort())
+  })
+  test('Pareto frontier from playground BDD-research: run-01 dominates run-02', () => {
+    // From playground/arena-bdd-research/report.md:
+    // Run-01: coverage=5, relevance=5, actionability=5, depth=5
+    // Run-02: coverage=3, relevance=2, actionability=2, depth=1
+    const result = computePareto([
+      { participant_id: 'run-01', scores: { coverage: 5, relevance: 5, actionability: 5, depth: 5 } },
+      { participant_id: 'run-02', scores: { coverage: 3, relevance: 2, actionability: 2, depth: 1 } },
+    ])
+    expect(result[0].dominated).toBe(false) // run-01: Pareto-optimal
+    expect(result[1].dominated).toBe(true)  // run-02: dominated by run-01
+    expect(result[1].dominated_by).toEqual(['run-01'])
+  })
+  test('empty scores object', () => {
+    const result = computePareto([
+      { participant_id: 'a', scores: {} },
+      { participant_id: 'b', scores: {} },
+    ])
+    expect(result).toHaveLength(2)
+    expect(result[0].dominated).toBe(false)
+    expect(result[1].dominated).toBe(false)
+  })
+  test('partial criteria overlap', () => {
+    const result = computePareto([
+      { participant_id: 'run-01', scores: { a: 5, b: 3 } },
+      { participant_id: 'run-02', scores: { a: 3, c: 5 } },
+    ])
+    // run-01 has a=5 vs run-02 a=3 (a wins)
+    // run-02 has b=undefined vs run-01 b=3 → treated as 0. So run-01 >= run-02 on all shared crit, > on one.
+    // But c: run-01 has 0, run-02 has 5. So run-02 > run-01 on c.
+    // Cross-dominance → neither dominates
+    expect(result[0].dominated).toBe(false)
+    expect(result[1].dominated).toBe(false)
+  })
+})

package/src/comparative-judge.ts ADDED Viewed

@@ -0,0 +1,166 @@
+import { zodToJsonSchema } from 'zod-to-json-schema'
+import { ComparativeReport, ScoreCell, ParetoEntry } from '@lythos/test-utils/schema'
+import type { AgentAdapter } from '@lythos/test-utils/agents'
+import type { ArenaManifest } from '@lythos/test-utils/schema'
+// ── Pareto Frontier (deterministic algorithm) ──────────────────────────────
+export interface ScoreVector {
+  participant_id: string
+  scores: Record<string, number>
+  dominated: boolean
+  dominated_by: string[]
+}
+/**
+ * Compute Pareto frontier from score vectors.
+ * Participant A dominates B if A >= B in all criteria AND A > B in at least one.
+ * This is deterministic — never delegated to LLM.
+ */
+export function computePareto(vectors: { participant_id: string; scores: Record<string, number> }[]): ParetoEntry[] {
+  const result: ParetoEntry[] = vectors.map(v => ({
+    participant_id: v.participant_id,
+    scores: { ...v.scores },
+    dominated: false,
+    dominated_by: [] as string[],
+  }))
+  // Union of all criteria across all participants
+  const allCriteria = [...new Set(vectors.flatMap(v => Object.keys(v.scores)))]
+  if (allCriteria.length === 0) return result
+  for (let i = 0; i < result.length; i++) {
+    for (let j = 0; j < result.length; j++) {
+      if (i === j) continue
+      const a = vectors[i].scores
+      const b = vectors[j].scores
+      const allGe = allCriteria.every(k => (a[k] ?? 0) >= (b[k] ?? 0))
+      const anyGt = allCriteria.some(k => (a[k] ?? 0) > (b[k] ?? 0))
+      if (allGe && anyGt) {
+        // i dominates j
+        result[j].dominated = true
+        if (!result[j].dominated_by.includes(result[i].participant_id)) {
+          result[j].dominated_by.push(result[i].participant_id)
+        }
+      }
+    }
+  }
+  return result
+}
+// ── Comparative Judge Prompt ──────────────────────────────────────────────
+function buildComparativePrompt(opts: {
+  manifest: ArenaManifest
+  verdicts: { participantId: string; verdict: unknown }[]
+}): string {
+  const criteriaDesc = opts.manifest.criteria.join(', ')
+  const participants = opts.manifest.participants
+    .map(p => `- ${p.id}: ${p.name} (${p.description || 'no description'})`)
+    .join('\n')
+  return `You are a comparative judge evaluating ${opts.manifest.participants.length} participants against shared criteria.
+## Task
+${opts.manifest.task}
+## Participants
+${participants}
+## Criteria
+${criteriaDesc}
+## Your Job
+For each participant, score them 1-5 on each criterion. Provide a brief rationale.
+Score meanings: 1=poor, 3=acceptable, 5=excellent.
+Use the submit_scores tool to return your structured evaluation.`
+}
+const SCORE_TOOL = {
+  name: 'submit_scores',
+  description: 'Submit per-participant scores for each criterion with rationales',
+  input_schema: zodToJsonSchema(ComparativeReport.pick({ score_matrix: true, key_findings: true, recommendations: true })) as Record<string, unknown>,
+}
+function toScoreMatrix(
+  manifest: ArenaManifest,
+  scores: { participant_id: string; criterion: string; weight: number; score: number; rationale: string }[]
+): typeof ScoreCell._output[] {
+  return scores.map(s => ScoreCell.parse(s))
+}
+// ── Comparative Judge ─────────────────────────────────────────────────────
+export async function runComparativeJudge(opts: {
+  manifest: ArenaManifest
+  verdicts: { participantId: string; verdict: unknown }[]
+  judge: AgentAdapter
+  workdir: string
+}): Promise<typeof ComparativeReport._output> {
+  const { manifest, verdicts, judge, workdir } = opts
+  const prompt = buildComparativePrompt({ manifest, verdicts })
+  let raw: string
+  let parsed: unknown
+  if (judge.invokeTool) {
+    parsed = await judge.invokeTool({
+      tool: SCORE_TOOL,
+      prompt,
+      cwd: workdir,
+      timeoutMs: 120000,
+    })
+    raw = JSON.stringify(parsed)
+  } else {
+    const result = await judge.spawn({ cwd: workdir, brief: prompt, timeoutMs: 120000 })
+    raw = result.stdout
+    const fenceMatch = raw.match(/```(?:json)?\s*([\s\S]*?)\s*```/)
+    const jsonStr = fenceMatch ? fenceMatch[1].trim() : raw.trim()
+    parsed = JSON.parse(jsonStr)
+  }
+  // Validate LLM output
+  const llmResult = ComparativeReport.pick({
+    score_matrix: true,
+    key_findings: true,
+    recommendations: true,
+  }).parse(parsed)
+  const scoreMatrix = toScoreMatrix(manifest, llmResult.score_matrix)
+  // Pareto: deterministic, never delegated to LLM
+  const participantScores = manifest.participants.map(p => {
+    const pScores: Record<string, number> = {}
+    for (const cell of scoreMatrix) {
+      if (cell.participant_id === p.id) {
+        pScores[cell.criterion] = cell.score
+      }
+    }
+    return { participant_id: p.id, scores: pScores }
+  })
+  const pareto = computePareto(participantScores)
+  // Weighted totals (equal weight by default)
+  const weightedTotals: Record<string, number> = {}
+  for (const p of manifest.participants) {
+    const pCells = scoreMatrix.filter(c => c.participant_id === p.id)
+    weightedTotals[p.id] = pCells.reduce((sum, c) => sum + c.score * c.weight, 0) / (pCells.length || 1)
+  }
+  return ComparativeReport.parse({
+    arena_id: manifest.id,
+    generated_at: new Date().toISOString(),
+    score_matrix: scoreMatrix,
+    weighted_totals: weightedTotals,
+    pareto,
+    key_findings: llmResult.key_findings ?? [],
+    recommendations: llmResult.recommendations ?? [],
+  })
+}

package/src/runner.ts ADDED Viewed

@@ -0,0 +1,187 @@
+import { mkdirSync, writeFileSync, existsSync, readFileSync } from 'node:fs'
+import { join, resolve } from 'node:path'
+import { runAgentScenario, type AgentScenario } from '@lythos/test-utils/agent-bdd'
+import { useAgent } from '@lythos/test-utils/agents'
+import { ArenaManifest, Player, type ArenaManifest as ArenaManifestType } from '@lythos/test-utils/schema'
+import { runComparativeJudge } from './comparative-judge'
+// ── Helpers ───────────────────────────────────────────────────────────────
+function stamp(): string {
+  const d = new Date()
+  return `${d.getFullYear()}${String(d.getMonth() + 1).padStart(2, '0')}${String(d.getDate()).padStart(2, '0')}-${String(d.getHours()).padStart(2, '0')}${String(d.getMinutes()).padStart(2, '0')}${String(d.getSeconds()).padStart(2, '0')}`
+}
+function cartesian<T>(arrays: T[][]): T[][] {
+  if (arrays.length === 0) return [[]]
+  const [first, ...rest] = arrays
+  const restProd = cartesian(rest)
+  return first.flatMap(a => restProd.map(r => [a, ...r]))
+}
+function slugify(input: string): string {
+  return input.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-+|-+$/g, '').slice(0, 40)
+}
+// ── Runner ────────────────────────────────────────────────────────────────
+export async function runArena(opts: {
+  taskPath: string
+  playerPaths: string[]
+  deckPaths: string[]
+  criteria: string[]
+  outDir: string
+  projectDir?: string
+}): Promise<{ manifest: ArenaManifestType; report: unknown; artifactsDir: string }> {
+  const { taskPath, playerPaths, deckPaths, criteria, outDir } = opts
+  // Load players
+  const players = playerPaths.map(p => {
+    const content = readFileSync(resolve(p), 'utf-8')
+    const parsed = Player.parse(JSON.parse(content))
+    return { path: p, ...parsed }
+  })
+  // Load deck labels from deck paths
+  const decks = deckPaths.map(p => ({ path: resolve(p) }))
+  // Build (player × deck) variant matrix
+  const variants = cartesian([players, decks]).map(([player, deck], i) => ({
+    participant_id: `run-${String(i + 1).padStart(2, '0')}`,
+    player,
+    deck_path: deck.path,
+  }))
+  // Build arena manifest
+  const arenaId = `arena-${stamp()}`
+  const artifactsDir = outDir || join(process.cwd(), 'runs', arenaId)
+  const manifest = ArenaManifest.parse({
+    id: arenaId,
+    created_at: new Date().toISOString(),
+    task: readFileSync(resolve(taskPath), 'utf-8').slice(0, 200),
+    mode: 'decks',
+    participants: variants.map(v => ({
+      id: v.participant_id,
+      name: v.player.path.split('/').pop()?.replace('.toml', '') ?? v.player.platform,
+      player: v.player.platform,
+      deck: v.deck_path,
+      description: `${v.player.platform} × ${v.deck_path.split('/').pop()?.replace('.toml', '')}`,
+    })),
+    criteria,
+    status: 'running',
+  })
+  mkdirSync(artifactsDir, { recursive: true })
+  writeFileSync(join(artifactsDir, 'arena.json'), JSON.stringify(manifest, null, 2) + '\n')
+  // Run each variant
+  const verdicts: { participantId: string; verdict: unknown }[] = []
+  for (const variant of variants) {
+    const cellDir = join(artifactsDir, 'runs', variant.participant_id)
+    mkdirSync(cellDir, { recursive: true })
+    try {
+      const result = await runAgentScenario({
+        scenarioPath: resolve(taskPath),
+        agent: useAgent(variant.player.platform),
+        setupWorkdir(_scenario: AgentScenario, workdir: string) {
+          mkdirSync(workdir, { recursive: true })
+          // Write deck.toml as skill-deck.toml
+          const deckContent = readFileSync(variant.deck_path, 'utf-8')
+          writeFileSync(join(workdir, 'skill-deck.toml'), deckContent)
+        },
+        baseDir: artifactsDir,
+      })
+      verdicts.push({
+        participantId: variant.participant_id,
+        verdict: result.verdict,
+      })
+    } catch (e) {
+      verdicts.push({
+        participantId: variant.participant_id,
+        verdict: {
+          verdict: 'ERROR' as const,
+          reason: `Runner exception: ${e instanceof Error ? e.message : String(e)}`,
+        },
+      })
+    }
+  }
+  // Run comparative judge
+  const judge = useAgent(players[0]?.platform ?? 'claude')
+  const report = await runComparativeJudge({
+    manifest,
+    verdicts,
+    judge,
+    workdir: artifactsDir,
+  })
+  // Write report
+  writeFileSync(join(artifactsDir, 'report.md'), `# Arena Report: ${manifest.id}
+**Task**: ${manifest.task}
+**Criteria**: ${manifest.criteria.join(', ')}
+**Date**: ${new Date().toISOString()}
+## Score Matrix
+${renderScoreMatrix(report)}
+## Pareto Frontier
+${renderPareto(report)}
+## Key Findings
+${(report.key_findings ?? []).map((f: string) => `- ${f}`).join('\n')}
+## Recommendations
+${(report.recommendations ?? []).map((r: { audience: string; recommendation: string }) => `- **${r.audience}**: ${r.recommendation}`).join('\n')}
+`)
+  // Update manifest status
+  const finalManifest = ArenaManifest.parse({ ...manifest, status: 'completed' })
+  writeFileSync(join(artifactsDir, 'arena.json'), JSON.stringify(finalManifest, null, 2) + '\n')
+  return { manifest: finalManifest, report, artifactsDir }
+}
+// ── Markdown Renderers ────────────────────────────────────────────────────
+function renderScoreMatrix(report: unknown & { score_matrix?: { participant_id: string; criterion: string; weight: number; score: number; rationale: string }[] }): string {
+  if (!report.score_matrix?.length) return 'No scores available.\n'
+  // Build participant × criterion matrix
+  const participants = [...new Set(report.score_matrix.map(s => s.participant_id))]
+  const criteria = [...new Set(report.score_matrix.map(s => s.criterion))]
+  let table = `| Criterion | Weight | ${participants.map(p => `${p}`).join(' | ')} |\n`
+  table += `|${'---|'.repeat(2 + participants.length)}\n`
+  for (const c of criteria) {
+    table += `| ${c} | 25% | ${participants.map(p => {
+      const cell = report.score_matrix!.find(s => s.participant_id === p && s.criterion === c)
+      return `**${cell?.score ?? '?'}**`
+    }).join(' | ')} |\n`
+  }
+  // Weighted totals
+  table += `| **Weighted Total** | 100% | ${participants.map(p => {
+    const pScores = report.score_matrix!.filter(s => s.participant_id === p)
+    const avg = pScores.length ? pScores.reduce((sum, s) => sum + s.score, 0) / pScores.length : 0
+    return `**${avg.toFixed(1)}**`
+  }).join(' | ')} |\n`
+  return table
+}
+function renderPareto(report: unknown & { pareto?: { participant_id: string; dominated: boolean; dominated_by: string[] }[] }): string {
+  if (!report.pareto?.length) return 'No Pareto analysis.\n'
+  return report.pareto.map((p: { participant_id: string; dominated: boolean; dominated_by: string[] }) => {
+    if (p.dominated) {
+      return `- **${p.participant_id}**: dominated by ${p.dominated_by.join(', ')}`
+    }
+    return `- **${p.participant_id}**: Pareto-optimal (non-dominated)`
+  }).join('\n')
+}