@lythos/skill-arena 0.9.1 → 0.9.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lythos/skill-arena",
3
- "version": "0.9.1",
3
+ "version": "0.9.3",
4
4
  "description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
5
5
  "keywords": [
6
6
  "ai-agent",
@@ -35,5 +35,9 @@
35
35
  "homepage": "https://github.com/lythos-labs/lythoskill/tree/main/packages/lythoskill-arena#readme",
36
36
  "engines": {
37
37
  "bun": ">=1.0.0"
38
+ },
39
+ "dependencies": {
40
+ "@lythos/test-utils": "^0.9.1",
41
+ "zod-to-json-schema": "^3.25.2"
38
42
  }
39
43
  }
@@ -0,0 +1,164 @@
1
+ import { describe, test, expect } from 'bun:test'
2
+ import { parseArenaToml, buildExecutionPlan, ArenaToml } from './arena-toml'
3
+
4
+ const minimalToml = `
5
+ [arena]
6
+ task = "Test task"
7
+ criteria = ["a", "b"]
8
+
9
+ [[side]]
10
+ name = "runner-a"
11
+ player = "claude-code"
12
+ deck = "./decks/a.toml"
13
+
14
+ [[side]]
15
+ name = "runner-b"
16
+ player = "claude-code"
17
+ deck = "./decks/b.toml"
18
+ `
19
+
20
+ const fullToml = `
21
+ [arena]
22
+ task = "Generate auth flow diagram"
23
+ criteria = ["syntax", "context", "logic", "token"]
24
+ runs_per_side = 3
25
+
26
+ [[side]]
27
+ name = "minimal"
28
+ player = "standard-coder"
29
+ deck = "./decks/minimal.toml"
30
+
31
+ [[side]]
32
+ name = "rich"
33
+ player = "expert-architect"
34
+ deck = "./decks/rich.toml"
35
+
36
+ [[side]]
37
+ name = "baseline"
38
+ player = "standard-coder"
39
+ deck = "./decks/baseline.toml"
40
+ control = true
41
+
42
+ [side.env]
43
+ container = "node:20-alpine"
44
+ pre_run = ["npm ci", "npm run build"]
45
+ working_dir = "/workspace"
46
+ `
47
+
48
+ // ── Schema + Parser ────────────────────────────────────────────────────────
49
+
50
+ describe('parseArenaToml', () => {
51
+ test('parses minimal two-side arena', () => {
52
+ const result = parseArenaToml(minimalToml)
53
+ expect(result.arena.task).toBe('Test task')
54
+ expect(result.arena.criteria).toEqual(['a', 'b'])
55
+ expect(result.arena.runs_per_side).toBe(1) // default
56
+ expect(result.side).toHaveLength(2)
57
+ expect(result.side[0].name).toBe('runner-a')
58
+ expect(result.side[0].player).toBe('claude-code')
59
+ expect(result.side[0].deck).toBe('./decks/a.toml')
60
+ expect(result.side[0].control).toBe(false) // default
61
+ })
62
+
63
+ test('parses full arena with runs_per_side and control', () => {
64
+ const result = parseArenaToml(fullToml)
65
+ expect(result.arena.runs_per_side).toBe(3)
66
+ expect(result.side).toHaveLength(3)
67
+ expect(result.side[2].name).toBe('baseline')
68
+ expect(result.side[2].control).toBe(true)
69
+ })
70
+
71
+ test('parses side env block', () => {
72
+ const result = parseArenaToml(fullToml)
73
+ const env = result.side[2].env
74
+ expect(env.container).toBe('node:20-alpine')
75
+ expect(env.pre_run).toEqual(['npm ci', 'npm run build'])
76
+ expect(env.working_dir).toBe('/workspace')
77
+ expect(env.env_vars).toEqual({})
78
+ })
79
+
80
+ test('rejects fewer than 2 sides', () => {
81
+ const bad = `[arena]\ntask = "x"\ncriteria = ["a"]\n\n[[side]]\nname = "only"\nplayer = "c"\ndeck = "x.toml"`
82
+ expect(() => parseArenaToml(bad)).toThrow()
83
+ })
84
+
85
+ test('rejects empty criteria', () => {
86
+ const bad = `[arena]\ntask = "x"\ncriteria = []\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
87
+ expect(() => parseArenaToml(bad)).toThrow()
88
+ })
89
+
90
+ test('rejects non-object input', () => {
91
+ expect(() => ArenaToml.parse('not valid')).toThrow()
92
+ })
93
+
94
+ test('rejects missing arena section', () => {
95
+ expect(() => parseArenaToml('[[side]]\nname = "a"')).toThrow()
96
+ })
97
+
98
+ test('rejects runs_per_side = 0', () => {
99
+ const bad = `[arena]\ntask = "x"\ncriteria = ["a"]\nruns_per_side = 0\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
100
+ expect(() => parseArenaToml(bad)).toThrow()
101
+ })
102
+
103
+ test('parses integer and boolean values correctly', () => {
104
+ const toml = `[arena]\ntask = "x"\ncriteria = ["a"]\nruns_per_side = 2\nmax_participants = 5\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
105
+ const result = parseArenaToml(toml)
106
+ expect(result.arena.runs_per_side).toBe(2)
107
+ expect(result.arena.max_participants).toBe(5)
108
+ })
109
+
110
+ test('comments are stripped', () => {
111
+ const toml = `[arena]\n# this is a comment\ntask = "x"\ncriteria = ["a"]\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
112
+ const result = parseArenaToml(toml)
113
+ expect(result.arena.task).toBe('x')
114
+ })
115
+ })
116
+
117
+ // ── Execution Plan ─────────────────────────────────────────────────────────
118
+
119
+ describe('buildExecutionPlan', () => {
120
+ test('generates plan: 2 sides × 1 run = 2 cells', () => {
121
+ const toml = parseArenaToml(minimalToml)
122
+ const plan = buildExecutionPlan(toml)
123
+ expect(plan.task).toBe('Test task')
124
+ expect(plan.criteria).toEqual(['a', 'b'])
125
+ expect(plan.cells).toHaveLength(2)
126
+ expect(plan.total_runs).toBe(2)
127
+ expect(plan.cells[0]).toEqual({ side: 'runner-a', player: 'claude-code', deck: './decks/a.toml', run: 1, control: false })
128
+ expect(plan.cells[1]).toEqual({ side: 'runner-b', player: 'claude-code', deck: './decks/b.toml', run: 1, control: false })
129
+ })
130
+
131
+ test('generates plan: 3 sides × 3 runs = 9 cells', () => {
132
+ const toml = parseArenaToml(fullToml)
133
+ const plan = buildExecutionPlan(toml)
134
+ expect(plan.cells).toHaveLength(9)
135
+ expect(plan.total_runs).toBe(9)
136
+
137
+ // Cells are ordered: side 0 run 1, side 0 run 2, side 0 run 3, side 1 run 1, ...
138
+ expect(plan.cells[0]).toEqual({ side: 'minimal', player: 'standard-coder', deck: './decks/minimal.toml', run: 1, control: false })
139
+ expect(plan.cells[1]).toEqual({ side: 'minimal', player: 'standard-coder', deck: './decks/minimal.toml', run: 2, control: false })
140
+ expect(plan.cells[2]).toEqual({ side: 'minimal', player: 'standard-coder', deck: './decks/minimal.toml', run: 3, control: false })
141
+ expect(plan.cells[3]).toEqual({ side: 'rich', player: 'expert-architect', deck: './decks/rich.toml', run: 1, control: false })
142
+ expect(plan.cells[8]).toEqual({ side: 'baseline', player: 'standard-coder', deck: './decks/baseline.toml', run: 3, control: true })
143
+ })
144
+
145
+ test('control flag preserved in plan cells', () => {
146
+ const toml = parseArenaToml(fullToml)
147
+ const plan = buildExecutionPlan(toml)
148
+ const baselineCells = plan.cells.filter(c => c.side === 'baseline')
149
+ expect(baselineCells).toHaveLength(3)
150
+ expect(baselineCells.every(c => c.control)).toBe(true)
151
+ })
152
+
153
+ test('dry-run: plan is pure data, no side effects', () => {
154
+ // The entire plan generation is a pure function — dry-run is just printing it
155
+ const toml = parseArenaToml(fullToml)
156
+ const plan = buildExecutionPlan(toml)
157
+ // Verify plan is self-describing for a --dry-run output
158
+ expect(plan.total_runs).toBeGreaterThan(0)
159
+ expect(plan.cells.every(c => typeof c.side === 'string')).toBe(true)
160
+ expect(plan.cells.every(c => typeof c.player === 'string')).toBe(true)
161
+ expect(plan.cells.every(c => typeof c.deck === 'string')).toBe(true)
162
+ expect(plan.cells.every(c => typeof c.run === 'number')).toBe(true)
163
+ })
164
+ })
@@ -0,0 +1,172 @@
1
+ import { z } from 'zod'
2
+ import type { ArenaManifest } from '@lythos/test-utils/schema'
3
+
4
+ // ── arena.toml Zod schema (declarative input, k8s-manifest style) ──────────
5
+ // Anchored on: ADR-20260502110308316
6
+
7
+ export const SideEnv = z.object({
8
+ container: z.string().optional(),
9
+ pre_run: z.array(z.string()).default([]),
10
+ working_dir: z.string().optional(),
11
+ env_vars: z.record(z.string()).default({}),
12
+ })
13
+ export type SideEnv = z.infer<typeof SideEnv>
14
+
15
+ export const Side = z.object({
16
+ name: z.string(),
17
+ player: z.string(), // reference to player config (useAgent resolves)
18
+ deck: z.string(), // path to deck.toml
19
+ control: z.boolean().default(false),
20
+ env: SideEnv.default({}),
21
+ })
22
+ export type Side = z.infer<typeof Side>
23
+
24
+ export const ArenaToml = z.object({
25
+ arena: z.object({
26
+ task: z.string(), // task description or path to TASK-arena.md
27
+ criteria: z.array(z.string()).min(1),
28
+ runs_per_side: z.number().int().positive().default(1),
29
+ max_participants: z.number().int().min(2).max(5).default(5),
30
+ }),
31
+ side: z.array(Side).min(2).max(5),
32
+ })
33
+ export type ArenaToml = z.infer<typeof ArenaToml>
34
+
35
+ // ── Parser ─────────────────────────────────────────────────────────────────
36
+
37
+ export function parseArenaToml(content: string): ArenaToml {
38
+ // Simple inline TOML parser for arena.toml (no external dep needed for this subset)
39
+ const parsed = parseToml(content)
40
+ return ArenaToml.parse(parsed)
41
+ }
42
+
43
+ // ── Plan generation (pure function, dry-run visible) ───────────────────────
44
+
45
+ export interface ExecutionCell {
46
+ side: string // side name
47
+ player: string // player reference
48
+ deck: string // deck path
49
+ run: number // 1-indexed run number
50
+ control: boolean
51
+ }
52
+
53
+ export interface ExecutionPlan {
54
+ task: string
55
+ criteria: string[]
56
+ cells: ExecutionCell[]
57
+ total_runs: number
58
+ }
59
+
60
+ export function buildExecutionPlan(toml: ArenaToml): ExecutionPlan {
61
+ const cells: ExecutionCell[] = []
62
+ for (const side of toml.side) {
63
+ for (let run = 1; run <= toml.arena.runs_per_side; run++) {
64
+ cells.push({
65
+ side: side.name,
66
+ player: side.player,
67
+ deck: side.deck,
68
+ run,
69
+ control: side.control,
70
+ })
71
+ }
72
+ }
73
+ return {
74
+ task: toml.arena.task,
75
+ criteria: toml.arena.criteria,
76
+ cells,
77
+ total_runs: cells.length,
78
+ }
79
+ }
80
+
81
+ // ── Minimal TOML parser (handles the arena.toml subset without external dep) ──
82
+
83
+ function parseToml(text: string): Record<string, unknown> {
84
+ const result: Record<string, unknown> = {}
85
+ let currentTable: Record<string, unknown> = result
86
+ let currentTableKey = ''
87
+ const arrayTables: Map<string, Record<string, unknown>[]> = new Map()
88
+
89
+ for (const rawLine of text.split('\n')) {
90
+ const line = rawLine.split('#')[0].trim()
91
+ if (!line) continue
92
+
93
+ // [[array]]
94
+ const arrayMatch = line.match(/^\[\[(.+?)\]\]$/)
95
+ if (arrayMatch) {
96
+ const key = arrayMatch[1] // e.g. "side"
97
+ if (!arrayTables.has(key)) arrayTables.set(key, [])
98
+ currentTable = {}
99
+ arrayTables.get(key)!.push(currentTable)
100
+ currentTableKey = key
101
+ continue
102
+ }
103
+
104
+ // [section]
105
+ const sectionMatch = line.match(/^\[(.+?)\]$/)
106
+ if (sectionMatch) {
107
+ const key = sectionMatch[1]
108
+ // nested key like "side.env"
109
+ if (key.includes('.')) {
110
+ const [parent, child] = key.split('.')
111
+ const parentArr = arrayTables.get(parent)
112
+ if (parentArr && parentArr.length > 0) {
113
+ currentTable = {}
114
+ parentArr[parentArr.length - 1][child] = currentTable
115
+ }
116
+ } else {
117
+ result[key] = {}
118
+ currentTable = result[key] as Record<string, unknown>
119
+ }
120
+ currentTableKey = ''
121
+ continue
122
+ }
123
+
124
+ // key = value
125
+ const eqIdx = line.indexOf('=')
126
+ if (eqIdx !== -1) {
127
+ const key = line.slice(0, eqIdx).trim()
128
+ let value = line.slice(eqIdx + 1).trim()
129
+
130
+ // String value
131
+ if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) {
132
+ value = value.slice(1, -1)
133
+ } else if (value === 'true') {
134
+ value = 'true'
135
+ } else if (value === 'false') {
136
+ value = 'false'
137
+ }
138
+
139
+ // Array value: ["a", "b"]
140
+ if (value.startsWith('[') && value.endsWith(']')) {
141
+ const inner = value.slice(1, -1).trim()
142
+ if (!inner) {
143
+ currentTable[key] = []
144
+ } else {
145
+ const arr = inner.split(',').map(s => {
146
+ const t = s.trim()
147
+ if ((t.startsWith('"') && t.endsWith('"')) || (t.startsWith("'") && t.endsWith("'"))) {
148
+ return t.slice(1, -1)
149
+ }
150
+ return t
151
+ })
152
+ currentTable[key] = arr
153
+ }
154
+ } else if (value === 'true') {
155
+ currentTable[key] = true
156
+ } else if (value === 'false') {
157
+ currentTable[key] = false
158
+ } else if (/^-?\d+(\.\d+)?$/.test(value)) {
159
+ currentTable[key] = Number(value)
160
+ } else {
161
+ currentTable[key] = value
162
+ }
163
+ }
164
+ }
165
+
166
+ // Materialize array tables into result
167
+ for (const [key, arr] of arrayTables) {
168
+ result[key] = arr
169
+ }
170
+
171
+ return result
172
+ }
package/src/cli.ts CHANGED
@@ -29,23 +29,40 @@ function printHelp(): void {
29
29
  console.log(`🎭 lythoskill-arena — Skill comparison runner
30
30
 
31
31
  Usage:
32
- lythoskill-arena --task "<task description>" --skills <skill1,skill2,...>
33
- lythoskill-arena --task "<task description>" --decks <deck1,deck2,...>
32
+ lythoskill-arena run --task <path> --players <A.toml,B.toml> --decks <A.toml,B.toml> --criteria <c1,c2,...> [--out <dir>]
33
+ lythoskill-arena scaffold --task "<description>" --skills <skill1,skill2,...>
34
+ lythoskill-arena scaffold --task "<description>" --decks <deck1,deck2,...>
34
35
  lythoskill-arena viz <arena-dir>
35
36
 
37
+ Commands:
38
+ run Run arena programmatically (declarative arena.toml or CLI flags)
39
+ scaffold Create arena directory structure (legacy, manual subagent execution)
40
+ viz Visualize arena report (ASCII charts)
41
+
36
42
  Options:
37
- -t, --task <desc> Task description (required)
38
- -s, --skills <list> Comma-separated skill names
43
+ -t, --task <path|desc> Task description or path to TASK-arena.md
44
+ -s, --skills <list> Comma-separated skill names (scaffold only)
39
45
  --decks <list> Comma-separated deck paths
40
46
  -c, --criteria <list> Evaluation criteria (default: syntax,context,logic,token)
41
- --control <skill> Control skill for comparison (default: lythoskill-project-scribe)
42
- -d, --dir <dir> Output directory (default: tmp)
47
+ --players <list> Comma-separated player.toml paths (CLI run only)
48
+ --config <path> Path to arena.toml (declarative mode, k8s-style)
49
+ --dry-run Print execution plan without running (with --config)
50
+ --control <skill> Control skill for comparison (scaffold only)
51
+ --out <dir> Output directory (run: defaults to runs/arena-<id>)
52
+ -d, --dir <dir> Output directory (scaffold: defaults to tmp)
43
53
  -p, --project <dir> Project directory (default: .)
44
54
 
45
55
  Examples:
46
- lythoskill-arena --task "Refactor auth module" --skills skill-a,skill-b
47
- lythoskill-arena --task "Write tests" --decks ./decks/minimal.toml,./decks/full.toml
48
- lythoskill-arena viz tmp/arena-20260430
56
+ # Declarative mode (k8s-style)
57
+ lythoskill-arena run --config ./arena.toml
58
+ lythoskill-arena run --config ./arena.toml --dry-run
59
+
60
+ # CLI-flag mode (backward compat)
61
+ lythoskill-arena run --task ./TASK-arena.md --players ./players/claude.toml --decks ./decks/run-01.toml,./decks/run-02.toml --criteria coverage,relevance
62
+
63
+ # Legacy scaffolding
64
+ lythoskill-arena scaffold --task "Refactor auth module" --skills skill-a,skill-b
65
+ lythoskill-arena viz runs/arena-20260504
49
66
  `)
50
67
  }
51
68
 
@@ -551,6 +568,67 @@ function runViz(argv: string[]) {
551
568
  console.log(renderRadarChart(report))
552
569
  }
553
570
 
571
+ // ── Run: programmatic arena execution ───────────────────────
572
+
573
+ async function runProgrammaticArena(argv: string[]) {
574
+ const { options } = parseArgs(argv)
575
+ const { readFileSync } = await import('node:fs')
576
+
577
+ const hasConfig = !!(options as Record<string, string | undefined>).config
578
+ const dryRun = argv.includes('--dry-run')
579
+
580
+ if (hasConfig) {
581
+ // arena.toml declarative mode
582
+ const { parseArenaToml } = await import('./arena-toml')
583
+ const { runArenaFromToml } = await import('./runner')
584
+ const configPath = (options as Record<string, string | undefined>).config!
585
+
586
+ const toml = parseArenaToml(readFileSync(configPath, 'utf-8'))
587
+ const result = await runArenaFromToml({
588
+ toml,
589
+ taskPath: toml.arena.task.startsWith('/') || toml.arena.task.startsWith('./')
590
+ ? toml.arena.task
591
+ : (options as Record<string, string | undefined>).task ?? toml.arena.task,
592
+ outDir: (options as Record<string, string | undefined>).out,
593
+ dryRun,
594
+ })
595
+
596
+ if ('plan' in result) {
597
+ // dry-run
598
+ console.log(`\n📋 Dry-run: ${result.plan.total_runs} cells across ${result.plan.cells.length / Math.max(1, toml.arena.runs_per_side)} sides × ${toml.arena.runs_per_side} runs`)
599
+ for (const cell of result.plan.cells) {
600
+ console.log(` ${cell.side}/run-${cell.run}: ${cell.player} × ${cell.deck}${cell.control ? ' [control]' : ''}`)
601
+ }
602
+ return
603
+ }
604
+
605
+ console.log(`\n🎮 Arena complete: ${result.manifest.id}`)
606
+ console.log(`📁 Artifacts: ${result.artifactsDir}`)
607
+ console.log(`📊 Report: ${result.artifactsDir}/report.md`)
608
+ return
609
+ }
610
+
611
+ // CLI-flag mode (backward compat)
612
+ if (!options.task || !options.decks) {
613
+ console.error('❌ --task <path> and --decks <list> are required for "run" (or use --config <arena.toml>)')
614
+ process.exit(1)
615
+ }
616
+
617
+ const { runArena: runArenaProgrammatic } = await import('./runner')
618
+
619
+ const result = await runArenaProgrammatic({
620
+ taskPath: options.task,
621
+ playerPaths: (options.players ?? 'players/claude-code.toml').split(',').map(s => s.trim()).filter(Boolean),
622
+ deckPaths: options.decks.split(',').map(s => s.trim()).filter(Boolean),
623
+ criteria: (options.criteria ?? 'syntax,context,logic,token').split(',').map(s => s.trim()).filter(Boolean),
624
+ outDir: options.out ?? `runs/arena-${timestamp()}`,
625
+ })
626
+
627
+ console.log(`\n🎮 Arena complete: ${result.manifest.id}`)
628
+ console.log(`📁 Artifacts: ${result.artifactsDir}`)
629
+ console.log(`📊 Report: ${result.artifactsDir}/report.md`)
630
+ }
631
+
554
632
  // ── Main Entry ───────────────────────────────────────────────
555
633
 
556
634
  if (import.meta.main) {
@@ -559,7 +637,14 @@ if (import.meta.main) {
559
637
 
560
638
  if (cmd === 'viz') {
561
639
  runViz(args.slice(1))
640
+ } else if (cmd === 'run') {
641
+ runProgrammaticArena(args.slice(1))
642
+ } else if (cmd === 'scaffold' || !cmd || args[0]?.startsWith('-')) {
643
+ // Legacy behavior: if no subcommand or starts with flags, treat as scaffold
644
+ runArena(cmd === 'scaffold' ? args.slice(1) : args)
562
645
  } else {
563
- runArena(args)
646
+ console.error(`❌ Unknown command: ${cmd}`)
647
+ printHelp()
648
+ process.exit(1)
564
649
  }
565
650
  }
@@ -0,0 +1,92 @@
1
+ import { describe, test, expect } from 'bun:test'
2
+ import { computePareto } from './comparative-judge'
3
+
4
+ describe('computePareto', () => {
5
+ test('single participant is always non-dominated', () => {
6
+ const result = computePareto([
7
+ { participant_id: 'run-01', scores: { a: 5, b: 3 } },
8
+ ])
9
+ expect(result).toHaveLength(1)
10
+ expect(result[0].dominated).toBe(false)
11
+ expect(result[0].dominated_by).toEqual([])
12
+ })
13
+
14
+ test('clear dominance: run-01 dominates run-02 on all criteria', () => {
15
+ const result = computePareto([
16
+ { participant_id: 'run-01', scores: { coverage: 5, relevance: 5 } },
17
+ { participant_id: 'run-02', scores: { coverage: 3, relevance: 2 } },
18
+ ])
19
+ expect(result[0].dominated).toBe(false)
20
+ expect(result[1].dominated).toBe(true)
21
+ expect(result[1].dominated_by).toEqual(['run-01'])
22
+ })
23
+
24
+ test('equal scores: no one dominates', () => {
25
+ const result = computePareto([
26
+ { participant_id: 'run-01', scores: { a: 4, b: 4 } },
27
+ { participant_id: 'run-02', scores: { a: 4, b: 4 } },
28
+ ])
29
+ expect(result[0].dominated).toBe(false)
30
+ expect(result[1].dominated).toBe(false)
31
+ })
32
+
33
+ test('cross dominance: each wins on different criteria', () => {
34
+ const result = computePareto([
35
+ { participant_id: 'run-01', scores: { speed: 5, accuracy: 2 } },
36
+ { participant_id: 'run-02', scores: { speed: 2, accuracy: 5 } },
37
+ ])
38
+ // Neither dominates: run-01 better on speed but worse on accuracy
39
+ expect(result[0].dominated).toBe(false)
40
+ expect(result[1].dominated).toBe(false)
41
+ })
42
+
43
+ test('multi-participant: transitive dominance chain', () => {
44
+ const result = computePareto([
45
+ { participant_id: 'best', scores: { a: 5, b: 5, c: 5 } },
46
+ { participant_id: 'mid', scores: { a: 4, b: 4, c: 4 } },
47
+ { participant_id: 'worst', scores: { a: 2, b: 2, c: 2 } },
48
+ ])
49
+ // best dominates both, mid dominates worst
50
+ expect(result[0].dominated).toBe(false) // best
51
+ expect(result[1].dominated).toBe(true) // mid (by best)
52
+ expect(result[1].dominated_by).toEqual(['best'])
53
+ expect(result[2].dominated).toBe(true) // worst (by both)
54
+ expect(result[2].dominated_by.sort()).toEqual(['best', 'mid'].sort())
55
+ })
56
+
57
+ test('Pareto frontier from playground BDD-research: run-01 dominates run-02', () => {
58
+ // From playground/arena-bdd-research/report.md:
59
+ // Run-01: coverage=5, relevance=5, actionability=5, depth=5
60
+ // Run-02: coverage=3, relevance=2, actionability=2, depth=1
61
+ const result = computePareto([
62
+ { participant_id: 'run-01', scores: { coverage: 5, relevance: 5, actionability: 5, depth: 5 } },
63
+ { participant_id: 'run-02', scores: { coverage: 3, relevance: 2, actionability: 2, depth: 1 } },
64
+ ])
65
+ expect(result[0].dominated).toBe(false) // run-01: Pareto-optimal
66
+ expect(result[1].dominated).toBe(true) // run-02: dominated by run-01
67
+ expect(result[1].dominated_by).toEqual(['run-01'])
68
+ })
69
+
70
+ test('empty scores object', () => {
71
+ const result = computePareto([
72
+ { participant_id: 'a', scores: {} },
73
+ { participant_id: 'b', scores: {} },
74
+ ])
75
+ expect(result).toHaveLength(2)
76
+ expect(result[0].dominated).toBe(false)
77
+ expect(result[1].dominated).toBe(false)
78
+ })
79
+
80
+ test('partial criteria overlap', () => {
81
+ const result = computePareto([
82
+ { participant_id: 'run-01', scores: { a: 5, b: 3 } },
83
+ { participant_id: 'run-02', scores: { a: 3, c: 5 } },
84
+ ])
85
+ // run-01 has a=5 vs run-02 a=3 (a wins)
86
+ // run-02 has b=undefined vs run-01 b=3 → treated as 0. So run-01 >= run-02 on all shared crit, > on one.
87
+ // But c: run-01 has 0, run-02 has 5. So run-02 > run-01 on c.
88
+ // Cross-dominance → neither dominates
89
+ expect(result[0].dominated).toBe(false)
90
+ expect(result[1].dominated).toBe(false)
91
+ })
92
+ })