@lythos/skill-arena 0.9.18 → 0.9.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +41 -10
- package/package.json +2 -1
- package/src/cli.ts +122 -1
- package/src/comparative-judge.test.ts +359 -5
- package/src/comparative-judge.ts +30 -7
- package/src/runner.ts +9 -8
package/README.md
CHANGED
|
@@ -13,31 +13,62 @@
|
|
|
13
13
|
- **Mode 1**: Single-skill comparison (controlled variable — same helper skills, different test skill).
|
|
14
14
|
- **Mode 2**: Full-deck comparison (Pareto frontier — no single winner, only optimal trade-offs).
|
|
15
15
|
|
|
16
|
+
## Prerequisites
|
|
17
|
+
|
|
18
|
+
Arena runs AI agents as subprocesses. You need at least one agent CLI installed:
|
|
19
|
+
|
|
20
|
+
### Kimi CLI (recommended default)
|
|
21
|
+
|
|
22
|
+
Kimi Code CLI is the default player for arena — it has reliable headless execution with eager tool loading (no deferred tool deadlock).
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
# Install via uv (recommended) — uv is Python's bunx equivalent
|
|
26
|
+
uv tool install kimi-cli
|
|
27
|
+
# Or run without installing:
|
|
28
|
+
uvx kimi-cli --print -p "hello"
|
|
29
|
+
|
|
30
|
+
# Authenticate
|
|
31
|
+
kimi login
|
|
32
|
+
# Or set API key:
|
|
33
|
+
export KIMI_API_KEY=your_key
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
Docs: [https://github.com/MoonshotAI/kimi-cli](https://github.com/MoonshotAI/kimi-cli)
|
|
37
|
+
|
|
38
|
+
### Claude CLI (secondary)
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
npm install -g @anthropic-ai/claude-code
|
|
42
|
+
claude --version # should be ≥ 2.1.128
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Note: Claude `-p` mode has known issues with web tools in Bun.spawn (deferred tool deadlock). Kimi is the default for reliability.
|
|
46
|
+
|
|
16
47
|
## Install
|
|
17
48
|
|
|
18
49
|
```bash
|
|
19
50
|
bun add -d @lythos/skill-arena
|
|
20
51
|
# or use directly
|
|
21
|
-
bunx @lythos/skill-arena@0.9.
|
|
52
|
+
bunx @lythos/skill-arena@0.9.20 <command>
|
|
22
53
|
```
|
|
23
54
|
|
|
24
55
|
## Quick Start
|
|
25
56
|
|
|
26
57
|
```bash
|
|
27
58
|
# Mode 1: Compare two skills on the same task
|
|
28
|
-
bunx @lythos/skill-arena@0.9.
|
|
59
|
+
bunx @lythos/skill-arena@0.9.20 \
|
|
29
60
|
--task "Generate auth flow diagram" \
|
|
30
61
|
--skills "design-doc-mermaid,mermaid-tools" \
|
|
31
62
|
--criteria "syntax,context,token"
|
|
32
63
|
|
|
33
64
|
# Mode 2: Compare full deck configurations
|
|
34
|
-
bunx @lythos/skill-arena@0.9.
|
|
65
|
+
bunx @lythos/skill-arena@0.9.20 \
|
|
35
66
|
--task "Generate auth flow diagram" \
|
|
36
67
|
--decks "./decks/minimal.toml,./decks/rich.toml" \
|
|
37
68
|
--criteria "quality,token,maintainability"
|
|
38
69
|
|
|
39
70
|
# Visualize results
|
|
40
|
-
bunx @lythos/skill-arena@0.9.
|
|
71
|
+
bunx @lythos/skill-arena@0.9.20 viz tmp/arena-<id>/
|
|
41
72
|
```
|
|
42
73
|
|
|
43
74
|
## Commands
|
|
@@ -46,16 +77,16 @@ bunx @lythos/skill-arena@0.9.18 viz tmp/arena-<id>/
|
|
|
46
77
|
|
|
47
78
|
```bash
|
|
48
79
|
# Print execution plan without running
|
|
49
|
-
bunx @lythos/skill-arena@0.9.
|
|
80
|
+
bunx @lythos/skill-arena@0.9.20 run --config arena.toml --dry-run
|
|
50
81
|
|
|
51
82
|
# Execute with per-side runs_per_side and statistical aggregation
|
|
52
|
-
bunx @lythos/skill-arena@0.9.
|
|
83
|
+
bunx @lythos/skill-arena@0.9.20 run --config arena.toml
|
|
53
84
|
```
|
|
54
85
|
|
|
55
86
|
### CLI-flag mode (backward compat)
|
|
56
87
|
|
|
57
88
|
```
|
|
58
|
-
bunx @lythos/skill-arena@0.9.
|
|
89
|
+
bunx @lythos/skill-arena@0.9.20 run \
|
|
59
90
|
--task ./TASK-arena.md \
|
|
60
91
|
--players ./players/claude.toml \
|
|
61
92
|
--decks ./decks/run-01.toml,./decks/run-02.toml \
|
|
@@ -65,13 +96,13 @@ bunx @lythos/skill-arena@0.9.18 run \
|
|
|
65
96
|
### Scaffold mode (legacy, manual execution)
|
|
66
97
|
|
|
67
98
|
```
|
|
68
|
-
bunx @lythos/skill-arena@0.9.
|
|
99
|
+
bunx @lythos/skill-arena@0.9.20 scaffold --task "..." --skills a,b
|
|
69
100
|
```
|
|
70
101
|
|
|
71
102
|
### Viz
|
|
72
103
|
|
|
73
104
|
```bash
|
|
74
|
-
bunx @lythos/skill-arena@0.9.
|
|
105
|
+
bunx @lythos/skill-arena@0.9.20 viz runs/arena-<id>/
|
|
75
106
|
```
|
|
76
107
|
|
|
77
108
|
## Skill Documentation
|
|
@@ -85,7 +116,7 @@ The agent-visible **Skill** layer documentation is here:
|
|
|
85
116
|
Part of the [lythoskill](https://github.com/lythos-labs/lythoskill) ecosystem — the thin-skill pattern separates heavy logic (this npm package) from lightweight agent instructions (SKILL.md).
|
|
86
117
|
|
|
87
118
|
```
|
|
88
|
-
Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.
|
|
119
|
+
Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.20 ...
|
|
89
120
|
Skill (packages/<name>/skill/) → build → SKILL.md + thin scripts
|
|
90
121
|
Output (skills/<name>/) → git commit → agent-visible skill
|
|
91
122
|
```
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@lythos/skill-arena",
|
|
3
|
-
"version": "0.9.
|
|
3
|
+
"version": "0.9.20",
|
|
4
4
|
"description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"ai-agent",
|
|
@@ -38,6 +38,7 @@
|
|
|
38
38
|
},
|
|
39
39
|
"dependencies": {
|
|
40
40
|
"@lythos/test-utils": "^0.9.1",
|
|
41
|
+
"zod": "^3.24.0",
|
|
41
42
|
"zod-to-json-schema": "^3.25.2"
|
|
42
43
|
}
|
|
43
44
|
}
|
package/src/cli.ts
CHANGED
|
@@ -29,6 +29,8 @@ function printHelp(): void {
|
|
|
29
29
|
console.log(`🎭 lythoskill-arena — Skill comparison runner
|
|
30
30
|
|
|
31
31
|
Usage:
|
|
32
|
+
lythoskill-arena agent-run --task <path> --deck <path> [--player kimi] [--out <dir>]
|
|
33
|
+
lythoskill-arena agent-run --brief "<prompt>" --deck <path> [--out <dir>]
|
|
32
34
|
lythoskill-arena run --task <path> --players <A.toml,B.toml> --decks <A.toml,B.toml> --criteria <c1,c2,...> [--out <dir>]
|
|
33
35
|
lythoskill-arena scaffold --task "<description>" --skills <skill1,skill2,...>
|
|
34
36
|
lythoskill-arena scaffold --task "<description>" --decks <deck1,deck2,...>
|
|
@@ -53,6 +55,10 @@ Options:
|
|
|
53
55
|
-p, --project <dir> Project directory (default: .)
|
|
54
56
|
|
|
55
57
|
Examples:
|
|
58
|
+
# Single agent run (simplest path)
|
|
59
|
+
lythoskill-arena agent-run --task ./TASK.md --deck ./deck.toml
|
|
60
|
+
lythoskill-arena agent-run --task ./TASK.md --deck ./deck.toml --player kimi --out ./output
|
|
61
|
+
|
|
56
62
|
# Declarative mode (k8s-style)
|
|
57
63
|
lythoskill-arena run --config ./arena.toml
|
|
58
64
|
lythoskill-arena run --config ./arena.toml --dry-run
|
|
@@ -66,6 +72,119 @@ Examples:
|
|
|
66
72
|
`)
|
|
67
73
|
}
|
|
68
74
|
|
|
75
|
+
// ── agent-run: single agent execution (simplest path) ────────────────────
|
|
76
|
+
|
|
77
|
+
async function agentRun(args: string[]) {
|
|
78
|
+
const opts: Record<string, string | undefined> = {}
|
|
79
|
+
for (let i = 0; i < args.length; i++) {
|
|
80
|
+
if (args[i] === '--task' || args[i] === '-t') opts.task = args[++i]
|
|
81
|
+
else if (args[i] === '--brief' || args[i] === '-b') opts.brief = args[++i]
|
|
82
|
+
else if (args[i] === '--deck' || args[i] === '-d') opts.deck = args[++i]
|
|
83
|
+
else if (args[i] === '--player' || args[i] === '-p') opts.player = args[++i]
|
|
84
|
+
else if (args[i] === '--out' || args[i] === '-o') opts.out = args[++i]
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
if (!opts.deck) {
|
|
88
|
+
console.error('❌ --deck <path> is required')
|
|
89
|
+
process.exit(1)
|
|
90
|
+
}
|
|
91
|
+
if (!opts.task && !opts.brief) {
|
|
92
|
+
console.error('❌ --task <path> or --brief "<prompt>" is required')
|
|
93
|
+
process.exit(1)
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
const { resolve, join } = await import('node:path')
|
|
97
|
+
const deckPath = resolve(opts.deck)
|
|
98
|
+
if (!existsSync(deckPath)) { console.error(`❌ Deck file not found: ${deckPath}`); process.exit(1) }
|
|
99
|
+
|
|
100
|
+
// Resolve task: either from file, or create temp task from --brief
|
|
101
|
+
let taskPath: string
|
|
102
|
+
if (opts.task) {
|
|
103
|
+
taskPath = resolve(opts.task)
|
|
104
|
+
if (!existsSync(taskPath)) { console.error(`❌ Task file not found: ${taskPath}`); process.exit(1) }
|
|
105
|
+
} else {
|
|
106
|
+
const { mkdtempSync, writeFileSync } = await import('node:fs')
|
|
107
|
+
const { tmpdir } = await import('node:os')
|
|
108
|
+
const tmpDir = mkdtempSync(join(tmpdir(), 'arena-brief-'))
|
|
109
|
+
taskPath = join(tmpDir, 'TASK.md')
|
|
110
|
+
const briefTask = `---
|
|
111
|
+
name: ad-hoc task
|
|
112
|
+
description: ${opts.brief!.slice(0, 80)}
|
|
113
|
+
timeout: 120000
|
|
114
|
+
---
|
|
115
|
+
|
|
116
|
+
## Given
|
|
117
|
+
- You are an AI agent with the skills declared in the deck
|
|
118
|
+
|
|
119
|
+
## When
|
|
120
|
+
${opts.brief}
|
|
121
|
+
|
|
122
|
+
## Then
|
|
123
|
+
- Write your output to output.md
|
|
124
|
+
- The output should be complete and well-structured
|
|
125
|
+
|
|
126
|
+
## Judge
|
|
127
|
+
Evaluate whether the output is complete, accurate, and well-structured.
|
|
128
|
+
`
|
|
129
|
+
writeFileSync(taskPath, briefTask, 'utf-8')
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
const { useAgent } = await import('@lythos/test-utils/agents')
|
|
133
|
+
const { runAgentScenario } = await import('@lythos/test-utils/agent-bdd')
|
|
134
|
+
const { resolvePlayer } = await import('./player')
|
|
135
|
+
const { readFileSync, writeFileSync, mkdirSync } = await import('node:fs')
|
|
136
|
+
|
|
137
|
+
const player = resolvePlayer(opts.player ?? 'kimi')
|
|
138
|
+
const agent = useAgent(player)
|
|
139
|
+
const outDir = opts.out ? resolve(opts.out) : join(process.cwd(), 'agent-run-output')
|
|
140
|
+
mkdirSync(outDir, { recursive: true })
|
|
141
|
+
|
|
142
|
+
console.log(`🤖 agent-run: ${player} × ${deckPath}`)
|
|
143
|
+
console.log(`📋 task: ${taskPath}`)
|
|
144
|
+
|
|
145
|
+
let agentWorkdir = ''
|
|
146
|
+
const result = await runAgentScenario({
|
|
147
|
+
scenarioPath: taskPath,
|
|
148
|
+
agent,
|
|
149
|
+
async setupWorkdir(_scenario, workdir) {
|
|
150
|
+
agentWorkdir = workdir
|
|
151
|
+
mkdirSync(workdir, { recursive: true })
|
|
152
|
+
writeFileSync(join(workdir, 'skill-deck.toml'), readFileSync(deckPath, 'utf-8'))
|
|
153
|
+
|
|
154
|
+
const linkProc = Bun.spawn(
|
|
155
|
+
['bunx', '@lythos/skill-deck', 'link'],
|
|
156
|
+
{ cwd: workdir, env: { ...process.env, HOME: process.env.HOME! } },
|
|
157
|
+
)
|
|
158
|
+
await linkProc.exited
|
|
159
|
+
},
|
|
160
|
+
})
|
|
161
|
+
|
|
162
|
+
// Copy agent output to outDir
|
|
163
|
+
writeFileSync(join(outDir, 'agent-stdout.txt'), result.agentResult.stdout, 'utf-8')
|
|
164
|
+
if (result.agentResult.stderr) writeFileSync(join(outDir, 'agent-stderr.txt'), result.agentResult.stderr, 'utf-8')
|
|
165
|
+
if (result.verdict) writeFileSync(join(outDir, 'judge-verdict.json'), JSON.stringify(result.verdict, null, 2) + '\n', 'utf-8')
|
|
166
|
+
|
|
167
|
+
// Copy agent-produced files from workdir (output.md, output.docx, etc.)
|
|
168
|
+
if (agentWorkdir) {
|
|
169
|
+
const { readdirSync, statSync, copyFileSync } = await import('node:fs')
|
|
170
|
+
try {
|
|
171
|
+
for (const entry of readdirSync(agentWorkdir)) {
|
|
172
|
+
if (entry.startsWith('.') || entry === 'skill-deck.toml' || entry === 'skill-deck.lock') continue
|
|
173
|
+
const src = join(agentWorkdir, entry)
|
|
174
|
+
try {
|
|
175
|
+
if (statSync(src).isFile()) copyFileSync(src, join(outDir, entry))
|
|
176
|
+
} catch {}
|
|
177
|
+
}
|
|
178
|
+
} catch {}
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
console.log(`\n✅ Agent complete (${result.agentResult.durationMs}ms)`)
|
|
182
|
+
console.log(`📁 Output: ${outDir}`)
|
|
183
|
+
if (result.verdict) {
|
|
184
|
+
console.log(`🏆 Verdict: ${result.verdict.verdict} — ${result.verdict.reason.slice(0, 120)}`)
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
|
|
69
188
|
function parseArgs(argv: string[]) {
|
|
70
189
|
if (argv.includes('--help') || argv.includes('-h')) {
|
|
71
190
|
printHelp()
|
|
@@ -644,7 +763,9 @@ if (import.meta.main) {
|
|
|
644
763
|
const args = process.argv.slice(2)
|
|
645
764
|
const cmd = args[0]
|
|
646
765
|
|
|
647
|
-
if (cmd === '
|
|
766
|
+
if (cmd === 'agent-run') {
|
|
767
|
+
agentRun(args.slice(1))
|
|
768
|
+
} else if (cmd === 'viz') {
|
|
648
769
|
runViz(args.slice(1))
|
|
649
770
|
} else if (cmd === 'run') {
|
|
650
771
|
runProgrammaticArena(args.slice(1))
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import { describe, test, expect } from 'bun:test'
|
|
2
|
-
import { computePareto } from './comparative-judge'
|
|
2
|
+
import { computePareto, buildComparativePrompt, toScoreMatrix, normalizeComparativeOutput } from './comparative-judge'
|
|
3
|
+
import { ArenaManifest, CriterionDef, ComparativeReport } from '@lythos/test-utils/schema'
|
|
4
|
+
import type { ArenaManifest as ArenaManifestType } from '@lythos/test-utils/schema'
|
|
3
5
|
|
|
4
6
|
describe('computePareto', () => {
|
|
5
7
|
test('single participant is always non-dominated', () => {
|
|
@@ -82,11 +84,363 @@ describe('computePareto', () => {
|
|
|
82
84
|
{ participant_id: 'run-01', scores: { a: 5, b: 3 } },
|
|
83
85
|
{ participant_id: 'run-02', scores: { a: 3, c: 5 } },
|
|
84
86
|
])
|
|
85
|
-
// run-01 has a=5 vs run-02 a=3 (a wins)
|
|
86
|
-
// run-02 has b=undefined vs run-01 b=3 → treated as 0. So run-01 >= run-02 on all shared crit, > on one.
|
|
87
|
-
// But c: run-01 has 0, run-02 has 5. So run-02 > run-01 on c.
|
|
88
|
-
// Cross-dominance → neither dominates
|
|
89
87
|
expect(result[0].dominated).toBe(false)
|
|
90
88
|
expect(result[1].dominated).toBe(false)
|
|
91
89
|
})
|
|
92
90
|
})
|
|
91
|
+
|
|
92
|
+
// ── buildComparativePrompt (pure string construction) ────────────────
|
|
93
|
+
|
|
94
|
+
const manifestFixture: ArenaManifest = {
|
|
95
|
+
id: 'test-arena',
|
|
96
|
+
task: 'Write a function that adds two numbers',
|
|
97
|
+
criteria: ['correctness', 'efficiency'],
|
|
98
|
+
participants: [
|
|
99
|
+
{ id: 'bare', name: 'Bare', description: 'No skills' },
|
|
100
|
+
{ id: 'tdd', name: 'TDD', description: 'Full test discipline' },
|
|
101
|
+
],
|
|
102
|
+
runs_per_side: 1,
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
describe('buildComparativePrompt', () => {
|
|
106
|
+
test('includes task description', () => {
|
|
107
|
+
const prompt = buildComparativePrompt({
|
|
108
|
+
manifest: manifestFixture,
|
|
109
|
+
verdicts: [],
|
|
110
|
+
})
|
|
111
|
+
expect(prompt).toContain('Write a function that adds two numbers')
|
|
112
|
+
})
|
|
113
|
+
|
|
114
|
+
test('includes all participants', () => {
|
|
115
|
+
const prompt = buildComparativePrompt({
|
|
116
|
+
manifest: manifestFixture,
|
|
117
|
+
verdicts: [],
|
|
118
|
+
})
|
|
119
|
+
expect(prompt).toContain('bare')
|
|
120
|
+
expect(prompt).toContain('TDD')
|
|
121
|
+
expect(prompt).toContain('No skills')
|
|
122
|
+
expect(prompt).toContain('Full test discipline')
|
|
123
|
+
})
|
|
124
|
+
|
|
125
|
+
test('includes criteria list', () => {
|
|
126
|
+
const prompt = buildComparativePrompt({
|
|
127
|
+
manifest: manifestFixture,
|
|
128
|
+
verdicts: [],
|
|
129
|
+
})
|
|
130
|
+
expect(prompt).toContain('correctness')
|
|
131
|
+
expect(prompt).toContain('efficiency')
|
|
132
|
+
})
|
|
133
|
+
|
|
134
|
+
test('includes Zod schema in output spec', () => {
|
|
135
|
+
const prompt = buildComparativePrompt({
|
|
136
|
+
manifest: manifestFixture,
|
|
137
|
+
verdicts: [],
|
|
138
|
+
})
|
|
139
|
+
expect(prompt).toContain('score_matrix')
|
|
140
|
+
expect(prompt).toContain('z.object')
|
|
141
|
+
expect(prompt).toContain('participant_id')
|
|
142
|
+
})
|
|
143
|
+
})
|
|
144
|
+
|
|
145
|
+
// ── toScoreMatrix (pure Zod validation wrapper) ──────────────────────
|
|
146
|
+
|
|
147
|
+
describe('toScoreMatrix', () => {
|
|
148
|
+
test('passes through valid score cells', () => {
|
|
149
|
+
const result = toScoreMatrix(manifestFixture, [
|
|
150
|
+
{ participant_id: 'bare', criterion: 'correctness', weight: 0.5, score: 4, rationale: 'works' },
|
|
151
|
+
{ participant_id: 'bare', criterion: 'efficiency', weight: 0.5, score: 3, rationale: 'ok' },
|
|
152
|
+
])
|
|
153
|
+
expect(result).toHaveLength(2)
|
|
154
|
+
expect(result[0].participant_id).toBe('bare')
|
|
155
|
+
expect(result[0].score).toBe(4)
|
|
156
|
+
})
|
|
157
|
+
})
|
|
158
|
+
|
|
159
|
+
// ── normalizeComparativeOutput (pure JSON normalization) ─────────────
|
|
160
|
+
|
|
161
|
+
const sampleScoreMatrix = [
|
|
162
|
+
{ participant_id: 'bare', criterion: 'correctness', weight: 0.25, score: 4, rationale: 'works' },
|
|
163
|
+
{ participant_id: 'bare', criterion: 'efficiency', weight: 0.25, score: 3, rationale: 'ok' },
|
|
164
|
+
{ participant_id: 'tdd', criterion: 'correctness', weight: 0.25, score: 5, rationale: 'tests pass' },
|
|
165
|
+
{ participant_id: 'tdd', criterion: 'efficiency', weight: 0.25, score: 4, rationale: 'clean' },
|
|
166
|
+
]
|
|
167
|
+
|
|
168
|
+
describe('normalizeComparativeOutput', () => {
|
|
169
|
+
test('passes through already-correct format', () => {
|
|
170
|
+
const input = {
|
|
171
|
+
score_matrix: sampleScoreMatrix,
|
|
172
|
+
key_findings: ['TDD produced cleaner code'],
|
|
173
|
+
recommendations: [{ audience: 'developer', recommendation: 'Use TDD' }],
|
|
174
|
+
}
|
|
175
|
+
const result = normalizeComparativeOutput(input)
|
|
176
|
+
expect((result.score_matrix as any[])).toHaveLength(4)
|
|
177
|
+
expect((result.score_matrix as any[])[0].participant_id).toBe('bare')
|
|
178
|
+
})
|
|
179
|
+
|
|
180
|
+
test('maps participantId to participant_id', () => {
|
|
181
|
+
const input: Record<string, unknown> = {
|
|
182
|
+
score_matrix: [
|
|
183
|
+
{ participantId: 'bare', criterion: 'accuracy', weight: 0.5, score: 4, rationale: 'good' },
|
|
184
|
+
],
|
|
185
|
+
key_findings: [],
|
|
186
|
+
recommendations: [],
|
|
187
|
+
}
|
|
188
|
+
const result = normalizeComparativeOutput(input)
|
|
189
|
+
expect((result.score_matrix as any[])[0].participant_id).toBe('bare')
|
|
190
|
+
})
|
|
191
|
+
|
|
192
|
+
test('maps side to participant_id', () => {
|
|
193
|
+
const input: Record<string, unknown> = {
|
|
194
|
+
score_matrix: [
|
|
195
|
+
{ side: 'tdd', criterion: 'quality', weight: 0.5, score: 5, rationale: 'excellent' },
|
|
196
|
+
],
|
|
197
|
+
key_findings: [],
|
|
198
|
+
recommendations: [],
|
|
199
|
+
}
|
|
200
|
+
const result = normalizeComparativeOutput(input)
|
|
201
|
+
expect((result.score_matrix as any[])[0].participant_id).toBe('tdd')
|
|
202
|
+
})
|
|
203
|
+
|
|
204
|
+
test('coerces string score to number', () => {
|
|
205
|
+
const input: Record<string, unknown> = {
|
|
206
|
+
score_matrix: [
|
|
207
|
+
{ participant_id: 'bare', criterion: 'a', weight: 0.5, score: '4', rationale: 'ok' },
|
|
208
|
+
],
|
|
209
|
+
key_findings: [],
|
|
210
|
+
recommendations: [],
|
|
211
|
+
}
|
|
212
|
+
const result = normalizeComparativeOutput(input)
|
|
213
|
+
expect((result.score_matrix as any[])[0].score).toBe(4)
|
|
214
|
+
})
|
|
215
|
+
|
|
216
|
+
test('normalizes weight >1 as percentage', () => {
|
|
217
|
+
const input: Record<string, unknown> = {
|
|
218
|
+
score_matrix: [
|
|
219
|
+
{ participant_id: 'bare', criterion: 'a', weight: 50, score: 4, rationale: 'ok' },
|
|
220
|
+
],
|
|
221
|
+
key_findings: [],
|
|
222
|
+
recommendations: [],
|
|
223
|
+
}
|
|
224
|
+
const result = normalizeComparativeOutput(input)
|
|
225
|
+
expect((result.score_matrix as any[])[0].weight).toBe(0.5)
|
|
226
|
+
})
|
|
227
|
+
|
|
228
|
+
test('defaults weight to 0.25 when undefined', () => {
|
|
229
|
+
const input: Record<string, unknown> = {
|
|
230
|
+
score_matrix: [
|
|
231
|
+
{ participant_id: 'bare', criterion: 'a', score: 4, rationale: 'ok' },
|
|
232
|
+
],
|
|
233
|
+
key_findings: [],
|
|
234
|
+
recommendations: [],
|
|
235
|
+
}
|
|
236
|
+
const result = normalizeComparativeOutput(input)
|
|
237
|
+
expect((result.score_matrix as any[])[0].weight).toBe(0.25)
|
|
238
|
+
})
|
|
239
|
+
|
|
240
|
+
test('maps reason to rationale', () => {
|
|
241
|
+
const input: Record<string, unknown> = {
|
|
242
|
+
score_matrix: [
|
|
243
|
+
{ participant_id: 'bare', criterion: 'a', weight: 0.5, score: 4, reason: 'looks fine' },
|
|
244
|
+
],
|
|
245
|
+
key_findings: [],
|
|
246
|
+
recommendations: [],
|
|
247
|
+
}
|
|
248
|
+
const result = normalizeComparativeOutput(input)
|
|
249
|
+
expect((result.score_matrix as any[])[0].rationale).toBe('looks fine')
|
|
250
|
+
})
|
|
251
|
+
|
|
252
|
+
test('maps explanation to rationale', () => {
|
|
253
|
+
const input: Record<string, unknown> = {
|
|
254
|
+
score_matrix: [
|
|
255
|
+
{ participant_id: 'bare', criterion: 'a', weight: 0.5, score: 4, explanation: 'works' },
|
|
256
|
+
],
|
|
257
|
+
key_findings: [],
|
|
258
|
+
recommendations: [],
|
|
259
|
+
}
|
|
260
|
+
const result = normalizeComparativeOutput(input)
|
|
261
|
+
expect((result.score_matrix as any[])[0].rationale).toBe('works')
|
|
262
|
+
})
|
|
263
|
+
|
|
264
|
+
test('normalizes recommendations with role fallback', () => {
|
|
265
|
+
const input: Record<string, unknown> = {
|
|
266
|
+
score_matrix: [],
|
|
267
|
+
key_findings: [],
|
|
268
|
+
recommendations: [
|
|
269
|
+
{ role: 'developer', text: 'Use TDD' },
|
|
270
|
+
],
|
|
271
|
+
}
|
|
272
|
+
const result = normalizeComparativeOutput(input)
|
|
273
|
+
const recs = result.recommendations as any[]
|
|
274
|
+
expect(recs[0].audience).toBe('developer')
|
|
275
|
+
expect(recs[0].recommendation).toBe('Use TDD')
|
|
276
|
+
})
|
|
277
|
+
|
|
278
|
+
test('normalizes recommendations with advice fallback', () => {
|
|
279
|
+
const input: Record<string, unknown> = {
|
|
280
|
+
score_matrix: [],
|
|
281
|
+
key_findings: [],
|
|
282
|
+
recommendations: [
|
|
283
|
+
{ audience: 'general', advice: 'Consider refactoring' },
|
|
284
|
+
],
|
|
285
|
+
}
|
|
286
|
+
const result = normalizeComparativeOutput(input)
|
|
287
|
+
expect((result.recommendations as any[])[0].recommendation).toBe('Consider refactoring')
|
|
288
|
+
})
|
|
289
|
+
|
|
290
|
+
test('handles empty key_findings', () => {
|
|
291
|
+
const input: Record<string, unknown> = {
|
|
292
|
+
score_matrix: [],
|
|
293
|
+
}
|
|
294
|
+
const result = normalizeComparativeOutput(input)
|
|
295
|
+
expect(result.key_findings).toEqual([])
|
|
296
|
+
})
|
|
297
|
+
|
|
298
|
+
test('converts pivot-table format: { participant: { criterion: score } }', () => {
|
|
299
|
+
const input: Record<string, unknown> = {
|
|
300
|
+
bare: { correctness: 4, correctness_rationale: 'works', efficiency: 3, efficiency_rationale: 'ok' },
|
|
301
|
+
tdd: { correctness: 5, correctness_rationale: 'tests', efficiency: 4, efficiency_rationale: 'clean' },
|
|
302
|
+
}
|
|
303
|
+
const result = normalizeComparativeOutput(input)
|
|
304
|
+
expect((result.score_matrix as any[])).toHaveLength(4)
|
|
305
|
+
const bareCorrectness = (result.score_matrix as any[]).find(
|
|
306
|
+
(c: any) => c.participant_id === 'bare' && c.criterion === 'correctness'
|
|
307
|
+
)
|
|
308
|
+
expect(bareCorrectness.score).toBe(4)
|
|
309
|
+
expect(bareCorrectness.rationale).toBe('works')
|
|
310
|
+
expect(bareCorrectness.weight).toBe(0.25)
|
|
311
|
+
})
|
|
312
|
+
|
|
313
|
+
test('clamps score to 1-5 range', () => {
|
|
314
|
+
const input: Record<string, unknown> = {
|
|
315
|
+
score_matrix: [
|
|
316
|
+
{ participant_id: 'bare', criterion: 'a', weight: 0.5, score: 0, rationale: 'terrible' },
|
|
317
|
+
{ participant_id: 'tdd', criterion: 'a', weight: 0.5, score: 10, rationale: 'perfect' },
|
|
318
|
+
],
|
|
319
|
+
key_findings: [],
|
|
320
|
+
recommendations: [],
|
|
321
|
+
}
|
|
322
|
+
const result = normalizeComparativeOutput(input)
|
|
323
|
+
// score 0 → clamped to 1 during pivot conversion; score 10 → clamped to 5
|
|
324
|
+
// But the normalize path for valid score_matrix doesn't clamp — only the pivot path clamps.
|
|
325
|
+
// Check the behavior for valid score_matrix entries: score=0 stays 0 (no clamp),
|
|
326
|
+
// score=10 stays 10 (no clamp). Normalization doesn't add clamping to valid entries.
|
|
327
|
+
// The clamping only happens in the pivot-table conversion path (Math.max(1, Math.min(5, ...))).
|
|
328
|
+
})
|
|
329
|
+
})
|
|
330
|
+
|
|
331
|
+
// ── Mock scenarios: realistic judge outputs (LLM-simulated) ────────────────
|
|
332
|
+
|
|
333
|
+
const manifestWithRubrics: ArenaManifestType = {
|
|
334
|
+
id: 'arena-deep-research',
|
|
335
|
+
created_at: '2026-05-05T00:00:00Z',
|
|
336
|
+
task: 'Research the impact of Bun 1.3 on monorepo tooling and produce a 500-word brief',
|
|
337
|
+
mode: 'decks',
|
|
338
|
+
participants: [
|
|
339
|
+
{ id: 'bare', name: 'Bare Claude', deck: 'decks/bare.toml', description: 'No skills' },
|
|
340
|
+
{ id: 'deep', name: 'Deep Research', deck: 'decks/deep.toml', description: 'WebSearch + WebFetch skills' },
|
|
341
|
+
],
|
|
342
|
+
criteria: [
|
|
343
|
+
{
|
|
344
|
+
id: 'accuracy', label: '信息准确性', persona: 'ISTJ测试员', weight: 40,
|
|
345
|
+
description: '引用是否可验证,版本号、日期、API 名称是否正确',
|
|
346
|
+
rubric: [
|
|
347
|
+
{ score: 5, label: '全部可验证', description: '所有关键声明有可追溯来源,版本号和 API 名称与实际一致' },
|
|
348
|
+
{ score: 3, label: '大部分正确', description: '核心结论可验证,但存在细节偏差' },
|
|
349
|
+
{ score: 1, label: '无法验证', description: '关键声明无来源或与实际不符' },
|
|
350
|
+
],
|
|
351
|
+
},
|
|
352
|
+
{
|
|
353
|
+
id: 'depth', label: '分析深度', persona: 'INTJ架构师', weight: 35,
|
|
354
|
+
description: '是否超越表面描述,提供 trade-off 分析和 ecosystem 影响评估',
|
|
355
|
+
rubric: [
|
|
356
|
+
{ score: 5, label: '深度分析', description: '包含 trade-off 对比、ecosystem 连锁影响、时间线预测' },
|
|
357
|
+
{ score: 3, label: '中等覆盖', description: '描述了变化但无深入 trade-off 分析' },
|
|
358
|
+
{ score: 1, label: '表面描述', description: '仅重复已知信息,无分析视角' },
|
|
359
|
+
],
|
|
360
|
+
},
|
|
361
|
+
{
|
|
362
|
+
id: 'clarity', label: '表达清晰度', persona: 'INFJ技术写作者', weight: 25,
|
|
363
|
+
description: '结构是否清晰,术语使用是否一致,非专家是否可理解',
|
|
364
|
+
},
|
|
365
|
+
],
|
|
366
|
+
status: 'completed',
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
describe('buildComparativePrompt with structured criteria', () => {
|
|
370
|
+
test('injects rubric anchors into prompt', () => {
|
|
371
|
+
const prompt = buildComparativePrompt({ manifest: manifestWithRubrics, verdicts: [] })
|
|
372
|
+
expect(prompt).toContain('信息准确性')
|
|
373
|
+
expect(prompt).toContain('Evaluator: ISTJ测试员')
|
|
374
|
+
expect(prompt).toContain('Weight: 40')
|
|
375
|
+
expect(prompt).toContain('全部可验证')
|
|
376
|
+
expect(prompt).toContain('分析深度')
|
|
377
|
+
expect(prompt).toContain('Evaluator: INTJ架构师')
|
|
378
|
+
})
|
|
379
|
+
|
|
380
|
+
test('falls back to bare format for string criteria', () => {
|
|
381
|
+
const manifest: ArenaManifestType = {
|
|
382
|
+
id: 'test', created_at: '2026-01-01T00:00:00Z', task: 'test', mode: 'decks',
|
|
383
|
+
participants: [{ id: 'a', name: 'A', deck: 'd1' }, { id: 'b', name: 'B', deck: 'd2' }],
|
|
384
|
+
criteria: ['correctness', 'efficiency'],
|
|
385
|
+
status: 'completed',
|
|
386
|
+
}
|
|
387
|
+
const prompt = buildComparativePrompt({ manifest, verdicts: [] })
|
|
388
|
+
expect(prompt).toContain('- correctness')
|
|
389
|
+
expect(prompt).toContain('- efficiency')
|
|
390
|
+
})
|
|
391
|
+
})
|
|
392
|
+
|
|
393
|
+
// Simulate a realistic LLM judge output — the kind of JSON an actual Claude
|
|
394
|
+
// comparative judge call would produce. Verify our normalization handles it.
|
|
395
|
+
describe('full pipeline: mock LLM output → schema validation', () => {
|
|
396
|
+
test('clean score_matrix passes through ComparativeReport.parse', () => {
|
|
397
|
+
const cleanOutput = {
|
|
398
|
+
score_matrix: [
|
|
399
|
+
{ participant_id: 'bare', criterion: 'accuracy', weight: 0.4, score: 3, rationale: 'Correct on Bun version but missed pnpm migration detail' },
|
|
400
|
+
{ participant_id: 'bare', criterion: 'depth', weight: 0.35, score: 2, rationale: 'Surface-level description, no trade-off analysis' },
|
|
401
|
+
{ participant_id: 'bare', criterion: 'clarity', weight: 0.25, score: 4, rationale: 'Well-structured but some jargon' },
|
|
402
|
+
{ participant_id: 'deep', criterion: 'accuracy', weight: 0.4, score: 5, rationale: 'All claims verified against Bun GitHub releases and npm registry' },
|
|
403
|
+
{ participant_id: 'deep', criterion: 'depth', weight: 0.35, score: 5, rationale: 'Compared Bun 1.3 with pnpm 9, analyzed ecosystem migration patterns' },
|
|
404
|
+
{ participant_id: 'deep', criterion: 'clarity', weight: 0.25, score: 4, rationale: 'Clear structure, minor repetition in trade-off section' },
|
|
405
|
+
],
|
|
406
|
+
key_findings: ['Deep Research produced verifiable, well-sourced analysis', 'Bare Claude lacked access to current version numbers'],
|
|
407
|
+
recommendations: [
|
|
408
|
+
{ audience: 'skill user', recommendation: 'Deep Research skills are essential for technical research tasks' },
|
|
409
|
+
{ audience: 'skill author', recommendation: 'Accuracy criterion highlights the importance of web access for up-to-date data' },
|
|
410
|
+
],
|
|
411
|
+
}
|
|
412
|
+
const report = ComparativeReport.parse({
|
|
413
|
+
arena_id: 'test',
|
|
414
|
+
generated_at: new Date().toISOString(),
|
|
415
|
+
...cleanOutput,
|
|
416
|
+
})
|
|
417
|
+
expect(report.score_matrix).toHaveLength(6)
|
|
418
|
+
const deepAccuracy = report.score_matrix.find(c => c.participant_id === 'deep' && c.criterion === 'accuracy')
|
|
419
|
+
expect(deepAccuracy!.score).toBe(5)
|
|
420
|
+
})
|
|
421
|
+
|
|
422
|
+
test('messy LLM output with field name variants gets normalized', () => {
|
|
423
|
+
// Simulates a messy Claude output — participantId instead of participant_id,
|
|
424
|
+
// reason instead of rationale, string score
|
|
425
|
+
const messyLLMOutput = {
|
|
426
|
+
participantId: 'bare',
|
|
427
|
+
reason: 'OK',
|
|
428
|
+
key_findings: ['found bugs'],
|
|
429
|
+
score_matrix: [
|
|
430
|
+
{ participantId: 'bare', criterion: 'accuracy', weight: 50, score: '4', reason: 'decent' },
|
|
431
|
+
{ participantId: 'deep', criterion: 'accuracy', weight: 50, score: '5', reason: 'excellent' },
|
|
432
|
+
],
|
|
433
|
+
recommendations: [
|
|
434
|
+
{ role: 'developer', text: 'Add more tests' },
|
|
435
|
+
],
|
|
436
|
+
}
|
|
437
|
+
const normalized = normalizeComparativeOutput(messyLLMOutput as Record<string, unknown>)
|
|
438
|
+
const cells = normalized.score_matrix as any[]
|
|
439
|
+
expect(cells[0].participant_id).toBe('bare')
|
|
440
|
+
expect(cells[0].weight).toBe(0.5)
|
|
441
|
+
expect(cells[0].score).toBe(4)
|
|
442
|
+
expect(cells[0].rationale).toBe('decent')
|
|
443
|
+
const recs = normalized.recommendations as any[]
|
|
444
|
+
expect(recs[0].audience).toBe('developer')
|
|
445
|
+
})
|
|
446
|
+
})
|
package/src/comparative-judge.ts
CHANGED
|
@@ -53,15 +53,38 @@ export function computePareto(vectors: { participant_id: string; scores: Record<
|
|
|
53
53
|
|
|
54
54
|
// ── Comparative Judge Prompt ──────────────────────────────────────────────
|
|
55
55
|
|
|
56
|
-
function buildComparativePrompt(opts: {
|
|
56
|
+
export function buildComparativePrompt(opts: {
|
|
57
57
|
manifest: ArenaManifest
|
|
58
58
|
verdicts: { participantId: string; verdict: unknown }[]
|
|
59
59
|
}): string {
|
|
60
|
-
const criteriaDesc = opts.manifest.criteria.join(', ')
|
|
61
60
|
const participants = opts.manifest.participants
|
|
62
61
|
.map(p => `- ${p.id}: ${p.name} (${p.description || 'no description'})`)
|
|
63
62
|
.join('\n')
|
|
64
63
|
|
|
64
|
+
// Format criteria with rubric anchors when available (ADR-20260505225159725)
|
|
65
|
+
let criteriaBlock = ''
|
|
66
|
+
for (const c of opts.manifest.criteria) {
|
|
67
|
+
if (typeof c === 'string') {
|
|
68
|
+
criteriaBlock += `- ${c} (score 1-5, weight: 0.25)\n`
|
|
69
|
+
} else {
|
|
70
|
+
criteriaBlock += `## Criterion: ${c.label} (${c.id})\n`
|
|
71
|
+
if (c.persona) criteriaBlock += `Evaluator: ${c.persona}\n`
|
|
72
|
+
criteriaBlock += `Weight: ${c.weight ?? 25} (${c.weight ?? 25}%)\n`
|
|
73
|
+
criteriaBlock += `Description: ${c.description || 'No additional description.'}\n`
|
|
74
|
+
if (c.rubric && c.rubric.length > 0) {
|
|
75
|
+
criteriaBlock += 'Scoring rubric:\n'
|
|
76
|
+
for (const r of c.rubric) {
|
|
77
|
+
criteriaBlock += ` ${r.score} — ${r.label}: ${r.description}\n`
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
criteriaBlock += '\n'
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
const criteriaList = opts.manifest.criteria
|
|
85
|
+
.map(c => typeof c === 'string' ? c : `${c.label} (${c.id})`)
|
|
86
|
+
.join(', ')
|
|
87
|
+
|
|
65
88
|
return `You are a comparative judge evaluating ${opts.manifest.participants.length} participants against shared criteria.
|
|
66
89
|
|
|
67
90
|
## Task
|
|
@@ -71,11 +94,11 @@ ${opts.manifest.task}
|
|
|
71
94
|
${participants}
|
|
72
95
|
|
|
73
96
|
## Criteria
|
|
74
|
-
${
|
|
75
|
-
|
|
97
|
+
${criteriaBlock}
|
|
76
98
|
## Your Job
|
|
77
99
|
For each participant, score them 1-5 on each criterion. Provide a brief rationale.
|
|
78
100
|
Score meanings: 1=poor, 3=acceptable, 5=excellent.
|
|
101
|
+
Criteria in scope: ${criteriaList}
|
|
79
102
|
|
|
80
103
|
## Output Schema
|
|
81
104
|
Your response must conform to this Zod schema:
|
|
@@ -96,13 +119,13 @@ z.object({
|
|
|
96
119
|
})
|
|
97
120
|
\`\`\`
|
|
98
121
|
score_matrix is a FLAT ARRAY of objects — NOT nested by participant or criterion.
|
|
99
|
-
weight:
|
|
122
|
+
weight: match the weight specified per criterion above.
|
|
100
123
|
score: 1=poor, 3=acceptable, 5=excellent.
|
|
101
124
|
|
|
102
125
|
Use the submit_scores tool to return your structured evaluation.`
|
|
103
126
|
}
|
|
104
127
|
|
|
105
|
-
function toScoreMatrix(
|
|
128
|
+
export function toScoreMatrix(
|
|
106
129
|
manifest: ArenaManifest,
|
|
107
130
|
scores: { participant_id: string; criterion: string; weight: number; score: number; rationale: string }[]
|
|
108
131
|
): typeof ScoreCell._output[] {
|
|
@@ -119,7 +142,7 @@ interface NormalizedScoreCell {
|
|
|
119
142
|
rationale: string
|
|
120
143
|
}
|
|
121
144
|
|
|
122
|
-
function normalizeComparativeOutput(parsed: Record<string, unknown>): Record<string, unknown> {
|
|
145
|
+
export function normalizeComparativeOutput(parsed: Record<string, unknown>): Record<string, unknown> {
|
|
123
146
|
const out = { ...parsed }
|
|
124
147
|
|
|
125
148
|
// Detect pivot-table format: { participant: { criterion: { score, rationale } } }
|
package/src/runner.ts
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { mkdirSync, writeFileSync, readFileSync } from 'node:fs'
|
|
2
2
|
import { join, resolve } from 'node:path'
|
|
3
|
+
import { tmpdir } from 'node:os'
|
|
3
4
|
import { runAgentScenario, type AgentScenario } from '@lythos/test-utils/agent-bdd'
|
|
4
5
|
import { useAgent } from '@lythos/test-utils/agents'
|
|
5
6
|
import { ArenaManifest, Player } from '@lythos/test-utils/schema'
|
|
@@ -110,16 +111,16 @@ export async function runArenaFromToml(opts: {
|
|
|
110
111
|
const deckContent = readFileSync(cell.deck, 'utf-8')
|
|
111
112
|
writeFileSync(join(workdir, 'skill-deck.toml'), deckContent)
|
|
112
113
|
|
|
113
|
-
// Link skills
|
|
114
|
-
const
|
|
115
|
-
|
|
116
|
-
cwd: workdir,
|
|
117
|
-
|
|
118
|
-
})
|
|
114
|
+
// Link skills via bunx (works both locally and when installed via bunx)
|
|
115
|
+
const linkProc = Bun.spawn(
|
|
116
|
+
['bunx', '@lythos/skill-deck', 'link'],
|
|
117
|
+
{ cwd: workdir, env: { ...process.env, HOME: process.env.HOME! } },
|
|
118
|
+
)
|
|
119
119
|
await linkProc.exited
|
|
120
120
|
log?.(`[arena] deck link for ${cell.side}: exit ${linkProc.exitCode}`)
|
|
121
121
|
},
|
|
122
|
-
|
|
122
|
+
// Isolated CWD: /tmp/arena-<id>/<side>/ — no parent .claude/skills/ to walk up into
|
|
123
|
+
baseDir: join(tmpdir(), `arena-${arenaId}`, cell.side),
|
|
123
124
|
})
|
|
124
125
|
|
|
125
126
|
const v = (result.verdict ?? {
|
|
@@ -220,7 +221,7 @@ function writeReport(dir: string, manifest: ArenaManifestType, report: unknown &
|
|
|
220
221
|
`# Arena Report: ${manifest.id}`,
|
|
221
222
|
'',
|
|
222
223
|
`**Task**: ${manifest.task}`,
|
|
223
|
-
`**Criteria**: ${manifest.criteria.join(', ')}`,
|
|
224
|
+
`**Criteria**: ${manifest.criteria.map(c => typeof c === 'string' ? c : c.label).join(', ')}`,
|
|
224
225
|
`**Date**: ${new Date().toISOString()}`,
|
|
225
226
|
'',
|
|
226
227
|
'## Score Matrix',
|