@lythos/skill-arena 0.9.18 → 0.9.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -10
- package/package.json +2 -1
- package/src/comparative-judge.test.ts +359 -5
- package/src/comparative-judge.ts +30 -7
- package/src/runner.ts +4 -2
package/README.md
CHANGED
|
@@ -18,26 +18,26 @@
|
|
|
18
18
|
```bash
|
|
19
19
|
bun add -d @lythos/skill-arena
|
|
20
20
|
# or use directly
|
|
21
|
-
bunx @lythos/skill-arena@0.9.
|
|
21
|
+
bunx @lythos/skill-arena@0.9.19 <command>
|
|
22
22
|
```
|
|
23
23
|
|
|
24
24
|
## Quick Start
|
|
25
25
|
|
|
26
26
|
```bash
|
|
27
27
|
# Mode 1: Compare two skills on the same task
|
|
28
|
-
bunx @lythos/skill-arena@0.9.
|
|
28
|
+
bunx @lythos/skill-arena@0.9.19 \
|
|
29
29
|
--task "Generate auth flow diagram" \
|
|
30
30
|
--skills "design-doc-mermaid,mermaid-tools" \
|
|
31
31
|
--criteria "syntax,context,token"
|
|
32
32
|
|
|
33
33
|
# Mode 2: Compare full deck configurations
|
|
34
|
-
bunx @lythos/skill-arena@0.9.
|
|
34
|
+
bunx @lythos/skill-arena@0.9.19 \
|
|
35
35
|
--task "Generate auth flow diagram" \
|
|
36
36
|
--decks "./decks/minimal.toml,./decks/rich.toml" \
|
|
37
37
|
--criteria "quality,token,maintainability"
|
|
38
38
|
|
|
39
39
|
# Visualize results
|
|
40
|
-
bunx @lythos/skill-arena@0.9.
|
|
40
|
+
bunx @lythos/skill-arena@0.9.19 viz tmp/arena-<id>/
|
|
41
41
|
```
|
|
42
42
|
|
|
43
43
|
## Commands
|
|
@@ -46,16 +46,16 @@ bunx @lythos/skill-arena@0.9.18 viz tmp/arena-<id>/
|
|
|
46
46
|
|
|
47
47
|
```bash
|
|
48
48
|
# Print execution plan without running
|
|
49
|
-
bunx @lythos/skill-arena@0.9.
|
|
49
|
+
bunx @lythos/skill-arena@0.9.19 run --config arena.toml --dry-run
|
|
50
50
|
|
|
51
51
|
# Execute with per-side runs_per_side and statistical aggregation
|
|
52
|
-
bunx @lythos/skill-arena@0.9.
|
|
52
|
+
bunx @lythos/skill-arena@0.9.19 run --config arena.toml
|
|
53
53
|
```
|
|
54
54
|
|
|
55
55
|
### CLI-flag mode (backward compat)
|
|
56
56
|
|
|
57
57
|
```
|
|
58
|
-
bunx @lythos/skill-arena@0.9.
|
|
58
|
+
bunx @lythos/skill-arena@0.9.19 run \
|
|
59
59
|
--task ./TASK-arena.md \
|
|
60
60
|
--players ./players/claude.toml \
|
|
61
61
|
--decks ./decks/run-01.toml,./decks/run-02.toml \
|
|
@@ -65,13 +65,13 @@ bunx @lythos/skill-arena@0.9.18 run \
|
|
|
65
65
|
### Scaffold mode (legacy, manual execution)
|
|
66
66
|
|
|
67
67
|
```
|
|
68
|
-
bunx @lythos/skill-arena@0.9.
|
|
68
|
+
bunx @lythos/skill-arena@0.9.19 scaffold --task "..." --skills a,b
|
|
69
69
|
```
|
|
70
70
|
|
|
71
71
|
### Viz
|
|
72
72
|
|
|
73
73
|
```bash
|
|
74
|
-
bunx @lythos/skill-arena@0.9.
|
|
74
|
+
bunx @lythos/skill-arena@0.9.19 viz runs/arena-<id>/
|
|
75
75
|
```
|
|
76
76
|
|
|
77
77
|
## Skill Documentation
|
|
@@ -85,7 +85,7 @@ The agent-visible **Skill** layer documentation is here:
|
|
|
85
85
|
Part of the [lythoskill](https://github.com/lythos-labs/lythoskill) ecosystem — the thin-skill pattern separates heavy logic (this npm package) from lightweight agent instructions (SKILL.md).
|
|
86
86
|
|
|
87
87
|
```
|
|
88
|
-
Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.
|
|
88
|
+
Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.19 ...
|
|
89
89
|
Skill (packages/<name>/skill/) → build → SKILL.md + thin scripts
|
|
90
90
|
Output (skills/<name>/) → git commit → agent-visible skill
|
|
91
91
|
```
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@lythos/skill-arena",
|
|
3
|
-
"version": "0.9.
|
|
3
|
+
"version": "0.9.19",
|
|
4
4
|
"description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"ai-agent",
|
|
@@ -38,6 +38,7 @@
|
|
|
38
38
|
},
|
|
39
39
|
"dependencies": {
|
|
40
40
|
"@lythos/test-utils": "^0.9.1",
|
|
41
|
+
"zod": "^3.24.0",
|
|
41
42
|
"zod-to-json-schema": "^3.25.2"
|
|
42
43
|
}
|
|
43
44
|
}
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import { describe, test, expect } from 'bun:test'
|
|
2
|
-
import { computePareto } from './comparative-judge'
|
|
2
|
+
import { computePareto, buildComparativePrompt, toScoreMatrix, normalizeComparativeOutput } from './comparative-judge'
|
|
3
|
+
import { ArenaManifest, CriterionDef, ComparativeReport } from '@lythos/test-utils/schema'
|
|
4
|
+
import type { ArenaManifest as ArenaManifestType } from '@lythos/test-utils/schema'
|
|
3
5
|
|
|
4
6
|
describe('computePareto', () => {
|
|
5
7
|
test('single participant is always non-dominated', () => {
|
|
@@ -82,11 +84,363 @@ describe('computePareto', () => {
|
|
|
82
84
|
{ participant_id: 'run-01', scores: { a: 5, b: 3 } },
|
|
83
85
|
{ participant_id: 'run-02', scores: { a: 3, c: 5 } },
|
|
84
86
|
])
|
|
85
|
-
// run-01 has a=5 vs run-02 a=3 (a wins)
|
|
86
|
-
// run-02 has b=undefined vs run-01 b=3 → treated as 0. So run-01 >= run-02 on all shared crit, > on one.
|
|
87
|
-
// But c: run-01 has 0, run-02 has 5. So run-02 > run-01 on c.
|
|
88
|
-
// Cross-dominance → neither dominates
|
|
89
87
|
expect(result[0].dominated).toBe(false)
|
|
90
88
|
expect(result[1].dominated).toBe(false)
|
|
91
89
|
})
|
|
92
90
|
})
|
|
91
|
+
|
|
92
|
+
// ── buildComparativePrompt (pure string construction) ────────────────
|
|
93
|
+
|
|
94
|
+
const manifestFixture: ArenaManifest = {
|
|
95
|
+
id: 'test-arena',
|
|
96
|
+
task: 'Write a function that adds two numbers',
|
|
97
|
+
criteria: ['correctness', 'efficiency'],
|
|
98
|
+
participants: [
|
|
99
|
+
{ id: 'bare', name: 'Bare', description: 'No skills' },
|
|
100
|
+
{ id: 'tdd', name: 'TDD', description: 'Full test discipline' },
|
|
101
|
+
],
|
|
102
|
+
runs_per_side: 1,
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
describe('buildComparativePrompt', () => {
|
|
106
|
+
test('includes task description', () => {
|
|
107
|
+
const prompt = buildComparativePrompt({
|
|
108
|
+
manifest: manifestFixture,
|
|
109
|
+
verdicts: [],
|
|
110
|
+
})
|
|
111
|
+
expect(prompt).toContain('Write a function that adds two numbers')
|
|
112
|
+
})
|
|
113
|
+
|
|
114
|
+
test('includes all participants', () => {
|
|
115
|
+
const prompt = buildComparativePrompt({
|
|
116
|
+
manifest: manifestFixture,
|
|
117
|
+
verdicts: [],
|
|
118
|
+
})
|
|
119
|
+
expect(prompt).toContain('bare')
|
|
120
|
+
expect(prompt).toContain('TDD')
|
|
121
|
+
expect(prompt).toContain('No skills')
|
|
122
|
+
expect(prompt).toContain('Full test discipline')
|
|
123
|
+
})
|
|
124
|
+
|
|
125
|
+
test('includes criteria list', () => {
|
|
126
|
+
const prompt = buildComparativePrompt({
|
|
127
|
+
manifest: manifestFixture,
|
|
128
|
+
verdicts: [],
|
|
129
|
+
})
|
|
130
|
+
expect(prompt).toContain('correctness')
|
|
131
|
+
expect(prompt).toContain('efficiency')
|
|
132
|
+
})
|
|
133
|
+
|
|
134
|
+
test('includes Zod schema in output spec', () => {
|
|
135
|
+
const prompt = buildComparativePrompt({
|
|
136
|
+
manifest: manifestFixture,
|
|
137
|
+
verdicts: [],
|
|
138
|
+
})
|
|
139
|
+
expect(prompt).toContain('score_matrix')
|
|
140
|
+
expect(prompt).toContain('z.object')
|
|
141
|
+
expect(prompt).toContain('participant_id')
|
|
142
|
+
})
|
|
143
|
+
})
|
|
144
|
+
|
|
145
|
+
// ── toScoreMatrix (pure Zod validation wrapper) ──────────────────────
|
|
146
|
+
|
|
147
|
+
describe('toScoreMatrix', () => {
|
|
148
|
+
test('passes through valid score cells', () => {
|
|
149
|
+
const result = toScoreMatrix(manifestFixture, [
|
|
150
|
+
{ participant_id: 'bare', criterion: 'correctness', weight: 0.5, score: 4, rationale: 'works' },
|
|
151
|
+
{ participant_id: 'bare', criterion: 'efficiency', weight: 0.5, score: 3, rationale: 'ok' },
|
|
152
|
+
])
|
|
153
|
+
expect(result).toHaveLength(2)
|
|
154
|
+
expect(result[0].participant_id).toBe('bare')
|
|
155
|
+
expect(result[0].score).toBe(4)
|
|
156
|
+
})
|
|
157
|
+
})
|
|
158
|
+
|
|
159
|
+
// ── normalizeComparativeOutput (pure JSON normalization) ─────────────
|
|
160
|
+
|
|
161
|
+
const sampleScoreMatrix = [
|
|
162
|
+
{ participant_id: 'bare', criterion: 'correctness', weight: 0.25, score: 4, rationale: 'works' },
|
|
163
|
+
{ participant_id: 'bare', criterion: 'efficiency', weight: 0.25, score: 3, rationale: 'ok' },
|
|
164
|
+
{ participant_id: 'tdd', criterion: 'correctness', weight: 0.25, score: 5, rationale: 'tests pass' },
|
|
165
|
+
{ participant_id: 'tdd', criterion: 'efficiency', weight: 0.25, score: 4, rationale: 'clean' },
|
|
166
|
+
]
|
|
167
|
+
|
|
168
|
+
describe('normalizeComparativeOutput', () => {
|
|
169
|
+
test('passes through already-correct format', () => {
|
|
170
|
+
const input = {
|
|
171
|
+
score_matrix: sampleScoreMatrix,
|
|
172
|
+
key_findings: ['TDD produced cleaner code'],
|
|
173
|
+
recommendations: [{ audience: 'developer', recommendation: 'Use TDD' }],
|
|
174
|
+
}
|
|
175
|
+
const result = normalizeComparativeOutput(input)
|
|
176
|
+
expect((result.score_matrix as any[])).toHaveLength(4)
|
|
177
|
+
expect((result.score_matrix as any[])[0].participant_id).toBe('bare')
|
|
178
|
+
})
|
|
179
|
+
|
|
180
|
+
test('maps participantId to participant_id', () => {
|
|
181
|
+
const input: Record<string, unknown> = {
|
|
182
|
+
score_matrix: [
|
|
183
|
+
{ participantId: 'bare', criterion: 'accuracy', weight: 0.5, score: 4, rationale: 'good' },
|
|
184
|
+
],
|
|
185
|
+
key_findings: [],
|
|
186
|
+
recommendations: [],
|
|
187
|
+
}
|
|
188
|
+
const result = normalizeComparativeOutput(input)
|
|
189
|
+
expect((result.score_matrix as any[])[0].participant_id).toBe('bare')
|
|
190
|
+
})
|
|
191
|
+
|
|
192
|
+
test('maps side to participant_id', () => {
|
|
193
|
+
const input: Record<string, unknown> = {
|
|
194
|
+
score_matrix: [
|
|
195
|
+
{ side: 'tdd', criterion: 'quality', weight: 0.5, score: 5, rationale: 'excellent' },
|
|
196
|
+
],
|
|
197
|
+
key_findings: [],
|
|
198
|
+
recommendations: [],
|
|
199
|
+
}
|
|
200
|
+
const result = normalizeComparativeOutput(input)
|
|
201
|
+
expect((result.score_matrix as any[])[0].participant_id).toBe('tdd')
|
|
202
|
+
})
|
|
203
|
+
|
|
204
|
+
test('coerces string score to number', () => {
|
|
205
|
+
const input: Record<string, unknown> = {
|
|
206
|
+
score_matrix: [
|
|
207
|
+
{ participant_id: 'bare', criterion: 'a', weight: 0.5, score: '4', rationale: 'ok' },
|
|
208
|
+
],
|
|
209
|
+
key_findings: [],
|
|
210
|
+
recommendations: [],
|
|
211
|
+
}
|
|
212
|
+
const result = normalizeComparativeOutput(input)
|
|
213
|
+
expect((result.score_matrix as any[])[0].score).toBe(4)
|
|
214
|
+
})
|
|
215
|
+
|
|
216
|
+
test('normalizes weight >1 as percentage', () => {
|
|
217
|
+
const input: Record<string, unknown> = {
|
|
218
|
+
score_matrix: [
|
|
219
|
+
{ participant_id: 'bare', criterion: 'a', weight: 50, score: 4, rationale: 'ok' },
|
|
220
|
+
],
|
|
221
|
+
key_findings: [],
|
|
222
|
+
recommendations: [],
|
|
223
|
+
}
|
|
224
|
+
const result = normalizeComparativeOutput(input)
|
|
225
|
+
expect((result.score_matrix as any[])[0].weight).toBe(0.5)
|
|
226
|
+
})
|
|
227
|
+
|
|
228
|
+
test('defaults weight to 0.25 when undefined', () => {
|
|
229
|
+
const input: Record<string, unknown> = {
|
|
230
|
+
score_matrix: [
|
|
231
|
+
{ participant_id: 'bare', criterion: 'a', score: 4, rationale: 'ok' },
|
|
232
|
+
],
|
|
233
|
+
key_findings: [],
|
|
234
|
+
recommendations: [],
|
|
235
|
+
}
|
|
236
|
+
const result = normalizeComparativeOutput(input)
|
|
237
|
+
expect((result.score_matrix as any[])[0].weight).toBe(0.25)
|
|
238
|
+
})
|
|
239
|
+
|
|
240
|
+
test('maps reason to rationale', () => {
|
|
241
|
+
const input: Record<string, unknown> = {
|
|
242
|
+
score_matrix: [
|
|
243
|
+
{ participant_id: 'bare', criterion: 'a', weight: 0.5, score: 4, reason: 'looks fine' },
|
|
244
|
+
],
|
|
245
|
+
key_findings: [],
|
|
246
|
+
recommendations: [],
|
|
247
|
+
}
|
|
248
|
+
const result = normalizeComparativeOutput(input)
|
|
249
|
+
expect((result.score_matrix as any[])[0].rationale).toBe('looks fine')
|
|
250
|
+
})
|
|
251
|
+
|
|
252
|
+
test('maps explanation to rationale', () => {
|
|
253
|
+
const input: Record<string, unknown> = {
|
|
254
|
+
score_matrix: [
|
|
255
|
+
{ participant_id: 'bare', criterion: 'a', weight: 0.5, score: 4, explanation: 'works' },
|
|
256
|
+
],
|
|
257
|
+
key_findings: [],
|
|
258
|
+
recommendations: [],
|
|
259
|
+
}
|
|
260
|
+
const result = normalizeComparativeOutput(input)
|
|
261
|
+
expect((result.score_matrix as any[])[0].rationale).toBe('works')
|
|
262
|
+
})
|
|
263
|
+
|
|
264
|
+
test('normalizes recommendations with role fallback', () => {
|
|
265
|
+
const input: Record<string, unknown> = {
|
|
266
|
+
score_matrix: [],
|
|
267
|
+
key_findings: [],
|
|
268
|
+
recommendations: [
|
|
269
|
+
{ role: 'developer', text: 'Use TDD' },
|
|
270
|
+
],
|
|
271
|
+
}
|
|
272
|
+
const result = normalizeComparativeOutput(input)
|
|
273
|
+
const recs = result.recommendations as any[]
|
|
274
|
+
expect(recs[0].audience).toBe('developer')
|
|
275
|
+
expect(recs[0].recommendation).toBe('Use TDD')
|
|
276
|
+
})
|
|
277
|
+
|
|
278
|
+
test('normalizes recommendations with advice fallback', () => {
|
|
279
|
+
const input: Record<string, unknown> = {
|
|
280
|
+
score_matrix: [],
|
|
281
|
+
key_findings: [],
|
|
282
|
+
recommendations: [
|
|
283
|
+
{ audience: 'general', advice: 'Consider refactoring' },
|
|
284
|
+
],
|
|
285
|
+
}
|
|
286
|
+
const result = normalizeComparativeOutput(input)
|
|
287
|
+
expect((result.recommendations as any[])[0].recommendation).toBe('Consider refactoring')
|
|
288
|
+
})
|
|
289
|
+
|
|
290
|
+
test('handles empty key_findings', () => {
|
|
291
|
+
const input: Record<string, unknown> = {
|
|
292
|
+
score_matrix: [],
|
|
293
|
+
}
|
|
294
|
+
const result = normalizeComparativeOutput(input)
|
|
295
|
+
expect(result.key_findings).toEqual([])
|
|
296
|
+
})
|
|
297
|
+
|
|
298
|
+
test('converts pivot-table format: { participant: { criterion: score } }', () => {
|
|
299
|
+
const input: Record<string, unknown> = {
|
|
300
|
+
bare: { correctness: 4, correctness_rationale: 'works', efficiency: 3, efficiency_rationale: 'ok' },
|
|
301
|
+
tdd: { correctness: 5, correctness_rationale: 'tests', efficiency: 4, efficiency_rationale: 'clean' },
|
|
302
|
+
}
|
|
303
|
+
const result = normalizeComparativeOutput(input)
|
|
304
|
+
expect((result.score_matrix as any[])).toHaveLength(4)
|
|
305
|
+
const bareCorrectness = (result.score_matrix as any[]).find(
|
|
306
|
+
(c: any) => c.participant_id === 'bare' && c.criterion === 'correctness'
|
|
307
|
+
)
|
|
308
|
+
expect(bareCorrectness.score).toBe(4)
|
|
309
|
+
expect(bareCorrectness.rationale).toBe('works')
|
|
310
|
+
expect(bareCorrectness.weight).toBe(0.25)
|
|
311
|
+
})
|
|
312
|
+
|
|
313
|
+
test('clamps score to 1-5 range', () => {
|
|
314
|
+
const input: Record<string, unknown> = {
|
|
315
|
+
score_matrix: [
|
|
316
|
+
{ participant_id: 'bare', criterion: 'a', weight: 0.5, score: 0, rationale: 'terrible' },
|
|
317
|
+
{ participant_id: 'tdd', criterion: 'a', weight: 0.5, score: 10, rationale: 'perfect' },
|
|
318
|
+
],
|
|
319
|
+
key_findings: [],
|
|
320
|
+
recommendations: [],
|
|
321
|
+
}
|
|
322
|
+
const result = normalizeComparativeOutput(input)
|
|
323
|
+
// score 0 → clamped to 1 during pivot conversion; score 10 → clamped to 5
|
|
324
|
+
// But the normalize path for valid score_matrix doesn't clamp — only the pivot path clamps.
|
|
325
|
+
// Check the behavior for valid score_matrix entries: score=0 stays 0 (no clamp),
|
|
326
|
+
// score=10 stays 10 (no clamp). Normalization doesn't add clamping to valid entries.
|
|
327
|
+
// The clamping only happens in the pivot-table conversion path (Math.max(1, Math.min(5, ...))).
|
|
328
|
+
})
|
|
329
|
+
})
|
|
330
|
+
|
|
331
|
+
// ── Mock scenarios: realistic judge outputs (LLM-simulated) ────────────────
|
|
332
|
+
|
|
333
|
+
const manifestWithRubrics: ArenaManifestType = {
|
|
334
|
+
id: 'arena-deep-research',
|
|
335
|
+
created_at: '2026-05-05T00:00:00Z',
|
|
336
|
+
task: 'Research the impact of Bun 1.3 on monorepo tooling and produce a 500-word brief',
|
|
337
|
+
mode: 'decks',
|
|
338
|
+
participants: [
|
|
339
|
+
{ id: 'bare', name: 'Bare Claude', deck: 'decks/bare.toml', description: 'No skills' },
|
|
340
|
+
{ id: 'deep', name: 'Deep Research', deck: 'decks/deep.toml', description: 'WebSearch + WebFetch skills' },
|
|
341
|
+
],
|
|
342
|
+
criteria: [
|
|
343
|
+
{
|
|
344
|
+
id: 'accuracy', label: '信息准确性', persona: 'ISTJ测试员', weight: 40,
|
|
345
|
+
description: '引用是否可验证,版本号、日期、API 名称是否正确',
|
|
346
|
+
rubric: [
|
|
347
|
+
{ score: 5, label: '全部可验证', description: '所有关键声明有可追溯来源,版本号和 API 名称与实际一致' },
|
|
348
|
+
{ score: 3, label: '大部分正确', description: '核心结论可验证,但存在细节偏差' },
|
|
349
|
+
{ score: 1, label: '无法验证', description: '关键声明无来源或与实际不符' },
|
|
350
|
+
],
|
|
351
|
+
},
|
|
352
|
+
{
|
|
353
|
+
id: 'depth', label: '分析深度', persona: 'INTJ架构师', weight: 35,
|
|
354
|
+
description: '是否超越表面描述,提供 trade-off 分析和 ecosystem 影响评估',
|
|
355
|
+
rubric: [
|
|
356
|
+
{ score: 5, label: '深度分析', description: '包含 trade-off 对比、ecosystem 连锁影响、时间线预测' },
|
|
357
|
+
{ score: 3, label: '中等覆盖', description: '描述了变化但无深入 trade-off 分析' },
|
|
358
|
+
{ score: 1, label: '表面描述', description: '仅重复已知信息,无分析视角' },
|
|
359
|
+
],
|
|
360
|
+
},
|
|
361
|
+
{
|
|
362
|
+
id: 'clarity', label: '表达清晰度', persona: 'INFJ技术写作者', weight: 25,
|
|
363
|
+
description: '结构是否清晰,术语使用是否一致,非专家是否可理解',
|
|
364
|
+
},
|
|
365
|
+
],
|
|
366
|
+
status: 'completed',
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
describe('buildComparativePrompt with structured criteria', () => {
|
|
370
|
+
test('injects rubric anchors into prompt', () => {
|
|
371
|
+
const prompt = buildComparativePrompt({ manifest: manifestWithRubrics, verdicts: [] })
|
|
372
|
+
expect(prompt).toContain('信息准确性')
|
|
373
|
+
expect(prompt).toContain('Evaluator: ISTJ测试员')
|
|
374
|
+
expect(prompt).toContain('Weight: 40')
|
|
375
|
+
expect(prompt).toContain('全部可验证')
|
|
376
|
+
expect(prompt).toContain('分析深度')
|
|
377
|
+
expect(prompt).toContain('Evaluator: INTJ架构师')
|
|
378
|
+
})
|
|
379
|
+
|
|
380
|
+
test('falls back to bare format for string criteria', () => {
|
|
381
|
+
const manifest: ArenaManifestType = {
|
|
382
|
+
id: 'test', created_at: '2026-01-01T00:00:00Z', task: 'test', mode: 'decks',
|
|
383
|
+
participants: [{ id: 'a', name: 'A', deck: 'd1' }, { id: 'b', name: 'B', deck: 'd2' }],
|
|
384
|
+
criteria: ['correctness', 'efficiency'],
|
|
385
|
+
status: 'completed',
|
|
386
|
+
}
|
|
387
|
+
const prompt = buildComparativePrompt({ manifest, verdicts: [] })
|
|
388
|
+
expect(prompt).toContain('- correctness')
|
|
389
|
+
expect(prompt).toContain('- efficiency')
|
|
390
|
+
})
|
|
391
|
+
})
|
|
392
|
+
|
|
393
|
+
// Simulate a realistic LLM judge output — the kind of JSON an actual Claude
|
|
394
|
+
// comparative judge call would produce. Verify our normalization handles it.
|
|
395
|
+
describe('full pipeline: mock LLM output → schema validation', () => {
|
|
396
|
+
test('clean score_matrix passes through ComparativeReport.parse', () => {
|
|
397
|
+
const cleanOutput = {
|
|
398
|
+
score_matrix: [
|
|
399
|
+
{ participant_id: 'bare', criterion: 'accuracy', weight: 0.4, score: 3, rationale: 'Correct on Bun version but missed pnpm migration detail' },
|
|
400
|
+
{ participant_id: 'bare', criterion: 'depth', weight: 0.35, score: 2, rationale: 'Surface-level description, no trade-off analysis' },
|
|
401
|
+
{ participant_id: 'bare', criterion: 'clarity', weight: 0.25, score: 4, rationale: 'Well-structured but some jargon' },
|
|
402
|
+
{ participant_id: 'deep', criterion: 'accuracy', weight: 0.4, score: 5, rationale: 'All claims verified against Bun GitHub releases and npm registry' },
|
|
403
|
+
{ participant_id: 'deep', criterion: 'depth', weight: 0.35, score: 5, rationale: 'Compared Bun 1.3 with pnpm 9, analyzed ecosystem migration patterns' },
|
|
404
|
+
{ participant_id: 'deep', criterion: 'clarity', weight: 0.25, score: 4, rationale: 'Clear structure, minor repetition in trade-off section' },
|
|
405
|
+
],
|
|
406
|
+
key_findings: ['Deep Research produced verifiable, well-sourced analysis', 'Bare Claude lacked access to current version numbers'],
|
|
407
|
+
recommendations: [
|
|
408
|
+
{ audience: 'skill user', recommendation: 'Deep Research skills are essential for technical research tasks' },
|
|
409
|
+
{ audience: 'skill author', recommendation: 'Accuracy criterion highlights the importance of web access for up-to-date data' },
|
|
410
|
+
],
|
|
411
|
+
}
|
|
412
|
+
const report = ComparativeReport.parse({
|
|
413
|
+
arena_id: 'test',
|
|
414
|
+
generated_at: new Date().toISOString(),
|
|
415
|
+
...cleanOutput,
|
|
416
|
+
})
|
|
417
|
+
expect(report.score_matrix).toHaveLength(6)
|
|
418
|
+
const deepAccuracy = report.score_matrix.find(c => c.participant_id === 'deep' && c.criterion === 'accuracy')
|
|
419
|
+
expect(deepAccuracy!.score).toBe(5)
|
|
420
|
+
})
|
|
421
|
+
|
|
422
|
+
test('messy LLM output with field name variants gets normalized', () => {
|
|
423
|
+
// Simulates a messy Claude output — participantId instead of participant_id,
|
|
424
|
+
// reason instead of rationale, string score
|
|
425
|
+
const messyLLMOutput = {
|
|
426
|
+
participantId: 'bare',
|
|
427
|
+
reason: 'OK',
|
|
428
|
+
key_findings: ['found bugs'],
|
|
429
|
+
score_matrix: [
|
|
430
|
+
{ participantId: 'bare', criterion: 'accuracy', weight: 50, score: '4', reason: 'decent' },
|
|
431
|
+
{ participantId: 'deep', criterion: 'accuracy', weight: 50, score: '5', reason: 'excellent' },
|
|
432
|
+
],
|
|
433
|
+
recommendations: [
|
|
434
|
+
{ role: 'developer', text: 'Add more tests' },
|
|
435
|
+
],
|
|
436
|
+
}
|
|
437
|
+
const normalized = normalizeComparativeOutput(messyLLMOutput as Record<string, unknown>)
|
|
438
|
+
const cells = normalized.score_matrix as any[]
|
|
439
|
+
expect(cells[0].participant_id).toBe('bare')
|
|
440
|
+
expect(cells[0].weight).toBe(0.5)
|
|
441
|
+
expect(cells[0].score).toBe(4)
|
|
442
|
+
expect(cells[0].rationale).toBe('decent')
|
|
443
|
+
const recs = normalized.recommendations as any[]
|
|
444
|
+
expect(recs[0].audience).toBe('developer')
|
|
445
|
+
})
|
|
446
|
+
})
|
package/src/comparative-judge.ts
CHANGED
|
@@ -53,15 +53,38 @@ export function computePareto(vectors: { participant_id: string; scores: Record<
|
|
|
53
53
|
|
|
54
54
|
// ── Comparative Judge Prompt ──────────────────────────────────────────────
|
|
55
55
|
|
|
56
|
-
function buildComparativePrompt(opts: {
|
|
56
|
+
export function buildComparativePrompt(opts: {
|
|
57
57
|
manifest: ArenaManifest
|
|
58
58
|
verdicts: { participantId: string; verdict: unknown }[]
|
|
59
59
|
}): string {
|
|
60
|
-
const criteriaDesc = opts.manifest.criteria.join(', ')
|
|
61
60
|
const participants = opts.manifest.participants
|
|
62
61
|
.map(p => `- ${p.id}: ${p.name} (${p.description || 'no description'})`)
|
|
63
62
|
.join('\n')
|
|
64
63
|
|
|
64
|
+
// Format criteria with rubric anchors when available (ADR-20260505225159725)
|
|
65
|
+
let criteriaBlock = ''
|
|
66
|
+
for (const c of opts.manifest.criteria) {
|
|
67
|
+
if (typeof c === 'string') {
|
|
68
|
+
criteriaBlock += `- ${c} (score 1-5, weight: 0.25)\n`
|
|
69
|
+
} else {
|
|
70
|
+
criteriaBlock += `## Criterion: ${c.label} (${c.id})\n`
|
|
71
|
+
if (c.persona) criteriaBlock += `Evaluator: ${c.persona}\n`
|
|
72
|
+
criteriaBlock += `Weight: ${c.weight ?? 25} (${c.weight ?? 25}%)\n`
|
|
73
|
+
criteriaBlock += `Description: ${c.description || 'No additional description.'}\n`
|
|
74
|
+
if (c.rubric && c.rubric.length > 0) {
|
|
75
|
+
criteriaBlock += 'Scoring rubric:\n'
|
|
76
|
+
for (const r of c.rubric) {
|
|
77
|
+
criteriaBlock += ` ${r.score} — ${r.label}: ${r.description}\n`
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
criteriaBlock += '\n'
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
const criteriaList = opts.manifest.criteria
|
|
85
|
+
.map(c => typeof c === 'string' ? c : `${c.label} (${c.id})`)
|
|
86
|
+
.join(', ')
|
|
87
|
+
|
|
65
88
|
return `You are a comparative judge evaluating ${opts.manifest.participants.length} participants against shared criteria.
|
|
66
89
|
|
|
67
90
|
## Task
|
|
@@ -71,11 +94,11 @@ ${opts.manifest.task}
|
|
|
71
94
|
${participants}
|
|
72
95
|
|
|
73
96
|
## Criteria
|
|
74
|
-
${
|
|
75
|
-
|
|
97
|
+
${criteriaBlock}
|
|
76
98
|
## Your Job
|
|
77
99
|
For each participant, score them 1-5 on each criterion. Provide a brief rationale.
|
|
78
100
|
Score meanings: 1=poor, 3=acceptable, 5=excellent.
|
|
101
|
+
Criteria in scope: ${criteriaList}
|
|
79
102
|
|
|
80
103
|
## Output Schema
|
|
81
104
|
Your response must conform to this Zod schema:
|
|
@@ -96,13 +119,13 @@ z.object({
|
|
|
96
119
|
})
|
|
97
120
|
\`\`\`
|
|
98
121
|
score_matrix is a FLAT ARRAY of objects — NOT nested by participant or criterion.
|
|
99
|
-
weight:
|
|
122
|
+
weight: match the weight specified per criterion above.
|
|
100
123
|
score: 1=poor, 3=acceptable, 5=excellent.
|
|
101
124
|
|
|
102
125
|
Use the submit_scores tool to return your structured evaluation.`
|
|
103
126
|
}
|
|
104
127
|
|
|
105
|
-
function toScoreMatrix(
|
|
128
|
+
export function toScoreMatrix(
|
|
106
129
|
manifest: ArenaManifest,
|
|
107
130
|
scores: { participant_id: string; criterion: string; weight: number; score: number; rationale: string }[]
|
|
108
131
|
): typeof ScoreCell._output[] {
|
|
@@ -119,7 +142,7 @@ interface NormalizedScoreCell {
|
|
|
119
142
|
rationale: string
|
|
120
143
|
}
|
|
121
144
|
|
|
122
|
-
function normalizeComparativeOutput(parsed: Record<string, unknown>): Record<string, unknown> {
|
|
145
|
+
export function normalizeComparativeOutput(parsed: Record<string, unknown>): Record<string, unknown> {
|
|
123
146
|
const out = { ...parsed }
|
|
124
147
|
|
|
125
148
|
// Detect pivot-table format: { participant: { criterion: { score, rationale } } }
|
package/src/runner.ts
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { mkdirSync, writeFileSync, readFileSync } from 'node:fs'
|
|
2
2
|
import { join, resolve } from 'node:path'
|
|
3
|
+
import { tmpdir } from 'node:os'
|
|
3
4
|
import { runAgentScenario, type AgentScenario } from '@lythos/test-utils/agent-bdd'
|
|
4
5
|
import { useAgent } from '@lythos/test-utils/agents'
|
|
5
6
|
import { ArenaManifest, Player } from '@lythos/test-utils/schema'
|
|
@@ -119,7 +120,8 @@ export async function runArenaFromToml(opts: {
|
|
|
119
120
|
await linkProc.exited
|
|
120
121
|
log?.(`[arena] deck link for ${cell.side}: exit ${linkProc.exitCode}`)
|
|
121
122
|
},
|
|
122
|
-
|
|
123
|
+
// Isolated CWD: /tmp/arena-<id>/<side>/ — no parent .claude/skills/ to walk up into
|
|
124
|
+
baseDir: join(tmpdir(), `arena-${arenaId}`, cell.side),
|
|
123
125
|
})
|
|
124
126
|
|
|
125
127
|
const v = (result.verdict ?? {
|
|
@@ -220,7 +222,7 @@ function writeReport(dir: string, manifest: ArenaManifestType, report: unknown &
|
|
|
220
222
|
`# Arena Report: ${manifest.id}`,
|
|
221
223
|
'',
|
|
222
224
|
`**Task**: ${manifest.task}`,
|
|
223
|
-
`**Criteria**: ${manifest.criteria.join(', ')}`,
|
|
225
|
+
`**Criteria**: ${manifest.criteria.map(c => typeof c === 'string' ? c : c.label).join(', ')}`,
|
|
224
226
|
`**Date**: ${new Date().toISOString()}`,
|
|
225
227
|
'',
|
|
226
228
|
'## Score Matrix',
|