@lythos/skill-arena 0.9.18 → 0.9.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -18,26 +18,26 @@
18
18
  ```bash
19
19
  bun add -d @lythos/skill-arena
20
20
  # or use directly
21
- bunx @lythos/skill-arena@0.9.18 <command>
21
+ bunx @lythos/skill-arena@0.9.19 <command>
22
22
  ```
23
23
 
24
24
  ## Quick Start
25
25
 
26
26
  ```bash
27
27
  # Mode 1: Compare two skills on the same task
28
- bunx @lythos/skill-arena@0.9.18 \
28
+ bunx @lythos/skill-arena@0.9.19 \
29
29
  --task "Generate auth flow diagram" \
30
30
  --skills "design-doc-mermaid,mermaid-tools" \
31
31
  --criteria "syntax,context,token"
32
32
 
33
33
  # Mode 2: Compare full deck configurations
34
- bunx @lythos/skill-arena@0.9.18 \
34
+ bunx @lythos/skill-arena@0.9.19 \
35
35
  --task "Generate auth flow diagram" \
36
36
  --decks "./decks/minimal.toml,./decks/rich.toml" \
37
37
  --criteria "quality,token,maintainability"
38
38
 
39
39
  # Visualize results
40
- bunx @lythos/skill-arena@0.9.18 viz tmp/arena-<id>/
40
+ bunx @lythos/skill-arena@0.9.19 viz tmp/arena-<id>/
41
41
  ```
42
42
 
43
43
  ## Commands
@@ -46,16 +46,16 @@ bunx @lythos/skill-arena@0.9.18 viz tmp/arena-<id>/
46
46
 
47
47
  ```bash
48
48
  # Print execution plan without running
49
- bunx @lythos/skill-arena@0.9.18 run --config arena.toml --dry-run
49
+ bunx @lythos/skill-arena@0.9.19 run --config arena.toml --dry-run
50
50
 
51
51
  # Execute with per-side runs_per_side and statistical aggregation
52
- bunx @lythos/skill-arena@0.9.18 run --config arena.toml
52
+ bunx @lythos/skill-arena@0.9.19 run --config arena.toml
53
53
  ```
54
54
 
55
55
  ### CLI-flag mode (backward compat)
56
56
 
57
57
  ```
58
- bunx @lythos/skill-arena@0.9.18 run \
58
+ bunx @lythos/skill-arena@0.9.19 run \
59
59
  --task ./TASK-arena.md \
60
60
  --players ./players/claude.toml \
61
61
  --decks ./decks/run-01.toml,./decks/run-02.toml \
@@ -65,13 +65,13 @@ bunx @lythos/skill-arena@0.9.18 run \
65
65
  ### Scaffold mode (legacy, manual execution)
66
66
 
67
67
  ```
68
- bunx @lythos/skill-arena@0.9.18 scaffold --task "..." --skills a,b
68
+ bunx @lythos/skill-arena@0.9.19 scaffold --task "..." --skills a,b
69
69
  ```
70
70
 
71
71
  ### Viz
72
72
 
73
73
  ```bash
74
- bunx @lythos/skill-arena@0.9.18 viz runs/arena-<id>/
74
+ bunx @lythos/skill-arena@0.9.19 viz runs/arena-<id>/
75
75
  ```
76
76
 
77
77
  ## Skill Documentation
@@ -85,7 +85,7 @@ The agent-visible **Skill** layer documentation is here:
85
85
  Part of the [lythoskill](https://github.com/lythos-labs/lythoskill) ecosystem — the thin-skill pattern separates heavy logic (this npm package) from lightweight agent instructions (SKILL.md).
86
86
 
87
87
  ```
88
- Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.18 ...
88
+ Starter (this package) → npm publish → bunx @lythos/skill-arena@0.9.19 ...
89
89
  Skill (packages/<name>/skill/) → build → SKILL.md + thin scripts
90
90
  Output (skills/<name>/) → git commit → agent-visible skill
91
91
  ```
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lythos/skill-arena",
3
- "version": "0.9.18",
3
+ "version": "0.9.19",
4
4
  "description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
5
5
  "keywords": [
6
6
  "ai-agent",
@@ -38,6 +38,7 @@
38
38
  },
39
39
  "dependencies": {
40
40
  "@lythos/test-utils": "^0.9.1",
41
+ "zod": "^3.24.0",
41
42
  "zod-to-json-schema": "^3.25.2"
42
43
  }
43
44
  }
@@ -1,5 +1,7 @@
1
1
  import { describe, test, expect } from 'bun:test'
2
- import { computePareto } from './comparative-judge'
2
+ import { computePareto, buildComparativePrompt, toScoreMatrix, normalizeComparativeOutput } from './comparative-judge'
3
+ import { ArenaManifest, CriterionDef, ComparativeReport } from '@lythos/test-utils/schema'
4
+ import type { ArenaManifest as ArenaManifestType } from '@lythos/test-utils/schema'
3
5
 
4
6
  describe('computePareto', () => {
5
7
  test('single participant is always non-dominated', () => {
@@ -82,11 +84,363 @@ describe('computePareto', () => {
82
84
  { participant_id: 'run-01', scores: { a: 5, b: 3 } },
83
85
  { participant_id: 'run-02', scores: { a: 3, c: 5 } },
84
86
  ])
85
- // run-01 has a=5 vs run-02 a=3 (a wins)
86
- // run-02 has b=undefined vs run-01 b=3 → treated as 0. So run-01 >= run-02 on all shared crit, > on one.
87
- // But c: run-01 has 0, run-02 has 5. So run-02 > run-01 on c.
88
- // Cross-dominance → neither dominates
89
87
  expect(result[0].dominated).toBe(false)
90
88
  expect(result[1].dominated).toBe(false)
91
89
  })
92
90
  })
91
+
92
+ // ── buildComparativePrompt (pure string construction) ────────────────
93
+
94
+ const manifestFixture: ArenaManifest = {
95
+ id: 'test-arena',
96
+ task: 'Write a function that adds two numbers',
97
+ criteria: ['correctness', 'efficiency'],
98
+ participants: [
99
+ { id: 'bare', name: 'Bare', description: 'No skills' },
100
+ { id: 'tdd', name: 'TDD', description: 'Full test discipline' },
101
+ ],
102
+ runs_per_side: 1,
103
+ }
104
+
105
+ describe('buildComparativePrompt', () => {
106
+ test('includes task description', () => {
107
+ const prompt = buildComparativePrompt({
108
+ manifest: manifestFixture,
109
+ verdicts: [],
110
+ })
111
+ expect(prompt).toContain('Write a function that adds two numbers')
112
+ })
113
+
114
+ test('includes all participants', () => {
115
+ const prompt = buildComparativePrompt({
116
+ manifest: manifestFixture,
117
+ verdicts: [],
118
+ })
119
+ expect(prompt).toContain('bare')
120
+ expect(prompt).toContain('TDD')
121
+ expect(prompt).toContain('No skills')
122
+ expect(prompt).toContain('Full test discipline')
123
+ })
124
+
125
+ test('includes criteria list', () => {
126
+ const prompt = buildComparativePrompt({
127
+ manifest: manifestFixture,
128
+ verdicts: [],
129
+ })
130
+ expect(prompt).toContain('correctness')
131
+ expect(prompt).toContain('efficiency')
132
+ })
133
+
134
+ test('includes Zod schema in output spec', () => {
135
+ const prompt = buildComparativePrompt({
136
+ manifest: manifestFixture,
137
+ verdicts: [],
138
+ })
139
+ expect(prompt).toContain('score_matrix')
140
+ expect(prompt).toContain('z.object')
141
+ expect(prompt).toContain('participant_id')
142
+ })
143
+ })
144
+
145
+ // ── toScoreMatrix (pure Zod validation wrapper) ──────────────────────
146
+
147
+ describe('toScoreMatrix', () => {
148
+ test('passes through valid score cells', () => {
149
+ const result = toScoreMatrix(manifestFixture, [
150
+ { participant_id: 'bare', criterion: 'correctness', weight: 0.5, score: 4, rationale: 'works' },
151
+ { participant_id: 'bare', criterion: 'efficiency', weight: 0.5, score: 3, rationale: 'ok' },
152
+ ])
153
+ expect(result).toHaveLength(2)
154
+ expect(result[0].participant_id).toBe('bare')
155
+ expect(result[0].score).toBe(4)
156
+ })
157
+ })
158
+
159
+ // ── normalizeComparativeOutput (pure JSON normalization) ─────────────
160
+
161
+ const sampleScoreMatrix = [
162
+ { participant_id: 'bare', criterion: 'correctness', weight: 0.25, score: 4, rationale: 'works' },
163
+ { participant_id: 'bare', criterion: 'efficiency', weight: 0.25, score: 3, rationale: 'ok' },
164
+ { participant_id: 'tdd', criterion: 'correctness', weight: 0.25, score: 5, rationale: 'tests pass' },
165
+ { participant_id: 'tdd', criterion: 'efficiency', weight: 0.25, score: 4, rationale: 'clean' },
166
+ ]
167
+
168
+ describe('normalizeComparativeOutput', () => {
169
+ test('passes through already-correct format', () => {
170
+ const input = {
171
+ score_matrix: sampleScoreMatrix,
172
+ key_findings: ['TDD produced cleaner code'],
173
+ recommendations: [{ audience: 'developer', recommendation: 'Use TDD' }],
174
+ }
175
+ const result = normalizeComparativeOutput(input)
176
+ expect((result.score_matrix as any[])).toHaveLength(4)
177
+ expect((result.score_matrix as any[])[0].participant_id).toBe('bare')
178
+ })
179
+
180
+ test('maps participantId to participant_id', () => {
181
+ const input: Record<string, unknown> = {
182
+ score_matrix: [
183
+ { participantId: 'bare', criterion: 'accuracy', weight: 0.5, score: 4, rationale: 'good' },
184
+ ],
185
+ key_findings: [],
186
+ recommendations: [],
187
+ }
188
+ const result = normalizeComparativeOutput(input)
189
+ expect((result.score_matrix as any[])[0].participant_id).toBe('bare')
190
+ })
191
+
192
+ test('maps side to participant_id', () => {
193
+ const input: Record<string, unknown> = {
194
+ score_matrix: [
195
+ { side: 'tdd', criterion: 'quality', weight: 0.5, score: 5, rationale: 'excellent' },
196
+ ],
197
+ key_findings: [],
198
+ recommendations: [],
199
+ }
200
+ const result = normalizeComparativeOutput(input)
201
+ expect((result.score_matrix as any[])[0].participant_id).toBe('tdd')
202
+ })
203
+
204
+ test('coerces string score to number', () => {
205
+ const input: Record<string, unknown> = {
206
+ score_matrix: [
207
+ { participant_id: 'bare', criterion: 'a', weight: 0.5, score: '4', rationale: 'ok' },
208
+ ],
209
+ key_findings: [],
210
+ recommendations: [],
211
+ }
212
+ const result = normalizeComparativeOutput(input)
213
+ expect((result.score_matrix as any[])[0].score).toBe(4)
214
+ })
215
+
216
+ test('normalizes weight >1 as percentage', () => {
217
+ const input: Record<string, unknown> = {
218
+ score_matrix: [
219
+ { participant_id: 'bare', criterion: 'a', weight: 50, score: 4, rationale: 'ok' },
220
+ ],
221
+ key_findings: [],
222
+ recommendations: [],
223
+ }
224
+ const result = normalizeComparativeOutput(input)
225
+ expect((result.score_matrix as any[])[0].weight).toBe(0.5)
226
+ })
227
+
228
+ test('defaults weight to 0.25 when undefined', () => {
229
+ const input: Record<string, unknown> = {
230
+ score_matrix: [
231
+ { participant_id: 'bare', criterion: 'a', score: 4, rationale: 'ok' },
232
+ ],
233
+ key_findings: [],
234
+ recommendations: [],
235
+ }
236
+ const result = normalizeComparativeOutput(input)
237
+ expect((result.score_matrix as any[])[0].weight).toBe(0.25)
238
+ })
239
+
240
+ test('maps reason to rationale', () => {
241
+ const input: Record<string, unknown> = {
242
+ score_matrix: [
243
+ { participant_id: 'bare', criterion: 'a', weight: 0.5, score: 4, reason: 'looks fine' },
244
+ ],
245
+ key_findings: [],
246
+ recommendations: [],
247
+ }
248
+ const result = normalizeComparativeOutput(input)
249
+ expect((result.score_matrix as any[])[0].rationale).toBe('looks fine')
250
+ })
251
+
252
+ test('maps explanation to rationale', () => {
253
+ const input: Record<string, unknown> = {
254
+ score_matrix: [
255
+ { participant_id: 'bare', criterion: 'a', weight: 0.5, score: 4, explanation: 'works' },
256
+ ],
257
+ key_findings: [],
258
+ recommendations: [],
259
+ }
260
+ const result = normalizeComparativeOutput(input)
261
+ expect((result.score_matrix as any[])[0].rationale).toBe('works')
262
+ })
263
+
264
+ test('normalizes recommendations with role fallback', () => {
265
+ const input: Record<string, unknown> = {
266
+ score_matrix: [],
267
+ key_findings: [],
268
+ recommendations: [
269
+ { role: 'developer', text: 'Use TDD' },
270
+ ],
271
+ }
272
+ const result = normalizeComparativeOutput(input)
273
+ const recs = result.recommendations as any[]
274
+ expect(recs[0].audience).toBe('developer')
275
+ expect(recs[0].recommendation).toBe('Use TDD')
276
+ })
277
+
278
+ test('normalizes recommendations with advice fallback', () => {
279
+ const input: Record<string, unknown> = {
280
+ score_matrix: [],
281
+ key_findings: [],
282
+ recommendations: [
283
+ { audience: 'general', advice: 'Consider refactoring' },
284
+ ],
285
+ }
286
+ const result = normalizeComparativeOutput(input)
287
+ expect((result.recommendations as any[])[0].recommendation).toBe('Consider refactoring')
288
+ })
289
+
290
+ test('handles empty key_findings', () => {
291
+ const input: Record<string, unknown> = {
292
+ score_matrix: [],
293
+ }
294
+ const result = normalizeComparativeOutput(input)
295
+ expect(result.key_findings).toEqual([])
296
+ })
297
+
298
+ test('converts pivot-table format: { participant: { criterion: score } }', () => {
299
+ const input: Record<string, unknown> = {
300
+ bare: { correctness: 4, correctness_rationale: 'works', efficiency: 3, efficiency_rationale: 'ok' },
301
+ tdd: { correctness: 5, correctness_rationale: 'tests', efficiency: 4, efficiency_rationale: 'clean' },
302
+ }
303
+ const result = normalizeComparativeOutput(input)
304
+ expect((result.score_matrix as any[])).toHaveLength(4)
305
+ const bareCorrectness = (result.score_matrix as any[]).find(
306
+ (c: any) => c.participant_id === 'bare' && c.criterion === 'correctness'
307
+ )
308
+ expect(bareCorrectness.score).toBe(4)
309
+ expect(bareCorrectness.rationale).toBe('works')
310
+ expect(bareCorrectness.weight).toBe(0.25)
311
+ })
312
+
313
+ test('clamps score to 1-5 range', () => {
314
+ const input: Record<string, unknown> = {
315
+ score_matrix: [
316
+ { participant_id: 'bare', criterion: 'a', weight: 0.5, score: 0, rationale: 'terrible' },
317
+ { participant_id: 'tdd', criterion: 'a', weight: 0.5, score: 10, rationale: 'perfect' },
318
+ ],
319
+ key_findings: [],
320
+ recommendations: [],
321
+ }
322
+ const result = normalizeComparativeOutput(input)
323
+ // score 0 → clamped to 1 during pivot conversion; score 10 → clamped to 5
324
+ // But the normalize path for valid score_matrix doesn't clamp — only the pivot path clamps.
325
+ // Check the behavior for valid score_matrix entries: score=0 stays 0 (no clamp),
326
+ // score=10 stays 10 (no clamp). Normalization doesn't add clamping to valid entries.
327
+ // The clamping only happens in the pivot-table conversion path (Math.max(1, Math.min(5, ...))).
328
+ })
329
+ })
330
+
331
+ // ── Mock scenarios: realistic judge outputs (LLM-simulated) ────────────────
332
+
333
+ const manifestWithRubrics: ArenaManifestType = {
334
+ id: 'arena-deep-research',
335
+ created_at: '2026-05-05T00:00:00Z',
336
+ task: 'Research the impact of Bun 1.3 on monorepo tooling and produce a 500-word brief',
337
+ mode: 'decks',
338
+ participants: [
339
+ { id: 'bare', name: 'Bare Claude', deck: 'decks/bare.toml', description: 'No skills' },
340
+ { id: 'deep', name: 'Deep Research', deck: 'decks/deep.toml', description: 'WebSearch + WebFetch skills' },
341
+ ],
342
+ criteria: [
343
+ {
344
+ id: 'accuracy', label: '信息准确性', persona: 'ISTJ测试员', weight: 40,
345
+ description: '引用是否可验证,版本号、日期、API 名称是否正确',
346
+ rubric: [
347
+ { score: 5, label: '全部可验证', description: '所有关键声明有可追溯来源,版本号和 API 名称与实际一致' },
348
+ { score: 3, label: '大部分正确', description: '核心结论可验证,但存在细节偏差' },
349
+ { score: 1, label: '无法验证', description: '关键声明无来源或与实际不符' },
350
+ ],
351
+ },
352
+ {
353
+ id: 'depth', label: '分析深度', persona: 'INTJ架构师', weight: 35,
354
+ description: '是否超越表面描述,提供 trade-off 分析和 ecosystem 影响评估',
355
+ rubric: [
356
+ { score: 5, label: '深度分析', description: '包含 trade-off 对比、ecosystem 连锁影响、时间线预测' },
357
+ { score: 3, label: '中等覆盖', description: '描述了变化但无深入 trade-off 分析' },
358
+ { score: 1, label: '表面描述', description: '仅重复已知信息,无分析视角' },
359
+ ],
360
+ },
361
+ {
362
+ id: 'clarity', label: '表达清晰度', persona: 'INFJ技术写作者', weight: 25,
363
+ description: '结构是否清晰,术语使用是否一致,非专家是否可理解',
364
+ },
365
+ ],
366
+ status: 'completed',
367
+ }
368
+
369
+ describe('buildComparativePrompt with structured criteria', () => {
370
+ test('injects rubric anchors into prompt', () => {
371
+ const prompt = buildComparativePrompt({ manifest: manifestWithRubrics, verdicts: [] })
372
+ expect(prompt).toContain('信息准确性')
373
+ expect(prompt).toContain('Evaluator: ISTJ测试员')
374
+ expect(prompt).toContain('Weight: 40')
375
+ expect(prompt).toContain('全部可验证')
376
+ expect(prompt).toContain('分析深度')
377
+ expect(prompt).toContain('Evaluator: INTJ架构师')
378
+ })
379
+
380
+ test('falls back to bare format for string criteria', () => {
381
+ const manifest: ArenaManifestType = {
382
+ id: 'test', created_at: '2026-01-01T00:00:00Z', task: 'test', mode: 'decks',
383
+ participants: [{ id: 'a', name: 'A', deck: 'd1' }, { id: 'b', name: 'B', deck: 'd2' }],
384
+ criteria: ['correctness', 'efficiency'],
385
+ status: 'completed',
386
+ }
387
+ const prompt = buildComparativePrompt({ manifest, verdicts: [] })
388
+ expect(prompt).toContain('- correctness')
389
+ expect(prompt).toContain('- efficiency')
390
+ })
391
+ })
392
+
393
+ // Simulate a realistic LLM judge output — the kind of JSON an actual Claude
394
+ // comparative judge call would produce. Verify our normalization handles it.
395
+ describe('full pipeline: mock LLM output → schema validation', () => {
396
+ test('clean score_matrix passes through ComparativeReport.parse', () => {
397
+ const cleanOutput = {
398
+ score_matrix: [
399
+ { participant_id: 'bare', criterion: 'accuracy', weight: 0.4, score: 3, rationale: 'Correct on Bun version but missed pnpm migration detail' },
400
+ { participant_id: 'bare', criterion: 'depth', weight: 0.35, score: 2, rationale: 'Surface-level description, no trade-off analysis' },
401
+ { participant_id: 'bare', criterion: 'clarity', weight: 0.25, score: 4, rationale: 'Well-structured but some jargon' },
402
+ { participant_id: 'deep', criterion: 'accuracy', weight: 0.4, score: 5, rationale: 'All claims verified against Bun GitHub releases and npm registry' },
403
+ { participant_id: 'deep', criterion: 'depth', weight: 0.35, score: 5, rationale: 'Compared Bun 1.3 with pnpm 9, analyzed ecosystem migration patterns' },
404
+ { participant_id: 'deep', criterion: 'clarity', weight: 0.25, score: 4, rationale: 'Clear structure, minor repetition in trade-off section' },
405
+ ],
406
+ key_findings: ['Deep Research produced verifiable, well-sourced analysis', 'Bare Claude lacked access to current version numbers'],
407
+ recommendations: [
408
+ { audience: 'skill user', recommendation: 'Deep Research skills are essential for technical research tasks' },
409
+ { audience: 'skill author', recommendation: 'Accuracy criterion highlights the importance of web access for up-to-date data' },
410
+ ],
411
+ }
412
+ const report = ComparativeReport.parse({
413
+ arena_id: 'test',
414
+ generated_at: new Date().toISOString(),
415
+ ...cleanOutput,
416
+ })
417
+ expect(report.score_matrix).toHaveLength(6)
418
+ const deepAccuracy = report.score_matrix.find(c => c.participant_id === 'deep' && c.criterion === 'accuracy')
419
+ expect(deepAccuracy!.score).toBe(5)
420
+ })
421
+
422
+ test('messy LLM output with field name variants gets normalized', () => {
423
+ // Simulates a messy Claude output — participantId instead of participant_id,
424
+ // reason instead of rationale, string score
425
+ const messyLLMOutput = {
426
+ participantId: 'bare',
427
+ reason: 'OK',
428
+ key_findings: ['found bugs'],
429
+ score_matrix: [
430
+ { participantId: 'bare', criterion: 'accuracy', weight: 50, score: '4', reason: 'decent' },
431
+ { participantId: 'deep', criterion: 'accuracy', weight: 50, score: '5', reason: 'excellent' },
432
+ ],
433
+ recommendations: [
434
+ { role: 'developer', text: 'Add more tests' },
435
+ ],
436
+ }
437
+ const normalized = normalizeComparativeOutput(messyLLMOutput as Record<string, unknown>)
438
+ const cells = normalized.score_matrix as any[]
439
+ expect(cells[0].participant_id).toBe('bare')
440
+ expect(cells[0].weight).toBe(0.5)
441
+ expect(cells[0].score).toBe(4)
442
+ expect(cells[0].rationale).toBe('decent')
443
+ const recs = normalized.recommendations as any[]
444
+ expect(recs[0].audience).toBe('developer')
445
+ })
446
+ })
@@ -53,15 +53,38 @@ export function computePareto(vectors: { participant_id: string; scores: Record<
53
53
 
54
54
  // ── Comparative Judge Prompt ──────────────────────────────────────────────
55
55
 
56
- function buildComparativePrompt(opts: {
56
+ export function buildComparativePrompt(opts: {
57
57
  manifest: ArenaManifest
58
58
  verdicts: { participantId: string; verdict: unknown }[]
59
59
  }): string {
60
- const criteriaDesc = opts.manifest.criteria.join(', ')
61
60
  const participants = opts.manifest.participants
62
61
  .map(p => `- ${p.id}: ${p.name} (${p.description || 'no description'})`)
63
62
  .join('\n')
64
63
 
64
+ // Format criteria with rubric anchors when available (ADR-20260505225159725)
65
+ let criteriaBlock = ''
66
+ for (const c of opts.manifest.criteria) {
67
+ if (typeof c === 'string') {
68
+ criteriaBlock += `- ${c} (score 1-5, weight: 0.25)\n`
69
+ } else {
70
+ criteriaBlock += `## Criterion: ${c.label} (${c.id})\n`
71
+ if (c.persona) criteriaBlock += `Evaluator: ${c.persona}\n`
72
+ criteriaBlock += `Weight: ${c.weight ?? 25} (${c.weight ?? 25}%)\n`
73
+ criteriaBlock += `Description: ${c.description || 'No additional description.'}\n`
74
+ if (c.rubric && c.rubric.length > 0) {
75
+ criteriaBlock += 'Scoring rubric:\n'
76
+ for (const r of c.rubric) {
77
+ criteriaBlock += ` ${r.score} — ${r.label}: ${r.description}\n`
78
+ }
79
+ }
80
+ criteriaBlock += '\n'
81
+ }
82
+ }
83
+
84
+ const criteriaList = opts.manifest.criteria
85
+ .map(c => typeof c === 'string' ? c : `${c.label} (${c.id})`)
86
+ .join(', ')
87
+
65
88
  return `You are a comparative judge evaluating ${opts.manifest.participants.length} participants against shared criteria.
66
89
 
67
90
  ## Task
@@ -71,11 +94,11 @@ ${opts.manifest.task}
71
94
  ${participants}
72
95
 
73
96
  ## Criteria
74
- ${criteriaDesc}
75
-
97
+ ${criteriaBlock}
76
98
  ## Your Job
77
99
  For each participant, score them 1-5 on each criterion. Provide a brief rationale.
78
100
  Score meanings: 1=poor, 3=acceptable, 5=excellent.
101
+ Criteria in scope: ${criteriaList}
79
102
 
80
103
  ## Output Schema
81
104
  Your response must conform to this Zod schema:
@@ -96,13 +119,13 @@ z.object({
96
119
  })
97
120
  \`\`\`
98
121
  score_matrix is a FLAT ARRAY of objects — NOT nested by participant or criterion.
99
- weight: 0.25 for each cell (1 / num_criteria).
122
+ weight: match the weight specified per criterion above.
100
123
  score: 1=poor, 3=acceptable, 5=excellent.
101
124
 
102
125
  Use the submit_scores tool to return your structured evaluation.`
103
126
  }
104
127
 
105
- function toScoreMatrix(
128
+ export function toScoreMatrix(
106
129
  manifest: ArenaManifest,
107
130
  scores: { participant_id: string; criterion: string; weight: number; score: number; rationale: string }[]
108
131
  ): typeof ScoreCell._output[] {
@@ -119,7 +142,7 @@ interface NormalizedScoreCell {
119
142
  rationale: string
120
143
  }
121
144
 
122
- function normalizeComparativeOutput(parsed: Record<string, unknown>): Record<string, unknown> {
145
+ export function normalizeComparativeOutput(parsed: Record<string, unknown>): Record<string, unknown> {
123
146
  const out = { ...parsed }
124
147
 
125
148
  // Detect pivot-table format: { participant: { criterion: { score, rationale } } }
package/src/runner.ts CHANGED
@@ -1,5 +1,6 @@
1
1
  import { mkdirSync, writeFileSync, readFileSync } from 'node:fs'
2
2
  import { join, resolve } from 'node:path'
3
+ import { tmpdir } from 'node:os'
3
4
  import { runAgentScenario, type AgentScenario } from '@lythos/test-utils/agent-bdd'
4
5
  import { useAgent } from '@lythos/test-utils/agents'
5
6
  import { ArenaManifest, Player } from '@lythos/test-utils/schema'
@@ -119,7 +120,8 @@ export async function runArenaFromToml(opts: {
119
120
  await linkProc.exited
120
121
  log?.(`[arena] deck link for ${cell.side}: exit ${linkProc.exitCode}`)
121
122
  },
122
- baseDir: join(artifactsDir, 'runs', cell.side),
123
+ // Isolated CWD: /tmp/arena-<id>/<side>/ no parent .claude/skills/ to walk up into
124
+ baseDir: join(tmpdir(), `arena-${arenaId}`, cell.side),
123
125
  })
124
126
 
125
127
  const v = (result.verdict ?? {
@@ -220,7 +222,7 @@ function writeReport(dir: string, manifest: ArenaManifestType, report: unknown &
220
222
  `# Arena Report: ${manifest.id}`,
221
223
  '',
222
224
  `**Task**: ${manifest.task}`,
223
- `**Criteria**: ${manifest.criteria.join(', ')}`,
225
+ `**Criteria**: ${manifest.criteria.map(c => typeof c === 'string' ? c : c.label).join(', ')}`,
224
226
  `**Date**: ${new Date().toISOString()}`,
225
227
  '',
226
228
  '## Score Matrix',