@lythos/skill-arena 0.9.0 ā 0.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +5 -1
- package/src/cli.ts +51 -10
- package/src/comparative-judge.test.ts +92 -0
- package/src/comparative-judge.ts +166 -0
- package/src/runner.ts +187 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@lythos/skill-arena",
|
|
3
|
-
"version": "0.9.
|
|
3
|
+
"version": "0.9.2",
|
|
4
4
|
"description": "Skill Arena ā benchmark skill effectiveness with controlled-variable comparison",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"ai-agent",
|
|
@@ -35,5 +35,9 @@
|
|
|
35
35
|
"homepage": "https://github.com/lythos-labs/lythoskill/tree/main/packages/lythoskill-arena#readme",
|
|
36
36
|
"engines": {
|
|
37
37
|
"bun": ">=1.0.0"
|
|
38
|
+
},
|
|
39
|
+
"dependencies": {
|
|
40
|
+
"@lythos/test-utils": "^0.9.1",
|
|
41
|
+
"zod-to-json-schema": "^3.25.2"
|
|
38
42
|
}
|
|
39
43
|
}
|
package/src/cli.ts
CHANGED
|
@@ -29,23 +29,31 @@ function printHelp(): void {
|
|
|
29
29
|
console.log(`š lythoskill-arena ā Skill comparison runner
|
|
30
30
|
|
|
31
31
|
Usage:
|
|
32
|
-
lythoskill-arena --task
|
|
33
|
-
lythoskill-arena --task "<
|
|
32
|
+
lythoskill-arena run --task <path> --players <A.toml,B.toml> --decks <A.toml,B.toml> --criteria <c1,c2,...> [--out <dir>]
|
|
33
|
+
lythoskill-arena scaffold --task "<description>" --skills <skill1,skill2,...>
|
|
34
|
+
lythoskill-arena scaffold --task "<description>" --decks <deck1,deck2,...>
|
|
34
35
|
lythoskill-arena viz <arena-dir>
|
|
35
36
|
|
|
37
|
+
Commands:
|
|
38
|
+
run Run arena programmatically (cartesian player Ć deck ā judge ā report)
|
|
39
|
+
scaffold Create arena directory structure (legacy, manual subagent execution)
|
|
40
|
+
viz Visualize arena report (ASCII charts)
|
|
41
|
+
|
|
36
42
|
Options:
|
|
37
|
-
-t, --task <desc>
|
|
38
|
-
-s, --skills <list> Comma-separated skill names
|
|
43
|
+
-t, --task <path|desc> Task description or path to TASK-arena.md
|
|
44
|
+
-s, --skills <list> Comma-separated skill names (scaffold only)
|
|
39
45
|
--decks <list> Comma-separated deck paths
|
|
40
46
|
-c, --criteria <list> Evaluation criteria (default: syntax,context,logic,token)
|
|
41
|
-
--
|
|
42
|
-
|
|
47
|
+
--players <list> Comma-separated player.toml paths (run only)
|
|
48
|
+
--control <skill> Control skill for comparison (scaffold only)
|
|
49
|
+
--out <dir> Output directory (run: defaults to runs/arena-<id>)
|
|
50
|
+
-d, --dir <dir> Output directory (scaffold: defaults to tmp)
|
|
43
51
|
-p, --project <dir> Project directory (default: .)
|
|
44
52
|
|
|
45
53
|
Examples:
|
|
46
|
-
lythoskill-arena --task
|
|
47
|
-
lythoskill-arena --task "
|
|
48
|
-
lythoskill-arena viz
|
|
54
|
+
lythoskill-arena run --task ./TASK-arena.md --players ./players/claude.toml,./players/kimi.toml --decks ./decks/run-01.toml,./decks/run-02.toml --criteria coverage,relevance
|
|
55
|
+
lythoskill-arena scaffold --task "Refactor auth module" --skills skill-a,skill-b
|
|
56
|
+
lythoskill-arena viz runs/arena-20260504
|
|
49
57
|
`)
|
|
50
58
|
}
|
|
51
59
|
|
|
@@ -551,6 +559,32 @@ function runViz(argv: string[]) {
|
|
|
551
559
|
console.log(renderRadarChart(report))
|
|
552
560
|
}
|
|
553
561
|
|
|
562
|
+
// āā Run: programmatic arena execution āāāāāāāāāāāāāāāāāāāāāāā
|
|
563
|
+
|
|
564
|
+
async function runProgrammaticArena(argv: string[]) {
|
|
565
|
+
const { options } = parseArgs(argv)
|
|
566
|
+
|
|
567
|
+
if (!options.task || !options.decks) {
|
|
568
|
+
console.error('ā --task <path> and --decks <list> are required for "run"')
|
|
569
|
+
process.exit(1)
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
const { runArena: runArenaProgrammatic } = await import('./runner')
|
|
573
|
+
|
|
574
|
+
const result = await runArenaProgrammatic({
|
|
575
|
+
taskPath: options.task,
|
|
576
|
+
playerPaths: (options.players ?? 'players/claude-code.toml').split(',').map(s => s.trim()).filter(Boolean),
|
|
577
|
+
deckPaths: options.decks.split(',').map(s => s.trim()).filter(Boolean),
|
|
578
|
+
criteria: (options.criteria ?? 'syntax,context,logic,token').split(',').map(s => s.trim()).filter(Boolean),
|
|
579
|
+
outDir: options.out ?? `runs/arena-${timestamp()}`,
|
|
580
|
+
projectDir: options.project,
|
|
581
|
+
})
|
|
582
|
+
|
|
583
|
+
console.log(`\nš® Arena complete: ${result.manifest.id}`)
|
|
584
|
+
console.log(`š Artifacts: ${result.artifactsDir}`)
|
|
585
|
+
console.log(`š Report: ${result.artifactsDir}/report.md`)
|
|
586
|
+
}
|
|
587
|
+
|
|
554
588
|
// āā Main Entry āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
|
|
555
589
|
|
|
556
590
|
if (import.meta.main) {
|
|
@@ -559,7 +593,14 @@ if (import.meta.main) {
|
|
|
559
593
|
|
|
560
594
|
if (cmd === 'viz') {
|
|
561
595
|
runViz(args.slice(1))
|
|
596
|
+
} else if (cmd === 'run') {
|
|
597
|
+
runProgrammaticArena(args.slice(1))
|
|
598
|
+
} else if (cmd === 'scaffold' || !cmd || args[0]?.startsWith('-')) {
|
|
599
|
+
// Legacy behavior: if no subcommand or starts with flags, treat as scaffold
|
|
600
|
+
runArena(cmd === 'scaffold' ? args.slice(1) : args)
|
|
562
601
|
} else {
|
|
563
|
-
|
|
602
|
+
console.error(`ā Unknown command: ${cmd}`)
|
|
603
|
+
printHelp()
|
|
604
|
+
process.exit(1)
|
|
564
605
|
}
|
|
565
606
|
}
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import { describe, test, expect } from 'bun:test'
|
|
2
|
+
import { computePareto } from './comparative-judge'
|
|
3
|
+
|
|
4
|
+
describe('computePareto', () => {
|
|
5
|
+
test('single participant is always non-dominated', () => {
|
|
6
|
+
const result = computePareto([
|
|
7
|
+
{ participant_id: 'run-01', scores: { a: 5, b: 3 } },
|
|
8
|
+
])
|
|
9
|
+
expect(result).toHaveLength(1)
|
|
10
|
+
expect(result[0].dominated).toBe(false)
|
|
11
|
+
expect(result[0].dominated_by).toEqual([])
|
|
12
|
+
})
|
|
13
|
+
|
|
14
|
+
test('clear dominance: run-01 dominates run-02 on all criteria', () => {
|
|
15
|
+
const result = computePareto([
|
|
16
|
+
{ participant_id: 'run-01', scores: { coverage: 5, relevance: 5 } },
|
|
17
|
+
{ participant_id: 'run-02', scores: { coverage: 3, relevance: 2 } },
|
|
18
|
+
])
|
|
19
|
+
expect(result[0].dominated).toBe(false)
|
|
20
|
+
expect(result[1].dominated).toBe(true)
|
|
21
|
+
expect(result[1].dominated_by).toEqual(['run-01'])
|
|
22
|
+
})
|
|
23
|
+
|
|
24
|
+
test('equal scores: no one dominates', () => {
|
|
25
|
+
const result = computePareto([
|
|
26
|
+
{ participant_id: 'run-01', scores: { a: 4, b: 4 } },
|
|
27
|
+
{ participant_id: 'run-02', scores: { a: 4, b: 4 } },
|
|
28
|
+
])
|
|
29
|
+
expect(result[0].dominated).toBe(false)
|
|
30
|
+
expect(result[1].dominated).toBe(false)
|
|
31
|
+
})
|
|
32
|
+
|
|
33
|
+
test('cross dominance: each wins on different criteria', () => {
|
|
34
|
+
const result = computePareto([
|
|
35
|
+
{ participant_id: 'run-01', scores: { speed: 5, accuracy: 2 } },
|
|
36
|
+
{ participant_id: 'run-02', scores: { speed: 2, accuracy: 5 } },
|
|
37
|
+
])
|
|
38
|
+
// Neither dominates: run-01 better on speed but worse on accuracy
|
|
39
|
+
expect(result[0].dominated).toBe(false)
|
|
40
|
+
expect(result[1].dominated).toBe(false)
|
|
41
|
+
})
|
|
42
|
+
|
|
43
|
+
test('multi-participant: transitive dominance chain', () => {
|
|
44
|
+
const result = computePareto([
|
|
45
|
+
{ participant_id: 'best', scores: { a: 5, b: 5, c: 5 } },
|
|
46
|
+
{ participant_id: 'mid', scores: { a: 4, b: 4, c: 4 } },
|
|
47
|
+
{ participant_id: 'worst', scores: { a: 2, b: 2, c: 2 } },
|
|
48
|
+
])
|
|
49
|
+
// best dominates both, mid dominates worst
|
|
50
|
+
expect(result[0].dominated).toBe(false) // best
|
|
51
|
+
expect(result[1].dominated).toBe(true) // mid (by best)
|
|
52
|
+
expect(result[1].dominated_by).toEqual(['best'])
|
|
53
|
+
expect(result[2].dominated).toBe(true) // worst (by both)
|
|
54
|
+
expect(result[2].dominated_by.sort()).toEqual(['best', 'mid'].sort())
|
|
55
|
+
})
|
|
56
|
+
|
|
57
|
+
test('Pareto frontier from playground BDD-research: run-01 dominates run-02', () => {
|
|
58
|
+
// From playground/arena-bdd-research/report.md:
|
|
59
|
+
// Run-01: coverage=5, relevance=5, actionability=5, depth=5
|
|
60
|
+
// Run-02: coverage=3, relevance=2, actionability=2, depth=1
|
|
61
|
+
const result = computePareto([
|
|
62
|
+
{ participant_id: 'run-01', scores: { coverage: 5, relevance: 5, actionability: 5, depth: 5 } },
|
|
63
|
+
{ participant_id: 'run-02', scores: { coverage: 3, relevance: 2, actionability: 2, depth: 1 } },
|
|
64
|
+
])
|
|
65
|
+
expect(result[0].dominated).toBe(false) // run-01: Pareto-optimal
|
|
66
|
+
expect(result[1].dominated).toBe(true) // run-02: dominated by run-01
|
|
67
|
+
expect(result[1].dominated_by).toEqual(['run-01'])
|
|
68
|
+
})
|
|
69
|
+
|
|
70
|
+
test('empty scores object', () => {
|
|
71
|
+
const result = computePareto([
|
|
72
|
+
{ participant_id: 'a', scores: {} },
|
|
73
|
+
{ participant_id: 'b', scores: {} },
|
|
74
|
+
])
|
|
75
|
+
expect(result).toHaveLength(2)
|
|
76
|
+
expect(result[0].dominated).toBe(false)
|
|
77
|
+
expect(result[1].dominated).toBe(false)
|
|
78
|
+
})
|
|
79
|
+
|
|
80
|
+
test('partial criteria overlap', () => {
|
|
81
|
+
const result = computePareto([
|
|
82
|
+
{ participant_id: 'run-01', scores: { a: 5, b: 3 } },
|
|
83
|
+
{ participant_id: 'run-02', scores: { a: 3, c: 5 } },
|
|
84
|
+
])
|
|
85
|
+
// run-01 has a=5 vs run-02 a=3 (a wins)
|
|
86
|
+
// run-02 has b=undefined vs run-01 b=3 ā treated as 0. So run-01 >= run-02 on all shared crit, > on one.
|
|
87
|
+
// But c: run-01 has 0, run-02 has 5. So run-02 > run-01 on c.
|
|
88
|
+
// Cross-dominance ā neither dominates
|
|
89
|
+
expect(result[0].dominated).toBe(false)
|
|
90
|
+
expect(result[1].dominated).toBe(false)
|
|
91
|
+
})
|
|
92
|
+
})
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
import { zodToJsonSchema } from 'zod-to-json-schema'
|
|
2
|
+
import { ComparativeReport, ScoreCell, ParetoEntry } from '@lythos/test-utils/schema'
|
|
3
|
+
import type { AgentAdapter } from '@lythos/test-utils/agents'
|
|
4
|
+
import type { ArenaManifest } from '@lythos/test-utils/schema'
|
|
5
|
+
|
|
6
|
+
// āā Pareto Frontier (deterministic algorithm) āāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
|
|
7
|
+
|
|
8
|
+
export interface ScoreVector {
|
|
9
|
+
participant_id: string
|
|
10
|
+
scores: Record<string, number>
|
|
11
|
+
dominated: boolean
|
|
12
|
+
dominated_by: string[]
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Compute Pareto frontier from score vectors.
|
|
17
|
+
* Participant A dominates B if A >= B in all criteria AND A > B in at least one.
|
|
18
|
+
* This is deterministic ā never delegated to LLM.
|
|
19
|
+
*/
|
|
20
|
+
export function computePareto(vectors: { participant_id: string; scores: Record<string, number> }[]): ParetoEntry[] {
|
|
21
|
+
const result: ParetoEntry[] = vectors.map(v => ({
|
|
22
|
+
participant_id: v.participant_id,
|
|
23
|
+
scores: { ...v.scores },
|
|
24
|
+
dominated: false,
|
|
25
|
+
dominated_by: [] as string[],
|
|
26
|
+
}))
|
|
27
|
+
|
|
28
|
+
// Union of all criteria across all participants
|
|
29
|
+
const allCriteria = [...new Set(vectors.flatMap(v => Object.keys(v.scores)))]
|
|
30
|
+
|
|
31
|
+
if (allCriteria.length === 0) return result
|
|
32
|
+
|
|
33
|
+
for (let i = 0; i < result.length; i++) {
|
|
34
|
+
for (let j = 0; j < result.length; j++) {
|
|
35
|
+
if (i === j) continue
|
|
36
|
+
const a = vectors[i].scores
|
|
37
|
+
const b = vectors[j].scores
|
|
38
|
+
|
|
39
|
+
const allGe = allCriteria.every(k => (a[k] ?? 0) >= (b[k] ?? 0))
|
|
40
|
+
const anyGt = allCriteria.some(k => (a[k] ?? 0) > (b[k] ?? 0))
|
|
41
|
+
|
|
42
|
+
if (allGe && anyGt) {
|
|
43
|
+
// i dominates j
|
|
44
|
+
result[j].dominated = true
|
|
45
|
+
if (!result[j].dominated_by.includes(result[i].participant_id)) {
|
|
46
|
+
result[j].dominated_by.push(result[i].participant_id)
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
return result
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// āā Comparative Judge Prompt āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
|
|
56
|
+
|
|
57
|
+
function buildComparativePrompt(opts: {
|
|
58
|
+
manifest: ArenaManifest
|
|
59
|
+
verdicts: { participantId: string; verdict: unknown }[]
|
|
60
|
+
}): string {
|
|
61
|
+
const criteriaDesc = opts.manifest.criteria.join(', ')
|
|
62
|
+
const participants = opts.manifest.participants
|
|
63
|
+
.map(p => `- ${p.id}: ${p.name} (${p.description || 'no description'})`)
|
|
64
|
+
.join('\n')
|
|
65
|
+
|
|
66
|
+
return `You are a comparative judge evaluating ${opts.manifest.participants.length} participants against shared criteria.
|
|
67
|
+
|
|
68
|
+
## Task
|
|
69
|
+
${opts.manifest.task}
|
|
70
|
+
|
|
71
|
+
## Participants
|
|
72
|
+
${participants}
|
|
73
|
+
|
|
74
|
+
## Criteria
|
|
75
|
+
${criteriaDesc}
|
|
76
|
+
|
|
77
|
+
## Your Job
|
|
78
|
+
For each participant, score them 1-5 on each criterion. Provide a brief rationale.
|
|
79
|
+
Score meanings: 1=poor, 3=acceptable, 5=excellent.
|
|
80
|
+
|
|
81
|
+
Use the submit_scores tool to return your structured evaluation.`
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
const SCORE_TOOL = {
|
|
85
|
+
name: 'submit_scores',
|
|
86
|
+
description: 'Submit per-participant scores for each criterion with rationales',
|
|
87
|
+
input_schema: zodToJsonSchema(ComparativeReport.pick({ score_matrix: true, key_findings: true, recommendations: true })) as Record<string, unknown>,
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
function toScoreMatrix(
|
|
91
|
+
manifest: ArenaManifest,
|
|
92
|
+
scores: { participant_id: string; criterion: string; weight: number; score: number; rationale: string }[]
|
|
93
|
+
): typeof ScoreCell._output[] {
|
|
94
|
+
return scores.map(s => ScoreCell.parse(s))
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
// āā Comparative Judge āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
|
|
98
|
+
|
|
99
|
+
export async function runComparativeJudge(opts: {
|
|
100
|
+
manifest: ArenaManifest
|
|
101
|
+
verdicts: { participantId: string; verdict: unknown }[]
|
|
102
|
+
judge: AgentAdapter
|
|
103
|
+
workdir: string
|
|
104
|
+
}): Promise<typeof ComparativeReport._output> {
|
|
105
|
+
const { manifest, verdicts, judge, workdir } = opts
|
|
106
|
+
|
|
107
|
+
const prompt = buildComparativePrompt({ manifest, verdicts })
|
|
108
|
+
|
|
109
|
+
let raw: string
|
|
110
|
+
let parsed: unknown
|
|
111
|
+
|
|
112
|
+
if (judge.invokeTool) {
|
|
113
|
+
parsed = await judge.invokeTool({
|
|
114
|
+
tool: SCORE_TOOL,
|
|
115
|
+
prompt,
|
|
116
|
+
cwd: workdir,
|
|
117
|
+
timeoutMs: 120000,
|
|
118
|
+
})
|
|
119
|
+
raw = JSON.stringify(parsed)
|
|
120
|
+
} else {
|
|
121
|
+
const result = await judge.spawn({ cwd: workdir, brief: prompt, timeoutMs: 120000 })
|
|
122
|
+
raw = result.stdout
|
|
123
|
+
const fenceMatch = raw.match(/```(?:json)?\s*([\s\S]*?)\s*```/)
|
|
124
|
+
const jsonStr = fenceMatch ? fenceMatch[1].trim() : raw.trim()
|
|
125
|
+
parsed = JSON.parse(jsonStr)
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
// Validate LLM output
|
|
129
|
+
const llmResult = ComparativeReport.pick({
|
|
130
|
+
score_matrix: true,
|
|
131
|
+
key_findings: true,
|
|
132
|
+
recommendations: true,
|
|
133
|
+
}).parse(parsed)
|
|
134
|
+
|
|
135
|
+
const scoreMatrix = toScoreMatrix(manifest, llmResult.score_matrix)
|
|
136
|
+
|
|
137
|
+
// Pareto: deterministic, never delegated to LLM
|
|
138
|
+
const participantScores = manifest.participants.map(p => {
|
|
139
|
+
const pScores: Record<string, number> = {}
|
|
140
|
+
for (const cell of scoreMatrix) {
|
|
141
|
+
if (cell.participant_id === p.id) {
|
|
142
|
+
pScores[cell.criterion] = cell.score
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
return { participant_id: p.id, scores: pScores }
|
|
146
|
+
})
|
|
147
|
+
|
|
148
|
+
const pareto = computePareto(participantScores)
|
|
149
|
+
|
|
150
|
+
// Weighted totals (equal weight by default)
|
|
151
|
+
const weightedTotals: Record<string, number> = {}
|
|
152
|
+
for (const p of manifest.participants) {
|
|
153
|
+
const pCells = scoreMatrix.filter(c => c.participant_id === p.id)
|
|
154
|
+
weightedTotals[p.id] = pCells.reduce((sum, c) => sum + c.score * c.weight, 0) / (pCells.length || 1)
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
return ComparativeReport.parse({
|
|
158
|
+
arena_id: manifest.id,
|
|
159
|
+
generated_at: new Date().toISOString(),
|
|
160
|
+
score_matrix: scoreMatrix,
|
|
161
|
+
weighted_totals: weightedTotals,
|
|
162
|
+
pareto,
|
|
163
|
+
key_findings: llmResult.key_findings ?? [],
|
|
164
|
+
recommendations: llmResult.recommendations ?? [],
|
|
165
|
+
})
|
|
166
|
+
}
|
package/src/runner.ts
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
import { mkdirSync, writeFileSync, existsSync, readFileSync } from 'node:fs'
|
|
2
|
+
import { join, resolve } from 'node:path'
|
|
3
|
+
import { runAgentScenario, type AgentScenario } from '@lythos/test-utils/agent-bdd'
|
|
4
|
+
import { useAgent } from '@lythos/test-utils/agents'
|
|
5
|
+
import { ArenaManifest, Player, type ArenaManifest as ArenaManifestType } from '@lythos/test-utils/schema'
|
|
6
|
+
import { runComparativeJudge } from './comparative-judge'
|
|
7
|
+
|
|
8
|
+
// āā Helpers āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
|
|
9
|
+
|
|
10
|
+
function stamp(): string {
|
|
11
|
+
const d = new Date()
|
|
12
|
+
return `${d.getFullYear()}${String(d.getMonth() + 1).padStart(2, '0')}${String(d.getDate()).padStart(2, '0')}-${String(d.getHours()).padStart(2, '0')}${String(d.getMinutes()).padStart(2, '0')}${String(d.getSeconds()).padStart(2, '0')}`
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
function cartesian<T>(arrays: T[][]): T[][] {
|
|
16
|
+
if (arrays.length === 0) return [[]]
|
|
17
|
+
const [first, ...rest] = arrays
|
|
18
|
+
const restProd = cartesian(rest)
|
|
19
|
+
return first.flatMap(a => restProd.map(r => [a, ...r]))
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
function slugify(input: string): string {
|
|
23
|
+
return input.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-+|-+$/g, '').slice(0, 40)
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
// āā Runner āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
|
|
27
|
+
|
|
28
|
+
export async function runArena(opts: {
|
|
29
|
+
taskPath: string
|
|
30
|
+
playerPaths: string[]
|
|
31
|
+
deckPaths: string[]
|
|
32
|
+
criteria: string[]
|
|
33
|
+
outDir: string
|
|
34
|
+
projectDir?: string
|
|
35
|
+
}): Promise<{ manifest: ArenaManifestType; report: unknown; artifactsDir: string }> {
|
|
36
|
+
const { taskPath, playerPaths, deckPaths, criteria, outDir } = opts
|
|
37
|
+
|
|
38
|
+
// Load players
|
|
39
|
+
const players = playerPaths.map(p => {
|
|
40
|
+
const content = readFileSync(resolve(p), 'utf-8')
|
|
41
|
+
const parsed = Player.parse(JSON.parse(content))
|
|
42
|
+
return { path: p, ...parsed }
|
|
43
|
+
})
|
|
44
|
+
|
|
45
|
+
// Load deck labels from deck paths
|
|
46
|
+
const decks = deckPaths.map(p => ({ path: resolve(p) }))
|
|
47
|
+
|
|
48
|
+
// Build (player Ć deck) variant matrix
|
|
49
|
+
const variants = cartesian([players, decks]).map(([player, deck], i) => ({
|
|
50
|
+
participant_id: `run-${String(i + 1).padStart(2, '0')}`,
|
|
51
|
+
player,
|
|
52
|
+
deck_path: deck.path,
|
|
53
|
+
}))
|
|
54
|
+
|
|
55
|
+
// Build arena manifest
|
|
56
|
+
const arenaId = `arena-${stamp()}`
|
|
57
|
+
const artifactsDir = outDir || join(process.cwd(), 'runs', arenaId)
|
|
58
|
+
|
|
59
|
+
const manifest = ArenaManifest.parse({
|
|
60
|
+
id: arenaId,
|
|
61
|
+
created_at: new Date().toISOString(),
|
|
62
|
+
task: readFileSync(resolve(taskPath), 'utf-8').slice(0, 200),
|
|
63
|
+
mode: 'decks',
|
|
64
|
+
participants: variants.map(v => ({
|
|
65
|
+
id: v.participant_id,
|
|
66
|
+
name: v.player.path.split('/').pop()?.replace('.toml', '') ?? v.player.platform,
|
|
67
|
+
player: v.player.platform,
|
|
68
|
+
deck: v.deck_path,
|
|
69
|
+
description: `${v.player.platform} Ć ${v.deck_path.split('/').pop()?.replace('.toml', '')}`,
|
|
70
|
+
})),
|
|
71
|
+
criteria,
|
|
72
|
+
status: 'running',
|
|
73
|
+
})
|
|
74
|
+
|
|
75
|
+
mkdirSync(artifactsDir, { recursive: true })
|
|
76
|
+
writeFileSync(join(artifactsDir, 'arena.json'), JSON.stringify(manifest, null, 2) + '\n')
|
|
77
|
+
|
|
78
|
+
// Run each variant
|
|
79
|
+
const verdicts: { participantId: string; verdict: unknown }[] = []
|
|
80
|
+
|
|
81
|
+
for (const variant of variants) {
|
|
82
|
+
const cellDir = join(artifactsDir, 'runs', variant.participant_id)
|
|
83
|
+
mkdirSync(cellDir, { recursive: true })
|
|
84
|
+
|
|
85
|
+
try {
|
|
86
|
+
const result = await runAgentScenario({
|
|
87
|
+
scenarioPath: resolve(taskPath),
|
|
88
|
+
agent: useAgent(variant.player.platform),
|
|
89
|
+
setupWorkdir(_scenario: AgentScenario, workdir: string) {
|
|
90
|
+
mkdirSync(workdir, { recursive: true })
|
|
91
|
+
// Write deck.toml as skill-deck.toml
|
|
92
|
+
const deckContent = readFileSync(variant.deck_path, 'utf-8')
|
|
93
|
+
writeFileSync(join(workdir, 'skill-deck.toml'), deckContent)
|
|
94
|
+
},
|
|
95
|
+
baseDir: artifactsDir,
|
|
96
|
+
})
|
|
97
|
+
|
|
98
|
+
verdicts.push({
|
|
99
|
+
participantId: variant.participant_id,
|
|
100
|
+
verdict: result.verdict,
|
|
101
|
+
})
|
|
102
|
+
} catch (e) {
|
|
103
|
+
verdicts.push({
|
|
104
|
+
participantId: variant.participant_id,
|
|
105
|
+
verdict: {
|
|
106
|
+
verdict: 'ERROR' as const,
|
|
107
|
+
reason: `Runner exception: ${e instanceof Error ? e.message : String(e)}`,
|
|
108
|
+
},
|
|
109
|
+
})
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
// Run comparative judge
|
|
114
|
+
const judge = useAgent(players[0]?.platform ?? 'claude')
|
|
115
|
+
const report = await runComparativeJudge({
|
|
116
|
+
manifest,
|
|
117
|
+
verdicts,
|
|
118
|
+
judge,
|
|
119
|
+
workdir: artifactsDir,
|
|
120
|
+
})
|
|
121
|
+
|
|
122
|
+
// Write report
|
|
123
|
+
writeFileSync(join(artifactsDir, 'report.md'), `# Arena Report: ${manifest.id}
|
|
124
|
+
|
|
125
|
+
**Task**: ${manifest.task}
|
|
126
|
+
**Criteria**: ${manifest.criteria.join(', ')}
|
|
127
|
+
**Date**: ${new Date().toISOString()}
|
|
128
|
+
|
|
129
|
+
## Score Matrix
|
|
130
|
+
${renderScoreMatrix(report)}
|
|
131
|
+
|
|
132
|
+
## Pareto Frontier
|
|
133
|
+
${renderPareto(report)}
|
|
134
|
+
|
|
135
|
+
## Key Findings
|
|
136
|
+
${(report.key_findings ?? []).map((f: string) => `- ${f}`).join('\n')}
|
|
137
|
+
|
|
138
|
+
## Recommendations
|
|
139
|
+
${(report.recommendations ?? []).map((r: { audience: string; recommendation: string }) => `- **${r.audience}**: ${r.recommendation}`).join('\n')}
|
|
140
|
+
`)
|
|
141
|
+
|
|
142
|
+
// Update manifest status
|
|
143
|
+
const finalManifest = ArenaManifest.parse({ ...manifest, status: 'completed' })
|
|
144
|
+
writeFileSync(join(artifactsDir, 'arena.json'), JSON.stringify(finalManifest, null, 2) + '\n')
|
|
145
|
+
|
|
146
|
+
return { manifest: finalManifest, report, artifactsDir }
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
// āā Markdown Renderers āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
|
|
150
|
+
|
|
151
|
+
function renderScoreMatrix(report: unknown & { score_matrix?: { participant_id: string; criterion: string; weight: number; score: number; rationale: string }[] }): string {
|
|
152
|
+
if (!report.score_matrix?.length) return 'No scores available.\n'
|
|
153
|
+
|
|
154
|
+
// Build participant Ć criterion matrix
|
|
155
|
+
const participants = [...new Set(report.score_matrix.map(s => s.participant_id))]
|
|
156
|
+
const criteria = [...new Set(report.score_matrix.map(s => s.criterion))]
|
|
157
|
+
|
|
158
|
+
let table = `| Criterion | Weight | ${participants.map(p => `${p}`).join(' | ')} |\n`
|
|
159
|
+
table += `|${'---|'.repeat(2 + participants.length)}\n`
|
|
160
|
+
|
|
161
|
+
for (const c of criteria) {
|
|
162
|
+
table += `| ${c} | 25% | ${participants.map(p => {
|
|
163
|
+
const cell = report.score_matrix!.find(s => s.participant_id === p && s.criterion === c)
|
|
164
|
+
return `**${cell?.score ?? '?'}**`
|
|
165
|
+
}).join(' | ')} |\n`
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
// Weighted totals
|
|
169
|
+
table += `| **Weighted Total** | 100% | ${participants.map(p => {
|
|
170
|
+
const pScores = report.score_matrix!.filter(s => s.participant_id === p)
|
|
171
|
+
const avg = pScores.length ? pScores.reduce((sum, s) => sum + s.score, 0) / pScores.length : 0
|
|
172
|
+
return `**${avg.toFixed(1)}**`
|
|
173
|
+
}).join(' | ')} |\n`
|
|
174
|
+
|
|
175
|
+
return table
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
function renderPareto(report: unknown & { pareto?: { participant_id: string; dominated: boolean; dominated_by: string[] }[] }): string {
|
|
179
|
+
if (!report.pareto?.length) return 'No Pareto analysis.\n'
|
|
180
|
+
|
|
181
|
+
return report.pareto.map((p: { participant_id: string; dominated: boolean; dominated_by: string[] }) => {
|
|
182
|
+
if (p.dominated) {
|
|
183
|
+
return `- **${p.participant_id}**: dominated by ${p.dominated_by.join(', ')}`
|
|
184
|
+
}
|
|
185
|
+
return `- **${p.participant_id}**: Pareto-optimal (non-dominated)`
|
|
186
|
+
}).join('\n')
|
|
187
|
+
}
|