@lythos/skill-arena 0.9.1 → 0.9.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +5 -1
- package/src/arena-toml.test.ts +164 -0
- package/src/arena-toml.ts +172 -0
- package/src/cli.ts +95 -10
- package/src/comparative-judge.test.ts +92 -0
- package/src/comparative-judge.ts +166 -0
- package/src/player.test.ts +95 -0
- package/src/player.ts +71 -0
- package/src/runner.ts +250 -0
- package/src/stats.test.ts +111 -0
- package/src/stats.ts +117 -0
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
import { zodToJsonSchema } from 'zod-to-json-schema'
|
|
2
|
+
import { ComparativeReport, ScoreCell, ParetoEntry } from '@lythos/test-utils/schema'
|
|
3
|
+
import type { AgentAdapter } from '@lythos/test-utils/agents'
|
|
4
|
+
import type { ArenaManifest } from '@lythos/test-utils/schema'
|
|
5
|
+
|
|
6
|
+
// ── Pareto Frontier (deterministic algorithm) ──────────────────────────────
|
|
7
|
+
|
|
8
|
+
export interface ScoreVector {
|
|
9
|
+
participant_id: string
|
|
10
|
+
scores: Record<string, number>
|
|
11
|
+
dominated: boolean
|
|
12
|
+
dominated_by: string[]
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Compute Pareto frontier from score vectors.
|
|
17
|
+
* Participant A dominates B if A >= B in all criteria AND A > B in at least one.
|
|
18
|
+
* This is deterministic — never delegated to LLM.
|
|
19
|
+
*/
|
|
20
|
+
export function computePareto(vectors: { participant_id: string; scores: Record<string, number> }[]): ParetoEntry[] {
|
|
21
|
+
const result: ParetoEntry[] = vectors.map(v => ({
|
|
22
|
+
participant_id: v.participant_id,
|
|
23
|
+
scores: { ...v.scores },
|
|
24
|
+
dominated: false,
|
|
25
|
+
dominated_by: [] as string[],
|
|
26
|
+
}))
|
|
27
|
+
|
|
28
|
+
// Union of all criteria across all participants
|
|
29
|
+
const allCriteria = [...new Set(vectors.flatMap(v => Object.keys(v.scores)))]
|
|
30
|
+
|
|
31
|
+
if (allCriteria.length === 0) return result
|
|
32
|
+
|
|
33
|
+
for (let i = 0; i < result.length; i++) {
|
|
34
|
+
for (let j = 0; j < result.length; j++) {
|
|
35
|
+
if (i === j) continue
|
|
36
|
+
const a = vectors[i].scores
|
|
37
|
+
const b = vectors[j].scores
|
|
38
|
+
|
|
39
|
+
const allGe = allCriteria.every(k => (a[k] ?? 0) >= (b[k] ?? 0))
|
|
40
|
+
const anyGt = allCriteria.some(k => (a[k] ?? 0) > (b[k] ?? 0))
|
|
41
|
+
|
|
42
|
+
if (allGe && anyGt) {
|
|
43
|
+
// i dominates j
|
|
44
|
+
result[j].dominated = true
|
|
45
|
+
if (!result[j].dominated_by.includes(result[i].participant_id)) {
|
|
46
|
+
result[j].dominated_by.push(result[i].participant_id)
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
return result
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// ── Comparative Judge Prompt ──────────────────────────────────────────────
|
|
56
|
+
|
|
57
|
+
function buildComparativePrompt(opts: {
|
|
58
|
+
manifest: ArenaManifest
|
|
59
|
+
verdicts: { participantId: string; verdict: unknown }[]
|
|
60
|
+
}): string {
|
|
61
|
+
const criteriaDesc = opts.manifest.criteria.join(', ')
|
|
62
|
+
const participants = opts.manifest.participants
|
|
63
|
+
.map(p => `- ${p.id}: ${p.name} (${p.description || 'no description'})`)
|
|
64
|
+
.join('\n')
|
|
65
|
+
|
|
66
|
+
return `You are a comparative judge evaluating ${opts.manifest.participants.length} participants against shared criteria.
|
|
67
|
+
|
|
68
|
+
## Task
|
|
69
|
+
${opts.manifest.task}
|
|
70
|
+
|
|
71
|
+
## Participants
|
|
72
|
+
${participants}
|
|
73
|
+
|
|
74
|
+
## Criteria
|
|
75
|
+
${criteriaDesc}
|
|
76
|
+
|
|
77
|
+
## Your Job
|
|
78
|
+
For each participant, score them 1-5 on each criterion. Provide a brief rationale.
|
|
79
|
+
Score meanings: 1=poor, 3=acceptable, 5=excellent.
|
|
80
|
+
|
|
81
|
+
Use the submit_scores tool to return your structured evaluation.`
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
const SCORE_TOOL = {
|
|
85
|
+
name: 'submit_scores',
|
|
86
|
+
description: 'Submit per-participant scores for each criterion with rationales',
|
|
87
|
+
input_schema: zodToJsonSchema(ComparativeReport.pick({ score_matrix: true, key_findings: true, recommendations: true })) as Record<string, unknown>,
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
function toScoreMatrix(
|
|
91
|
+
manifest: ArenaManifest,
|
|
92
|
+
scores: { participant_id: string; criterion: string; weight: number; score: number; rationale: string }[]
|
|
93
|
+
): typeof ScoreCell._output[] {
|
|
94
|
+
return scores.map(s => ScoreCell.parse(s))
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
// ── Comparative Judge ─────────────────────────────────────────────────────
|
|
98
|
+
|
|
99
|
+
export async function runComparativeJudge(opts: {
|
|
100
|
+
manifest: ArenaManifest
|
|
101
|
+
verdicts: { participantId: string; verdict: unknown }[]
|
|
102
|
+
judge: AgentAdapter
|
|
103
|
+
workdir: string
|
|
104
|
+
}): Promise<typeof ComparativeReport._output> {
|
|
105
|
+
const { manifest, verdicts, judge, workdir } = opts
|
|
106
|
+
|
|
107
|
+
const prompt = buildComparativePrompt({ manifest, verdicts })
|
|
108
|
+
|
|
109
|
+
let raw: string
|
|
110
|
+
let parsed: unknown
|
|
111
|
+
|
|
112
|
+
if (judge.invokeTool) {
|
|
113
|
+
parsed = await judge.invokeTool({
|
|
114
|
+
tool: SCORE_TOOL,
|
|
115
|
+
prompt,
|
|
116
|
+
cwd: workdir,
|
|
117
|
+
timeoutMs: 120000,
|
|
118
|
+
})
|
|
119
|
+
raw = JSON.stringify(parsed)
|
|
120
|
+
} else {
|
|
121
|
+
const result = await judge.spawn({ cwd: workdir, brief: prompt, timeoutMs: 120000 })
|
|
122
|
+
raw = result.stdout
|
|
123
|
+
const fenceMatch = raw.match(/```(?:json)?\s*([\s\S]*?)\s*```/)
|
|
124
|
+
const jsonStr = fenceMatch ? fenceMatch[1].trim() : raw.trim()
|
|
125
|
+
parsed = JSON.parse(jsonStr)
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
// Validate LLM output
|
|
129
|
+
const llmResult = ComparativeReport.pick({
|
|
130
|
+
score_matrix: true,
|
|
131
|
+
key_findings: true,
|
|
132
|
+
recommendations: true,
|
|
133
|
+
}).parse(parsed)
|
|
134
|
+
|
|
135
|
+
const scoreMatrix = toScoreMatrix(manifest, llmResult.score_matrix)
|
|
136
|
+
|
|
137
|
+
// Pareto: deterministic, never delegated to LLM
|
|
138
|
+
const participantScores = manifest.participants.map(p => {
|
|
139
|
+
const pScores: Record<string, number> = {}
|
|
140
|
+
for (const cell of scoreMatrix) {
|
|
141
|
+
if (cell.participant_id === p.id) {
|
|
142
|
+
pScores[cell.criterion] = cell.score
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
return { participant_id: p.id, scores: pScores }
|
|
146
|
+
})
|
|
147
|
+
|
|
148
|
+
const pareto = computePareto(participantScores)
|
|
149
|
+
|
|
150
|
+
// Weighted totals (equal weight by default)
|
|
151
|
+
const weightedTotals: Record<string, number> = {}
|
|
152
|
+
for (const p of manifest.participants) {
|
|
153
|
+
const pCells = scoreMatrix.filter(c => c.participant_id === p.id)
|
|
154
|
+
weightedTotals[p.id] = pCells.reduce((sum, c) => sum + c.score * c.weight, 0) / (pCells.length || 1)
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
return ComparativeReport.parse({
|
|
158
|
+
arena_id: manifest.id,
|
|
159
|
+
generated_at: new Date().toISOString(),
|
|
160
|
+
score_matrix: scoreMatrix,
|
|
161
|
+
weighted_totals: weightedTotals,
|
|
162
|
+
pareto,
|
|
163
|
+
key_findings: llmResult.key_findings ?? [],
|
|
164
|
+
recommendations: llmResult.recommendations ?? [],
|
|
165
|
+
})
|
|
166
|
+
}
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
import { describe, test, expect } from 'bun:test'
|
|
2
|
+
import { resolvePlayer, resolveSides, groupBySide, totalRuns } from './player'
|
|
3
|
+
import { parseArenaToml } from './arena-toml'
|
|
4
|
+
|
|
5
|
+
const toml = parseArenaToml(`
|
|
6
|
+
[arena]
|
|
7
|
+
task = "Test task"
|
|
8
|
+
criteria = ["a", "b"]
|
|
9
|
+
runs_per_side = 3
|
|
10
|
+
|
|
11
|
+
[[side]]
|
|
12
|
+
name = "minimal"
|
|
13
|
+
player = "claude-code"
|
|
14
|
+
deck = "./decks/minimal.toml"
|
|
15
|
+
|
|
16
|
+
[[side]]
|
|
17
|
+
name = "rich"
|
|
18
|
+
player = "expert-architect"
|
|
19
|
+
deck = "./decks/rich.toml"
|
|
20
|
+
`)
|
|
21
|
+
|
|
22
|
+
describe('resolvePlayer', () => {
|
|
23
|
+
test('maps claude-code → claude', () => {
|
|
24
|
+
expect(resolvePlayer('claude-code')).toBe('claude')
|
|
25
|
+
})
|
|
26
|
+
|
|
27
|
+
test('maps Claude → claude (case insensitive)', () => {
|
|
28
|
+
expect(resolvePlayer('Claude')).toBe('claude')
|
|
29
|
+
})
|
|
30
|
+
|
|
31
|
+
test('maps kimi → kimi', () => {
|
|
32
|
+
expect(resolvePlayer('kimi')).toBe('kimi')
|
|
33
|
+
})
|
|
34
|
+
|
|
35
|
+
test('passes through unknown player names', () => {
|
|
36
|
+
expect(resolvePlayer('expert-architect')).toBe('expert-architect')
|
|
37
|
+
})
|
|
38
|
+
|
|
39
|
+
test('trims whitespace', () => {
|
|
40
|
+
expect(resolvePlayer(' claude-code ')).toBe('claude')
|
|
41
|
+
})
|
|
42
|
+
})
|
|
43
|
+
|
|
44
|
+
describe('resolveSides', () => {
|
|
45
|
+
test('resolves all sides in arena.toml', () => {
|
|
46
|
+
const sides = resolveSides(toml)
|
|
47
|
+
expect(sides).toHaveLength(2)
|
|
48
|
+
expect(sides[0].platform).toBe('claude')
|
|
49
|
+
expect(sides[1].platform).toBe('expert-architect')
|
|
50
|
+
expect(sides[0].playerName).toBe('claude-code')
|
|
51
|
+
})
|
|
52
|
+
|
|
53
|
+
test('preserves side config', () => {
|
|
54
|
+
const sides = resolveSides(toml)
|
|
55
|
+
expect(sides[0].side.name).toBe('minimal')
|
|
56
|
+
expect(sides[0].side.deck).toBe('./decks/minimal.toml')
|
|
57
|
+
})
|
|
58
|
+
})
|
|
59
|
+
|
|
60
|
+
describe('groupBySide', () => {
|
|
61
|
+
test('groups by side name with run count', () => {
|
|
62
|
+
const groups = groupBySide(toml)
|
|
63
|
+
expect(groups).toHaveLength(2)
|
|
64
|
+
expect(groups[0].runs).toBe(3) // runs_per_side
|
|
65
|
+
expect(groups[1].runs).toBe(3)
|
|
66
|
+
expect(groups[0].platform).toBe('claude')
|
|
67
|
+
})
|
|
68
|
+
|
|
69
|
+
test('control flag preserved', () => {
|
|
70
|
+
const controlToml = parseArenaToml(`
|
|
71
|
+
[arena]
|
|
72
|
+
task = "x"
|
|
73
|
+
criteria = ["a"]
|
|
74
|
+
|
|
75
|
+
[[side]]
|
|
76
|
+
name = "test"
|
|
77
|
+
player = "claude-code"
|
|
78
|
+
deck = "a.toml"
|
|
79
|
+
|
|
80
|
+
[[side]]
|
|
81
|
+
name = "baseline"
|
|
82
|
+
player = "claude-code"
|
|
83
|
+
deck = "b.toml"
|
|
84
|
+
control = true
|
|
85
|
+
`)
|
|
86
|
+
const groups = groupBySide(controlToml)
|
|
87
|
+
expect(groups[1].control).toBe(true)
|
|
88
|
+
})
|
|
89
|
+
})
|
|
90
|
+
|
|
91
|
+
describe('totalRuns', () => {
|
|
92
|
+
test('calculates sides × runs_per_side', () => {
|
|
93
|
+
expect(totalRuns(toml)).toBe(6) // 2 sides × 3 runs
|
|
94
|
+
})
|
|
95
|
+
})
|
package/src/player.ts
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import type { Side, ArenaToml } from './arena-toml'
|
|
2
|
+
|
|
3
|
+
// ── Player reference resolution (pure function) ────────────────────────────
|
|
4
|
+
// Maps arena.toml player names → platform identifiers.
|
|
5
|
+
// AgentAdapter creation is the IO layer's job (T4), not ours.
|
|
6
|
+
|
|
7
|
+
export interface ResolvedSide {
|
|
8
|
+
side: Side
|
|
9
|
+
platform: string // resolved platform for useAgent()
|
|
10
|
+
playerName: string // original player reference
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
/** Built-in player registry. Player names that map directly to useAgent platforms. */
|
|
14
|
+
const BUILTIN_PLAYERS: Record<string, string> = {
|
|
15
|
+
'claude': 'claude',
|
|
16
|
+
'claude-code': 'claude',
|
|
17
|
+
'kimi': 'kimi',
|
|
18
|
+
'cursor': 'cursor',
|
|
19
|
+
'gemini': 'gemini',
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Resolve a player reference to its platform identifier.
|
|
24
|
+
* - Built-in names (claude, kimi, cursor) map directly
|
|
25
|
+
* - Unknown names are passed through (assumed to be useAgent-compatible)
|
|
26
|
+
* - Future: custom player.toml files will override built-in mappings
|
|
27
|
+
*/
|
|
28
|
+
export function resolvePlayer(name: string): string {
|
|
29
|
+
const normalized = name.toLowerCase().trim()
|
|
30
|
+
return BUILTIN_PLAYERS[normalized] ?? normalized
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Map arena.toml sides to resolved side configs.
|
|
35
|
+
* Pure function — no IO, no agent creation.
|
|
36
|
+
*/
|
|
37
|
+
export function resolveSides(toml: ArenaToml): ResolvedSide[] {
|
|
38
|
+
return toml.side.map(side => ({
|
|
39
|
+
side,
|
|
40
|
+
platform: resolvePlayer(side.player),
|
|
41
|
+
playerName: side.player,
|
|
42
|
+
}))
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// ── Side grouping (for per-side aggregation in T3) ─────────────────────────
|
|
46
|
+
|
|
47
|
+
export interface SideGroup {
|
|
48
|
+
sideName: string
|
|
49
|
+
player: string
|
|
50
|
+
deck: string
|
|
51
|
+
control: boolean
|
|
52
|
+
runs: number
|
|
53
|
+
platform: string
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/** Group resolved sides by name for per-side statistical aggregation */
|
|
57
|
+
export function groupBySide(toml: ArenaToml): SideGroup[] {
|
|
58
|
+
return resolveSides(toml).map(rs => ({
|
|
59
|
+
sideName: rs.side.name,
|
|
60
|
+
player: rs.playerName,
|
|
61
|
+
deck: rs.side.deck,
|
|
62
|
+
control: rs.side.control,
|
|
63
|
+
runs: toml.arena.runs_per_side,
|
|
64
|
+
platform: rs.platform,
|
|
65
|
+
}))
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/** Get total run count from arena.toml (sides × runs_per_side) */
|
|
69
|
+
export function totalRuns(toml: ArenaToml): number {
|
|
70
|
+
return toml.side.length * toml.arena.runs_per_side
|
|
71
|
+
}
|
package/src/runner.ts
ADDED
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
import { mkdirSync, writeFileSync, readFileSync } from 'node:fs'
|
|
2
|
+
import { join, resolve } from 'node:path'
|
|
3
|
+
import { runAgentScenario, type AgentScenario } from '@lythos/test-utils/agent-bdd'
|
|
4
|
+
import { useAgent } from '@lythos/test-utils/agents'
|
|
5
|
+
import { ArenaManifest, Player } from '@lythos/test-utils/schema'
|
|
6
|
+
import type { ArenaManifest as ArenaManifestType, JudgeVerdict } from '@lythos/test-utils/schema'
|
|
7
|
+
import { runComparativeJudge } from './comparative-judge'
|
|
8
|
+
import { parseArenaToml, buildExecutionPlan, type ArenaToml } from './arena-toml'
|
|
9
|
+
import { resolvePlayer, resolveSides } from './player'
|
|
10
|
+
import { aggregateAllStats } from './stats'
|
|
11
|
+
import type { SideStats } from './stats'
|
|
12
|
+
|
|
13
|
+
// ── Helpers ───────────────────────────────────────────────────────────────
|
|
14
|
+
|
|
15
|
+
function stamp(): string {
|
|
16
|
+
const d = new Date()
|
|
17
|
+
return `${d.getFullYear()}${String(d.getMonth() + 1).padStart(2, '0')}${String(d.getDate()).padStart(2, '0')}-${String(d.getHours()).padStart(2, '0')}${String(d.getMinutes()).padStart(2, '0')}${String(d.getSeconds()).padStart(2, '0')}`
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
// ── Declarative runner (arena.toml → execute) ─────────────────────────────
|
|
21
|
+
|
|
22
|
+
export interface ArenaResult {
|
|
23
|
+
manifest: ArenaManifestType
|
|
24
|
+
report: unknown
|
|
25
|
+
stats: SideStats[]
|
|
26
|
+
artifactsDir: string
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export async function runArenaFromToml(opts: {
|
|
30
|
+
toml: ArenaToml
|
|
31
|
+
taskPath: string
|
|
32
|
+
outDir?: string
|
|
33
|
+
dryRun?: boolean
|
|
34
|
+
}): Promise<ArenaResult | { plan: ReturnType<typeof buildExecutionPlan> }> {
|
|
35
|
+
const { toml, taskPath, outDir, dryRun } = opts
|
|
36
|
+
|
|
37
|
+
const plan = buildExecutionPlan(toml)
|
|
38
|
+
|
|
39
|
+
// dry-run: return plan without executing
|
|
40
|
+
if (dryRun) {
|
|
41
|
+
return { plan }
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
const arenaId = `arena-${stamp()}`
|
|
45
|
+
const artifactsDir = outDir || join(process.cwd(), 'runs', arenaId)
|
|
46
|
+
const resolved = resolveSides(toml)
|
|
47
|
+
|
|
48
|
+
// Build manifest
|
|
49
|
+
const manifest = ArenaManifest.parse({
|
|
50
|
+
id: arenaId,
|
|
51
|
+
created_at: new Date().toISOString(),
|
|
52
|
+
task: readFileSync(resolve(taskPath), 'utf-8').slice(0, 200),
|
|
53
|
+
mode: 'decks',
|
|
54
|
+
participants: [...new Map(resolved.map(r => [r.side.name, r])).values()].map(r => ({
|
|
55
|
+
id: r.side.name,
|
|
56
|
+
name: r.side.name,
|
|
57
|
+
player: r.platform,
|
|
58
|
+
deck: r.side.deck,
|
|
59
|
+
description: `${r.playerName} × ${r.side.deck}`,
|
|
60
|
+
})),
|
|
61
|
+
criteria: toml.arena.criteria,
|
|
62
|
+
status: 'running',
|
|
63
|
+
})
|
|
64
|
+
|
|
65
|
+
mkdirSync(artifactsDir, { recursive: true })
|
|
66
|
+
writeFileSync(join(artifactsDir, 'arena.json'), JSON.stringify(manifest, null, 2) + '\n')
|
|
67
|
+
|
|
68
|
+
// Execute plan: per-cell agent run
|
|
69
|
+
const verdictsBySide = new Map<string, JudgeVerdict[]>()
|
|
70
|
+
|
|
71
|
+
for (const cell of plan.cells) {
|
|
72
|
+
const cellDir = join(artifactsDir, 'runs', cell.side, `run-${cell.run}`)
|
|
73
|
+
mkdirSync(cellDir, { recursive: true })
|
|
74
|
+
|
|
75
|
+
try {
|
|
76
|
+
const agent = useAgent(resolvePlayer(cell.player))
|
|
77
|
+
const result = await runAgentScenario({
|
|
78
|
+
scenarioPath: resolve(taskPath),
|
|
79
|
+
agent,
|
|
80
|
+
setupWorkdir(_scenario: AgentScenario, workdir: string) {
|
|
81
|
+
mkdirSync(workdir, { recursive: true })
|
|
82
|
+
const deckContent = readFileSync(resolve(cell.deck), 'utf-8')
|
|
83
|
+
writeFileSync(join(workdir, 'skill-deck.toml'), deckContent)
|
|
84
|
+
},
|
|
85
|
+
baseDir: join(artifactsDir, 'runs', cell.side),
|
|
86
|
+
})
|
|
87
|
+
|
|
88
|
+
const v = (result.verdict ?? {
|
|
89
|
+
verdict: 'ERROR' as const,
|
|
90
|
+
reason: 'No verdict returned',
|
|
91
|
+
criteria: [],
|
|
92
|
+
}) as JudgeVerdict
|
|
93
|
+
|
|
94
|
+
if (!verdictsBySide.has(cell.side)) verdictsBySide.set(cell.side, [])
|
|
95
|
+
verdictsBySide.get(cell.side)!.push(v)
|
|
96
|
+
} catch (e) {
|
|
97
|
+
if (!verdictsBySide.has(cell.side)) verdictsBySide.set(cell.side, [])
|
|
98
|
+
verdictsBySide.get(cell.side)!.push({
|
|
99
|
+
verdict: 'ERROR' as const,
|
|
100
|
+
reason: `Runner exception: ${e instanceof Error ? e.message : String(e)}`,
|
|
101
|
+
criteria: [],
|
|
102
|
+
})
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// Aggregate stats
|
|
107
|
+
const stats = aggregateAllStats(verdictsBySide)
|
|
108
|
+
|
|
109
|
+
// Comparative judge
|
|
110
|
+
const flatVerdicts: { participantId: string; verdict: unknown }[] = []
|
|
111
|
+
for (const [side, verdicts] of verdictsBySide) {
|
|
112
|
+
// Use the first run's verdict for comparative judge (or aggregate into one)
|
|
113
|
+
if (verdicts.length > 0) {
|
|
114
|
+
flatVerdicts.push({ participantId: side, verdict: verdicts[0] })
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
const judge = useAgent(resolved[0]?.platform ?? 'claude')
|
|
119
|
+
const report = await runComparativeJudge({
|
|
120
|
+
manifest,
|
|
121
|
+
verdicts: flatVerdicts,
|
|
122
|
+
judge,
|
|
123
|
+
workdir: artifactsDir,
|
|
124
|
+
})
|
|
125
|
+
|
|
126
|
+
// Write report
|
|
127
|
+
writeReport(artifactsDir, manifest, report, stats)
|
|
128
|
+
|
|
129
|
+
// Update manifest
|
|
130
|
+
const finalManifest = ArenaManifest.parse({ ...manifest, status: 'completed' })
|
|
131
|
+
writeFileSync(join(artifactsDir, 'arena.json'), JSON.stringify(finalManifest, null, 2) + '\n')
|
|
132
|
+
|
|
133
|
+
return { manifest: finalManifest, report, stats, artifactsDir }
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
// ── Backward compat: CLI-flag style runner ─────────────────────────────────
|
|
137
|
+
|
|
138
|
+
export async function runArena(opts: {
|
|
139
|
+
taskPath: string
|
|
140
|
+
playerPaths: string[]
|
|
141
|
+
deckPaths: string[]
|
|
142
|
+
criteria: string[]
|
|
143
|
+
outDir: string
|
|
144
|
+
}): Promise<{ manifest: ArenaManifestType; report: unknown; artifactsDir: string }> {
|
|
145
|
+
const { taskPath, playerPaths, deckPaths, criteria, outDir } = opts
|
|
146
|
+
|
|
147
|
+
// Convert CLI flags to ArenaToml internally
|
|
148
|
+
const toml: ArenaToml = {
|
|
149
|
+
arena: {
|
|
150
|
+
task: readFileSync(resolve(taskPath), 'utf-8').slice(0, 200),
|
|
151
|
+
criteria,
|
|
152
|
+
runs_per_side: 1,
|
|
153
|
+
max_participants: Math.min(playerPaths.length, deckPaths.length),
|
|
154
|
+
},
|
|
155
|
+
side: playerPaths.flatMap((playerPath, pi) =>
|
|
156
|
+
deckPaths.map((deckPath, di) => ({
|
|
157
|
+
name: `run-${String(pi * deckPaths.length + di + 1).padStart(2, '0')}`,
|
|
158
|
+
player: Player.parse(JSON.parse(readFileSync(resolve(playerPath), 'utf-8'))).platform,
|
|
159
|
+
deck: deckPath,
|
|
160
|
+
}))
|
|
161
|
+
),
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
const result = await runArenaFromToml({ toml, taskPath, outDir })
|
|
165
|
+
const { manifest, report, artifactsDir } = result as ArenaResult
|
|
166
|
+
return { manifest, report, artifactsDir }
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
// ── Report renderer ────────────────────────────────────────────────────────
|
|
170
|
+
|
|
171
|
+
function writeReport(dir: string, manifest: ArenaManifestType, report: unknown & { score_matrix?: { participant_id: string; criterion: string; weight: number; score: number; rationale: string }[]; pareto?: { participant_id: string; dominated: boolean; dominated_by: string[] }[]; key_findings?: string[]; recommendations?: { audience: string; recommendation: string }[] }, stats: SideStats[]): void {
|
|
172
|
+
const lines: string[] = [
|
|
173
|
+
`# Arena Report: ${manifest.id}`,
|
|
174
|
+
'',
|
|
175
|
+
`**Task**: ${manifest.task}`,
|
|
176
|
+
`**Criteria**: ${manifest.criteria.join(', ')}`,
|
|
177
|
+
`**Date**: ${new Date().toISOString()}`,
|
|
178
|
+
'',
|
|
179
|
+
'## Score Matrix',
|
|
180
|
+
'',
|
|
181
|
+
renderScoreMatrix(report),
|
|
182
|
+
'',
|
|
183
|
+
'## Per-Side Statistics',
|
|
184
|
+
'',
|
|
185
|
+
renderStatsTable(stats),
|
|
186
|
+
'',
|
|
187
|
+
'## Pareto Frontier',
|
|
188
|
+
'',
|
|
189
|
+
renderPareto(report),
|
|
190
|
+
'',
|
|
191
|
+
'## Key Findings',
|
|
192
|
+
'',
|
|
193
|
+
...(report.key_findings ?? []).map((f: string) => `- ${f}`),
|
|
194
|
+
'',
|
|
195
|
+
'## Recommendations',
|
|
196
|
+
'',
|
|
197
|
+
...(report.recommendations ?? []).map((r: { audience: string; recommendation: string }) => `- **${r.audience}**: ${r.recommendation}`),
|
|
198
|
+
]
|
|
199
|
+
|
|
200
|
+
writeFileSync(join(dir, 'report.md'), lines.join('\n') + '\n')
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
function renderStatsTable(stats: SideStats[]): string {
|
|
204
|
+
if (stats.length === 0) return 'No statistics available.\n'
|
|
205
|
+
|
|
206
|
+
let table = `| Side | Runs | Pass Rate | Mean Confidence | Criteria |\n`
|
|
207
|
+
table += `|------|------|-----------|-----------------|----------|\n`
|
|
208
|
+
|
|
209
|
+
for (const s of stats) {
|
|
210
|
+
const confStr = s.meanConfidence != null ? `${s.meanConfidence.toFixed(0)}%` : '-'
|
|
211
|
+
const criteriaStr = s.criteria.map(c => `${c.name}: ${(c.mean * 100).toFixed(0)}%`).join(', ')
|
|
212
|
+
table += `| ${s.sideName} | ${s.runs} | ${(s.passRate * 100).toFixed(0)}% | ${confStr} | ${criteriaStr} |\n`
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
return table
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
function renderScoreMatrix(report: unknown & { score_matrix?: { participant_id: string; criterion: string; weight: number; score: number; rationale: string }[] }): string {
|
|
219
|
+
if (!report.score_matrix?.length) return 'No scores available.\n'
|
|
220
|
+
|
|
221
|
+
const participants = [...new Set(report.score_matrix.map(s => s.participant_id))]
|
|
222
|
+
const criteria = [...new Set(report.score_matrix.map(s => s.criterion))]
|
|
223
|
+
|
|
224
|
+
let table = `| Criterion | Weight | ${participants.join(' | ')} |\n`
|
|
225
|
+
table += `|${'---|'.repeat(2 + participants.length)}\n`
|
|
226
|
+
|
|
227
|
+
for (const c of criteria) {
|
|
228
|
+
table += `| ${c} | 25% | ${participants.map(p => {
|
|
229
|
+
const cell = report.score_matrix!.find(s => s.participant_id === p && s.criterion === c)
|
|
230
|
+
return `**${cell?.score ?? '?'}**`
|
|
231
|
+
}).join(' | ')} |\n`
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
table += `| **Weighted Total** | 100% | ${participants.map(p => {
|
|
235
|
+
const pScores = report.score_matrix!.filter(s => s.participant_id === p)
|
|
236
|
+
const avg = pScores.length ? pScores.reduce((sum, s) => sum + s.score, 0) / pScores.length : 0
|
|
237
|
+
return `**${avg.toFixed(1)}**`
|
|
238
|
+
}).join(' | ')} |\n`
|
|
239
|
+
|
|
240
|
+
return table
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
function renderPareto(report: unknown & { pareto?: { participant_id: string; dominated: boolean; dominated_by: string[] }[] }): string {
|
|
244
|
+
if (!report.pareto?.length) return 'No Pareto analysis.\n'
|
|
245
|
+
return report.pareto.map(p =>
|
|
246
|
+
p.dominated
|
|
247
|
+
? `- **${p.participant_id}**: dominated by ${p.dominated_by.join(', ')}`
|
|
248
|
+
: `- **${p.participant_id}**: Pareto-optimal (non-dominated)`
|
|
249
|
+
).join('\n')
|
|
250
|
+
}
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
import { describe, test, expect } from 'bun:test'
|
|
2
|
+
import { aggregateSideStats, aggregateAllStats } from './stats'
|
|
3
|
+
import type { JudgeVerdict } from '@lythos/test-utils/schema'
|
|
4
|
+
|
|
5
|
+
function makeVerdict(overrides?: Partial<JudgeVerdict>): JudgeVerdict {
|
|
6
|
+
return {
|
|
7
|
+
verdict: 'PASS',
|
|
8
|
+
reason: 'OK',
|
|
9
|
+
criteria: [{ name: 'correctness', passed: true }],
|
|
10
|
+
...overrides,
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
// ── aggregateSideStats ─────────────────────────────────────────────────────
|
|
15
|
+
|
|
16
|
+
describe('aggregateSideStats', () => {
|
|
17
|
+
test('single run: passRate=1, no variance', () => {
|
|
18
|
+
const stats = aggregateSideStats('test', [makeVerdict()])
|
|
19
|
+
expect(stats.sideName).toBe('test')
|
|
20
|
+
expect(stats.runs).toBe(1)
|
|
21
|
+
expect(stats.passRate).toBe(1)
|
|
22
|
+
expect(stats.failRate).toBe(0)
|
|
23
|
+
expect(stats.errorRate).toBe(0)
|
|
24
|
+
})
|
|
25
|
+
|
|
26
|
+
test('3 runs: 2 PASS, 1 FAIL', () => {
|
|
27
|
+
const verdicts = [
|
|
28
|
+
makeVerdict(),
|
|
29
|
+
makeVerdict(),
|
|
30
|
+
makeVerdict({ verdict: 'FAIL', reason: 'bad' }),
|
|
31
|
+
]
|
|
32
|
+
const stats = aggregateSideStats('test', verdicts)
|
|
33
|
+
expect(stats.passRate).toBeCloseTo(2 / 3)
|
|
34
|
+
expect(stats.failRate).toBeCloseTo(1 / 3)
|
|
35
|
+
})
|
|
36
|
+
|
|
37
|
+
test('confidence: mean across runs', () => {
|
|
38
|
+
const verdicts = [
|
|
39
|
+
makeVerdict({ confidence: 90 }),
|
|
40
|
+
makeVerdict({ confidence: 80 }),
|
|
41
|
+
makeVerdict({ confidence: 70 }),
|
|
42
|
+
]
|
|
43
|
+
const stats = aggregateSideStats('test', verdicts)
|
|
44
|
+
expect(stats.meanConfidence).toBeCloseTo(80)
|
|
45
|
+
expect(stats.confidenceVariance).toBeCloseTo(100) // (100+0+100)/2 = 100
|
|
46
|
+
})
|
|
47
|
+
|
|
48
|
+
test('confidence: null when no verdict has it', () => {
|
|
49
|
+
const stats = aggregateSideStats('test', [makeVerdict(), makeVerdict()])
|
|
50
|
+
expect(stats.meanConfidence).toBeNull()
|
|
51
|
+
expect(stats.confidenceVariance).toBeNull()
|
|
52
|
+
})
|
|
53
|
+
|
|
54
|
+
test('per-criterion pass rate', () => {
|
|
55
|
+
const verdicts = [
|
|
56
|
+
makeVerdict({ criteria: [{ name: 'accuracy', passed: true }] }),
|
|
57
|
+
makeVerdict({ criteria: [{ name: 'accuracy', passed: false }] }),
|
|
58
|
+
makeVerdict({ criteria: [{ name: 'accuracy', passed: true }] }),
|
|
59
|
+
]
|
|
60
|
+
const stats = aggregateSideStats('test', verdicts)
|
|
61
|
+
expect(stats.criteria).toHaveLength(1)
|
|
62
|
+
expect(stats.criteria[0].name).toBe('accuracy')
|
|
63
|
+
expect(stats.criteria[0].mean).toBeCloseTo(2 / 3)
|
|
64
|
+
})
|
|
65
|
+
|
|
66
|
+
test('per-criterion scores: mean and variance', () => {
|
|
67
|
+
const verdicts = [
|
|
68
|
+
makeVerdict({ scores: { coverage: 5, relevance: 4 } }),
|
|
69
|
+
makeVerdict({ scores: { coverage: 3, relevance: 4 } }),
|
|
70
|
+
makeVerdict({ scores: { coverage: 4, relevance: 4 } }),
|
|
71
|
+
]
|
|
72
|
+
const stats = aggregateSideStats('test', verdicts)
|
|
73
|
+
expect(stats.scoreByCriterion.coverage.mean).toBeCloseTo(4)
|
|
74
|
+
expect(stats.scoreByCriterion.relevance.mean).toBeCloseTo(4)
|
|
75
|
+
expect(stats.scoreByCriterion.relevance.variance).toBe(0) // all 4s
|
|
76
|
+
})
|
|
77
|
+
|
|
78
|
+
test('zero runs: all zeros', () => {
|
|
79
|
+
const stats = aggregateSideStats('empty', [])
|
|
80
|
+
expect(stats.runs).toBe(0)
|
|
81
|
+
expect(stats.passRate).toBe(0)
|
|
82
|
+
expect(stats.meanConfidence).toBeNull()
|
|
83
|
+
})
|
|
84
|
+
|
|
85
|
+
test('handles ERROR verdicts correctly', () => {
|
|
86
|
+
const verdicts = [
|
|
87
|
+
makeVerdict(),
|
|
88
|
+
makeVerdict({ verdict: 'ERROR', reason: 'parse failed' }),
|
|
89
|
+
]
|
|
90
|
+
const stats = aggregateSideStats('test', verdicts)
|
|
91
|
+
expect(stats.passRate).toBe(0.5)
|
|
92
|
+
expect(stats.errorRate).toBe(0.5)
|
|
93
|
+
})
|
|
94
|
+
})
|
|
95
|
+
|
|
96
|
+
// ── aggregateAllStats ──────────────────────────────────────────────────────
|
|
97
|
+
|
|
98
|
+
describe('aggregateAllStats', () => {
|
|
99
|
+
test('aggregates multiple sides', () => {
|
|
100
|
+
const map = new Map<string, JudgeVerdict[]>()
|
|
101
|
+
map.set('side-a', [makeVerdict(), makeVerdict()])
|
|
102
|
+
map.set('side-b', [makeVerdict({ verdict: 'FAIL', reason: 'nope' })])
|
|
103
|
+
|
|
104
|
+
const stats = aggregateAllStats(map)
|
|
105
|
+
expect(stats).toHaveLength(2)
|
|
106
|
+
expect(stats[0].sideName).toBe('side-a')
|
|
107
|
+
expect(stats[0].passRate).toBe(1)
|
|
108
|
+
expect(stats[1].sideName).toBe('side-b')
|
|
109
|
+
expect(stats[1].passRate).toBe(0)
|
|
110
|
+
})
|
|
111
|
+
})
|