@lythos/skill-arena 0.11.2 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -8
- package/package.json +5 -5
- package/src/arena-toml.test.ts +44 -46
- package/src/arena-toml.ts +12 -13
- package/src/cli.ts +238 -667
- package/src/runner.ts +152 -183
package/src/runner.ts
CHANGED
|
@@ -1,18 +1,20 @@
|
|
|
1
|
-
import { existsSync, mkdirSync, writeFileSync, readFileSync,
|
|
1
|
+
import { existsSync, mkdirSync, writeFileSync, readFileSync, cpSync, readdirSync } from 'node:fs'
|
|
2
2
|
import { join, resolve } from 'node:path'
|
|
3
|
-
import {
|
|
4
|
-
import { runAgentScenario, type AgentScenario } from '@lythos/test-utils/agent-bdd'
|
|
3
|
+
import { homedir } from 'node:os'
|
|
5
4
|
import { useAgent } from '@lythos/test-utils/agents'
|
|
6
|
-
|
|
5
|
+
import { createSanitizer } from '@lythos/test-utils/sanitize'
|
|
6
|
+
import { runLLMJudge } from '@lythos/test-utils/judge'
|
|
7
|
+
import { readCheckpoints } from '@lythos/test-utils/bdd-runner'
|
|
8
|
+
import { ArenaManifest, Player, type JudgeInput, type Evidence, type JudgeVerdict } from '@lythos/test-utils/schema'
|
|
9
|
+
import type { ArenaManifest as ArenaManifestType } from '@lythos/test-utils/schema'
|
|
7
10
|
try { await import('@lythos/agent-adapter-claude-sdk') } catch { /* package not installed */ }
|
|
8
11
|
try { await import('@lythos/agent-adapter-deepseek-serve') } catch { /* package not installed */ }
|
|
9
|
-
import { ArenaManifest, Player } from '@lythos/test-utils/schema'
|
|
10
|
-
import type { ArenaManifest as ArenaManifestType, JudgeVerdict } from '@lythos/test-utils/schema'
|
|
11
12
|
import { runComparativeJudge } from './comparative-judge'
|
|
12
13
|
import { parseArenaToml, buildExecutionPlan, type ArenaToml, type ExecutionPlan } from './arena-toml'
|
|
13
14
|
import { resolvePlayer, resolveSides } from './player'
|
|
14
15
|
import { aggregateAllStats } from './stats'
|
|
15
16
|
import type { SideStats } from './stats'
|
|
17
|
+
import { buildCopyPlan } from './preflight'
|
|
16
18
|
|
|
17
19
|
// ── Helpers ───────────────────────────────────────────────────────────────
|
|
18
20
|
|
|
@@ -21,8 +23,6 @@ function stamp(): string {
|
|
|
21
23
|
return `${d.getFullYear()}${String(d.getMonth() + 1).padStart(2, '0')}${String(d.getDate()).padStart(2, '0')}-${String(d.getHours()).padStart(2, '0')}${String(d.getMinutes()).padStart(2, '0')}${String(d.getSeconds()).padStart(2, '0')}`
|
|
22
24
|
}
|
|
23
25
|
|
|
24
|
-
// ── Declarative runner (arena.toml → execute) ─────────────────────────────
|
|
25
|
-
|
|
26
26
|
export interface ArenaResult {
|
|
27
27
|
manifest: ArenaManifestType
|
|
28
28
|
report: unknown
|
|
@@ -30,7 +30,30 @@ export interface ArenaResult {
|
|
|
30
30
|
artifactsDir: string
|
|
31
31
|
}
|
|
32
32
|
|
|
33
|
-
|
|
33
|
+
// ── Task + judge text resolution (no parsing — natural language) ──────────
|
|
34
|
+
|
|
35
|
+
function resolveTaskText(toml: ArenaToml, configDir?: string): string {
|
|
36
|
+
const p = toml.arena.task
|
|
37
|
+
const candidate = configDir ? resolve(configDir, p) : resolve(p)
|
|
38
|
+
if (existsSync(candidate)) return readFileSync(candidate, 'utf-8')
|
|
39
|
+
return p
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
function resolveJudgeText(toml: ArenaToml, configDir?: string): string | null {
|
|
43
|
+
if (toml.arena.judge) {
|
|
44
|
+
const p = toml.arena.judge
|
|
45
|
+
const candidate = configDir ? resolve(configDir, p) : resolve(p)
|
|
46
|
+
if (existsSync(candidate)) return readFileSync(candidate, 'utf-8')
|
|
47
|
+
return p
|
|
48
|
+
}
|
|
49
|
+
if (toml.arena.criteria && toml.arena.criteria.length > 0) {
|
|
50
|
+
return toml.arena.criteria.map(c => `- ${c}`).join('\n')
|
|
51
|
+
}
|
|
52
|
+
return null
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// ── Plan formatting ───────────────────────────────────────────────────────
|
|
56
|
+
|
|
34
57
|
export function formatPlanOutput(plan: ExecutionPlan): string[] {
|
|
35
58
|
const lines: string[] = []
|
|
36
59
|
const sideCount = new Set(plan.cells.map(c => c.side)).size
|
|
@@ -41,51 +64,25 @@ export function formatPlanOutput(plan: ExecutionPlan): string[] {
|
|
|
41
64
|
return lines
|
|
42
65
|
}
|
|
43
66
|
|
|
67
|
+
// ── Main ──────────────────────────────────────────────────────────────────
|
|
68
|
+
|
|
44
69
|
export async function runArenaFromToml(opts: {
|
|
45
70
|
toml: ArenaToml
|
|
46
71
|
taskPath: string
|
|
47
72
|
outDir?: string
|
|
48
73
|
dryRun?: boolean
|
|
49
74
|
log?: (msg: string) => void
|
|
50
|
-
configDir?: string
|
|
75
|
+
configDir?: string
|
|
51
76
|
}): Promise<ArenaResult | { plan: ReturnType<typeof buildExecutionPlan> }> {
|
|
52
77
|
const { toml, taskPath, outDir, dryRun, log, configDir } = opts
|
|
53
78
|
|
|
54
|
-
// Resolve relative paths against config dir (anti-footgun: cwd may differ)
|
|
55
79
|
const resolvePath = (p: string) => {
|
|
56
80
|
if (p.startsWith('/')) return p
|
|
57
81
|
if (configDir) return resolve(configDir, p)
|
|
58
82
|
return resolve(p)
|
|
59
83
|
}
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
if (existsSync(candidate)) return { path: candidate }
|
|
63
|
-
// taskPath is inline text — write temp scenario file
|
|
64
|
-
const tmp = join(tmpdir(), `arena-task-${stamp()}.agent.md`)
|
|
65
|
-
writeFileSync(tmp, `---
|
|
66
|
-
name: arena task
|
|
67
|
-
description: ${taskPath.slice(0, 80)}
|
|
68
|
-
timeout: 120000
|
|
69
|
-
---
|
|
70
|
-
|
|
71
|
-
## Given
|
|
72
|
-
- Working directory with an empty project
|
|
73
|
-
- bun is available
|
|
74
|
-
|
|
75
|
-
## When
|
|
76
|
-
${taskPath}
|
|
77
|
-
|
|
78
|
-
## Then
|
|
79
|
-
- Complete the task above
|
|
80
|
-
- Write a summary to output.md
|
|
81
|
-
|
|
82
|
-
## Judge
|
|
83
|
-
- completeness
|
|
84
|
-
- correctness
|
|
85
|
-
`)
|
|
86
|
-
return { path: tmp, cleanup: () => { try { rmSync(tmp) } catch {} } }
|
|
87
|
-
}
|
|
88
|
-
const { path: taskAbs, cleanup: taskCleanup } = resolveOrCreateTask()
|
|
84
|
+
|
|
85
|
+
const taskText = resolveTaskText(toml, configDir)
|
|
89
86
|
const resolvedToml: ArenaToml = {
|
|
90
87
|
...toml,
|
|
91
88
|
side: toml.side.map(s => ({ ...s, deck: resolvePath(s.deck) })),
|
|
@@ -93,11 +90,8 @@ ${taskPath}
|
|
|
93
90
|
|
|
94
91
|
const plan = buildExecutionPlan(resolvedToml)
|
|
95
92
|
|
|
96
|
-
// dry-run: return plan without executing
|
|
97
93
|
if (dryRun) {
|
|
98
|
-
for (const line of formatPlanOutput(plan))
|
|
99
|
-
log?.(line)
|
|
100
|
-
}
|
|
94
|
+
for (const line of formatPlanOutput(plan)) log?.(line)
|
|
101
95
|
return { plan }
|
|
102
96
|
}
|
|
103
97
|
|
|
@@ -105,14 +99,10 @@ ${taskPath}
|
|
|
105
99
|
const artifactsDir = outDir || join(process.cwd(), 'runs', arenaId)
|
|
106
100
|
const resolved = resolveSides(resolvedToml)
|
|
107
101
|
|
|
108
|
-
// Build manifest
|
|
109
|
-
const taskContent = existsSync(taskAbs)
|
|
110
|
-
? readFileSync(taskAbs, 'utf-8').slice(0, 200)
|
|
111
|
-
: taskPath // inline description, not a file path
|
|
112
102
|
const manifest = ArenaManifest.parse({
|
|
113
103
|
id: arenaId,
|
|
114
104
|
created_at: new Date().toISOString(),
|
|
115
|
-
task:
|
|
105
|
+
task: taskText.slice(0, 200),
|
|
116
106
|
mode: 'decks',
|
|
117
107
|
participants: [...new Map(resolved.map(r => [r.side.name, r])).values()].map(r => ({
|
|
118
108
|
id: r.side.name,
|
|
@@ -121,78 +111,110 @@ ${taskPath}
|
|
|
121
111
|
deck: r.side.deck,
|
|
122
112
|
description: `${r.playerName} × ${r.side.deck}`,
|
|
123
113
|
})),
|
|
124
|
-
criteria: resolvedToml.arena.criteria,
|
|
114
|
+
criteria: resolvedToml.arena.criteria ?? [resolvedToml.arena.judge ?? 'completeness'],
|
|
125
115
|
status: 'running',
|
|
126
116
|
})
|
|
127
117
|
|
|
128
118
|
mkdirSync(artifactsDir, { recursive: true })
|
|
129
119
|
writeFileSync(join(artifactsDir, 'arena.json'), JSON.stringify(manifest, null, 2) + '\n')
|
|
130
120
|
|
|
131
|
-
|
|
121
|
+
const judgeText = resolveJudgeText(resolvedToml, configDir)
|
|
122
|
+
const judgeInput: JudgeInput | undefined = judgeText
|
|
123
|
+
? { criteria: judgeText, task_context: taskText.slice(0, 500) }
|
|
124
|
+
: undefined
|
|
125
|
+
|
|
126
|
+
// ── Per-cell: agent.spawn directly, no AgentScenario/parseAgentMd ────
|
|
132
127
|
const verdictsBySide = new Map<string, JudgeVerdict[]>()
|
|
133
128
|
|
|
134
129
|
for (const cell of plan.cells) {
|
|
135
130
|
const cellDir = join(artifactsDir, 'runs', cell.side, `run-${cell.run}`)
|
|
136
131
|
mkdirSync(cellDir, { recursive: true })
|
|
137
132
|
|
|
133
|
+
const workDir = join(artifactsDir, 'work', cell.side)
|
|
134
|
+
mkdirSync(workDir, { recursive: true })
|
|
135
|
+
const originalCwd = process.cwd()
|
|
136
|
+
|
|
138
137
|
try {
|
|
138
|
+
// Setup: deck + AGENTS.md + link
|
|
139
|
+
writeFileSync(join(workDir, 'skill-deck.toml'), readFileSync(cell.deck, 'utf-8'))
|
|
140
|
+
writeFileSync(join(workDir, 'AGENTS.md'), [
|
|
141
|
+
'# Arena Test Environment',
|
|
142
|
+
`**Side**: ${cell.side}`, `**Player**: ${cell.player}`, `**Run**: ${cell.run}`,
|
|
143
|
+
'## Task', '', taskText,
|
|
144
|
+
'## How This Works',
|
|
145
|
+
'- Isolated arena test directory. Skills in skill-deck.toml, linked via deck link.',
|
|
146
|
+
'- Complete the task using available skills. Output to this directory.',
|
|
147
|
+
].join('\n'))
|
|
148
|
+
const linkProc = Bun.spawn(
|
|
149
|
+
['bunx', '@lythos/skill-deck', 'link'],
|
|
150
|
+
{ cwd: workDir, env: { ...process.env, HOME: process.env.HOME! } },
|
|
151
|
+
)
|
|
152
|
+
await linkProc.exited
|
|
153
|
+
log?.(`[arena] deck link for ${cell.side}: exit ${linkProc.exitCode}`)
|
|
154
|
+
|
|
155
|
+
process.chdir(workDir)
|
|
156
|
+
|
|
157
|
+
// Direct agent.spawn (no parseAgentMd, no AgentScenario)
|
|
139
158
|
const agent = useAgent(resolvePlayer(cell.player))
|
|
140
|
-
const
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
mkdirSync(workdir, { recursive: true })
|
|
145
|
-
const deckContent = readFileSync(cell.deck, 'utf-8')
|
|
146
|
-
writeFileSync(join(workdir, 'skill-deck.toml'), deckContent)
|
|
147
|
-
|
|
148
|
-
// Write AGENTS.md bootloader — agents read this on entry
|
|
149
|
-
writeFileSync(join(workdir, 'AGENTS.md'), [
|
|
150
|
-
'# Arena Test Environment',
|
|
151
|
-
'',
|
|
152
|
-
`**Side**: ${cell.side}`,
|
|
153
|
-
`**Player**: ${cell.player}`,
|
|
154
|
-
`**Run**: ${cell.run}`,
|
|
155
|
-
'',
|
|
156
|
-
'## Task',
|
|
157
|
-
'',
|
|
158
|
-
scenario.it ?? scenario.description ?? '(no task description)',
|
|
159
|
-
'',
|
|
160
|
-
'## How This Works',
|
|
161
|
-
'',
|
|
162
|
-
'- This is an isolated arena test directory. No parent `.claude/skills/` exists.',
|
|
163
|
-
'- Skills are configured in `skill-deck.toml` and symlinked by `deck link`.',
|
|
164
|
-
'- Complete the task above using the available skills.',
|
|
165
|
-
'- Output your work to this directory (or `output/` if specified).',
|
|
166
|
-
'',
|
|
167
|
-
'## Expected Output',
|
|
168
|
-
'',
|
|
169
|
-
'After completing the task, write a brief summary of what you did.',
|
|
170
|
-
].join('\n'))
|
|
171
|
-
|
|
172
|
-
// Link skills via bunx (works both locally and when installed via bunx)
|
|
173
|
-
const linkProc = Bun.spawn(
|
|
174
|
-
['bunx', '@lythos/skill-deck', 'link'],
|
|
175
|
-
{ cwd: workdir, env: { ...process.env, HOME: process.env.HOME! } },
|
|
176
|
-
)
|
|
177
|
-
await linkProc.exited
|
|
178
|
-
log?.(`[arena] deck link for ${cell.side}: exit ${linkProc.exitCode}`)
|
|
179
|
-
},
|
|
180
|
-
// Isolated CWD: /tmp/arena-<id>/<side>/ — no parent .claude/skills/ to walk up into
|
|
181
|
-
baseDir: join(tmpdir(), `arena-${arenaId}`, cell.side),
|
|
159
|
+
const agentResult = await agent.spawn({
|
|
160
|
+
cwd: workDir,
|
|
161
|
+
brief: taskText,
|
|
162
|
+
timeoutMs: 300000,
|
|
182
163
|
})
|
|
183
164
|
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
165
|
+
process.chdir(originalCwd)
|
|
166
|
+
|
|
167
|
+
// Persist agent output
|
|
168
|
+
const sanitizer = createSanitizer({ projectRoot: process.cwd(), homeDir: homedir(), workDir })
|
|
169
|
+
writeFileSync(join(cellDir, 'agent-stdout.txt'), sanitizer.sanitize(agentResult.stdout), 'utf-8')
|
|
170
|
+
if (agentResult.stderr) writeFileSync(join(cellDir, 'agent-stderr.txt'), sanitizer.sanitize(agentResult.stderr), 'utf-8')
|
|
171
|
+
|
|
172
|
+
// Copy artifacts
|
|
173
|
+
const skipSet = new Set(['.claude', 'skill-deck.toml', 'skill-deck.lock', 'AGENTS.md'])
|
|
174
|
+
try {
|
|
175
|
+
const entries = readdirSync(workDir)
|
|
176
|
+
const copyPlan = buildCopyPlan(workDir, cellDir, entries, skipSet)
|
|
177
|
+
for (const { src, dest, name } of copyPlan) {
|
|
178
|
+
try { cpSync(src, dest, { recursive: true }) } catch (e) {
|
|
179
|
+
log?.(`⚠️ Failed to copy agent output: ${name} — ${e instanceof Error ? e.message : e}`)
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
} catch (e) {
|
|
183
|
+
log?.(`⚠️ Failed to read agent workdir for copy: ${e instanceof Error ? e.message : e}`)
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
// Evidence
|
|
187
|
+
const checkpoints = readCheckpoints(workDir)
|
|
188
|
+
let artifactFiles: string[] = []
|
|
189
|
+
try {
|
|
190
|
+
for (const e of readdirSync(workDir)) {
|
|
191
|
+
if (!e.startsWith('.') && !skipSet.has(e) && e !== 'agent-stdout.txt' && e !== 'agent-stderr.txt' && e !== 'judge-verdict.json' && e !== '_checkpoints') {
|
|
192
|
+
artifactFiles.push(e)
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
} catch {}
|
|
196
|
+
|
|
197
|
+
// Per-cell judge — runLLMJudge as toolbox function, no intermediate pipeline
|
|
198
|
+
let v: JudgeVerdict
|
|
199
|
+
if (judgeInput) {
|
|
200
|
+
const evidence: Evidence = {
|
|
201
|
+
sandbox_cwd: workDir,
|
|
202
|
+
stdout: agentResult.stdout,
|
|
203
|
+
stderr: agentResult.stderr,
|
|
204
|
+
artifact_files: artifactFiles,
|
|
205
|
+
}
|
|
206
|
+
const judgeAgent = useAgent(resolvePlayer(resolved[0]?.platform ?? 'claude'))
|
|
207
|
+
const judgeResult = await runLLMJudge(judgeInput, evidence, checkpoints, judgeAgent)
|
|
208
|
+
v = judgeResult.verdict ?? { verdict: 'ERROR' as const, reason: 'No verdict returned', criteria: [] }
|
|
209
|
+
} else {
|
|
210
|
+
v = { verdict: 'ERROR' as const, reason: 'No judge criteria provided', criteria: [] }
|
|
211
|
+
}
|
|
189
212
|
|
|
190
|
-
// Persist per-cell verdict + agent output for auditability
|
|
191
213
|
writeFileSync(join(cellDir, 'judge-verdict.json'), JSON.stringify({
|
|
192
214
|
...v,
|
|
193
|
-
agent_stdout:
|
|
194
|
-
agent_stderr:
|
|
195
|
-
duration_ms:
|
|
215
|
+
agent_stdout: agentResult.stdout.slice(0, 5000),
|
|
216
|
+
agent_stderr: agentResult.stderr.slice(0, 1000),
|
|
217
|
+
duration_ms: agentResult.durationMs,
|
|
196
218
|
}, null, 2) + '\n')
|
|
197
219
|
|
|
198
220
|
if (!verdictsBySide.has(cell.side)) verdictsBySide.set(cell.side, [])
|
|
@@ -209,55 +231,30 @@ ${taskPath}
|
|
|
209
231
|
}
|
|
210
232
|
}
|
|
211
233
|
|
|
212
|
-
// Aggregate
|
|
234
|
+
// Aggregate + comparative
|
|
213
235
|
const stats = aggregateAllStats(verdictsBySide)
|
|
214
|
-
|
|
215
|
-
// Comparative judge
|
|
216
236
|
const flatVerdicts: { participantId: string; verdict: unknown }[] = []
|
|
217
237
|
for (const [side, verdicts] of verdictsBySide) {
|
|
218
|
-
|
|
219
|
-
if (verdicts.length > 0) {
|
|
220
|
-
flatVerdicts.push({ participantId: side, verdict: verdicts[0] })
|
|
221
|
-
}
|
|
238
|
+
if (verdicts.length > 0) flatVerdicts.push({ participantId: side, verdict: verdicts[0] })
|
|
222
239
|
}
|
|
223
|
-
|
|
224
240
|
const judge = useAgent(resolved[0]?.platform ?? 'claude')
|
|
225
|
-
const report = await runComparativeJudge({
|
|
226
|
-
manifest,
|
|
227
|
-
verdicts: flatVerdicts,
|
|
228
|
-
judge,
|
|
229
|
-
workdir: artifactsDir,
|
|
230
|
-
})
|
|
231
|
-
|
|
232
|
-
// Write report
|
|
241
|
+
const report = await runComparativeJudge({ manifest, verdicts: flatVerdicts, judge, workdir: artifactsDir })
|
|
233
242
|
writeReport(artifactsDir, manifest, report, stats)
|
|
234
243
|
|
|
235
|
-
// Update manifest
|
|
236
244
|
const finalManifest = ArenaManifest.parse({ ...manifest, status: 'completed' })
|
|
237
245
|
writeFileSync(join(artifactsDir, 'arena.json'), JSON.stringify(finalManifest, null, 2) + '\n')
|
|
238
246
|
|
|
239
247
|
return { manifest: finalManifest, report, stats, artifactsDir }
|
|
240
248
|
}
|
|
241
249
|
|
|
242
|
-
// ── Backward compat
|
|
250
|
+
// ── Backward compat ──────────────────────────────────────────────────────
|
|
243
251
|
|
|
244
252
|
export async function runArena(opts: {
|
|
245
|
-
taskPath: string
|
|
246
|
-
playerPaths: string[]
|
|
247
|
-
deckPaths: string[]
|
|
248
|
-
criteria: string[]
|
|
249
|
-
outDir: string
|
|
253
|
+
taskPath: string; playerPaths: string[]; deckPaths: string[]; criteria: string[]; outDir: string
|
|
250
254
|
}): Promise<{ manifest: ArenaManifestType; report: unknown; artifactsDir: string }> {
|
|
251
255
|
const { taskPath, playerPaths, deckPaths, criteria, outDir } = opts
|
|
252
|
-
|
|
253
|
-
// Convert CLI flags to ArenaToml internally
|
|
254
256
|
const toml: ArenaToml = {
|
|
255
|
-
arena: {
|
|
256
|
-
task: readFileSync(resolve(taskPath), 'utf-8').slice(0, 200),
|
|
257
|
-
criteria,
|
|
258
|
-
runs_per_side: 1,
|
|
259
|
-
max_participants: Math.min(playerPaths.length, deckPaths.length),
|
|
260
|
-
},
|
|
257
|
+
arena: { task: readFileSync(resolve(taskPath), 'utf-8').slice(0, 200), criteria, runs_per_side: 1, max_participants: Math.min(playerPaths.length, deckPaths.length) } as any,
|
|
261
258
|
side: playerPaths.flatMap((playerPath, pi) =>
|
|
262
259
|
deckPaths.map((deckPath, di) => ({
|
|
263
260
|
name: `run-${String(pi * deckPaths.length + di + 1).padStart(2, '0')}`,
|
|
@@ -266,89 +263,61 @@ export async function runArena(opts: {
|
|
|
266
263
|
}))
|
|
267
264
|
),
|
|
268
265
|
}
|
|
269
|
-
|
|
270
266
|
const result = await runArenaFromToml({ toml, taskPath, outDir })
|
|
271
267
|
const { manifest, report, artifactsDir } = result as ArenaResult
|
|
272
268
|
return { manifest, report, artifactsDir }
|
|
273
269
|
}
|
|
274
270
|
|
|
275
|
-
// ── Report
|
|
271
|
+
// ── Report ────────────────────────────────────────────────────────────────
|
|
276
272
|
|
|
277
|
-
function writeReport(dir: string, manifest: ArenaManifestType, report:
|
|
273
|
+
function writeReport(dir: string, manifest: ArenaManifestType, report: any, stats: SideStats[]): void {
|
|
278
274
|
const lines: string[] = [
|
|
279
|
-
`# Arena Report: ${manifest.id}`,
|
|
280
|
-
'',
|
|
275
|
+
`# Arena Report: ${manifest.id}`, '',
|
|
281
276
|
`**Task**: ${manifest.task}`,
|
|
282
|
-
`**Criteria**: ${manifest.criteria.map(c => typeof c === 'string' ? c : c.label).join(', ')}`,
|
|
283
|
-
`**Date**: ${new Date().toISOString()}`,
|
|
284
|
-
'',
|
|
285
|
-
'##
|
|
286
|
-
'',
|
|
287
|
-
|
|
288
|
-
'',
|
|
289
|
-
'## Per-Side Statistics',
|
|
290
|
-
'',
|
|
291
|
-
renderStatsTable(stats),
|
|
292
|
-
'',
|
|
293
|
-
'## Pareto Frontier',
|
|
294
|
-
'',
|
|
295
|
-
renderPareto(report),
|
|
296
|
-
'',
|
|
297
|
-
'## Key Findings',
|
|
298
|
-
'',
|
|
299
|
-
...(report.key_findings ?? []).map((f: string) => `- ${f}`),
|
|
300
|
-
'',
|
|
301
|
-
'## Recommendations',
|
|
302
|
-
'',
|
|
303
|
-
...(report.recommendations ?? []).map((r: { audience: string; recommendation: string }) => `- **${r.audience}**: ${r.recommendation}`),
|
|
277
|
+
`**Criteria**: ${manifest.criteria.map((c: any) => typeof c === 'string' ? c : c.label).join(', ')}`,
|
|
278
|
+
`**Date**: ${new Date().toISOString()}`, '',
|
|
279
|
+
'## Score Matrix', '', renderScoreMatrix(report), '',
|
|
280
|
+
'## Per-Side Statistics', '', renderStatsTable(stats), '',
|
|
281
|
+
'## Pareto Frontier', '', renderPareto(report), '',
|
|
282
|
+
'## Key Findings', '', ...(report.key_findings ?? []).map((f: string) => `- ${f}`), '',
|
|
283
|
+
'## Recommendations', '', ...(report.recommendations ?? []).map((r: any) => `- **${r.audience}**: ${r.recommendation}`),
|
|
304
284
|
]
|
|
305
|
-
|
|
306
285
|
writeFileSync(join(dir, 'report.md'), lines.join('\n') + '\n')
|
|
307
286
|
}
|
|
308
287
|
|
|
309
288
|
function renderStatsTable(stats: SideStats[]): string {
|
|
310
289
|
if (stats.length === 0) return 'No statistics available.\n'
|
|
311
|
-
|
|
312
|
-
let table = `| Side | Runs | Pass Rate | Mean Confidence | Criteria |\n`
|
|
313
|
-
table += `|------|------|-----------|-----------------|----------|\n`
|
|
314
|
-
|
|
290
|
+
let table = `| Side | Runs | Pass Rate | Mean Confidence | Criteria |\n|------|------|-----------|-----------------|----------|\n`
|
|
315
291
|
for (const s of stats) {
|
|
316
292
|
const confStr = s.meanConfidence != null ? `${s.meanConfidence.toFixed(0)}%` : '-'
|
|
317
293
|
const criteriaStr = s.criteria.map(c => `${c.name}: ${(c.mean * 100).toFixed(0)}%`).join(', ')
|
|
318
294
|
table += `| ${s.sideName} | ${s.runs} | ${(s.passRate * 100).toFixed(0)}% | ${confStr} | ${criteriaStr} |\n`
|
|
319
295
|
}
|
|
320
|
-
|
|
321
296
|
return table
|
|
322
297
|
}
|
|
323
298
|
|
|
324
|
-
function renderScoreMatrix(report:
|
|
299
|
+
function renderScoreMatrix(report: any): string {
|
|
325
300
|
if (!report.score_matrix?.length) return 'No scores available.\n'
|
|
326
|
-
|
|
327
|
-
const
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
let table = `| Criterion | Weight | ${participants.join(' | ')} |\n`
|
|
331
|
-
table += `|${'---|'.repeat(2 + participants.length)}\n`
|
|
332
|
-
|
|
301
|
+
const participants = [...new Set(report.score_matrix.map((s: any) => s.participant_id))]
|
|
302
|
+
const criteria = [...new Set(report.score_matrix.map((s: any) => s.criterion))]
|
|
303
|
+
let table = `| Criterion | Weight | ${participants.join(' | ')} |\n|${'---|'.repeat(2 + participants.length)}\n`
|
|
333
304
|
for (const c of criteria) {
|
|
334
|
-
table += `| ${c} | 25% | ${participants.map(p => {
|
|
335
|
-
const cell = report.score_matrix!.find(s => s.participant_id === p && s.criterion === c)
|
|
305
|
+
table += `| ${c} | 25% | ${participants.map((p: any) => {
|
|
306
|
+
const cell = report.score_matrix!.find((s: any) => s.participant_id === p && s.criterion === c)
|
|
336
307
|
return `**${cell?.score ?? '?'}**`
|
|
337
308
|
}).join(' | ')} |\n`
|
|
338
309
|
}
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
const
|
|
342
|
-
const avg = pScores.length ? pScores.reduce((sum, s) => sum + s.score, 0) / pScores.length : 0
|
|
310
|
+
table += `| **Weighted Total** | 100% | ${participants.map((p: any) => {
|
|
311
|
+
const pScores = report.score_matrix!.filter((s: any) => s.participant_id === p)
|
|
312
|
+
const avg = pScores.length ? pScores.reduce((sum: number, s: any) => sum + s.score, 0) / pScores.length : 0
|
|
343
313
|
return `**${avg.toFixed(1)}**`
|
|
344
314
|
}).join(' | ')} |\n`
|
|
345
|
-
|
|
346
315
|
return table
|
|
347
316
|
}
|
|
348
317
|
|
|
349
|
-
function renderPareto(report:
|
|
318
|
+
function renderPareto(report: any): string {
|
|
350
319
|
if (!report.pareto?.length) return 'No Pareto analysis.\n'
|
|
351
|
-
return report.pareto.map(p =>
|
|
320
|
+
return report.pareto.map((p: any) =>
|
|
352
321
|
p.dominated
|
|
353
322
|
? `- **${p.participant_id}**: dominated by ${p.dominated_by.join(', ')}`
|
|
354
323
|
: `- **${p.participant_id}**: Pareto-optimal (non-dominated)`
|