@lythos/skill-arena 0.9.7 → 0.9.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/arena-toml.ts +3 -0
- package/src/comparative-judge.ts +63 -49
- package/src/runner.ts +8 -0
package/package.json
CHANGED
package/src/arena-toml.ts
CHANGED
|
@@ -27,6 +27,9 @@ export const ArenaToml = z.object({
|
|
|
27
27
|
criteria: z.array(z.string()).min(1),
|
|
28
28
|
runs_per_side: z.number().int().positive().default(1),
|
|
29
29
|
max_participants: z.number().int().min(2).max(5).default(5),
|
|
30
|
+
model: z.string().optional(), // e.g. "claude-sonnet-4-6"
|
|
31
|
+
endpoint: z.string().optional(), // e.g. "api.anthropic.com"
|
|
32
|
+
notes: z.string().optional(), // freeform reproducibility notes
|
|
30
33
|
}),
|
|
31
34
|
side: z.array(Side).min(2).max(5),
|
|
32
35
|
})
|
package/src/comparative-judge.ts
CHANGED
|
@@ -106,61 +106,75 @@ export async function runComparativeJudge(opts: {
|
|
|
106
106
|
|
|
107
107
|
const prompt = buildComparativePrompt({ manifest, verdicts })
|
|
108
108
|
|
|
109
|
-
let raw
|
|
109
|
+
let raw = ''
|
|
110
110
|
let parsed: unknown
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
const llmResult = ComparativeReport.pick({
|
|
130
|
-
score_matrix: true,
|
|
131
|
-
key_findings: true,
|
|
132
|
-
recommendations: true,
|
|
133
|
-
}).parse(parsed)
|
|
134
|
-
|
|
135
|
-
const scoreMatrix = toScoreMatrix(manifest, llmResult.score_matrix)
|
|
136
|
-
|
|
137
|
-
// Pareto: deterministic, never delegated to LLM
|
|
138
|
-
const participantScores = manifest.participants.map(p => {
|
|
139
|
-
const pScores: Record<string, number> = {}
|
|
140
|
-
for (const cell of scoreMatrix) {
|
|
141
|
-
if (cell.participant_id === p.id) {
|
|
142
|
-
pScores[cell.criterion] = cell.score
|
|
111
|
+
let lastError: string | undefined
|
|
112
|
+
|
|
113
|
+
for (let attempt = 0; attempt <= 2; attempt++) {
|
|
114
|
+
try {
|
|
115
|
+
if (judge.invokeTool) {
|
|
116
|
+
parsed = await judge.invokeTool({
|
|
117
|
+
tool: SCORE_TOOL,
|
|
118
|
+
prompt,
|
|
119
|
+
cwd: workdir,
|
|
120
|
+
timeoutMs: 120000,
|
|
121
|
+
})
|
|
122
|
+
raw = JSON.stringify(parsed)
|
|
123
|
+
} else {
|
|
124
|
+
const result = await judge.spawn({ cwd: workdir, brief: prompt, timeoutMs: 120000 })
|
|
125
|
+
raw = result.stdout
|
|
126
|
+
const fenceMatch = raw.match(/```(?:json)?\s*([\s\S]*?)\s*```/)
|
|
127
|
+
const jsonStr = fenceMatch ? fenceMatch[1].trim() : raw.trim()
|
|
128
|
+
parsed = JSON.parse(jsonStr)
|
|
143
129
|
}
|
|
144
|
-
}
|
|
145
|
-
return { participant_id: p.id, scores: pScores }
|
|
146
|
-
})
|
|
147
130
|
|
|
148
|
-
|
|
131
|
+
// Validate LLM output through Zod
|
|
132
|
+
const llmResult = ComparativeReport.pick({
|
|
133
|
+
score_matrix: true,
|
|
134
|
+
key_findings: true,
|
|
135
|
+
recommendations: true,
|
|
136
|
+
}).parse(parsed)
|
|
137
|
+
|
|
138
|
+
// Success — proceed to Pareto computation
|
|
139
|
+
const scoreMatrix = toScoreMatrix(manifest, llmResult.score_matrix)
|
|
140
|
+
const participantScores = manifest.participants.map(p => {
|
|
141
|
+
const pScores: Record<string, number> = {}
|
|
142
|
+
for (const cell of scoreMatrix) {
|
|
143
|
+
if (cell.participant_id === p.id) pScores[cell.criterion] = cell.score
|
|
144
|
+
}
|
|
145
|
+
return { participant_id: p.id, scores: pScores }
|
|
146
|
+
})
|
|
147
|
+
const pareto = computePareto(participantScores)
|
|
148
|
+
const weightedTotals: Record<string, number> = {}
|
|
149
|
+
for (const p of manifest.participants) {
|
|
150
|
+
const pCells = scoreMatrix.filter(c => c.participant_id === p.id)
|
|
151
|
+
weightedTotals[p.id] = pCells.reduce((sum, c) => sum + c.score * c.weight, 0) / (pCells.length || 1)
|
|
152
|
+
}
|
|
149
153
|
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
154
|
+
return ComparativeReport.parse({
|
|
155
|
+
arena_id: manifest.id,
|
|
156
|
+
generated_at: new Date().toISOString(),
|
|
157
|
+
score_matrix: scoreMatrix,
|
|
158
|
+
weighted_totals: weightedTotals,
|
|
159
|
+
pareto,
|
|
160
|
+
key_findings: llmResult.key_findings ?? [],
|
|
161
|
+
recommendations: llmResult.recommendations ?? [],
|
|
162
|
+
})
|
|
163
|
+
} catch (e) {
|
|
164
|
+
lastError = e instanceof Error ? e.message : String(e)
|
|
165
|
+
if (attempt < 2) continue // retry
|
|
166
|
+
}
|
|
155
167
|
}
|
|
156
168
|
|
|
157
|
-
return
|
|
169
|
+
// All retries exhausted: return fallback report
|
|
170
|
+
const empty: typeof ComparativeReport._output = {
|
|
158
171
|
arena_id: manifest.id,
|
|
159
172
|
generated_at: new Date().toISOString(),
|
|
160
|
-
score_matrix:
|
|
161
|
-
weighted_totals:
|
|
162
|
-
pareto,
|
|
163
|
-
key_findings:
|
|
164
|
-
recommendations:
|
|
165
|
-
}
|
|
173
|
+
score_matrix: [],
|
|
174
|
+
weighted_totals: {},
|
|
175
|
+
pareto: [],
|
|
176
|
+
key_findings: [`Comparative judge failed after 3 attempts: ${lastError}`],
|
|
177
|
+
recommendations: [],
|
|
178
|
+
}
|
|
179
|
+
return empty
|
|
166
180
|
}
|
package/src/runner.ts
CHANGED
|
@@ -119,6 +119,14 @@ export async function runArenaFromToml(opts: {
|
|
|
119
119
|
criteria: [],
|
|
120
120
|
}) as JudgeVerdict
|
|
121
121
|
|
|
122
|
+
// Persist per-cell verdict + agent output for auditability
|
|
123
|
+
writeFileSync(join(cellDir, 'judge-verdict.json'), JSON.stringify({
|
|
124
|
+
...v,
|
|
125
|
+
agent_stdout: result.agentResult.stdout.slice(0, 5000),
|
|
126
|
+
agent_stderr: result.agentResult.stderr.slice(0, 1000),
|
|
127
|
+
duration_ms: result.agentResult.durationMs,
|
|
128
|
+
}, null, 2) + '\n')
|
|
129
|
+
|
|
122
130
|
if (!verdictsBySide.has(cell.side)) verdictsBySide.set(cell.side, [])
|
|
123
131
|
verdictsBySide.get(cell.side)!.push(v)
|
|
124
132
|
} catch (e) {
|