@lythos/skill-arena 0.9.11 → 0.9.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/comparative-judge.ts +169 -30
- package/src/runner.ts +12 -3
package/package.json
CHANGED
package/src/comparative-judge.ts
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import { zodToJsonSchema } from 'zod-to-json-schema'
|
|
2
1
|
import { ComparativeReport, ScoreCell, ParetoEntry } from '@lythos/test-utils/schema'
|
|
3
2
|
import type { AgentAdapter } from '@lythos/test-utils/agents'
|
|
4
3
|
import type { ArenaManifest } from '@lythos/test-utils/schema'
|
|
@@ -78,13 +77,29 @@ ${criteriaDesc}
|
|
|
78
77
|
For each participant, score them 1-5 on each criterion. Provide a brief rationale.
|
|
79
78
|
Score meanings: 1=poor, 3=acceptable, 5=excellent.
|
|
80
79
|
|
|
81
|
-
|
|
82
|
-
|
|
80
|
+
## Output Schema
|
|
81
|
+
Your response must conform to this Zod schema:
|
|
82
|
+
\`\`\`ts
|
|
83
|
+
z.object({
|
|
84
|
+
score_matrix: z.array(z.object({
|
|
85
|
+
participant_id: z.string(),
|
|
86
|
+
criterion: z.string(),
|
|
87
|
+
weight: z.number().min(0).max(1),
|
|
88
|
+
score: z.number().int().min(1).max(5),
|
|
89
|
+
rationale: z.string(),
|
|
90
|
+
})),
|
|
91
|
+
key_findings: z.array(z.string()),
|
|
92
|
+
recommendations: z.array(z.object({
|
|
93
|
+
audience: z.string(),
|
|
94
|
+
recommendation: z.string(),
|
|
95
|
+
})),
|
|
96
|
+
})
|
|
97
|
+
\`\`\`
|
|
98
|
+
score_matrix is a FLAT ARRAY of objects — NOT nested by participant or criterion.
|
|
99
|
+
weight: 0.25 for each cell (1 / num_criteria).
|
|
100
|
+
score: 1=poor, 3=acceptable, 5=excellent.
|
|
83
101
|
|
|
84
|
-
|
|
85
|
-
name: 'submit_scores',
|
|
86
|
-
description: 'Submit per-participant scores for each criterion with rationales',
|
|
87
|
-
input_schema: zodToJsonSchema(ComparativeReport.pick({ score_matrix: true, key_findings: true, recommendations: true })) as Record<string, unknown>,
|
|
102
|
+
Use the submit_scores tool to return your structured evaluation.`
|
|
88
103
|
}
|
|
89
104
|
|
|
90
105
|
function toScoreMatrix(
|
|
@@ -94,6 +109,112 @@ function toScoreMatrix(
|
|
|
94
109
|
return scores.map(s => ScoreCell.parse(s))
|
|
95
110
|
}
|
|
96
111
|
|
|
112
|
+
// ── LLM Output Normalization (handle common schema mismatches) ─────────────
|
|
113
|
+
|
|
114
|
+
interface NormalizedScoreCell {
|
|
115
|
+
participant_id: string
|
|
116
|
+
criterion: string
|
|
117
|
+
weight: number
|
|
118
|
+
score: number
|
|
119
|
+
rationale: string
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
function normalizeComparativeOutput(parsed: Record<string, unknown>): Record<string, unknown> {
|
|
123
|
+
const out = { ...parsed }
|
|
124
|
+
|
|
125
|
+
// Detect pivot-table format: { participant: { criterion: { score, rationale } } }
|
|
126
|
+
// Also handles flat format: { participant: { criterion: <score>, criterion_rationale: "..." } }
|
|
127
|
+
// Convert to expected score_matrix: [{ participant_id, criterion, score, weight, rationale }]
|
|
128
|
+
if (!Array.isArray(out.score_matrix)) {
|
|
129
|
+
const participants = Object.keys(out).filter(k => {
|
|
130
|
+
const v = out[k]
|
|
131
|
+
return v && typeof v === 'object' && !Array.isArray(v) && k !== 'key_findings' && k !== 'recommendations'
|
|
132
|
+
})
|
|
133
|
+
if (participants.length >= 2) {
|
|
134
|
+
const matrix: NormalizedScoreCell[] = []
|
|
135
|
+
for (const p of participants) {
|
|
136
|
+
const criteria = out[p] as Record<string, unknown>
|
|
137
|
+
// Collect criterion keys (exclude _rationale, _reason, _note suffixed keys)
|
|
138
|
+
const criterionKeys = Object.keys(criteria).filter(k =>
|
|
139
|
+
!k.endsWith('_rationale') && !k.endsWith('_reason') && !k.endsWith('_note') && !k.endsWith('_notes')
|
|
140
|
+
)
|
|
141
|
+
for (const criterion of criterionKeys) {
|
|
142
|
+
const rawScore = criteria[criterion]
|
|
143
|
+
const rationale = criteria[`${criterion}_rationale`] ?? criteria[`${criterion}_reason`] ?? criteria[`${criterion}_note`] ?? criteria[`${criterion}_notes`] ?? ''
|
|
144
|
+
let score = 3
|
|
145
|
+
if (typeof rawScore === 'number') score = rawScore
|
|
146
|
+
else if (typeof rawScore === 'string') {
|
|
147
|
+
const n = Number(rawScore)
|
|
148
|
+
if (!isNaN(n)) score = n
|
|
149
|
+
else {
|
|
150
|
+
// If it's a descriptive string (not a score), it might be the rationale
|
|
151
|
+
if (!rationale) criteria[`${criterion}_rationale`] = rawScore
|
|
152
|
+
}
|
|
153
|
+
} else if (typeof rawScore === 'object' && rawScore !== null) {
|
|
154
|
+
const obj = rawScore as Record<string, unknown>
|
|
155
|
+
score = typeof obj.score === 'number' ? obj.score : (typeof obj.score === 'string' ? Number(obj.score) || 3 : 3)
|
|
156
|
+
}
|
|
157
|
+
matrix.push({
|
|
158
|
+
participant_id: p,
|
|
159
|
+
criterion,
|
|
160
|
+
weight: 0.25,
|
|
161
|
+
score: Math.max(1, Math.min(5, Math.round(score))),
|
|
162
|
+
rationale: String(rationale).slice(0, 300),
|
|
163
|
+
})
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
if (matrix.length > 0) {
|
|
167
|
+
out.score_matrix = matrix
|
|
168
|
+
for (const p of participants) delete out[p]
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
// Normalize score_matrix entries
|
|
174
|
+
if (Array.isArray(out.score_matrix)) {
|
|
175
|
+
out.score_matrix = (out.score_matrix as Record<string, unknown>[]).map((cell): NormalizedScoreCell => {
|
|
176
|
+
const c = { ...cell }
|
|
177
|
+
// Map common field name variants
|
|
178
|
+
if (!c.participant_id && c.participantId) c.participant_id = c.participantId
|
|
179
|
+
if (!c.participant_id && c.side) c.participant_id = c.side
|
|
180
|
+
// Normalize score to number
|
|
181
|
+
if (typeof c.score === 'string') c.score = Number(c.score) || 3
|
|
182
|
+
// Normalize weight: if >1, assume percentage scale
|
|
183
|
+
if (typeof c.weight === 'number' && c.weight > 1) c.weight = c.weight / 100
|
|
184
|
+
if (c.weight === undefined) c.weight = 0.25
|
|
185
|
+
// Map rationale field name variants
|
|
186
|
+
if (!c.rationale && c.reason) c.rationale = c.reason
|
|
187
|
+
if (!c.rationale && c.notes) c.rationale = c.notes
|
|
188
|
+
if (!c.rationale && c.explanation) c.rationale = c.explanation
|
|
189
|
+
if (!c.rationale) c.rationale = ''
|
|
190
|
+
|
|
191
|
+
return {
|
|
192
|
+
participant_id: String(c.participant_id ?? 'unknown'),
|
|
193
|
+
criterion: String(c.criterion ?? 'unknown'),
|
|
194
|
+
weight: Number(c.weight),
|
|
195
|
+
score: Number(c.score),
|
|
196
|
+
rationale: String(c.rationale),
|
|
197
|
+
}
|
|
198
|
+
})
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
// Normalize recommendations
|
|
202
|
+
if (Array.isArray(out.recommendations)) {
|
|
203
|
+
out.recommendations = (out.recommendations as Record<string, unknown>[]).map(r => ({
|
|
204
|
+
audience: String(r.audience ?? r.role ?? 'general'),
|
|
205
|
+
recommendation: String(r.recommendation ?? r.text ?? r.advice ?? ''),
|
|
206
|
+
}))
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
// Ensure key_findings is an array of strings
|
|
210
|
+
if (!out.key_findings) out.key_findings = []
|
|
211
|
+
if (Array.isArray(out.key_findings)) {
|
|
212
|
+
out.key_findings = out.key_findings.map(f => String(f))
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
return out
|
|
216
|
+
}
|
|
217
|
+
|
|
97
218
|
// ── Comparative Judge ─────────────────────────────────────────────────────
|
|
98
219
|
|
|
99
220
|
export async function runComparativeJudge(opts: {
|
|
@@ -112,28 +233,24 @@ export async function runComparativeJudge(opts: {
|
|
|
112
233
|
|
|
113
234
|
for (let attempt = 0; attempt <= 2; attempt++) {
|
|
114
235
|
try {
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
const fenceMatch = raw.match(/```(?:json)?\s*([\s\S]*?)\s*```/)
|
|
127
|
-
const jsonStr = fenceMatch ? fenceMatch[1].trim() : raw.trim()
|
|
128
|
-
parsed = JSON.parse(jsonStr)
|
|
129
|
-
}
|
|
236
|
+
// Use spawn directly — prompt already includes Zod schema, invokeTool's
|
|
237
|
+
// redundant JSON Schema wrapper confuses the LLM.
|
|
238
|
+
const result = await judge.spawn({ cwd: workdir, brief: prompt, timeoutMs: 120000 })
|
|
239
|
+
raw = result.stdout
|
|
240
|
+
const fenceMatch = raw.match(/```(?:json)?\s*([\s\S]*?)\s*```/)
|
|
241
|
+
const jsonStr = fenceMatch ? fenceMatch[1].trim() : raw.trim()
|
|
242
|
+
if (!jsonStr) throw new Error('Empty LLM output')
|
|
243
|
+
parsed = JSON.parse(jsonStr)
|
|
244
|
+
|
|
245
|
+
// Normalize LLM output before Zod validation
|
|
246
|
+
const normalizedParsed = normalizeComparativeOutput(parsed as Record<string, unknown>)
|
|
130
247
|
|
|
131
248
|
// Validate LLM output through Zod
|
|
132
249
|
const llmResult = ComparativeReport.pick({
|
|
133
250
|
score_matrix: true,
|
|
134
251
|
key_findings: true,
|
|
135
252
|
recommendations: true,
|
|
136
|
-
}).parse(
|
|
253
|
+
}).parse(normalizedParsed)
|
|
137
254
|
|
|
138
255
|
// Success — proceed to Pareto computation
|
|
139
256
|
const scoreMatrix = toScoreMatrix(manifest, llmResult.score_matrix)
|
|
@@ -166,15 +283,37 @@ export async function runComparativeJudge(opts: {
|
|
|
166
283
|
}
|
|
167
284
|
}
|
|
168
285
|
|
|
169
|
-
// All retries exhausted:
|
|
170
|
-
const
|
|
286
|
+
// All retries exhausted: build fallback report from per-cell verdicts
|
|
287
|
+
const scoreMatrix: typeof ScoreCell._output[] = []
|
|
288
|
+
for (const v of verdicts) {
|
|
289
|
+
const jv = v.verdict as Record<string, unknown> | null
|
|
290
|
+
const criteria = (Array.isArray(jv?.criteria) ? jv!.criteria : []) as { name?: string; passed?: boolean; note?: string }[]
|
|
291
|
+
for (const c of criteria) {
|
|
292
|
+
scoreMatrix.push(ScoreCell.parse({
|
|
293
|
+
participant_id: v.participantId,
|
|
294
|
+
criterion: c.name ?? 'unknown',
|
|
295
|
+
weight: 1 / (manifest.criteria.length || 1),
|
|
296
|
+
score: c.passed ? 5 : 1,
|
|
297
|
+
rationale: c.note ?? (c.passed ? 'PASS' : 'FAIL'),
|
|
298
|
+
}))
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
const participantScores = manifest.participants.map(p => {
|
|
302
|
+
const pScores: Record<string, number> = {}
|
|
303
|
+
for (const cell of scoreMatrix) {
|
|
304
|
+
if (cell.participant_id === p.id) pScores[cell.criterion] = cell.score
|
|
305
|
+
}
|
|
306
|
+
return { participant_id: p.id, scores: pScores }
|
|
307
|
+
})
|
|
308
|
+
const pareto = computePareto(participantScores)
|
|
309
|
+
|
|
310
|
+
return ComparativeReport.parse({
|
|
171
311
|
arena_id: manifest.id,
|
|
172
312
|
generated_at: new Date().toISOString(),
|
|
173
|
-
score_matrix:
|
|
313
|
+
score_matrix: scoreMatrix,
|
|
174
314
|
weighted_totals: {},
|
|
175
|
-
pareto
|
|
176
|
-
key_findings: [`Comparative judge
|
|
315
|
+
pareto,
|
|
316
|
+
key_findings: [`Comparative judge unavailable; scores derived from per-cell verdicts. Last error: ${lastError}`],
|
|
177
317
|
recommendations: [],
|
|
178
|
-
}
|
|
179
|
-
return empty
|
|
318
|
+
})
|
|
180
319
|
}
|
package/src/runner.ts
CHANGED
|
@@ -103,12 +103,21 @@ export async function runArenaFromToml(opts: {
|
|
|
103
103
|
try {
|
|
104
104
|
const agent = useAgent(resolvePlayer(cell.player))
|
|
105
105
|
const result = await runAgentScenario({
|
|
106
|
-
scenarioPath:
|
|
106
|
+
scenarioPath: taskAbs,
|
|
107
107
|
agent,
|
|
108
|
-
setupWorkdir(_scenario: AgentScenario, workdir: string) {
|
|
108
|
+
async setupWorkdir(_scenario: AgentScenario, workdir: string) {
|
|
109
109
|
mkdirSync(workdir, { recursive: true })
|
|
110
|
-
const deckContent = readFileSync(
|
|
110
|
+
const deckContent = readFileSync(cell.deck, 'utf-8')
|
|
111
111
|
writeFileSync(join(workdir, 'skill-deck.toml'), deckContent)
|
|
112
|
+
|
|
113
|
+
// Link skills into .claude/skills/ so claude -p can discover them
|
|
114
|
+
const deckCli = resolve(import.meta.dir, '..', '..', 'lythoskill-deck', 'src', 'cli.ts')
|
|
115
|
+
const linkProc = Bun.spawn(['bun', 'run', deckCli, 'link'], {
|
|
116
|
+
cwd: workdir,
|
|
117
|
+
env: { ...process.env, HOME: process.env.HOME },
|
|
118
|
+
})
|
|
119
|
+
await linkProc.exited
|
|
120
|
+
log?.(`[arena] deck link for ${cell.side}: exit ${linkProc.exitCode}`)
|
|
112
121
|
},
|
|
113
122
|
baseDir: join(artifactsDir, 'runs', cell.side),
|
|
114
123
|
})
|