@lythos/skill-arena 0.9.12 → 0.9.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/comparative-judge.ts +37 -29
- package/src/runner.ts +10 -1
package/package.json
CHANGED
package/src/comparative-judge.ts
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import { zodToJsonSchema } from 'zod-to-json-schema'
|
|
2
1
|
import { ComparativeReport, ScoreCell, ParetoEntry } from '@lythos/test-utils/schema'
|
|
3
2
|
import type { AgentAdapter } from '@lythos/test-utils/agents'
|
|
4
3
|
import type { ArenaManifest } from '@lythos/test-utils/schema'
|
|
@@ -103,12 +102,6 @@ score: 1=poor, 3=acceptable, 5=excellent.
|
|
|
103
102
|
Use the submit_scores tool to return your structured evaluation.`
|
|
104
103
|
}
|
|
105
104
|
|
|
106
|
-
const SCORE_TOOL = {
|
|
107
|
-
name: 'submit_scores',
|
|
108
|
-
description: 'Submit per-participant scores for each criterion with rationales',
|
|
109
|
-
input_schema: zodToJsonSchema(ComparativeReport.pick({ score_matrix: true, key_findings: true, recommendations: true })) as Record<string, unknown>,
|
|
110
|
-
}
|
|
111
|
-
|
|
112
105
|
function toScoreMatrix(
|
|
113
106
|
manifest: ArenaManifest,
|
|
114
107
|
scores: { participant_id: string; criterion: string; weight: number; score: number; rationale: string }[]
|
|
@@ -240,21 +233,14 @@ export async function runComparativeJudge(opts: {
|
|
|
240
233
|
|
|
241
234
|
for (let attempt = 0; attempt <= 2; attempt++) {
|
|
242
235
|
try {
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
} else {
|
|
252
|
-
const result = await judge.spawn({ cwd: workdir, brief: prompt, timeoutMs: 120000 })
|
|
253
|
-
raw = result.stdout
|
|
254
|
-
const fenceMatch = raw.match(/```(?:json)?\s*([\s\S]*?)\s*```/)
|
|
255
|
-
const jsonStr = fenceMatch ? fenceMatch[1].trim() : raw.trim()
|
|
256
|
-
parsed = JSON.parse(jsonStr)
|
|
257
|
-
}
|
|
236
|
+
// Use spawn directly — prompt already includes Zod schema, invokeTool's
|
|
237
|
+
// redundant JSON Schema wrapper confuses the LLM.
|
|
238
|
+
const result = await judge.spawn({ cwd: workdir, brief: prompt, timeoutMs: 120000 })
|
|
239
|
+
raw = result.stdout
|
|
240
|
+
const fenceMatch = raw.match(/```(?:json)?\s*([\s\S]*?)\s*```/)
|
|
241
|
+
const jsonStr = fenceMatch ? fenceMatch[1].trim() : raw.trim()
|
|
242
|
+
if (!jsonStr) throw new Error('Empty LLM output')
|
|
243
|
+
parsed = JSON.parse(jsonStr)
|
|
258
244
|
|
|
259
245
|
// Normalize LLM output before Zod validation
|
|
260
246
|
const normalizedParsed = normalizeComparativeOutput(parsed as Record<string, unknown>)
|
|
@@ -297,15 +283,37 @@ export async function runComparativeJudge(opts: {
|
|
|
297
283
|
}
|
|
298
284
|
}
|
|
299
285
|
|
|
300
|
-
// All retries exhausted:
|
|
301
|
-
const
|
|
286
|
+
// All retries exhausted: build fallback report from per-cell verdicts
|
|
287
|
+
const scoreMatrix: typeof ScoreCell._output[] = []
|
|
288
|
+
for (const v of verdicts) {
|
|
289
|
+
const jv = v.verdict as Record<string, unknown> | null
|
|
290
|
+
const criteria = (Array.isArray(jv?.criteria) ? jv!.criteria : []) as { name?: string; passed?: boolean; note?: string }[]
|
|
291
|
+
for (const c of criteria) {
|
|
292
|
+
scoreMatrix.push(ScoreCell.parse({
|
|
293
|
+
participant_id: v.participantId,
|
|
294
|
+
criterion: c.name ?? 'unknown',
|
|
295
|
+
weight: 1 / (manifest.criteria.length || 1),
|
|
296
|
+
score: c.passed ? 5 : 1,
|
|
297
|
+
rationale: c.note ?? (c.passed ? 'PASS' : 'FAIL'),
|
|
298
|
+
}))
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
const participantScores = manifest.participants.map(p => {
|
|
302
|
+
const pScores: Record<string, number> = {}
|
|
303
|
+
for (const cell of scoreMatrix) {
|
|
304
|
+
if (cell.participant_id === p.id) pScores[cell.criterion] = cell.score
|
|
305
|
+
}
|
|
306
|
+
return { participant_id: p.id, scores: pScores }
|
|
307
|
+
})
|
|
308
|
+
const pareto = computePareto(participantScores)
|
|
309
|
+
|
|
310
|
+
return ComparativeReport.parse({
|
|
302
311
|
arena_id: manifest.id,
|
|
303
312
|
generated_at: new Date().toISOString(),
|
|
304
|
-
score_matrix:
|
|
313
|
+
score_matrix: scoreMatrix,
|
|
305
314
|
weighted_totals: {},
|
|
306
|
-
pareto
|
|
307
|
-
key_findings: [`Comparative judge
|
|
315
|
+
pareto,
|
|
316
|
+
key_findings: [`Comparative judge unavailable; scores derived from per-cell verdicts. Last error: ${lastError}`],
|
|
308
317
|
recommendations: [],
|
|
309
|
-
}
|
|
310
|
-
return empty
|
|
318
|
+
})
|
|
311
319
|
}
|
package/src/runner.ts
CHANGED
|
@@ -105,10 +105,19 @@ export async function runArenaFromToml(opts: {
|
|
|
105
105
|
const result = await runAgentScenario({
|
|
106
106
|
scenarioPath: taskAbs,
|
|
107
107
|
agent,
|
|
108
|
-
setupWorkdir(_scenario: AgentScenario, workdir: string) {
|
|
108
|
+
async setupWorkdir(_scenario: AgentScenario, workdir: string) {
|
|
109
109
|
mkdirSync(workdir, { recursive: true })
|
|
110
110
|
const deckContent = readFileSync(cell.deck, 'utf-8')
|
|
111
111
|
writeFileSync(join(workdir, 'skill-deck.toml'), deckContent)
|
|
112
|
+
|
|
113
|
+
// Link skills into .claude/skills/ so claude -p can discover them
|
|
114
|
+
const deckCli = resolve(import.meta.dir, '..', '..', 'lythoskill-deck', 'src', 'cli.ts')
|
|
115
|
+
const linkProc = Bun.spawn(['bun', 'run', deckCli, 'link'], {
|
|
116
|
+
cwd: workdir,
|
|
117
|
+
env: { ...process.env, HOME: process.env.HOME },
|
|
118
|
+
})
|
|
119
|
+
await linkProc.exited
|
|
120
|
+
log?.(`[arena] deck link for ${cell.side}: exit ${linkProc.exitCode}`)
|
|
112
121
|
},
|
|
113
122
|
baseDir: join(artifactsDir, 'runs', cell.side),
|
|
114
123
|
})
|