@lythos/skill-arena 0.9.11 → 0.9.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lythos/skill-arena",
3
- "version": "0.9.11",
3
+ "version": "0.9.13",
4
4
  "description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
5
5
  "keywords": [
6
6
  "ai-agent",
@@ -1,4 +1,3 @@
1
- import { zodToJsonSchema } from 'zod-to-json-schema'
2
1
  import { ComparativeReport, ScoreCell, ParetoEntry } from '@lythos/test-utils/schema'
3
2
  import type { AgentAdapter } from '@lythos/test-utils/agents'
4
3
  import type { ArenaManifest } from '@lythos/test-utils/schema'
@@ -78,13 +77,29 @@ ${criteriaDesc}
78
77
  For each participant, score them 1-5 on each criterion. Provide a brief rationale.
79
78
  Score meanings: 1=poor, 3=acceptable, 5=excellent.
80
79
 
81
- Use the submit_scores tool to return your structured evaluation.`
82
- }
80
+ ## Output Schema
81
+ Your response must conform to this Zod schema:
82
+ \`\`\`ts
83
+ z.object({
84
+ score_matrix: z.array(z.object({
85
+ participant_id: z.string(),
86
+ criterion: z.string(),
87
+ weight: z.number().min(0).max(1),
88
+ score: z.number().int().min(1).max(5),
89
+ rationale: z.string(),
90
+ })),
91
+ key_findings: z.array(z.string()),
92
+ recommendations: z.array(z.object({
93
+ audience: z.string(),
94
+ recommendation: z.string(),
95
+ })),
96
+ })
97
+ \`\`\`
98
+ score_matrix is a FLAT ARRAY of objects — NOT nested by participant or criterion.
99
+ weight: 0.25 for each cell (1 / num_criteria).
100
+ score: 1=poor, 3=acceptable, 5=excellent.
83
101
 
84
- const SCORE_TOOL = {
85
- name: 'submit_scores',
86
- description: 'Submit per-participant scores for each criterion with rationales',
87
- input_schema: zodToJsonSchema(ComparativeReport.pick({ score_matrix: true, key_findings: true, recommendations: true })) as Record<string, unknown>,
102
+ Use the submit_scores tool to return your structured evaluation.`
88
103
  }
89
104
 
90
105
  function toScoreMatrix(
@@ -94,6 +109,112 @@ function toScoreMatrix(
94
109
  return scores.map(s => ScoreCell.parse(s))
95
110
  }
96
111
 
112
+ // ── LLM Output Normalization (handle common schema mismatches) ─────────────
113
+
114
+ interface NormalizedScoreCell {
115
+ participant_id: string
116
+ criterion: string
117
+ weight: number
118
+ score: number
119
+ rationale: string
120
+ }
121
+
122
+ function normalizeComparativeOutput(parsed: Record<string, unknown>): Record<string, unknown> {
123
+ const out = { ...parsed }
124
+
125
+ // Detect pivot-table format: { participant: { criterion: { score, rationale } } }
126
+ // Also handles flat format: { participant: { criterion: <score>, criterion_rationale: "..." } }
127
+ // Convert to expected score_matrix: [{ participant_id, criterion, score, weight, rationale }]
128
+ if (!Array.isArray(out.score_matrix)) {
129
+ const participants = Object.keys(out).filter(k => {
130
+ const v = out[k]
131
+ return v && typeof v === 'object' && !Array.isArray(v) && k !== 'key_findings' && k !== 'recommendations'
132
+ })
133
+ if (participants.length >= 2) {
134
+ const matrix: NormalizedScoreCell[] = []
135
+ for (const p of participants) {
136
+ const criteria = out[p] as Record<string, unknown>
137
+ // Collect criterion keys (exclude _rationale, _reason, _note suffixed keys)
138
+ const criterionKeys = Object.keys(criteria).filter(k =>
139
+ !k.endsWith('_rationale') && !k.endsWith('_reason') && !k.endsWith('_note') && !k.endsWith('_notes')
140
+ )
141
+ for (const criterion of criterionKeys) {
142
+ const rawScore = criteria[criterion]
143
+ const rationale = criteria[`${criterion}_rationale`] ?? criteria[`${criterion}_reason`] ?? criteria[`${criterion}_note`] ?? criteria[`${criterion}_notes`] ?? ''
144
+ let score = 3
145
+ if (typeof rawScore === 'number') score = rawScore
146
+ else if (typeof rawScore === 'string') {
147
+ const n = Number(rawScore)
148
+ if (!isNaN(n)) score = n
149
+ else {
150
+ // If it's a descriptive string (not a score), it might be the rationale
151
+ if (!rationale) criteria[`${criterion}_rationale`] = rawScore
152
+ }
153
+ } else if (typeof rawScore === 'object' && rawScore !== null) {
154
+ const obj = rawScore as Record<string, unknown>
155
+ score = typeof obj.score === 'number' ? obj.score : (typeof obj.score === 'string' ? Number(obj.score) || 3 : 3)
156
+ }
157
+ matrix.push({
158
+ participant_id: p,
159
+ criterion,
160
+ weight: 0.25,
161
+ score: Math.max(1, Math.min(5, Math.round(score))),
162
+ rationale: String(rationale).slice(0, 300),
163
+ })
164
+ }
165
+ }
166
+ if (matrix.length > 0) {
167
+ out.score_matrix = matrix
168
+ for (const p of participants) delete out[p]
169
+ }
170
+ }
171
+ }
172
+
173
+ // Normalize score_matrix entries
174
+ if (Array.isArray(out.score_matrix)) {
175
+ out.score_matrix = (out.score_matrix as Record<string, unknown>[]).map((cell): NormalizedScoreCell => {
176
+ const c = { ...cell }
177
+ // Map common field name variants
178
+ if (!c.participant_id && c.participantId) c.participant_id = c.participantId
179
+ if (!c.participant_id && c.side) c.participant_id = c.side
180
+ // Normalize score to number
181
+ if (typeof c.score === 'string') c.score = Number(c.score) || 3
182
+ // Normalize weight: if >1, assume percentage scale
183
+ if (typeof c.weight === 'number' && c.weight > 1) c.weight = c.weight / 100
184
+ if (c.weight === undefined) c.weight = 0.25
185
+ // Map rationale field name variants
186
+ if (!c.rationale && c.reason) c.rationale = c.reason
187
+ if (!c.rationale && c.notes) c.rationale = c.notes
188
+ if (!c.rationale && c.explanation) c.rationale = c.explanation
189
+ if (!c.rationale) c.rationale = ''
190
+
191
+ return {
192
+ participant_id: String(c.participant_id ?? 'unknown'),
193
+ criterion: String(c.criterion ?? 'unknown'),
194
+ weight: Number(c.weight),
195
+ score: Number(c.score),
196
+ rationale: String(c.rationale),
197
+ }
198
+ })
199
+ }
200
+
201
+ // Normalize recommendations
202
+ if (Array.isArray(out.recommendations)) {
203
+ out.recommendations = (out.recommendations as Record<string, unknown>[]).map(r => ({
204
+ audience: String(r.audience ?? r.role ?? 'general'),
205
+ recommendation: String(r.recommendation ?? r.text ?? r.advice ?? ''),
206
+ }))
207
+ }
208
+
209
+ // Ensure key_findings is an array of strings
210
+ if (!out.key_findings) out.key_findings = []
211
+ if (Array.isArray(out.key_findings)) {
212
+ out.key_findings = out.key_findings.map(f => String(f))
213
+ }
214
+
215
+ return out
216
+ }
217
+
97
218
  // ── Comparative Judge ─────────────────────────────────────────────────────
98
219
 
99
220
  export async function runComparativeJudge(opts: {
@@ -112,28 +233,24 @@ export async function runComparativeJudge(opts: {
112
233
 
113
234
  for (let attempt = 0; attempt <= 2; attempt++) {
114
235
  try {
115
- if (judge.invokeTool) {
116
- parsed = await judge.invokeTool({
117
- tool: SCORE_TOOL,
118
- prompt,
119
- cwd: workdir,
120
- timeoutMs: 120000,
121
- })
122
- raw = JSON.stringify(parsed)
123
- } else {
124
- const result = await judge.spawn({ cwd: workdir, brief: prompt, timeoutMs: 120000 })
125
- raw = result.stdout
126
- const fenceMatch = raw.match(/```(?:json)?\s*([\s\S]*?)\s*```/)
127
- const jsonStr = fenceMatch ? fenceMatch[1].trim() : raw.trim()
128
- parsed = JSON.parse(jsonStr)
129
- }
236
+ // Use spawn directly — prompt already includes Zod schema, invokeTool's
237
+ // redundant JSON Schema wrapper confuses the LLM.
238
+ const result = await judge.spawn({ cwd: workdir, brief: prompt, timeoutMs: 120000 })
239
+ raw = result.stdout
240
+ const fenceMatch = raw.match(/```(?:json)?\s*([\s\S]*?)\s*```/)
241
+ const jsonStr = fenceMatch ? fenceMatch[1].trim() : raw.trim()
242
+ if (!jsonStr) throw new Error('Empty LLM output')
243
+ parsed = JSON.parse(jsonStr)
244
+
245
+ // Normalize LLM output before Zod validation
246
+ const normalizedParsed = normalizeComparativeOutput(parsed as Record<string, unknown>)
130
247
 
131
248
  // Validate LLM output through Zod
132
249
  const llmResult = ComparativeReport.pick({
133
250
  score_matrix: true,
134
251
  key_findings: true,
135
252
  recommendations: true,
136
- }).parse(parsed)
253
+ }).parse(normalizedParsed)
137
254
 
138
255
  // Success — proceed to Pareto computation
139
256
  const scoreMatrix = toScoreMatrix(manifest, llmResult.score_matrix)
@@ -166,15 +283,37 @@ export async function runComparativeJudge(opts: {
166
283
  }
167
284
  }
168
285
 
169
- // All retries exhausted: return fallback report
170
- const empty: typeof ComparativeReport._output = {
286
+ // All retries exhausted: build fallback report from per-cell verdicts
287
+ const scoreMatrix: typeof ScoreCell._output[] = []
288
+ for (const v of verdicts) {
289
+ const jv = v.verdict as Record<string, unknown> | null
290
+ const criteria = (Array.isArray(jv?.criteria) ? jv!.criteria : []) as { name?: string; passed?: boolean; note?: string }[]
291
+ for (const c of criteria) {
292
+ scoreMatrix.push(ScoreCell.parse({
293
+ participant_id: v.participantId,
294
+ criterion: c.name ?? 'unknown',
295
+ weight: 1 / (manifest.criteria.length || 1),
296
+ score: c.passed ? 5 : 1,
297
+ rationale: c.note ?? (c.passed ? 'PASS' : 'FAIL'),
298
+ }))
299
+ }
300
+ }
301
+ const participantScores = manifest.participants.map(p => {
302
+ const pScores: Record<string, number> = {}
303
+ for (const cell of scoreMatrix) {
304
+ if (cell.participant_id === p.id) pScores[cell.criterion] = cell.score
305
+ }
306
+ return { participant_id: p.id, scores: pScores }
307
+ })
308
+ const pareto = computePareto(participantScores)
309
+
310
+ return ComparativeReport.parse({
171
311
  arena_id: manifest.id,
172
312
  generated_at: new Date().toISOString(),
173
- score_matrix: [],
313
+ score_matrix: scoreMatrix,
174
314
  weighted_totals: {},
175
- pareto: [],
176
- key_findings: [`Comparative judge failed after 3 attempts: ${lastError}`],
315
+ pareto,
316
+ key_findings: [`Comparative judge unavailable; scores derived from per-cell verdicts. Last error: ${lastError}`],
177
317
  recommendations: [],
178
- }
179
- return empty
318
+ })
180
319
  }
package/src/runner.ts CHANGED
@@ -103,12 +103,21 @@ export async function runArenaFromToml(opts: {
103
103
  try {
104
104
  const agent = useAgent(resolvePlayer(cell.player))
105
105
  const result = await runAgentScenario({
106
- scenarioPath: resolve(taskPath),
106
+ scenarioPath: taskAbs,
107
107
  agent,
108
- setupWorkdir(_scenario: AgentScenario, workdir: string) {
108
+ async setupWorkdir(_scenario: AgentScenario, workdir: string) {
109
109
  mkdirSync(workdir, { recursive: true })
110
- const deckContent = readFileSync(resolve(cell.deck), 'utf-8')
110
+ const deckContent = readFileSync(cell.deck, 'utf-8')
111
111
  writeFileSync(join(workdir, 'skill-deck.toml'), deckContent)
112
+
113
+ // Link skills into .claude/skills/ so claude -p can discover them
114
+ const deckCli = resolve(import.meta.dir, '..', '..', 'lythoskill-deck', 'src', 'cli.ts')
115
+ const linkProc = Bun.spawn(['bun', 'run', deckCli, 'link'], {
116
+ cwd: workdir,
117
+ env: { ...process.env, HOME: process.env.HOME },
118
+ })
119
+ await linkProc.exited
120
+ log?.(`[arena] deck link for ${cell.side}: exit ${linkProc.exitCode}`)
112
121
  },
113
122
  baseDir: join(artifactsDir, 'runs', cell.side),
114
123
  })