@lythos/skill-arena 0.9.11 → 0.9.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lythos/skill-arena",
3
- "version": "0.9.11",
3
+ "version": "0.9.12",
4
4
  "description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
5
5
  "keywords": [
6
6
  "ai-agent",
@@ -78,6 +78,28 @@ ${criteriaDesc}
78
78
  For each participant, score them 1-5 on each criterion. Provide a brief rationale.
79
79
  Score meanings: 1=poor, 3=acceptable, 5=excellent.
80
80
 
81
+ ## Output Schema
82
+ Your response must conform to this Zod schema:
83
+ \`\`\`ts
84
+ z.object({
85
+ score_matrix: z.array(z.object({
86
+ participant_id: z.string(),
87
+ criterion: z.string(),
88
+ weight: z.number().min(0).max(1),
89
+ score: z.number().int().min(1).max(5),
90
+ rationale: z.string(),
91
+ })),
92
+ key_findings: z.array(z.string()),
93
+ recommendations: z.array(z.object({
94
+ audience: z.string(),
95
+ recommendation: z.string(),
96
+ })),
97
+ })
98
+ \`\`\`
99
+ score_matrix is a FLAT ARRAY of objects — NOT nested by participant or criterion.
100
+ weight: 0.25 for each cell (1 / num_criteria).
101
+ score: 1=poor, 3=acceptable, 5=excellent.
102
+
81
103
  Use the submit_scores tool to return your structured evaluation.`
82
104
  }
83
105
 
@@ -94,6 +116,112 @@ function toScoreMatrix(
94
116
  return scores.map(s => ScoreCell.parse(s))
95
117
  }
96
118
 
119
+ // ── LLM Output Normalization (handle common schema mismatches) ─────────────
120
+
121
+ interface NormalizedScoreCell {
122
+ participant_id: string
123
+ criterion: string
124
+ weight: number
125
+ score: number
126
+ rationale: string
127
+ }
128
+
129
+ function normalizeComparativeOutput(parsed: Record<string, unknown>): Record<string, unknown> {
130
+ const out = { ...parsed }
131
+
132
+ // Detect pivot-table format: { participant: { criterion: { score, rationale } } }
133
+ // Also handles flat format: { participant: { criterion: <score>, criterion_rationale: "..." } }
134
+ // Convert to expected score_matrix: [{ participant_id, criterion, score, weight, rationale }]
135
+ if (!Array.isArray(out.score_matrix)) {
136
+ const participants = Object.keys(out).filter(k => {
137
+ const v = out[k]
138
+ return v && typeof v === 'object' && !Array.isArray(v) && k !== 'key_findings' && k !== 'recommendations'
139
+ })
140
+ if (participants.length >= 2) {
141
+ const matrix: NormalizedScoreCell[] = []
142
+ for (const p of participants) {
143
+ const criteria = out[p] as Record<string, unknown>
144
+ // Collect criterion keys (exclude _rationale, _reason, _note suffixed keys)
145
+ const criterionKeys = Object.keys(criteria).filter(k =>
146
+ !k.endsWith('_rationale') && !k.endsWith('_reason') && !k.endsWith('_note') && !k.endsWith('_notes')
147
+ )
148
+ for (const criterion of criterionKeys) {
149
+ const rawScore = criteria[criterion]
150
+ const rationale = criteria[`${criterion}_rationale`] ?? criteria[`${criterion}_reason`] ?? criteria[`${criterion}_note`] ?? criteria[`${criterion}_notes`] ?? ''
151
+ let score = 3
152
+ if (typeof rawScore === 'number') score = rawScore
153
+ else if (typeof rawScore === 'string') {
154
+ const n = Number(rawScore)
155
+ if (!isNaN(n)) score = n
156
+ else {
157
+ // If it's a descriptive string (not a score), it might be the rationale
158
+ if (!rationale) criteria[`${criterion}_rationale`] = rawScore
159
+ }
160
+ } else if (typeof rawScore === 'object' && rawScore !== null) {
161
+ const obj = rawScore as Record<string, unknown>
162
+ score = typeof obj.score === 'number' ? obj.score : (typeof obj.score === 'string' ? Number(obj.score) || 3 : 3)
163
+ }
164
+ matrix.push({
165
+ participant_id: p,
166
+ criterion,
167
+ weight: 0.25,
168
+ score: Math.max(1, Math.min(5, Math.round(score))),
169
+ rationale: String(rationale).slice(0, 300),
170
+ })
171
+ }
172
+ }
173
+ if (matrix.length > 0) {
174
+ out.score_matrix = matrix
175
+ for (const p of participants) delete out[p]
176
+ }
177
+ }
178
+ }
179
+
180
+ // Normalize score_matrix entries
181
+ if (Array.isArray(out.score_matrix)) {
182
+ out.score_matrix = (out.score_matrix as Record<string, unknown>[]).map((cell): NormalizedScoreCell => {
183
+ const c = { ...cell }
184
+ // Map common field name variants
185
+ if (!c.participant_id && c.participantId) c.participant_id = c.participantId
186
+ if (!c.participant_id && c.side) c.participant_id = c.side
187
+ // Normalize score to number
188
+ if (typeof c.score === 'string') c.score = Number(c.score) || 3
189
+ // Normalize weight: if >1, assume percentage scale
190
+ if (typeof c.weight === 'number' && c.weight > 1) c.weight = c.weight / 100
191
+ if (c.weight === undefined) c.weight = 0.25
192
+ // Map rationale field name variants
193
+ if (!c.rationale && c.reason) c.rationale = c.reason
194
+ if (!c.rationale && c.notes) c.rationale = c.notes
195
+ if (!c.rationale && c.explanation) c.rationale = c.explanation
196
+ if (!c.rationale) c.rationale = ''
197
+
198
+ return {
199
+ participant_id: String(c.participant_id ?? 'unknown'),
200
+ criterion: String(c.criterion ?? 'unknown'),
201
+ weight: Number(c.weight),
202
+ score: Number(c.score),
203
+ rationale: String(c.rationale),
204
+ }
205
+ })
206
+ }
207
+
208
+ // Normalize recommendations
209
+ if (Array.isArray(out.recommendations)) {
210
+ out.recommendations = (out.recommendations as Record<string, unknown>[]).map(r => ({
211
+ audience: String(r.audience ?? r.role ?? 'general'),
212
+ recommendation: String(r.recommendation ?? r.text ?? r.advice ?? ''),
213
+ }))
214
+ }
215
+
216
+ // Ensure key_findings is an array of strings
217
+ if (!out.key_findings) out.key_findings = []
218
+ if (Array.isArray(out.key_findings)) {
219
+ out.key_findings = out.key_findings.map(f => String(f))
220
+ }
221
+
222
+ return out
223
+ }
224
+
97
225
  // ── Comparative Judge ─────────────────────────────────────────────────────
98
226
 
99
227
  export async function runComparativeJudge(opts: {
@@ -128,12 +256,15 @@ export async function runComparativeJudge(opts: {
128
256
  parsed = JSON.parse(jsonStr)
129
257
  }
130
258
 
259
+ // Normalize LLM output before Zod validation
260
+ const normalizedParsed = normalizeComparativeOutput(parsed as Record<string, unknown>)
261
+
131
262
  // Validate LLM output through Zod
132
263
  const llmResult = ComparativeReport.pick({
133
264
  score_matrix: true,
134
265
  key_findings: true,
135
266
  recommendations: true,
136
- }).parse(parsed)
267
+ }).parse(normalizedParsed)
137
268
 
138
269
  // Success — proceed to Pareto computation
139
270
  const scoreMatrix = toScoreMatrix(manifest, llmResult.score_matrix)
package/src/runner.ts CHANGED
@@ -103,11 +103,11 @@ export async function runArenaFromToml(opts: {
103
103
  try {
104
104
  const agent = useAgent(resolvePlayer(cell.player))
105
105
  const result = await runAgentScenario({
106
- scenarioPath: resolve(taskPath),
106
+ scenarioPath: taskAbs,
107
107
  agent,
108
108
  setupWorkdir(_scenario: AgentScenario, workdir: string) {
109
109
  mkdirSync(workdir, { recursive: true })
110
- const deckContent = readFileSync(resolve(cell.deck), 'utf-8')
110
+ const deckContent = readFileSync(cell.deck, 'utf-8')
111
111
  writeFileSync(join(workdir, 'skill-deck.toml'), deckContent)
112
112
  },
113
113
  baseDir: join(artifactsDir, 'runs', cell.side),