@lythos/skill-arena 0.9.7 → 0.9.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lythos/skill-arena",
3
- "version": "0.9.7",
3
+ "version": "0.9.9",
4
4
  "description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
5
5
  "keywords": [
6
6
  "ai-agent",
package/src/arena-toml.ts CHANGED
@@ -27,6 +27,9 @@ export const ArenaToml = z.object({
27
27
  criteria: z.array(z.string()).min(1),
28
28
  runs_per_side: z.number().int().positive().default(1),
29
29
  max_participants: z.number().int().min(2).max(5).default(5),
30
+ model: z.string().optional(), // e.g. "claude-sonnet-4-6"
31
+ endpoint: z.string().optional(), // e.g. "api.anthropic.com"
32
+ notes: z.string().optional(), // freeform reproducibility notes
30
33
  }),
31
34
  side: z.array(Side).min(2).max(5),
32
35
  })
@@ -106,61 +106,75 @@ export async function runComparativeJudge(opts: {
106
106
 
107
107
  const prompt = buildComparativePrompt({ manifest, verdicts })
108
108
 
109
- let raw: string
109
+ let raw = ''
110
110
  let parsed: unknown
111
-
112
- if (judge.invokeTool) {
113
- parsed = await judge.invokeTool({
114
- tool: SCORE_TOOL,
115
- prompt,
116
- cwd: workdir,
117
- timeoutMs: 120000,
118
- })
119
- raw = JSON.stringify(parsed)
120
- } else {
121
- const result = await judge.spawn({ cwd: workdir, brief: prompt, timeoutMs: 120000 })
122
- raw = result.stdout
123
- const fenceMatch = raw.match(/```(?:json)?\s*([\s\S]*?)\s*```/)
124
- const jsonStr = fenceMatch ? fenceMatch[1].trim() : raw.trim()
125
- parsed = JSON.parse(jsonStr)
126
- }
127
-
128
- // Validate LLM output
129
- const llmResult = ComparativeReport.pick({
130
- score_matrix: true,
131
- key_findings: true,
132
- recommendations: true,
133
- }).parse(parsed)
134
-
135
- const scoreMatrix = toScoreMatrix(manifest, llmResult.score_matrix)
136
-
137
- // Pareto: deterministic, never delegated to LLM
138
- const participantScores = manifest.participants.map(p => {
139
- const pScores: Record<string, number> = {}
140
- for (const cell of scoreMatrix) {
141
- if (cell.participant_id === p.id) {
142
- pScores[cell.criterion] = cell.score
111
+ let lastError: string | undefined
112
+
113
+ for (let attempt = 0; attempt <= 2; attempt++) {
114
+ try {
115
+ if (judge.invokeTool) {
116
+ parsed = await judge.invokeTool({
117
+ tool: SCORE_TOOL,
118
+ prompt,
119
+ cwd: workdir,
120
+ timeoutMs: 120000,
121
+ })
122
+ raw = JSON.stringify(parsed)
123
+ } else {
124
+ const result = await judge.spawn({ cwd: workdir, brief: prompt, timeoutMs: 120000 })
125
+ raw = result.stdout
126
+ const fenceMatch = raw.match(/```(?:json)?\s*([\s\S]*?)\s*```/)
127
+ const jsonStr = fenceMatch ? fenceMatch[1].trim() : raw.trim()
128
+ parsed = JSON.parse(jsonStr)
143
129
  }
144
- }
145
- return { participant_id: p.id, scores: pScores }
146
- })
147
130
 
148
- const pareto = computePareto(participantScores)
131
+ // Validate LLM output through Zod
132
+ const llmResult = ComparativeReport.pick({
133
+ score_matrix: true,
134
+ key_findings: true,
135
+ recommendations: true,
136
+ }).parse(parsed)
137
+
138
+ // Success — proceed to Pareto computation
139
+ const scoreMatrix = toScoreMatrix(manifest, llmResult.score_matrix)
140
+ const participantScores = manifest.participants.map(p => {
141
+ const pScores: Record<string, number> = {}
142
+ for (const cell of scoreMatrix) {
143
+ if (cell.participant_id === p.id) pScores[cell.criterion] = cell.score
144
+ }
145
+ return { participant_id: p.id, scores: pScores }
146
+ })
147
+ const pareto = computePareto(participantScores)
148
+ const weightedTotals: Record<string, number> = {}
149
+ for (const p of manifest.participants) {
150
+ const pCells = scoreMatrix.filter(c => c.participant_id === p.id)
151
+ weightedTotals[p.id] = pCells.reduce((sum, c) => sum + c.score * c.weight, 0) / (pCells.length || 1)
152
+ }
149
153
 
150
- // Weighted totals (equal weight by default)
151
- const weightedTotals: Record<string, number> = {}
152
- for (const p of manifest.participants) {
153
- const pCells = scoreMatrix.filter(c => c.participant_id === p.id)
154
- weightedTotals[p.id] = pCells.reduce((sum, c) => sum + c.score * c.weight, 0) / (pCells.length || 1)
154
+ return ComparativeReport.parse({
155
+ arena_id: manifest.id,
156
+ generated_at: new Date().toISOString(),
157
+ score_matrix: scoreMatrix,
158
+ weighted_totals: weightedTotals,
159
+ pareto,
160
+ key_findings: llmResult.key_findings ?? [],
161
+ recommendations: llmResult.recommendations ?? [],
162
+ })
163
+ } catch (e) {
164
+ lastError = e instanceof Error ? e.message : String(e)
165
+ if (attempt < 2) continue // retry
166
+ }
155
167
  }
156
168
 
157
- return ComparativeReport.parse({
169
+ // All retries exhausted: return fallback report
170
+ const empty: typeof ComparativeReport._output = {
158
171
  arena_id: manifest.id,
159
172
  generated_at: new Date().toISOString(),
160
- score_matrix: scoreMatrix,
161
- weighted_totals: weightedTotals,
162
- pareto,
163
- key_findings: llmResult.key_findings ?? [],
164
- recommendations: llmResult.recommendations ?? [],
165
- })
173
+ score_matrix: [],
174
+ weighted_totals: {},
175
+ pareto: [],
176
+ key_findings: [`Comparative judge failed after 3 attempts: ${lastError}`],
177
+ recommendations: [],
178
+ }
179
+ return empty
166
180
  }
package/src/runner.ts CHANGED
@@ -119,6 +119,14 @@ export async function runArenaFromToml(opts: {
119
119
  criteria: [],
120
120
  }) as JudgeVerdict
121
121
 
122
+ // Persist per-cell verdict + agent output for auditability
123
+ writeFileSync(join(cellDir, 'judge-verdict.json'), JSON.stringify({
124
+ ...v,
125
+ agent_stdout: result.agentResult.stdout.slice(0, 5000),
126
+ agent_stderr: result.agentResult.stderr.slice(0, 1000),
127
+ duration_ms: result.agentResult.durationMs,
128
+ }, null, 2) + '\n')
129
+
122
130
  if (!verdictsBySide.has(cell.side)) verdictsBySide.set(cell.side, [])
123
131
  verdictsBySide.get(cell.side)!.push(v)
124
132
  } catch (e) {