npm - @lythos/skill-arena - Versions diffs - 0.9.7 → 0.9.9 - Mend

@lythos/skill-arena 0.9.7 → 0.9.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/package.json +1 -1
package/src/arena-toml.ts +3 -0
package/src/comparative-judge.ts +63 -49
package/src/runner.ts +8 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@lythos/skill-arena",
-  "version": "0.9.7",
+  "version": "0.9.9",
   "description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
   "keywords": [
     "ai-agent",

package/src/arena-toml.ts CHANGED Viewed

@@ -27,6 +27,9 @@ export const ArenaToml = z.object({
     criteria: z.array(z.string()).min(1),
     runs_per_side: z.number().int().positive().default(1),
     max_participants: z.number().int().min(2).max(5).default(5),
+    model: z.string().optional(),  // e.g. "claude-sonnet-4-6"
+    endpoint: z.string().optional(), // e.g. "api.anthropic.com"
+    notes: z.string().optional(),  // freeform reproducibility notes
   }),
   side: z.array(Side).min(2).max(5),
 })

package/src/comparative-judge.ts CHANGED Viewed

@@ -106,61 +106,75 @@ export async function runComparativeJudge(opts: {
   const prompt = buildComparativePrompt({ manifest, verdicts })
-  let raw: string
+  let raw = ''
   let parsed: unknown
-  if (judge.invokeTool) {
-    parsed = await judge.invokeTool({
-      tool: SCORE_TOOL,
-      prompt,
-      cwd: workdir,
-      timeoutMs: 120000,
-    })
-    raw = JSON.stringify(parsed)
-  } else {
-    const result = await judge.spawn({ cwd: workdir, brief: prompt, timeoutMs: 120000 })
-    raw = result.stdout
-    const fenceMatch = raw.match(/```(?:json)?\s*([\s\S]*?)\s*```/)
-    const jsonStr = fenceMatch ? fenceMatch[1].trim() : raw.trim()
-    parsed = JSON.parse(jsonStr)
-  }
-  // Validate LLM output
-  const llmResult = ComparativeReport.pick({
-    score_matrix: true,
-    key_findings: true,
-    recommendations: true,
-  }).parse(parsed)
-  const scoreMatrix = toScoreMatrix(manifest, llmResult.score_matrix)
-  // Pareto: deterministic, never delegated to LLM
-  const participantScores = manifest.participants.map(p => {
-    const pScores: Record<string, number> = {}
-    for (const cell of scoreMatrix) {
-      if (cell.participant_id === p.id) {
-        pScores[cell.criterion] = cell.score
+  let lastError: string | undefined
+  for (let attempt = 0; attempt <= 2; attempt++) {
+    try {
+      if (judge.invokeTool) {
+        parsed = await judge.invokeTool({
+          tool: SCORE_TOOL,
+          prompt,
+          cwd: workdir,
+          timeoutMs: 120000,
+        })
+        raw = JSON.stringify(parsed)
+      } else {
+        const result = await judge.spawn({ cwd: workdir, brief: prompt, timeoutMs: 120000 })
+        raw = result.stdout
+        const fenceMatch = raw.match(/```(?:json)?\s*([\s\S]*?)\s*```/)
+        const jsonStr = fenceMatch ? fenceMatch[1].trim() : raw.trim()
+        parsed = JSON.parse(jsonStr)
       }
-    }
-    return { participant_id: p.id, scores: pScores }
-  })
-  const pareto = computePareto(participantScores)
+      // Validate LLM output through Zod
+      const llmResult = ComparativeReport.pick({
+        score_matrix: true,
+        key_findings: true,
+        recommendations: true,
+      }).parse(parsed)
+      // Success — proceed to Pareto computation
+      const scoreMatrix = toScoreMatrix(manifest, llmResult.score_matrix)
+      const participantScores = manifest.participants.map(p => {
+        const pScores: Record<string, number> = {}
+        for (const cell of scoreMatrix) {
+          if (cell.participant_id === p.id) pScores[cell.criterion] = cell.score
+        }
+        return { participant_id: p.id, scores: pScores }
+      })
+      const pareto = computePareto(participantScores)
+      const weightedTotals: Record<string, number> = {}
+      for (const p of manifest.participants) {
+        const pCells = scoreMatrix.filter(c => c.participant_id === p.id)
+        weightedTotals[p.id] = pCells.reduce((sum, c) => sum + c.score * c.weight, 0) / (pCells.length || 1)
+      }
-  // Weighted totals (equal weight by default)
-  const weightedTotals: Record<string, number> = {}
-  for (const p of manifest.participants) {
-    const pCells = scoreMatrix.filter(c => c.participant_id === p.id)
-    weightedTotals[p.id] = pCells.reduce((sum, c) => sum + c.score * c.weight, 0) / (pCells.length || 1)
+      return ComparativeReport.parse({
+        arena_id: manifest.id,
+        generated_at: new Date().toISOString(),
+        score_matrix: scoreMatrix,
+        weighted_totals: weightedTotals,
+        pareto,
+        key_findings: llmResult.key_findings ?? [],
+        recommendations: llmResult.recommendations ?? [],
+      })
+    } catch (e) {
+      lastError = e instanceof Error ? e.message : String(e)
+      if (attempt < 2) continue // retry
+    }
   }
-  return ComparativeReport.parse({
+  // All retries exhausted: return fallback report
+  const empty: typeof ComparativeReport._output = {
     arena_id: manifest.id,
     generated_at: new Date().toISOString(),
-    score_matrix: scoreMatrix,
-    weighted_totals: weightedTotals,
-    pareto,
-    key_findings: llmResult.key_findings ?? [],
-    recommendations: llmResult.recommendations ?? [],
-  })
+    score_matrix: [],
+    weighted_totals: {},
+    pareto: [],
+    key_findings: [`Comparative judge failed after 3 attempts: ${lastError}`],
+    recommendations: [],
+  }
+  return empty
 }

package/src/runner.ts CHANGED Viewed

@@ -119,6 +119,14 @@ export async function runArenaFromToml(opts: {
         criteria: [],
       }) as JudgeVerdict
+      // Persist per-cell verdict + agent output for auditability
+      writeFileSync(join(cellDir, 'judge-verdict.json'), JSON.stringify({
+        ...v,
+        agent_stdout: result.agentResult.stdout.slice(0, 5000),
+        agent_stderr: result.agentResult.stderr.slice(0, 1000),
+        duration_ms: result.agentResult.durationMs,
+      }, null, 2) + '\n')
       if (!verdictsBySide.has(cell.side)) verdictsBySide.set(cell.side, [])
       verdictsBySide.get(cell.side)!.push(v)
     } catch (e) {