npm - @lythos/skill-arena - Versions diffs - 0.9.6 → 0.9.8 - Mend

@lythos/skill-arena 0.9.6 → 0.9.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/package.json +1 -1
package/src/arena-toml.ts +3 -0
package/src/cli.ts +3 -3
package/src/comparative-judge.ts +63 -49
package/src/runner.ts +18 -5

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@lythos/skill-arena",
-  "version": "0.9.6",
+  "version": "0.9.8",
   "description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
   "keywords": [
     "ai-agent",

package/src/arena-toml.ts CHANGED Viewed

@@ -27,6 +27,9 @@ export const ArenaToml = z.object({
     criteria: z.array(z.string()).min(1),
     runs_per_side: z.number().int().positive().default(1),
     max_participants: z.number().int().min(2).max(5).default(5),
+    model: z.string().optional(),  // e.g. "claude-sonnet-4-6"
+    endpoint: z.string().optional(), // e.g. "api.anthropic.com"
+    notes: z.string().optional(),  // freeform reproducibility notes
   }),
   side: z.array(Side).min(2).max(5),
 })

package/src/cli.ts CHANGED Viewed

@@ -593,11 +593,11 @@ async function runProgrammaticArena(argv: string[]) {
     const configPath = (options as Record<string, string | undefined>).config!
     const toml = parseArenaToml(readFileSync(configPath, 'utf-8'))
+    const { dirname } = await import('node:path')
     const result = await runArenaFromToml({
       toml,
-      taskPath: toml.arena.task.startsWith('/') || toml.arena.task.startsWith('./')
-        ? toml.arena.task
-        : (options as Record<string, string | undefined>).task ?? toml.arena.task,
+      taskPath: toml.arena.task,
+      configDir: dirname(configPath),  // resolve relative paths against config file dir
       outDir: (options as Record<string, string | undefined>).out,
       dryRun,
     })

package/src/comparative-judge.ts CHANGED Viewed

@@ -106,61 +106,75 @@ export async function runComparativeJudge(opts: {
   const prompt = buildComparativePrompt({ manifest, verdicts })
-  let raw: string
+  let raw = ''
   let parsed: unknown
-  if (judge.invokeTool) {
-    parsed = await judge.invokeTool({
-      tool: SCORE_TOOL,
-      prompt,
-      cwd: workdir,
-      timeoutMs: 120000,
-    })
-    raw = JSON.stringify(parsed)
-  } else {
-    const result = await judge.spawn({ cwd: workdir, brief: prompt, timeoutMs: 120000 })
-    raw = result.stdout
-    const fenceMatch = raw.match(/```(?:json)?\s*([\s\S]*?)\s*```/)
-    const jsonStr = fenceMatch ? fenceMatch[1].trim() : raw.trim()
-    parsed = JSON.parse(jsonStr)
-  }
-  // Validate LLM output
-  const llmResult = ComparativeReport.pick({
-    score_matrix: true,
-    key_findings: true,
-    recommendations: true,
-  }).parse(parsed)
-  const scoreMatrix = toScoreMatrix(manifest, llmResult.score_matrix)
-  // Pareto: deterministic, never delegated to LLM
-  const participantScores = manifest.participants.map(p => {
-    const pScores: Record<string, number> = {}
-    for (const cell of scoreMatrix) {
-      if (cell.participant_id === p.id) {
-        pScores[cell.criterion] = cell.score
+  let lastError: string | undefined
+  for (let attempt = 0; attempt <= 2; attempt++) {
+    try {
+      if (judge.invokeTool) {
+        parsed = await judge.invokeTool({
+          tool: SCORE_TOOL,
+          prompt,
+          cwd: workdir,
+          timeoutMs: 120000,
+        })
+        raw = JSON.stringify(parsed)
+      } else {
+        const result = await judge.spawn({ cwd: workdir, brief: prompt, timeoutMs: 120000 })
+        raw = result.stdout
+        const fenceMatch = raw.match(/```(?:json)?\s*([\s\S]*?)\s*```/)
+        const jsonStr = fenceMatch ? fenceMatch[1].trim() : raw.trim()
+        parsed = JSON.parse(jsonStr)
       }
-    }
-    return { participant_id: p.id, scores: pScores }
-  })
-  const pareto = computePareto(participantScores)
+      // Validate LLM output through Zod
+      const llmResult = ComparativeReport.pick({
+        score_matrix: true,
+        key_findings: true,
+        recommendations: true,
+      }).parse(parsed)
+      // Success — proceed to Pareto computation
+      const scoreMatrix = toScoreMatrix(manifest, llmResult.score_matrix)
+      const participantScores = manifest.participants.map(p => {
+        const pScores: Record<string, number> = {}
+        for (const cell of scoreMatrix) {
+          if (cell.participant_id === p.id) pScores[cell.criterion] = cell.score
+        }
+        return { participant_id: p.id, scores: pScores }
+      })
+      const pareto = computePareto(participantScores)
+      const weightedTotals: Record<string, number> = {}
+      for (const p of manifest.participants) {
+        const pCells = scoreMatrix.filter(c => c.participant_id === p.id)
+        weightedTotals[p.id] = pCells.reduce((sum, c) => sum + c.score * c.weight, 0) / (pCells.length || 1)
+      }
-  // Weighted totals (equal weight by default)
-  const weightedTotals: Record<string, number> = {}
-  for (const p of manifest.participants) {
-    const pCells = scoreMatrix.filter(c => c.participant_id === p.id)
-    weightedTotals[p.id] = pCells.reduce((sum, c) => sum + c.score * c.weight, 0) / (pCells.length || 1)
+      return ComparativeReport.parse({
+        arena_id: manifest.id,
+        generated_at: new Date().toISOString(),
+        score_matrix: scoreMatrix,
+        weighted_totals: weightedTotals,
+        pareto,
+        key_findings: llmResult.key_findings ?? [],
+        recommendations: llmResult.recommendations ?? [],
+      })
+    } catch (e) {
+      lastError = e instanceof Error ? e.message : String(e)
+      if (attempt < 2) continue // retry
+    }
   }
-  return ComparativeReport.parse({
+  // All retries exhausted: return fallback report
+  const empty: typeof ComparativeReport._output = {
     arena_id: manifest.id,
     generated_at: new Date().toISOString(),
-    score_matrix: scoreMatrix,
-    weighted_totals: weightedTotals,
-    pareto,
-    key_findings: llmResult.key_findings ?? [],
-    recommendations: llmResult.recommendations ?? [],
-  })
+    score_matrix: [],
+    weighted_totals: {},
+    pareto: [],
+    key_findings: [`Comparative judge failed after 3 attempts: ${lastError}`],
+    recommendations: [],
+  }
+  return empty
 }

package/src/runner.ts CHANGED Viewed

@@ -43,10 +43,23 @@ export async function runArenaFromToml(opts: {
   outDir?: string
   dryRun?: boolean
   log?: (msg: string) => void
+  configDir?: string    // for resolving relative paths
 }): Promise<ArenaResult | { plan: ReturnType<typeof buildExecutionPlan> }> {
-  const { toml, taskPath, outDir, dryRun, log } = opts
+  const { toml, taskPath, outDir, dryRun, log, configDir } = opts
-  const plan = buildExecutionPlan(toml)
+  // Resolve relative paths against config dir (anti-footgun: cwd may differ)
+  const resolvePath = (p: string) => {
+    if (p.startsWith('/')) return p
+    if (configDir) return resolve(configDir, p)
+    return resolve(p)
+  }
+  const taskAbs = resolvePath(taskPath)
+  const resolvedToml: ArenaToml = {
+    ...toml,
+    side: toml.side.map(s => ({ ...s, deck: resolvePath(s.deck) })),
+  }
+  const plan = buildExecutionPlan(resolvedToml)
   // dry-run: return plan without executing
   if (dryRun) {
@@ -58,13 +71,13 @@ export async function runArenaFromToml(opts: {
   const arenaId = `arena-${stamp()}`
   const artifactsDir = outDir || join(process.cwd(), 'runs', arenaId)
-  const resolved = resolveSides(toml)
+  const resolved = resolveSides(resolvedToml)
   // Build manifest
   const manifest = ArenaManifest.parse({
     id: arenaId,
     created_at: new Date().toISOString(),
-    task: readFileSync(resolve(taskPath), 'utf-8').slice(0, 200),
+    task: readFileSync(taskAbs, 'utf-8').slice(0, 200),
     mode: 'decks',
     participants: [...new Map(resolved.map(r => [r.side.name, r])).values()].map(r => ({
       id: r.side.name,
@@ -73,7 +86,7 @@ export async function runArenaFromToml(opts: {
       deck: r.side.deck,
       description: `${r.playerName} × ${r.side.deck}`,
     })),
-    criteria: toml.arena.criteria,
+    criteria: resolvedToml.arena.criteria,
     status: 'running',
   })