npm - @lythos/skill-arena - Versions diffs - 0.9.12 → 0.9.14 - Mend

@lythos/skill-arena 0.9.12 → 0.9.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/package.json +1 -1
package/src/comparative-judge.ts +37 -29
package/src/runner.ts +10 -1

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@lythos/skill-arena",
-  "version": "0.9.12",
+  "version": "0.9.14",
   "description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
   "keywords": [
     "ai-agent",

package/src/comparative-judge.ts CHANGED Viewed

@@ -1,4 +1,3 @@
-import { zodToJsonSchema } from 'zod-to-json-schema'
 import { ComparativeReport, ScoreCell, ParetoEntry } from '@lythos/test-utils/schema'
 import type { AgentAdapter } from '@lythos/test-utils/agents'
 import type { ArenaManifest } from '@lythos/test-utils/schema'
@@ -103,12 +102,6 @@ score: 1=poor, 3=acceptable, 5=excellent.
 Use the submit_scores tool to return your structured evaluation.`
 }
-const SCORE_TOOL = {
-  name: 'submit_scores',
-  description: 'Submit per-participant scores for each criterion with rationales',
-  input_schema: zodToJsonSchema(ComparativeReport.pick({ score_matrix: true, key_findings: true, recommendations: true })) as Record<string, unknown>,
-}
 function toScoreMatrix(
   manifest: ArenaManifest,
   scores: { participant_id: string; criterion: string; weight: number; score: number; rationale: string }[]
@@ -240,21 +233,14 @@ export async function runComparativeJudge(opts: {
   for (let attempt = 0; attempt <= 2; attempt++) {
     try {
-      if (judge.invokeTool) {
-        parsed = await judge.invokeTool({
-          tool: SCORE_TOOL,
-          prompt,
-          cwd: workdir,
-          timeoutMs: 120000,
-        })
-        raw = JSON.stringify(parsed)
-      } else {
-        const result = await judge.spawn({ cwd: workdir, brief: prompt, timeoutMs: 120000 })
-        raw = result.stdout
-        const fenceMatch = raw.match(/```(?:json)?\s*([\s\S]*?)\s*```/)
-        const jsonStr = fenceMatch ? fenceMatch[1].trim() : raw.trim()
-        parsed = JSON.parse(jsonStr)
-      }
+      // Use spawn directly — prompt already includes Zod schema, invokeTool's
+      // redundant JSON Schema wrapper confuses the LLM.
+      const result = await judge.spawn({ cwd: workdir, brief: prompt, timeoutMs: 120000 })
+      raw = result.stdout
+      const fenceMatch = raw.match(/```(?:json)?\s*([\s\S]*?)\s*```/)
+      const jsonStr = fenceMatch ? fenceMatch[1].trim() : raw.trim()
+      if (!jsonStr) throw new Error('Empty LLM output')
+      parsed = JSON.parse(jsonStr)
       // Normalize LLM output before Zod validation
       const normalizedParsed = normalizeComparativeOutput(parsed as Record<string, unknown>)
@@ -297,15 +283,37 @@ export async function runComparativeJudge(opts: {
     }
   }
-  // All retries exhausted: return fallback report
-  const empty: typeof ComparativeReport._output = {
+  // All retries exhausted: build fallback report from per-cell verdicts
+  const scoreMatrix: typeof ScoreCell._output[] = []
+  for (const v of verdicts) {
+    const jv = v.verdict as Record<string, unknown> | null
+    const criteria = (Array.isArray(jv?.criteria) ? jv!.criteria : []) as { name?: string; passed?: boolean; note?: string }[]
+    for (const c of criteria) {
+      scoreMatrix.push(ScoreCell.parse({
+        participant_id: v.participantId,
+        criterion: c.name ?? 'unknown',
+        weight: 1 / (manifest.criteria.length || 1),
+        score: c.passed ? 5 : 1,
+        rationale: c.note ?? (c.passed ? 'PASS' : 'FAIL'),
+      }))
+    }
+  }
+  const participantScores = manifest.participants.map(p => {
+    const pScores: Record<string, number> = {}
+    for (const cell of scoreMatrix) {
+      if (cell.participant_id === p.id) pScores[cell.criterion] = cell.score
+    }
+    return { participant_id: p.id, scores: pScores }
+  })
+  const pareto = computePareto(participantScores)
+  return ComparativeReport.parse({
     arena_id: manifest.id,
     generated_at: new Date().toISOString(),
-    score_matrix: [],
+    score_matrix: scoreMatrix,
     weighted_totals: {},
-    pareto: [],
-    key_findings: [`Comparative judge failed after 3 attempts: ${lastError}`],
+    pareto,
+    key_findings: [`Comparative judge unavailable; scores derived from per-cell verdicts. Last error: ${lastError}`],
     recommendations: [],
-  }
-  return empty
+  })
 }

package/src/runner.ts CHANGED Viewed

@@ -105,10 +105,19 @@ export async function runArenaFromToml(opts: {
       const result = await runAgentScenario({
         scenarioPath: taskAbs,
         agent,
-        setupWorkdir(_scenario: AgentScenario, workdir: string) {
+        async setupWorkdir(_scenario: AgentScenario, workdir: string) {
           mkdirSync(workdir, { recursive: true })
           const deckContent = readFileSync(cell.deck, 'utf-8')
           writeFileSync(join(workdir, 'skill-deck.toml'), deckContent)
+          // Link skills into .claude/skills/ so claude -p can discover them
+          const deckCli = resolve(import.meta.dir, '..', '..', 'lythoskill-deck', 'src', 'cli.ts')
+          const linkProc = Bun.spawn(['bun', 'run', deckCli, 'link'], {
+            cwd: workdir,
+            env: { ...process.env, HOME: process.env.HOME },
+          })
+          await linkProc.exited
+          log?.(`[arena] deck link for ${cell.side}: exit ${linkProc.exitCode}`)
         },
         baseDir: join(artifactsDir, 'runs', cell.side),
       })