@lythos/skill-arena 0.9.12 → 0.9.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lythos/skill-arena",
3
- "version": "0.9.12",
3
+ "version": "0.9.13",
4
4
  "description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
5
5
  "keywords": [
6
6
  "ai-agent",
@@ -1,4 +1,3 @@
1
- import { zodToJsonSchema } from 'zod-to-json-schema'
2
1
  import { ComparativeReport, ScoreCell, ParetoEntry } from '@lythos/test-utils/schema'
3
2
  import type { AgentAdapter } from '@lythos/test-utils/agents'
4
3
  import type { ArenaManifest } from '@lythos/test-utils/schema'
@@ -103,12 +102,6 @@ score: 1=poor, 3=acceptable, 5=excellent.
103
102
  Use the submit_scores tool to return your structured evaluation.`
104
103
  }
105
104
 
106
- const SCORE_TOOL = {
107
- name: 'submit_scores',
108
- description: 'Submit per-participant scores for each criterion with rationales',
109
- input_schema: zodToJsonSchema(ComparativeReport.pick({ score_matrix: true, key_findings: true, recommendations: true })) as Record<string, unknown>,
110
- }
111
-
112
105
  function toScoreMatrix(
113
106
  manifest: ArenaManifest,
114
107
  scores: { participant_id: string; criterion: string; weight: number; score: number; rationale: string }[]
@@ -240,21 +233,14 @@ export async function runComparativeJudge(opts: {
240
233
 
241
234
  for (let attempt = 0; attempt <= 2; attempt++) {
242
235
  try {
243
- if (judge.invokeTool) {
244
- parsed = await judge.invokeTool({
245
- tool: SCORE_TOOL,
246
- prompt,
247
- cwd: workdir,
248
- timeoutMs: 120000,
249
- })
250
- raw = JSON.stringify(parsed)
251
- } else {
252
- const result = await judge.spawn({ cwd: workdir, brief: prompt, timeoutMs: 120000 })
253
- raw = result.stdout
254
- const fenceMatch = raw.match(/```(?:json)?\s*([\s\S]*?)\s*```/)
255
- const jsonStr = fenceMatch ? fenceMatch[1].trim() : raw.trim()
256
- parsed = JSON.parse(jsonStr)
257
- }
236
+ // Use spawn directly — prompt already includes Zod schema, invokeTool's
237
+ // redundant JSON Schema wrapper confuses the LLM.
238
+ const result = await judge.spawn({ cwd: workdir, brief: prompt, timeoutMs: 120000 })
239
+ raw = result.stdout
240
+ const fenceMatch = raw.match(/```(?:json)?\s*([\s\S]*?)\s*```/)
241
+ const jsonStr = fenceMatch ? fenceMatch[1].trim() : raw.trim()
242
+ if (!jsonStr) throw new Error('Empty LLM output')
243
+ parsed = JSON.parse(jsonStr)
258
244
 
259
245
  // Normalize LLM output before Zod validation
260
246
  const normalizedParsed = normalizeComparativeOutput(parsed as Record<string, unknown>)
@@ -297,15 +283,37 @@ export async function runComparativeJudge(opts: {
297
283
  }
298
284
  }
299
285
 
300
- // All retries exhausted: return fallback report
301
- const empty: typeof ComparativeReport._output = {
286
+ // All retries exhausted: build fallback report from per-cell verdicts
287
+ const scoreMatrix: typeof ScoreCell._output[] = []
288
+ for (const v of verdicts) {
289
+ const jv = v.verdict as Record<string, unknown> | null
290
+ const criteria = (Array.isArray(jv?.criteria) ? jv!.criteria : []) as { name?: string; passed?: boolean; note?: string }[]
291
+ for (const c of criteria) {
292
+ scoreMatrix.push(ScoreCell.parse({
293
+ participant_id: v.participantId,
294
+ criterion: c.name ?? 'unknown',
295
+ weight: 1 / (manifest.criteria.length || 1),
296
+ score: c.passed ? 5 : 1,
297
+ rationale: c.note ?? (c.passed ? 'PASS' : 'FAIL'),
298
+ }))
299
+ }
300
+ }
301
+ const participantScores = manifest.participants.map(p => {
302
+ const pScores: Record<string, number> = {}
303
+ for (const cell of scoreMatrix) {
304
+ if (cell.participant_id === p.id) pScores[cell.criterion] = cell.score
305
+ }
306
+ return { participant_id: p.id, scores: pScores }
307
+ })
308
+ const pareto = computePareto(participantScores)
309
+
310
+ return ComparativeReport.parse({
302
311
  arena_id: manifest.id,
303
312
  generated_at: new Date().toISOString(),
304
- score_matrix: [],
313
+ score_matrix: scoreMatrix,
305
314
  weighted_totals: {},
306
- pareto: [],
307
- key_findings: [`Comparative judge failed after 3 attempts: ${lastError}`],
315
+ pareto,
316
+ key_findings: [`Comparative judge unavailable; scores derived from per-cell verdicts. Last error: ${lastError}`],
308
317
  recommendations: [],
309
- }
310
- return empty
318
+ })
311
319
  }
package/src/runner.ts CHANGED
@@ -105,10 +105,19 @@ export async function runArenaFromToml(opts: {
105
105
  const result = await runAgentScenario({
106
106
  scenarioPath: taskAbs,
107
107
  agent,
108
- setupWorkdir(_scenario: AgentScenario, workdir: string) {
108
+ async setupWorkdir(_scenario: AgentScenario, workdir: string) {
109
109
  mkdirSync(workdir, { recursive: true })
110
110
  const deckContent = readFileSync(cell.deck, 'utf-8')
111
111
  writeFileSync(join(workdir, 'skill-deck.toml'), deckContent)
112
+
113
+ // Link skills into .claude/skills/ so claude -p can discover them
114
+ const deckCli = resolve(import.meta.dir, '..', '..', 'lythoskill-deck', 'src', 'cli.ts')
115
+ const linkProc = Bun.spawn(['bun', 'run', deckCli, 'link'], {
116
+ cwd: workdir,
117
+ env: { ...process.env, HOME: process.env.HOME },
118
+ })
119
+ await linkProc.exited
120
+ log?.(`[arena] deck link for ${cell.side}: exit ${linkProc.exitCode}`)
112
121
  },
113
122
  baseDir: join(artifactsDir, 'runs', cell.side),
114
123
  })