@lythos/skill-arena 0.9.6 → 0.9.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lythos/skill-arena",
3
- "version": "0.9.6",
3
+ "version": "0.9.8",
4
4
  "description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
5
5
  "keywords": [
6
6
  "ai-agent",
package/src/arena-toml.ts CHANGED
@@ -27,6 +27,9 @@ export const ArenaToml = z.object({
27
27
  criteria: z.array(z.string()).min(1),
28
28
  runs_per_side: z.number().int().positive().default(1),
29
29
  max_participants: z.number().int().min(2).max(5).default(5),
30
+ model: z.string().optional(), // e.g. "claude-sonnet-4-6"
31
+ endpoint: z.string().optional(), // e.g. "api.anthropic.com"
32
+ notes: z.string().optional(), // freeform reproducibility notes
30
33
  }),
31
34
  side: z.array(Side).min(2).max(5),
32
35
  })
package/src/cli.ts CHANGED
@@ -593,11 +593,11 @@ async function runProgrammaticArena(argv: string[]) {
593
593
  const configPath = (options as Record<string, string | undefined>).config!
594
594
 
595
595
  const toml = parseArenaToml(readFileSync(configPath, 'utf-8'))
596
+ const { dirname } = await import('node:path')
596
597
  const result = await runArenaFromToml({
597
598
  toml,
598
- taskPath: toml.arena.task.startsWith('/') || toml.arena.task.startsWith('./')
599
- ? toml.arena.task
600
- : (options as Record<string, string | undefined>).task ?? toml.arena.task,
599
+ taskPath: toml.arena.task,
600
+ configDir: dirname(configPath), // resolve relative paths against config file dir
601
601
  outDir: (options as Record<string, string | undefined>).out,
602
602
  dryRun,
603
603
  })
@@ -106,61 +106,75 @@ export async function runComparativeJudge(opts: {
106
106
 
107
107
  const prompt = buildComparativePrompt({ manifest, verdicts })
108
108
 
109
- let raw: string
109
+ let raw = ''
110
110
  let parsed: unknown
111
-
112
- if (judge.invokeTool) {
113
- parsed = await judge.invokeTool({
114
- tool: SCORE_TOOL,
115
- prompt,
116
- cwd: workdir,
117
- timeoutMs: 120000,
118
- })
119
- raw = JSON.stringify(parsed)
120
- } else {
121
- const result = await judge.spawn({ cwd: workdir, brief: prompt, timeoutMs: 120000 })
122
- raw = result.stdout
123
- const fenceMatch = raw.match(/```(?:json)?\s*([\s\S]*?)\s*```/)
124
- const jsonStr = fenceMatch ? fenceMatch[1].trim() : raw.trim()
125
- parsed = JSON.parse(jsonStr)
126
- }
127
-
128
- // Validate LLM output
129
- const llmResult = ComparativeReport.pick({
130
- score_matrix: true,
131
- key_findings: true,
132
- recommendations: true,
133
- }).parse(parsed)
134
-
135
- const scoreMatrix = toScoreMatrix(manifest, llmResult.score_matrix)
136
-
137
- // Pareto: deterministic, never delegated to LLM
138
- const participantScores = manifest.participants.map(p => {
139
- const pScores: Record<string, number> = {}
140
- for (const cell of scoreMatrix) {
141
- if (cell.participant_id === p.id) {
142
- pScores[cell.criterion] = cell.score
111
+ let lastError: string | undefined
112
+
113
+ for (let attempt = 0; attempt <= 2; attempt++) {
114
+ try {
115
+ if (judge.invokeTool) {
116
+ parsed = await judge.invokeTool({
117
+ tool: SCORE_TOOL,
118
+ prompt,
119
+ cwd: workdir,
120
+ timeoutMs: 120000,
121
+ })
122
+ raw = JSON.stringify(parsed)
123
+ } else {
124
+ const result = await judge.spawn({ cwd: workdir, brief: prompt, timeoutMs: 120000 })
125
+ raw = result.stdout
126
+ const fenceMatch = raw.match(/```(?:json)?\s*([\s\S]*?)\s*```/)
127
+ const jsonStr = fenceMatch ? fenceMatch[1].trim() : raw.trim()
128
+ parsed = JSON.parse(jsonStr)
143
129
  }
144
- }
145
- return { participant_id: p.id, scores: pScores }
146
- })
147
130
 
148
- const pareto = computePareto(participantScores)
131
+ // Validate LLM output through Zod
132
+ const llmResult = ComparativeReport.pick({
133
+ score_matrix: true,
134
+ key_findings: true,
135
+ recommendations: true,
136
+ }).parse(parsed)
137
+
138
+ // Success — proceed to Pareto computation
139
+ const scoreMatrix = toScoreMatrix(manifest, llmResult.score_matrix)
140
+ const participantScores = manifest.participants.map(p => {
141
+ const pScores: Record<string, number> = {}
142
+ for (const cell of scoreMatrix) {
143
+ if (cell.participant_id === p.id) pScores[cell.criterion] = cell.score
144
+ }
145
+ return { participant_id: p.id, scores: pScores }
146
+ })
147
+ const pareto = computePareto(participantScores)
148
+ const weightedTotals: Record<string, number> = {}
149
+ for (const p of manifest.participants) {
150
+ const pCells = scoreMatrix.filter(c => c.participant_id === p.id)
151
+ weightedTotals[p.id] = pCells.reduce((sum, c) => sum + c.score * c.weight, 0) / (pCells.length || 1)
152
+ }
149
153
 
150
- // Weighted totals (equal weight by default)
151
- const weightedTotals: Record<string, number> = {}
152
- for (const p of manifest.participants) {
153
- const pCells = scoreMatrix.filter(c => c.participant_id === p.id)
154
- weightedTotals[p.id] = pCells.reduce((sum, c) => sum + c.score * c.weight, 0) / (pCells.length || 1)
154
+ return ComparativeReport.parse({
155
+ arena_id: manifest.id,
156
+ generated_at: new Date().toISOString(),
157
+ score_matrix: scoreMatrix,
158
+ weighted_totals: weightedTotals,
159
+ pareto,
160
+ key_findings: llmResult.key_findings ?? [],
161
+ recommendations: llmResult.recommendations ?? [],
162
+ })
163
+ } catch (e) {
164
+ lastError = e instanceof Error ? e.message : String(e)
165
+ if (attempt < 2) continue // retry
166
+ }
155
167
  }
156
168
 
157
- return ComparativeReport.parse({
169
+ // All retries exhausted: return fallback report
170
+ const empty: typeof ComparativeReport._output = {
158
171
  arena_id: manifest.id,
159
172
  generated_at: new Date().toISOString(),
160
- score_matrix: scoreMatrix,
161
- weighted_totals: weightedTotals,
162
- pareto,
163
- key_findings: llmResult.key_findings ?? [],
164
- recommendations: llmResult.recommendations ?? [],
165
- })
173
+ score_matrix: [],
174
+ weighted_totals: {},
175
+ pareto: [],
176
+ key_findings: [`Comparative judge failed after 3 attempts: ${lastError}`],
177
+ recommendations: [],
178
+ }
179
+ return empty
166
180
  }
package/src/runner.ts CHANGED
@@ -43,10 +43,23 @@ export async function runArenaFromToml(opts: {
43
43
  outDir?: string
44
44
  dryRun?: boolean
45
45
  log?: (msg: string) => void
46
+ configDir?: string // for resolving relative paths
46
47
  }): Promise<ArenaResult | { plan: ReturnType<typeof buildExecutionPlan> }> {
47
- const { toml, taskPath, outDir, dryRun, log } = opts
48
+ const { toml, taskPath, outDir, dryRun, log, configDir } = opts
48
49
 
49
- const plan = buildExecutionPlan(toml)
50
+ // Resolve relative paths against config dir (anti-footgun: cwd may differ)
51
+ const resolvePath = (p: string) => {
52
+ if (p.startsWith('/')) return p
53
+ if (configDir) return resolve(configDir, p)
54
+ return resolve(p)
55
+ }
56
+ const taskAbs = resolvePath(taskPath)
57
+ const resolvedToml: ArenaToml = {
58
+ ...toml,
59
+ side: toml.side.map(s => ({ ...s, deck: resolvePath(s.deck) })),
60
+ }
61
+
62
+ const plan = buildExecutionPlan(resolvedToml)
50
63
 
51
64
  // dry-run: return plan without executing
52
65
  if (dryRun) {
@@ -58,13 +71,13 @@ export async function runArenaFromToml(opts: {
58
71
 
59
72
  const arenaId = `arena-${stamp()}`
60
73
  const artifactsDir = outDir || join(process.cwd(), 'runs', arenaId)
61
- const resolved = resolveSides(toml)
74
+ const resolved = resolveSides(resolvedToml)
62
75
 
63
76
  // Build manifest
64
77
  const manifest = ArenaManifest.parse({
65
78
  id: arenaId,
66
79
  created_at: new Date().toISOString(),
67
- task: readFileSync(resolve(taskPath), 'utf-8').slice(0, 200),
80
+ task: readFileSync(taskAbs, 'utf-8').slice(0, 200),
68
81
  mode: 'decks',
69
82
  participants: [...new Map(resolved.map(r => [r.side.name, r])).values()].map(r => ({
70
83
  id: r.side.name,
@@ -73,7 +86,7 @@ export async function runArenaFromToml(opts: {
73
86
  deck: r.side.deck,
74
87
  description: `${r.playerName} × ${r.side.deck}`,
75
88
  })),
76
- criteria: toml.arena.criteria,
89
+ criteria: resolvedToml.arena.criteria,
77
90
  status: 'running',
78
91
  })
79
92