@lythos/skill-arena 0.14.3 → 0.14.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/package.json +7 -5
- package/src/arena-toml.ts +1 -0
- package/src/comparative-judge.ts +1 -0
- package/src/player.test.ts +5 -5
- package/src/player.ts +4 -2
- package/src/runner.test.ts +91 -0
- package/src/runner.ts +2 -0
package/README.md
CHANGED
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@lythos/skill-arena",
|
|
3
|
-
"version": "0.14.
|
|
3
|
+
"version": "0.14.4",
|
|
4
4
|
"description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"ai-agent",
|
|
@@ -42,13 +42,15 @@
|
|
|
42
42
|
"bun": ">=1.0.0"
|
|
43
43
|
},
|
|
44
44
|
"dependencies": {
|
|
45
|
-
"@lythos/cold-pool": "^0.14.
|
|
46
|
-
"@lythos/infra": "^0.14.
|
|
47
|
-
"@lythos/test-utils": "^0.14.
|
|
45
|
+
"@lythos/cold-pool": "^0.14.4",
|
|
46
|
+
"@lythos/infra": "^0.14.4",
|
|
47
|
+
"@lythos/test-utils": "^0.14.4",
|
|
48
48
|
"zod": "^3.24.0",
|
|
49
49
|
"zod-to-json-schema": "^3.25.2"
|
|
50
50
|
},
|
|
51
51
|
"optionalDependencies": {
|
|
52
|
-
"@lythos/agent-adapter-claude-sdk": "^0.14.
|
|
52
|
+
"@lythos/agent-adapter-claude-sdk": "^0.14.4",
|
|
53
|
+
"@lythos/agent-adapter-deepseek-serve": "^0.14.4",
|
|
54
|
+
"@lythos/agent-adapter-codex": "^0.14.4"
|
|
53
55
|
}
|
|
54
56
|
}
|
package/src/arena-toml.ts
CHANGED
|
@@ -66,6 +66,7 @@ export interface ExecutionPlan {
|
|
|
66
66
|
total_runs: number
|
|
67
67
|
}
|
|
68
68
|
|
|
69
|
+
/** Plan builder — pure data transform. Execution: arena/runner.ts spawns subagents. */
|
|
69
70
|
export function buildExecutionPlan(toml: ArenaToml): ExecutionPlan {
|
|
70
71
|
const cells: ExecutionCell[] = []
|
|
71
72
|
for (const side of toml.side) {
|
package/src/comparative-judge.ts
CHANGED
|
@@ -53,6 +53,7 @@ export function computePareto(vectors: { participant_id: string; scores: Record<
|
|
|
53
53
|
|
|
54
54
|
// ── Comparative Judge Prompt ──────────────────────────────────────────────
|
|
55
55
|
|
|
56
|
+
/** Pure prompt builder — no IO. Execution: arena vs mode subagent spawn (cli.ts vsRun). */
|
|
56
57
|
export function buildComparativePrompt(opts: {
|
|
57
58
|
manifest: ArenaManifest
|
|
58
59
|
verdicts: { participantId: string; verdict: unknown }[]
|
package/src/player.test.ts
CHANGED
|
@@ -21,11 +21,11 @@ deck = "./decks/rich.toml"
|
|
|
21
21
|
|
|
22
22
|
describe('resolvePlayer', () => {
|
|
23
23
|
test('maps claude-code → claude', () => {
|
|
24
|
-
expect(resolvePlayer('claude-code')).toBe('claude')
|
|
24
|
+
expect(resolvePlayer('claude-code')).toBe('claude-sdk')
|
|
25
25
|
})
|
|
26
26
|
|
|
27
27
|
test('maps Claude → claude (case insensitive)', () => {
|
|
28
|
-
expect(resolvePlayer('Claude')).toBe('claude')
|
|
28
|
+
expect(resolvePlayer('Claude')).toBe('claude-sdk')
|
|
29
29
|
})
|
|
30
30
|
|
|
31
31
|
test('maps kimi → kimi', () => {
|
|
@@ -37,7 +37,7 @@ describe('resolvePlayer', () => {
|
|
|
37
37
|
})
|
|
38
38
|
|
|
39
39
|
test('trims whitespace', () => {
|
|
40
|
-
expect(resolvePlayer(' claude-code ')).toBe('claude')
|
|
40
|
+
expect(resolvePlayer(' claude-code ')).toBe('claude-sdk')
|
|
41
41
|
})
|
|
42
42
|
})
|
|
43
43
|
|
|
@@ -45,7 +45,7 @@ describe('resolveSides', () => {
|
|
|
45
45
|
test('resolves all sides in arena.toml', () => {
|
|
46
46
|
const sides = resolveSides(toml)
|
|
47
47
|
expect(sides).toHaveLength(2)
|
|
48
|
-
expect(sides[0].platform).toBe('claude')
|
|
48
|
+
expect(sides[0].platform).toBe('claude-sdk')
|
|
49
49
|
expect(sides[1].platform).toBe('expert-architect')
|
|
50
50
|
expect(sides[0].playerName).toBe('claude-code')
|
|
51
51
|
})
|
|
@@ -63,7 +63,7 @@ describe('groupBySide', () => {
|
|
|
63
63
|
expect(groups).toHaveLength(2)
|
|
64
64
|
expect(groups[0].runs).toBe(3) // runs_per_side
|
|
65
65
|
expect(groups[1].runs).toBe(3)
|
|
66
|
-
expect(groups[0].platform).toBe('claude')
|
|
66
|
+
expect(groups[0].platform).toBe('claude-sdk')
|
|
67
67
|
})
|
|
68
68
|
|
|
69
69
|
test('control flag preserved', () => {
|
package/src/player.ts
CHANGED
|
@@ -12,10 +12,12 @@ export interface ResolvedSide {
|
|
|
12
12
|
|
|
13
13
|
/** Built-in player registry. Player names that map directly to useAgent platforms. */
|
|
14
14
|
const BUILTIN_PLAYERS: Record<string, string> = {
|
|
15
|
-
'claude': 'claude',
|
|
16
|
-
'claude-code': 'claude',
|
|
15
|
+
'claude': 'claude-sdk', // SDK mode (claude-cli is deprecated — ADR-20260518145235543)
|
|
16
|
+
'claude-code': 'claude-sdk',
|
|
17
|
+
'claude-cli': 'claude', // legacy CLI spawn — explicitly opt-in
|
|
17
18
|
'kimi': 'kimi',
|
|
18
19
|
'deepseek': 'deepseek',
|
|
20
|
+
'codex': 'codex',
|
|
19
21
|
'cursor': 'cursor',
|
|
20
22
|
'gemini': 'gemini',
|
|
21
23
|
}
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import { describe, test, expect } from 'bun:test'
|
|
2
|
+
import { buildArenaPrompt, formatPlanOutput } from './runner'
|
|
3
|
+
import { buildExecutionPlan, parseArenaToml } from './arena-toml'
|
|
4
|
+
|
|
5
|
+
describe('buildArenaPrompt — plan-mode (pure, no IO)', () => {
|
|
6
|
+
test('includes CWD, Deck, and output directory', () => {
|
|
7
|
+
const prompt = buildArenaPrompt({
|
|
8
|
+
brief: 'Write a hello world function',
|
|
9
|
+
cwd: '/tmp/arena-cell',
|
|
10
|
+
deckPath: '/tmp/test-deck.toml',
|
|
11
|
+
})
|
|
12
|
+
expect(prompt).toContain('CWD: /tmp/arena-cell')
|
|
13
|
+
expect(prompt).toContain('Deck: /tmp/test-deck.toml')
|
|
14
|
+
expect(prompt).toContain('Produce output to: /tmp/arena-cell/')
|
|
15
|
+
})
|
|
16
|
+
|
|
17
|
+
test('includes decision-log.jsonl mandatory instructions', () => {
|
|
18
|
+
const prompt = buildArenaPrompt({
|
|
19
|
+
brief: 'test',
|
|
20
|
+
cwd: '/tmp/arena-cell',
|
|
21
|
+
deckPath: '/tmp/test-deck.toml',
|
|
22
|
+
})
|
|
23
|
+
expect(prompt).toContain('MANDATORY — write decision-log.jsonl')
|
|
24
|
+
expect(prompt).toContain('t (seconds elapsed)')
|
|
25
|
+
expect(prompt).toContain('phase (setup/content/design/output)')
|
|
26
|
+
expect(prompt).toContain('decision (what you chose)')
|
|
27
|
+
expect(prompt).toContain('reason (why)')
|
|
28
|
+
})
|
|
29
|
+
|
|
30
|
+
test('includes robustness and tools instructions', () => {
|
|
31
|
+
const prompt = buildArenaPrompt({
|
|
32
|
+
brief: 'test',
|
|
33
|
+
cwd: '/tmp/arena-cell',
|
|
34
|
+
deckPath: '/tmp/test-deck.toml',
|
|
35
|
+
})
|
|
36
|
+
expect(prompt).toContain('ROBUSTNESS')
|
|
37
|
+
expect(prompt).toContain('TOOLS')
|
|
38
|
+
expect(prompt).toContain('.claude/skills/')
|
|
39
|
+
})
|
|
40
|
+
|
|
41
|
+
test('task brief appears at the end', () => {
|
|
42
|
+
const prompt = buildArenaPrompt({
|
|
43
|
+
brief: 'Generate a dark-mode CSS theme',
|
|
44
|
+
cwd: '/tmp/arena-cell',
|
|
45
|
+
deckPath: '/tmp/test-deck.toml',
|
|
46
|
+
})
|
|
47
|
+
expect(prompt).toContain('TASK:')
|
|
48
|
+
expect(prompt).toContain('Generate a dark-mode CSS theme')
|
|
49
|
+
// Brief should be at the end (after MANDATORY, ROBUSTNESS, TOOLS)
|
|
50
|
+
const briefIdx = prompt.indexOf('Generate a dark-mode CSS theme')
|
|
51
|
+
const mandatoryIdx = prompt.indexOf('MANDATORY')
|
|
52
|
+
const toolsIdx = prompt.indexOf('TOOLS')
|
|
53
|
+
expect(briefIdx).toBeGreaterThan(mandatoryIdx)
|
|
54
|
+
expect(briefIdx).toBeGreaterThan(toolsIdx)
|
|
55
|
+
})
|
|
56
|
+
|
|
57
|
+
test('outputDir overrides default output path', () => {
|
|
58
|
+
const prompt = buildArenaPrompt({
|
|
59
|
+
brief: 'test',
|
|
60
|
+
cwd: '/tmp/arena-cell',
|
|
61
|
+
deckPath: '/tmp/test-deck.toml',
|
|
62
|
+
outputDir: '/custom/output',
|
|
63
|
+
})
|
|
64
|
+
expect(prompt).toContain('Produce output to: /custom/output/')
|
|
65
|
+
})
|
|
66
|
+
|
|
67
|
+
test('preflightReport included when provided', () => {
|
|
68
|
+
const prompt = buildArenaPrompt({
|
|
69
|
+
brief: 'test',
|
|
70
|
+
cwd: '/tmp/arena-cell',
|
|
71
|
+
deckPath: '/tmp/test-deck.toml',
|
|
72
|
+
preflightReport: '✅ 3 skills linked, 0 missing',
|
|
73
|
+
})
|
|
74
|
+
expect(prompt).toContain('Preflight:')
|
|
75
|
+
expect(prompt).toContain('✅ 3 skills linked, 0 missing')
|
|
76
|
+
})
|
|
77
|
+
|
|
78
|
+
test('no preflight section when report not provided', () => {
|
|
79
|
+
const prompt = buildArenaPrompt({
|
|
80
|
+
brief: 'test',
|
|
81
|
+
cwd: '/tmp/arena-cell',
|
|
82
|
+
deckPath: '/tmp/test-deck.toml',
|
|
83
|
+
})
|
|
84
|
+
expect(prompt).not.toContain('Preflight:')
|
|
85
|
+
})
|
|
86
|
+
|
|
87
|
+
test('prompt is deterministic — same inputs = same output', () => {
|
|
88
|
+
const opts = { brief: 'test', cwd: '/tmp/a', deckPath: '/tmp/d.toml' }
|
|
89
|
+
expect(buildArenaPrompt(opts)).toBe(buildArenaPrompt(opts))
|
|
90
|
+
})
|
|
91
|
+
})
|
package/src/runner.ts
CHANGED
|
@@ -9,6 +9,7 @@ import { ArenaManifest, Player, type JudgeInput, type Evidence, type JudgeVerdic
|
|
|
9
9
|
import type { ArenaManifest as ArenaManifestType } from '@lythos/test-utils/schema'
|
|
10
10
|
try { await import('@lythos/agent-adapter-claude-sdk') } catch { /* package not installed */ }
|
|
11
11
|
try { await import('@lythos/agent-adapter-deepseek-serve') } catch { /* package not installed */ }
|
|
12
|
+
try { await import('@lythos/agent-adapter-codex') } catch { /* package not installed */ }
|
|
12
13
|
import { runComparativeJudge } from './comparative-judge'
|
|
13
14
|
import { parseArenaToml, buildExecutionPlan, type ArenaToml, type ExecutionPlan } from './arena-toml'
|
|
14
15
|
import { resolvePlayer, resolveSides } from './player'
|
|
@@ -54,6 +55,7 @@ function resolveJudgeText(toml: ArenaToml, configDir?: string): string | null {
|
|
|
54
55
|
|
|
55
56
|
// ── Prompt template (IoC: brief = variable, template = fixed contract) ────
|
|
56
57
|
|
|
58
|
+
/** Pure prompt builder — no IO. Execution: arena subagent spawn (cli.ts singleRun / vsRun). */
|
|
57
59
|
export function buildArenaPrompt(opts: {
|
|
58
60
|
brief: string
|
|
59
61
|
cwd: string
|