@lythos/skill-arena 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +1 -1
  2. package/src/cli.ts +324 -29
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lythos/skill-arena",
3
- "version": "0.1.0",
3
+ "version": "0.3.0",
4
4
  "description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
5
5
  "license": "MIT",
6
6
  "type": "module",
package/src/cli.ts CHANGED
@@ -6,9 +6,9 @@
6
6
  */
7
7
 
8
8
  import {
9
- existsSync, mkdirSync, writeFileSync,
9
+ existsSync, mkdirSync, writeFileSync, readFileSync,
10
10
  } from 'node:fs'
11
- import { join, resolve } from 'node:path'
11
+ import { join, resolve, basename } from 'node:path'
12
12
 
13
13
  // ── 简单的 slugify ──────────────────────────────────────────
14
14
  function slugify(input: string): string {
@@ -29,8 +29,9 @@ function parseArgs(argv: string[]) {
29
29
  const options: Record<string, string | undefined> = {
30
30
  task: undefined,
31
31
  skills: undefined,
32
+ decks: undefined,
32
33
  criteria: 'syntax,context,logic,token',
33
- control: 'project-scribe',
34
+ control: 'lythoskill-project-scribe',
34
35
  dir: 'tmp',
35
36
  project: '.',
36
37
  }
@@ -42,6 +43,8 @@ function parseArgs(argv: string[]) {
42
43
  options.task = argv[++i]
43
44
  } else if (arg === '--skills' || arg === '-s') {
44
45
  options.skills = argv[++i]
46
+ } else if (arg === '--decks') {
47
+ options.decks = argv[++i]
45
48
  } else if (arg === '--criteria' || arg === '-c') {
46
49
  options.criteria = argv[++i]
47
50
  } else if (arg === '--control') {
@@ -67,15 +70,42 @@ export function runArena(argv: string[]) {
67
70
  process.exit(1)
68
71
  }
69
72
 
70
- const SKILLS = (options.skills || '').split(',').map(s => s.trim()).filter(Boolean)
71
- if (SKILLS.length < 2) {
73
+ const HAS_DECKS = !!options.decks
74
+ const HAS_SKILLS = !!options.skills
75
+
76
+ if (!HAS_DECKS && !HAS_SKILLS) {
77
+ console.error('❌ 请提供 --skills 或 --decks')
78
+ process.exit(1)
79
+ }
80
+ if (HAS_DECKS && HAS_SKILLS) {
81
+ console.error('❌ --skills 和 --decks 不能同时使用')
82
+ process.exit(1)
83
+ }
84
+
85
+ const DECK_PATHS = HAS_DECKS
86
+ ? (options.decks || '').split(',').map(s => s.trim()).filter(Boolean)
87
+ : []
88
+
89
+ const SKILLS = HAS_SKILLS
90
+ ? (options.skills || '').split(',').map(s => s.trim()).filter(Boolean)
91
+ : []
92
+
93
+ if (HAS_SKILLS && SKILLS.length < 2) {
72
94
  console.error('❌ 至少需要 2 个 skill 才能进行 arena')
73
95
  process.exit(1)
74
96
  }
75
- if (SKILLS.length > 5) {
97
+ if (HAS_SKILLS && SKILLS.length > 5) {
76
98
  console.error('❌ 一次 arena 最多 5 个 skill')
77
99
  process.exit(1)
78
100
  }
101
+ if (HAS_DECKS && DECK_PATHS.length < 2) {
102
+ console.error('❌ 至少需要 2 个 deck 才能进行 arena')
103
+ process.exit(1)
104
+ }
105
+ if (HAS_DECKS && DECK_PATHS.length > 5) {
106
+ console.error('❌ 一次 arena 最多 5 个 deck')
107
+ process.exit(1)
108
+ }
79
109
 
80
110
  const CRITERIA = (options.criteria || 'syntax,context,logic,token')
81
111
  .split(',').map(s => s.trim()).filter(Boolean)
@@ -93,15 +123,37 @@ export function runArena(argv: string[]) {
93
123
  mkdirSync(join(ARENA_DIR, 'runs'), { recursive: true })
94
124
 
95
125
  // ── 生成参与者与 deck ───────────────────────────────────────
96
- const participants = SKILLS.map((skill, i) => {
97
- const id = `run-${String(i + 1).padStart(2, '0')}`
98
- return {
99
- id,
100
- name: skill,
101
- skill_name: skill,
102
- deck_path: join(ARENA_DIR, 'decks', `arena-${id}.toml`),
103
- }
104
- })
126
+ let participants: { id: string; name: string; skill_name: string; deck_path: string }[]
127
+ let mode: 'single-skill' | 'full-deck'
128
+
129
+ if (HAS_DECKS) {
130
+ mode = 'full-deck'
131
+ participants = DECK_PATHS.map((deckPath, i) => {
132
+ const id = `run-${String(i + 1).padStart(2, '0')}`
133
+ const name = basename(deckPath).replace(/\.toml$/, '')
134
+ const destPath = join(ARENA_DIR, 'decks', `arena-${id}.toml`)
135
+ // Copy the provided deck to arena directory
136
+ if (existsSync(deckPath)) {
137
+ const content = readFileSync(deckPath, 'utf-8')
138
+ writeFileSync(destPath, content)
139
+ } else {
140
+ console.error(`❌ Deck 文件不存在: ${deckPath}`)
141
+ process.exit(1)
142
+ }
143
+ return { id, name, skill_name: name, deck_path: destPath }
144
+ })
145
+ } else {
146
+ mode = 'single-skill'
147
+ participants = SKILLS.map((skill, i) => {
148
+ const id = `run-${String(i + 1).padStart(2, '0')}`
149
+ return {
150
+ id,
151
+ name: skill,
152
+ skill_name: skill,
153
+ deck_path: join(ARENA_DIR, 'decks', `arena-${id}.toml`),
154
+ }
155
+ })
156
+ }
105
157
 
106
158
  const criteria = CRITERIA.map((c) => ({
107
159
  name: c,
@@ -109,8 +161,9 @@ export function runArena(argv: string[]) {
109
161
  weight: 1,
110
162
  }))
111
163
 
112
- for (const p of participants) {
113
- const deckContent = `# ============================================================
164
+ if (mode === 'single-skill') {
165
+ for (const p of participants) {
166
+ const deckContent = `# ============================================================
114
167
  # Arena Deck: ${p.id} — ${p.name}
115
168
  # ============================================================
116
169
  # 变量:${p.name}
@@ -124,11 +177,11 @@ max_cards = 10
124
177
 
125
178
  [tool]
126
179
  skills = [
127
- "${p.skill_name}",
128
- ${CONTROL_SKILLS.map(s => ` "${s}",`).join('\n')}
180
+ ${[...new Set([p.skill_name, ...CONTROL_SKILLS])].map(s => ` "${s}",`).join('\n')}
129
181
  ]
130
182
  `
131
- writeFileSync(p.deck_path, deckContent)
183
+ writeFileSync(p.deck_path, deckContent)
184
+ }
132
185
  }
133
186
 
134
187
  // ── 生成 arena.json ─────────────────────────────────────────
@@ -164,8 +217,14 @@ ${criteria.map(c => ` - ${c.label}`).join('\n')}
164
217
  arena_decks:
165
218
  ${participants.map(p => ` - ${p.deck_path.replace(PROJECT_DIR, '.')}`).join('\n')}
166
219
  judge_persona: |
167
- 你是一个中立的技能评测员。对比所有 subagent 的输出,
168
- evaluation_criteria 给出 1-5 分评分,最终给出 Winner 和选型建议。
220
+ ${mode === 'full-deck'
221
+ ? `你是一个多目标优化分析师。不要选 Winner
222
+ 对每个 deck 配置,按 evaluation_criteria 输出评分向量(1-5 分)。
223
+ 识别 Pareto 非支配解集——没有"最强",只有"在不同维度上的最优权衡"。
224
+ 对被支配的解,说明它被谁支配、在哪个维度上劣势。
225
+ 如果发现任何涌现 combo(多个 skill 组合产生 1+1>2 的效果),单独标注。`
226
+ : `你是一个中立的技能评测员。对比所有 subagent 的输出,
227
+ 按 evaluation_criteria 给出 1-5 分评分,最终给出 Winner 和选型建议。`}
169
228
  acceptance:
170
229
  ${participants.map(p => ` - Subagent ${p.id} 使用 ${p.deck_path.replace(PROJECT_DIR, '.')} 完成任务并写入 runs/${p.id}.md`).join('\n')}
171
230
  - Judge 读取所有 run 文件并生成 report.md
@@ -181,16 +240,16 @@ managed_dirs:
181
240
  ${participants.map(p => `### ${p.id} (${p.name})
182
241
  \`\`\`bash
183
242
  cd "${PROJECT_DIR}"
184
- bunx @lythos/skill-deck link --deck "${p.deck_path}"
243
+ bunx @lythos/skill-deck link --deck "${p.deck_path}" --workdir "${PROJECT_DIR}"
185
244
  # 然后执行任务,输出写入 "${join(ARENA_DIR, 'runs', `${p.id}.md`)}"
186
- bunx @lythos/skill-deck link --deck "${join(PROJECT_DIR, 'skill-deck.toml')}"
245
+ bunx @lythos/skill-deck link --deck "${join(PROJECT_DIR, 'skill-deck.toml')}" --workdir "${PROJECT_DIR}"
187
246
  \`\`\`
188
247
  `).join('')}
189
248
 
190
249
  ### Judge
191
250
  \`\`\`bash
192
251
  cd "${PROJECT_DIR}"
193
- bunx @lythos/skill-deck link --deck "${join(PROJECT_DIR, 'skill-deck.toml')}"
252
+ bunx @lythos/skill-deck link --deck "${join(PROJECT_DIR, 'skill-deck.toml')}" --workdir "${PROJECT_DIR}"
194
253
  # 读取所有 run 文件,生成 "${join(ARENA_DIR, 'report.md')}"
195
254
  \`\`\`
196
255
  `
@@ -204,9 +263,9 @@ bunx @lythos/skill-deck link --deck "${join(PROJECT_DIR, 'skill-deck.toml')}"
204
263
  ID: ${ARENA_ID}
205
264
  任务: ${TASK}
206
265
  目录: ${ARENA_DIR}
207
- 参与者: ${SKILLS.join(', ')}
208
- 控制变量: ${CONTROL_SKILLS.join(', ')}
209
- 评测维度: ${CRITERIA.join(', ')}
266
+ 模式: ${mode === 'full-deck' ? '完整 deck 配置对比' : '单 skill 对比'}
267
+ 参与者: ${participants.map(p => p.name).join(', ')}
268
+ ${mode === 'single-skill' ? `控制变量: ${CONTROL_SKILLS.join(', ')}\n` : ''}评测维度: ${CRITERIA.join(', ')}
210
269
 
211
270
  生成文件:
212
271
  📋 ${join(ARENA_DIR, 'arena.json')}
@@ -220,6 +279,242 @@ ID: ${ARENA_ID}
220
279
  `)
221
280
  }
222
281
 
282
+ // ── Viz: Report Visualizer ─────────────────────────────────
283
+
284
+ interface ScoreRow {
285
+ checkpoint: string
286
+ scores: Record<string, number>
287
+ notes: string
288
+ maxScore: number
289
+ }
290
+
291
+ function parseReportMd(reportPath: string): { title: string; rows: ScoreRow[]; summary?: Record<string, number> } | null {
292
+ if (!existsSync(reportPath)) return null
293
+ const text = readFileSync(reportPath, 'utf-8')
294
+
295
+ // Extract title
296
+ const titleMatch = text.match(/^#\s+(.+)$/m)
297
+ const title = titleMatch ? titleMatch[1].trim() : 'Arena Report'
298
+
299
+ const lines = text.split('\n')
300
+ const rows: ScoreRow[] = []
301
+ const summaries: Record<string, number> = {}
302
+
303
+ let currentSection = ''
304
+ let inTable = false
305
+ let headers: string[] = []
306
+
307
+ for (const line of lines) {
308
+ const trimmed = line.trim()
309
+
310
+ // Detect section headers like "### Memory Condition" or "### Control Condition"
311
+ const sectionMatch = trimmed.match(/^#{2,4}\s+(.*Condition.*|.*Variable.*|.*Group.*)/i)
312
+ if (sectionMatch) {
313
+ currentSection = sectionMatch[1].replace(/[()]/g, '').trim()
314
+ inTable = false
315
+ continue
316
+ }
317
+
318
+ // Table header row
319
+ if (trimmed.startsWith('|') && trimmed.includes('Checkpoint') && !trimmed.includes('---')) {
320
+ inTable = true
321
+ const parts = trimmed.split('|').map(s => s.trim()).filter(Boolean)
322
+ headers = parts.slice(1)
323
+ continue
324
+ }
325
+
326
+ // Table separator
327
+ if (inTable && trimmed.startsWith('|') && trimmed.includes('---')) continue
328
+
329
+ // Table data row
330
+ if (inTable && trimmed.startsWith('|')) {
331
+ const parts = trimmed.split('|').map(s => s.trim()).filter(Boolean)
332
+ if (parts.length >= 2) {
333
+ const firstCell = parts[0]
334
+ const checkpoint = firstCell.replace(/\*\*/g, '').trim()
335
+
336
+ // Skip "Total" rows — handle them as summary
337
+ if (/^total/i.test(checkpoint)) {
338
+ for (let i = 1; i < parts.length && i <= headers.length; i++) {
339
+ const num = parseFloat(parts[i])
340
+ if (!isNaN(num)) {
341
+ const key = currentSection
342
+ ? `${currentSection} ${headers[i - 1]}`.trim()
343
+ : headers[i - 1]
344
+ summaries[key] = num
345
+ }
346
+ }
347
+ continue
348
+ }
349
+
350
+ // Skip non-numeric rows (section headers inside table)
351
+ const secondCell = parts[1]
352
+ if (isNaN(parseFloat(secondCell))) continue
353
+
354
+ const scores: Record<string, number> = {}
355
+ let maxScore = 0
356
+ for (let i = 1; i < parts.length && i <= headers.length; i++) {
357
+ const header = headers[i - 1]
358
+ if (/notes?/i.test(header)) continue // Skip notes column
359
+ const val = parts[i]
360
+ const num = parseFloat(val)
361
+ if (!isNaN(num)) {
362
+ // Prefix with section name if multiple condition tables exist
363
+ const key = currentSection && headers.length <= 2
364
+ ? `${currentSection} Score`
365
+ : header
366
+ scores[key] = num
367
+ maxScore = Math.max(maxScore, num)
368
+ }
369
+ }
370
+
371
+ const notes = parts[parts.length - 1] || ''
372
+ if (Object.keys(scores).length > 0) {
373
+ rows.push({ checkpoint, scores, notes, maxScore })
374
+ }
375
+ }
376
+ continue
377
+ }
378
+
379
+ // End of table
380
+ if (inTable && !trimmed.startsWith('|') && trimmed !== '') {
381
+ inTable = false
382
+ currentSection = ''
383
+ }
384
+ }
385
+
386
+ return { title, rows, summary: Object.keys(summaries).length > 0 ? summaries : undefined }
387
+ }
388
+
389
+ function renderBar(value: number, max: number, width = 30): string {
390
+ const filled = Math.round((value / max) * width)
391
+ const empty = width - filled
392
+ return '█'.repeat(filled) + '░'.repeat(empty)
393
+ }
394
+
395
+ function renderAsciiChart(report: NonNullable<ReturnType<typeof parseReportMd>>): string {
396
+ const { title, rows, summary } = report
397
+ const participants = rows.length > 0 ? Object.keys(rows[0].scores) : []
398
+ const maxVal = rows.reduce((m, r) => Math.max(m, r.maxScore), 0) || 10
399
+
400
+ let out = `\n╔══════════════════════════════════════════════════════════════════════╗\n`
401
+ out += `║ 🏆 ${title.slice(0, 58).padEnd(58)} ║\n`
402
+ out += `╚══════════════════════════════════════════════════════════════════════╝\n\n`
403
+
404
+ // Per-checkpoint bars
405
+ for (const row of rows) {
406
+ out += `📋 ${row.checkpoint}\n`
407
+ for (const [name, score] of Object.entries(row.scores)) {
408
+ const bar = renderBar(score, maxVal)
409
+ out += ` ${name.padEnd(12)} ${bar} ${score}/${maxVal}\n`
410
+ }
411
+ if (row.notes) {
412
+ out += ` 💡 ${row.notes.slice(0, 80)}${row.notes.length > 80 ? '...' : ''}\n`
413
+ }
414
+ out += '\n'
415
+ }
416
+
417
+ // Summary totals
418
+ if (summary) {
419
+ out += `━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n`
420
+ out += `📊 TOTAL SCORES\n`
421
+ for (const [name, score] of Object.entries(summary)) {
422
+ const bar = renderBar(score, maxVal * rows.length)
423
+ out += ` ${name.padEnd(12)} ${bar} ${score}\n`
424
+ }
425
+ out += '\n'
426
+ }
427
+
428
+ return out
429
+ }
430
+
431
+ function renderRadarChart(report: NonNullable<ReturnType<typeof parseReportMd>>): string {
432
+ const { rows } = report
433
+ if (rows.length === 0) return ''
434
+
435
+ const participants = Object.keys(rows[0].scores)
436
+ if (participants.length < 2) return ''
437
+
438
+ // Use checkpoint names as axes
439
+ const axes = rows.map(r => r.checkpoint.slice(0, 12))
440
+ const maxVal = rows.reduce((m, r) => Math.max(m, ...Object.values(r.scores)), 0) || 10
441
+
442
+ // Simple ASCII radar: concentric circles with labels
443
+ const size = 16
444
+ const center = size / 2
445
+ let out = `\n🕸️ RADAR CHART (MOO Scoring)\n\n`
446
+
447
+ // For each participant, show a compact radar representation
448
+ const symbols = ['■', '●', '▲', '◆', '★']
449
+ for (let pi = 0; pi < participants.length; pi++) {
450
+ const p = participants[pi]
451
+ const sym = symbols[pi % symbols.length]
452
+ out += ` ${sym} ${p}\n`
453
+ }
454
+ out += '\n'
455
+
456
+ // Per-axis score table (more readable than pure ASCII art)
457
+ out += ` Axis `
458
+ for (const p of participants) out += `${p.slice(0, 8).padStart(8)} `
459
+ out += '\n'
460
+ out += ` ${'─'.repeat(14 + participants.length * 9)}\n`
461
+
462
+ for (let i = 0; i < rows.length; i++) {
463
+ const axis = axes[i].padEnd(12)
464
+ out += ` ${axis} `
465
+ for (const p of participants) {
466
+ const score = rows[i].scores[p] ?? 0
467
+ out += `${String(score).padStart(8)} `
468
+ }
469
+ out += '\n'
470
+ }
471
+
472
+ return out
473
+ }
474
+
475
+ function runViz(argv: string[]) {
476
+ const arenaDir = argv.find(a => !a.startsWith('-')) || '.'
477
+ const resolvedDir = resolve(arenaDir)
478
+
479
+ const arenaJsonPath = join(resolvedDir, 'arena.json')
480
+ const reportPath = join(resolvedDir, 'report.md')
481
+
482
+ if (!existsSync(arenaJsonPath)) {
483
+ console.error(`❌ 找不到 arena.json: ${arenaJsonPath}`)
484
+ process.exit(1)
485
+ }
486
+
487
+ const arenaJson = JSON.parse(readFileSync(arenaJsonPath, 'utf-8'))
488
+ const meta = arenaJson.metadata
489
+
490
+ console.log(`\n🎮 Arena Viz: ${meta.id}`)
491
+ console.log(` 任务: ${meta.task_description}`)
492
+ console.log(` 参与者: ${meta.participants.map((p: any) => p.name).join(', ')}`)
493
+
494
+ if (!existsSync(reportPath)) {
495
+ console.log(`\n⏳ report.md 尚未生成,请先运行 Judge`)
496
+ return
497
+ }
498
+
499
+ const report = parseReportMd(reportPath)
500
+ if (!report || report.rows.length === 0) {
501
+ console.log(`\n⚠️ 无法从 report.md 解析评分数据`)
502
+ return
503
+ }
504
+
505
+ console.log(renderAsciiChart(report))
506
+ console.log(renderRadarChart(report))
507
+ }
508
+
509
+ // ── Main Entry ───────────────────────────────────────────────
510
+
223
511
  if (import.meta.main) {
224
- runArena(process.argv.slice(2))
512
+ const args = process.argv.slice(2)
513
+ const cmd = args[0]
514
+
515
+ if (cmd === 'viz') {
516
+ runViz(args.slice(1))
517
+ } else {
518
+ runArena(args)
519
+ }
225
520
  }