@einja/dev-cli 0.1.41 → 0.1.45

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. package/README.md +0 -1
  2. package/dist/cli.js +0 -1
  3. package/dist/cli.js.map +1 -1
  4. package/dist/commands/sync.d.ts.map +1 -1
  5. package/dist/commands/sync.js +1 -20
  6. package/dist/commands/sync.js.map +1 -1
  7. package/dist/commands/task-loop/lib/github-client.test.js.map +1 -1
  8. package/dist/commands/task-loop/lib/vibe-kanban-rest-client.js +2 -2
  9. package/dist/commands/task-loop/lib/vibe-kanban-rest-client.js.map +1 -1
  10. package/dist/lib/preset-update/file-copier.js +3 -3
  11. package/dist/lib/preset-update/file-copier.js.map +1 -1
  12. package/dist/lib/sync/file-filter.js +2 -2
  13. package/dist/lib/sync/file-filter.js.map +1 -1
  14. package/dist/lib/sync/file-filter.test.js +20 -0
  15. package/dist/lib/sync/file-filter.test.js.map +1 -1
  16. package/dist/lib/sync/marker-processor.js.map +1 -1
  17. package/dist/lib/sync/metadata-manager.js +1 -1
  18. package/dist/lib/sync/metadata-manager.js.map +1 -1
  19. package/dist/lib/sync/metadata-manager.test.js +3 -2
  20. package/dist/lib/sync/metadata-manager.test.js.map +1 -1
  21. package/dist/lib/sync/project-private-synchronizer.d.ts.map +1 -1
  22. package/dist/lib/sync/project-private-synchronizer.js +5 -1
  23. package/dist/lib/sync/project-private-synchronizer.js.map +1 -1
  24. package/dist/types/index.d.ts +0 -1
  25. package/dist/types/index.d.ts.map +1 -1
  26. package/package.json +1 -1
  27. package/presets/default/.claude/agents/einja/backend-architect.md +17 -1
  28. package/presets/default/.claude/agents/einja/codex-agent.md +1 -1
  29. package/presets/default/.claude/agents/einja/design-engineer.md +1 -1
  30. package/presets/default/.claude/agents/einja/docs/docs-updater.md +3 -93
  31. package/presets/default/.claude/agents/einja/frontend-architect.md +17 -1
  32. package/presets/default/.claude/agents/einja/frontend-coder.md +1 -1
  33. package/presets/default/.claude/agents/einja/{specs/spec-design-generator.md → issue-specs/design-generator.md} +12 -7
  34. package/presets/default/.claude/agents/einja/{specs/spec-qa-generator.md → issue-specs/qa-generator.md} +6 -4
  35. package/presets/default/.claude/agents/einja/{specs/spec-requirements-generator.md → issue-specs/requirements-generator.md} +5 -5
  36. package/presets/default/.claude/agents/einja/{specs/spec-tasks-generator.md → issue-specs/tasks-generator.md} +13 -14
  37. package/presets/default/.claude/agents/einja/{specs/spec-tasks-validator.md → issue-specs/tasks-validator.md} +9 -9
  38. package/presets/default/.claude/agents/einja/issue-specs/ui-design-generator.md +114 -0
  39. package/presets/default/.claude/agents/einja/task/task-executer.md +9 -3
  40. package/presets/default/.claude/agents/einja/task/task-modification-analyzer.md +2 -2
  41. package/presets/default/.claude/agents/einja/task/task-qa.md +3 -3
  42. package/presets/default/.claude/agents/einja/task/task-reviewer.md +13 -1
  43. package/presets/default/.claude/commands/einja/einja-sync.md +119 -44
  44. package/presets/default/.claude/commands/einja/issue-exec.md +29 -19
  45. package/presets/default/.claude/commands/einja/sync-cursor-commands.md +6 -6
  46. package/presets/default/.claude/commands/einja/{update-docs-by-task-specs.md → update-docs-by-issue-specs.md} +58 -58
  47. package/presets/default/.claude/hooks/einja/plan-mode-skill-loader.sh +5 -1
  48. package/presets/default/.claude/settings.json +14 -4
  49. package/presets/default/.claude/skills/{einja-general-context-loader → _einja-general-context-loader}/SKILL.md +2 -2
  50. package/presets/default/.claude/skills/{einja-output-format → _einja-output-format}/SKILL.md +1 -1
  51. package/presets/default/.claude/skills/_einja-project-overview/SKILL.md +29 -0
  52. package/presets/default/.claude/skills/{einja-spec-context-loader → _einja-spec-context-loader}/SKILL.md +5 -5
  53. package/presets/default/.claude/skills/einja-coding-standards/references/testing-strategy.md +899 -0
  54. package/presets/default/.claude/skills/einja-conflict-resolver/SKILL.md +1 -1
  55. package/presets/default/.claude/skills/einja-create-pr/SKILL.md +138 -0
  56. package/presets/default/.claude/skills/einja-infra-maintenance/SKILL.md +779 -0
  57. package/presets/default/.claude/{commands/einja/spec-create.md → skills/einja-issue-spec-create/SKILL.md} +47 -24
  58. package/presets/default/.claude/skills/einja-issue-spec-generator/SKILL.md +105 -0
  59. package/presets/default/.claude/skills/einja-issue-spec-generator/references/format-rules.md +35 -0
  60. package/presets/default/.claude/skills/einja-issue-spec-validator/SKILL.md +130 -0
  61. package/presets/default/.claude/skills/einja-issue-spec-validator/references/validation-rules.md +52 -0
  62. package/presets/default/.claude/skills/einja-npm-release/SKILL.md +242 -0
  63. package/presets/default/.claude/skills/einja-skill-creator/SKILL.md +68 -12
  64. package/presets/default/.claude/skills/einja-skill-creator/scripts/aggregate_benchmark.py +368 -121
  65. package/presets/default/.claude/skills/einja-skill-creator/scripts/compare_runs.py +154 -0
  66. package/presets/default/.claude/skills/einja-skill-creator/scripts/generate_report.py +14 -7
  67. package/presets/default/.claude/skills/einja-skill-creator/scripts/improve_description.py +2 -7
  68. package/presets/default/.claude/skills/einja-skill-creator/scripts/run_loop.py +263 -183
  69. package/presets/default/.claude/skills/einja-skill-first/SKILL.md +265 -0
  70. package/presets/default/.claude/skills/einja-subagent-question-protocol/SKILL.md +98 -0
  71. package/presets/default/.claude/skills/einja-task-commit/SKILL.md +7 -7
  72. package/presets/default/.claude/{commands/einja/task-exec.md → skills/einja-task-exec/SKILL.md} +3 -78
  73. package/presets/default/.claude/skills/einja-task-qa/SKILL.md +4 -4
  74. package/presets/default/.claude/skills/einja-task-qa/references/troubleshooting.md +1 -1
  75. package/presets/default/.claude/skills/einja-task-qa/references/usage-patterns.md +2 -2
  76. package/presets/default/.claude/skills/einja-team-exec/SKILL.md +165 -0
  77. package/presets/default/CLAUDE.md.template +21 -6
  78. package/presets/default/docs/einja/instructions/deployment-setup.md +1 -1
  79. package/presets/default/docs/einja/instructions/issue-exec-workflow.md +11 -11
  80. package/presets/default/docs/einja/instructions/local-server-environment-and-worktree.md +1 -1
  81. package/presets/default/docs/einja/instructions/setup-flow.md +279 -0
  82. package/presets/default/docs/einja/instructions/task-execute.md +42 -42
  83. package/presets/default/docs/einja/steering/acceptance-criteria-and-qa-guide.md +1 -1
  84. package/presets/default/docs/einja/steering/branch-strategy.md +1 -1
  85. package/presets/default/docs/einja/steering/development-workflow.md +93 -25
  86. package/presets/default/docs/einja/steering/infrastructure/deployment.md +107 -0
  87. package/presets/default/docs/einja/steering/task-management.md +9 -13
  88. package/presets/default/scripts/ensure-serena.sh +2 -2
  89. package/presets/default/scripts/env-rotate-secrets.ts +66 -6
  90. package/presets/default/scripts/init-github.ts +363 -0
  91. package/presets/default/scripts/init.sh +11 -5
  92. package/presets/default/scripts/setup-dev.ts +16 -1
  93. package/dist/lib/sync/backup-manager.d.ts +0 -50
  94. package/dist/lib/sync/backup-manager.d.ts.map +0 -1
  95. package/dist/lib/sync/backup-manager.js +0 -117
  96. package/dist/lib/sync/backup-manager.js.map +0 -1
  97. package/dist/lib/sync/backup-manager.test.d.ts +0 -2
  98. package/dist/lib/sync/backup-manager.test.d.ts.map +0 -1
  99. package/dist/lib/sync/backup-manager.test.js +0 -155
  100. package/dist/lib/sync/backup-manager.test.js.map +0 -1
  101. package/presets/default/.claude/agents/einja/git/conflict-resolver.md +0 -152
  102. package/presets/default/.claude/hooks/einja/validate-git-commit.sh +0 -239
  103. package/presets/default/.claude/skills/einja-project-overview/SKILL.md +0 -39
@@ -1,153 +1,400 @@
1
1
  #!/usr/bin/env python3
2
- """複数スキルのベンチマーク結果を集約。
2
+ """
3
+ ベンチマーク実行結果を集約してサマリー統計を生成。
4
+
5
+ 実行ディレクトリ内の grading.json ファイルを読み込み、以下を生成する:
6
+ - 各メトリクス(平均・標準偏差・最小・最大)を含む run_summary
7
+ - with_skill / without_skill 設定間のデルタ
8
+
9
+ 使用方法:
10
+ python aggregate_benchmark.py <benchmark_dir>
11
+
12
+ 例:
13
+ python aggregate_benchmark.py benchmarks/2026-01-15T10-30-00/
3
14
 
4
- 複数のrun_loop.py出力を受け取り、全スキルのスコアを
5
- サマリーテーブルとして表示する。
15
+ 対応するディレクトリ構成:
16
+
17
+ ワークスペース構成(skill-creator イテレーションから):
18
+ <benchmark_dir>/
19
+ └── eval-N/
20
+ ├── with_skill/
21
+ │ ├── run-1/grading.json
22
+ │ └── run-2/grading.json
23
+ └── without_skill/
24
+ ├── run-1/grading.json
25
+ └── run-2/grading.json
26
+
27
+ レガシー構成(runs/ サブディレクトリあり):
28
+ <benchmark_dir>/
29
+ └── runs/
30
+ └── eval-N/
31
+ ├── with_skill/
32
+ │ └── run-1/grading.json
33
+ └── without_skill/
34
+ └── run-1/grading.json
6
35
  """
7
36
 
8
37
  import argparse
9
38
  import json
39
+ import math
10
40
  import sys
41
+ from datetime import datetime, timezone
11
42
  from pathlib import Path
12
43
 
13
44
 
14
- def aggregate_results(result_files: list[str]) -> dict:
15
- """複数のrun_loop.py出力ファイルを集約する。"""
16
- skills = []
45
+ def calculate_stats(values: list[float]) -> dict:
46
+ """値リストの平均・標準偏差・最小・最大を計算する。"""
47
+ if not values:
48
+ return {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0}
17
49
 
18
- for filepath in result_files:
19
- try:
20
- data = json.loads(Path(filepath).read_text())
21
- except (json.JSONDecodeError, FileNotFoundError) as e:
22
- print(f"警告: {filepath} の読み込みに失敗しました: {e}", file=sys.stderr)
23
- continue
50
+ n = len(values)
51
+ mean = sum(values) / n
52
+
53
+ if n > 1:
54
+ variance = sum((x - mean) ** 2 for x in values) / (n - 1)
55
+ stddev = math.sqrt(variance)
56
+ else:
57
+ stddev = 0.0
58
+
59
+ return {
60
+ "mean": round(mean, 4),
61
+ "stddev": round(stddev, 4),
62
+ "min": round(min(values), 4),
63
+ "max": round(max(values), 4)
64
+ }
65
+
66
+
67
+ def load_run_results(benchmark_dir: Path) -> dict:
68
+ """
69
+ ベンチマークディレクトリから全実行結果を読み込む。
70
+
71
+ 設定名(例: "with_skill"/"without_skill" または "new_skill"/"old_skill")を
72
+ キーとして実行結果リストを格納した dict を返す。
73
+ """
74
+ # eval ディレクトリが benchmark_dir 直下または runs/ 以下の両方に対応
75
+ runs_dir = benchmark_dir / "runs"
76
+ if runs_dir.exists():
77
+ search_dir = runs_dir
78
+ elif list(benchmark_dir.glob("eval-*")):
79
+ search_dir = benchmark_dir
80
+ else:
81
+ print(f"No eval directories found in {benchmark_dir} or {benchmark_dir / 'runs'}")
82
+ return {}
83
+
84
+ results: dict[str, list] = {}
85
+
86
+ for eval_idx, eval_dir in enumerate(sorted(search_dir.glob("eval-*"))):
87
+ metadata_path = eval_dir / "eval_metadata.json"
88
+ if metadata_path.exists():
89
+ try:
90
+ with open(metadata_path) as mf:
91
+ eval_id = json.load(mf).get("eval_id", eval_idx)
92
+ except (json.JSONDecodeError, OSError):
93
+ eval_id = eval_idx
94
+ else:
95
+ try:
96
+ eval_id = int(eval_dir.name.split("-")[1])
97
+ except ValueError:
98
+ eval_id = eval_idx
99
+
100
+ # 設定ディレクトリをハードコードせず動的に検出
101
+ for config_dir in sorted(eval_dir.iterdir()):
102
+ if not config_dir.is_dir():
103
+ continue
104
+ # run-* を含まないディレクトリ(inputs, outputs 等)はスキップ
105
+ if not list(config_dir.glob("run-*")):
106
+ continue
107
+ config = config_dir.name
108
+ if config not in results:
109
+ results[config] = []
110
+
111
+ for run_dir in sorted(config_dir.glob("run-*")):
112
+ run_number = int(run_dir.name.split("-")[1])
113
+ grading_file = run_dir / "grading.json"
114
+
115
+ if not grading_file.exists():
116
+ print(f"Warning: grading.json not found in {run_dir}")
117
+ continue
118
+
119
+ try:
120
+ with open(grading_file) as f:
121
+ grading = json.load(f)
122
+ except json.JSONDecodeError as e:
123
+ print(f"Warning: Invalid JSON in {grading_file}: {e}")
124
+ continue
125
+
126
+ # メトリクスを抽出
127
+ result = {
128
+ "eval_id": eval_id,
129
+ "run_number": run_number,
130
+ "pass_rate": grading.get("summary", {}).get("pass_rate", 0.0),
131
+ "passed": grading.get("summary", {}).get("passed", 0),
132
+ "failed": grading.get("summary", {}).get("failed", 0),
133
+ "total": grading.get("summary", {}).get("total", 0),
134
+ }
135
+
136
+ # タイミングを抽出(grading.json 優先、なければ timing.json を参照)
137
+ timing = grading.get("timing", {})
138
+ result["time_seconds"] = timing.get("total_duration_seconds", 0.0)
139
+ timing_file = run_dir / "timing.json"
140
+ if result["time_seconds"] == 0.0 and timing_file.exists():
141
+ try:
142
+ with open(timing_file) as tf:
143
+ timing_data = json.load(tf)
144
+ result["time_seconds"] = timing_data.get("total_duration_seconds", 0.0)
145
+ result["tokens"] = timing_data.get("total_tokens", 0)
146
+ except json.JSONDecodeError:
147
+ pass
148
+
149
+ # 実行メトリクスを抽出
150
+ metrics = grading.get("execution_metrics", {})
151
+ result["tool_calls"] = metrics.get("total_tool_calls", 0)
152
+ if not result.get("tokens"):
153
+ result["tokens"] = metrics.get("output_chars", 0)
154
+ result["errors"] = metrics.get("errors_encountered", 0)
155
+
156
+ # expectations を抽出(viewer に必要なフィールド: text, passed, evidence)
157
+ raw_expectations = grading.get("expectations", [])
158
+ for exp in raw_expectations:
159
+ if "text" not in exp or "passed" not in exp:
160
+ print(f"Warning: expectation in {grading_file} missing required fields (text, passed, evidence): {exp}")
161
+ result["expectations"] = raw_expectations
162
+
163
+ # user_notes_summary からノートを抽出
164
+ notes_summary = grading.get("user_notes_summary", {})
165
+ notes = []
166
+ notes.extend(notes_summary.get("uncertainties", []))
167
+ notes.extend(notes_summary.get("needs_review", []))
168
+ notes.extend(notes_summary.get("workarounds", []))
169
+ result["notes"] = notes
170
+
171
+ results[config].append(result)
172
+
173
+ return results
174
+
175
+
176
+ def aggregate_results(results: dict) -> dict:
177
+ """
178
+ 実行結果をサマリー統計に集約する。
179
+
180
+ 各設定のスタットと設定間のデルタを含む run_summary を返す。
181
+ """
182
+ run_summary = {}
183
+ configs = list(results.keys())
24
184
 
25
- history = data.get("history", [])
26
- if not history:
27
- print(f"警告: {filepath} に履歴がありません", file=sys.stderr)
185
+ for config in configs:
186
+ runs = results.get(config, [])
187
+
188
+ if not runs:
189
+ run_summary[config] = {
190
+ "pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
191
+ "time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
192
+ "tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0}
193
+ }
28
194
  continue
29
195
 
30
- # 最良のイテレーションを見つける(テスト > トレーニングで優先)
31
- best_idx = 0
32
- best_test = -1
33
- best_train = -1
34
- for i, h in enumerate(history):
35
- t_passed = h.get("test_passed", -1)
36
- tr_passed = h.get("train_passed", h.get("passed", 0))
37
- if t_passed > best_test or (t_passed == best_test and tr_passed > best_train):
38
- best_test = t_passed
39
- best_train = tr_passed
40
- best_idx = i
41
-
42
- best = history[best_idx]
43
- original = history[0]
44
-
45
- skill_entry = {
46
- "skill_name": data.get("skill_name", Path(filepath).stem),
47
- "file": filepath,
48
- "iterations": len(history),
49
- "best_iteration": best_idx,
50
- "original_description": data.get("original_description", ""),
51
- "best_description": best.get("description", ""),
52
- "original_train_score": f"{original.get('train_passed', original.get('passed', 0))}/{original.get('train_total', original.get('total', 0))}",
53
- "best_train_score": f"{best.get('train_passed', best.get('passed', 0))}/{best.get('train_total', best.get('total', 0))}",
54
- "original_train_passed": original.get("train_passed", original.get("passed", 0)),
55
- "original_train_total": original.get("train_total", original.get("total", 0)),
56
- "best_train_passed": best.get("train_passed", best.get("passed", 0)),
57
- "best_train_total": best.get("train_total", best.get("total", 0)),
196
+ pass_rates = [r["pass_rate"] for r in runs]
197
+ times = [r["time_seconds"] for r in runs]
198
+ tokens = [r.get("tokens", 0) for r in runs]
199
+
200
+ run_summary[config] = {
201
+ "pass_rate": calculate_stats(pass_rates),
202
+ "time_seconds": calculate_stats(times),
203
+ "tokens": calculate_stats(tokens)
58
204
  }
59
205
 
60
- # テストスコア(存在する場合)
61
- if best.get("test_passed") is not None:
62
- skill_entry["original_test_score"] = f"{original.get('test_passed', '?')}/{original.get('test_total', '?')}"
63
- skill_entry["best_test_score"] = f"{best.get('test_passed', '?')}/{best.get('test_total', '?')}"
64
- skill_entry["best_test_passed"] = best.get("test_passed", 0)
65
- skill_entry["best_test_total"] = best.get("test_total", 0)
66
-
67
- skills.append(skill_entry)
68
-
69
- # トレーニングスコアでソート(降順)
70
- skills.sort(
71
- key=lambda s: (
72
- s.get("best_test_passed", 0) / max(s.get("best_test_total", 1), 1),
73
- s["best_train_passed"] / max(s["best_train_total"], 1),
74
- ),
75
- reverse=True,
76
- )
206
+ # 最初の2設定間のデルタを計算
207
+ if len(configs) >= 2:
208
+ primary = run_summary.get(configs[0], {})
209
+ baseline = run_summary.get(configs[1], {})
210
+ else:
211
+ primary = run_summary.get(configs[0], {}) if configs else {}
212
+ baseline = {}
77
213
 
78
- # 全体サマリーの計算
79
- total_train_passed = sum(s["best_train_passed"] for s in skills)
80
- total_train_total = sum(s["best_train_total"] for s in skills)
81
- total_test_passed = sum(s.get("best_test_passed", 0) for s in skills if "best_test_passed" in s)
82
- total_test_total = sum(s.get("best_test_total", 0) for s in skills if "best_test_total" in s)
214
+ delta_pass_rate = primary.get("pass_rate", {}).get("mean", 0) - baseline.get("pass_rate", {}).get("mean", 0)
215
+ delta_time = primary.get("time_seconds", {}).get("mean", 0) - baseline.get("time_seconds", {}).get("mean", 0)
216
+ delta_tokens = primary.get("tokens", {}).get("mean", 0) - baseline.get("tokens", {}).get("mean", 0)
83
217
 
84
- return {
85
- "skills": skills,
86
- "summary": {
87
- "total_skills": len(skills),
88
- "total_train_passed": total_train_passed,
89
- "total_train_total": total_train_total,
90
- "total_train_score": f"{total_train_passed}/{total_train_total}",
91
- "total_test_passed": total_test_passed,
92
- "total_test_total": total_test_total,
93
- "total_test_score": f"{total_test_passed}/{total_test_total}" if total_test_total > 0 else None,
218
+ run_summary["delta"] = {
219
+ "pass_rate": f"{delta_pass_rate:+.2f}",
220
+ "time_seconds": f"{delta_time:+.1f}",
221
+ "tokens": f"{delta_tokens:+.0f}"
222
+ }
223
+
224
+ return run_summary
225
+
226
+
227
+ def generate_benchmark(benchmark_dir: Path, skill_name: str = "", skill_path: str = "") -> dict:
228
+ """
229
+ 実行結果から完全な benchmark.json を生成する。
230
+ """
231
+ results = load_run_results(benchmark_dir)
232
+ run_summary = aggregate_results(results)
233
+
234
+ # benchmark.json 用の runs 配列を構築
235
+ runs = []
236
+ for config in results:
237
+ for result in results[config]:
238
+ runs.append({
239
+ "eval_id": result["eval_id"],
240
+ "configuration": config,
241
+ "run_number": result["run_number"],
242
+ "result": {
243
+ "pass_rate": result["pass_rate"],
244
+ "passed": result["passed"],
245
+ "failed": result["failed"],
246
+ "total": result["total"],
247
+ "time_seconds": result["time_seconds"],
248
+ "tokens": result.get("tokens", 0),
249
+ "tool_calls": result.get("tool_calls", 0),
250
+ "errors": result.get("errors", 0)
251
+ },
252
+ "expectations": result["expectations"],
253
+ "notes": result["notes"]
254
+ })
255
+
256
+ # 結果から eval ID を決定
257
+ eval_ids = sorted(set(
258
+ r["eval_id"]
259
+ for config in results.values()
260
+ for r in config
261
+ ))
262
+
263
+ benchmark = {
264
+ "metadata": {
265
+ "skill_name": skill_name or "<skill-name>",
266
+ "skill_path": skill_path or "<path/to/skill>",
267
+ "executor_model": "<model-name>",
268
+ "analyzer_model": "<model-name>",
269
+ "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
270
+ "evals_run": eval_ids,
271
+ "runs_per_configuration": 3
94
272
  },
273
+ "runs": runs,
274
+ "run_summary": run_summary,
275
+ "notes": [] # アナライザーが埋める
95
276
  }
96
277
 
278
+ return benchmark
279
+
280
+
281
+ def generate_markdown(benchmark: dict) -> str:
282
+ """benchmark データから人間が読みやすい benchmark.md を生成する。"""
283
+ metadata = benchmark["metadata"]
284
+ run_summary = benchmark["run_summary"]
285
+
286
+ # 設定名を取得("delta" を除く)
287
+ configs = [k for k in run_summary if k != "delta"]
288
+ config_a = configs[0] if len(configs) >= 1 else "config_a"
289
+ config_b = configs[1] if len(configs) >= 2 else "config_b"
290
+ label_a = config_a.replace("_", " ").title()
291
+ label_b = config_b.replace("_", " ").title()
292
+
293
+ lines = [
294
+ f"# Skill Benchmark: {metadata['skill_name']}",
295
+ "",
296
+ f"**Model**: {metadata['executor_model']}",
297
+ f"**Date**: {metadata['timestamp']}",
298
+ f"**Evals**: {', '.join(map(str, metadata['evals_run']))} ({metadata['runs_per_configuration']} runs each per configuration)",
299
+ "",
300
+ "## Summary",
301
+ "",
302
+ f"| Metric | {label_a} | {label_b} | Delta |",
303
+ "|--------|------------|---------------|-------|",
304
+ ]
97
305
 
98
- def print_table(aggregated: dict, verbose: bool = False) -> None:
99
- """集約結果をテーブル形式でstderrに出力する。"""
100
- skills = aggregated["skills"]
101
- summary = aggregated["summary"]
102
-
103
- has_test = any("best_test_score" in s for s in skills)
104
-
105
- # ヘッダー
106
- header = f"{'スキル名':<30} {'トレーニング(元)':<14} {'トレーニング(最良)':<14}"
107
- if has_test:
108
- header += f" {'テスト(元)':<12} {'テスト(最良)':<12}"
109
- header += f" {'回数':<6} {'最良回':<6}"
110
- print(header, file=sys.stderr)
111
- print("-" * len(header), file=sys.stderr)
112
-
113
- # 各スキル
114
- for s in skills:
115
- line = f"{s['skill_name']:<30} {s['original_train_score']:<14} {s['best_train_score']:<14}"
116
- if has_test:
117
- orig_test = s.get("original_test_score", "-")
118
- best_test = s.get("best_test_score", "-")
119
- line += f" {orig_test:<12} {best_test:<12}"
120
- line += f" {s['iterations']:<6} {s['best_iteration']:<6}"
121
- print(line, file=sys.stderr)
122
-
123
- if verbose:
124
- print(f" オリジナル: {s['original_description'][:80]}...", file=sys.stderr)
125
- print(f" 最良: {s['best_description'][:80]}...", file=sys.stderr)
126
-
127
- # サマリー
128
- print("-" * len(header), file=sys.stderr)
129
- total_line = f"{'合計':<30} {'':<14} {summary['total_train_score']:<14}"
130
- if has_test and summary.get("total_test_score"):
131
- total_line += f" {'':<12} {summary['total_test_score']:<12}"
132
- total_line += f" {summary['total_skills']} スキル"
133
- print(total_line, file=sys.stderr)
306
+ a_summary = run_summary.get(config_a, {})
307
+ b_summary = run_summary.get(config_b, {})
308
+ delta = run_summary.get("delta", {})
309
+
310
+ # パスレートのフォーマット
311
+ a_pr = a_summary.get("pass_rate", {})
312
+ b_pr = b_summary.get("pass_rate", {})
313
+ lines.append(f"| Pass Rate | {a_pr.get('mean', 0)*100:.0f}% ± {a_pr.get('stddev', 0)*100:.0f}% | {b_pr.get('mean', 0)*100:.0f}% ± {b_pr.get('stddev', 0)*100:.0f}% | {delta.get('pass_rate', '—')} |")
314
+
315
+ # タイムのフォーマット
316
+ a_time = a_summary.get("time_seconds", {})
317
+ b_time = b_summary.get("time_seconds", {})
318
+ lines.append(f"| Time | {a_time.get('mean', 0):.1f}s ± {a_time.get('stddev', 0):.1f}s | {b_time.get('mean', 0):.1f}s ± {b_time.get('stddev', 0):.1f}s | {delta.get('time_seconds', '—')}s |")
319
+
320
+ # トークン数のフォーマット
321
+ a_tokens = a_summary.get("tokens", {})
322
+ b_tokens = b_summary.get("tokens", {})
323
+ lines.append(f"| Tokens | {a_tokens.get('mean', 0):.0f} ± {a_tokens.get('stddev', 0):.0f} | {b_tokens.get('mean', 0):.0f} ± {b_tokens.get('stddev', 0):.0f} | {delta.get('tokens', '—')} |")
324
+
325
+ # ノートセクション
326
+ if benchmark.get("notes"):
327
+ lines.extend([
328
+ "",
329
+ "## Notes",
330
+ ""
331
+ ])
332
+ for note in benchmark["notes"]:
333
+ lines.append(f"- {note}")
334
+
335
+ return "\n".join(lines)
134
336
 
135
337
 
136
338
  def main():
137
- parser = argparse.ArgumentParser(description="複数スキルのベンチマーク結果を集約")
138
- parser.add_argument("files", nargs="+", help="run_loop.pyのJSON出力ファイル(複数指定可)")
139
- parser.add_argument("--verbose", action="store_true", help="各スキルのdescriptionも表示")
140
- parser.add_argument("--json", action="store_true", help="JSON形式で標準出力に出力")
339
+ parser = argparse.ArgumentParser(
340
+ description="ベンチマーク実行結果を集約してサマリー統計を生成"
341
+ )
342
+ parser.add_argument(
343
+ "benchmark_dir",
344
+ type=Path,
345
+ help="ベンチマークディレクトリへのパス"
346
+ )
347
+ parser.add_argument(
348
+ "--skill-name",
349
+ default="",
350
+ help="ベンチマーク対象スキルの名前"
351
+ )
352
+ parser.add_argument(
353
+ "--skill-path",
354
+ default="",
355
+ help="ベンチマーク対象スキルへのパス"
356
+ )
357
+ parser.add_argument(
358
+ "--output", "-o",
359
+ type=Path,
360
+ help="benchmark.json の出力パス(デフォルト: <benchmark_dir>/benchmark.json)"
361
+ )
362
+
141
363
  args = parser.parse_args()
142
364
 
143
- aggregated = aggregate_results(args.files)
365
+ if not args.benchmark_dir.exists():
366
+ print(f"Directory not found: {args.benchmark_dir}")
367
+ sys.exit(1)
368
+
369
+ # ベンチマーク生成
370
+ benchmark = generate_benchmark(args.benchmark_dir, args.skill_name, args.skill_path)
371
+
372
+ # 出力パスを決定
373
+ output_json = args.output or (args.benchmark_dir / "benchmark.json")
374
+ output_md = output_json.with_suffix(".md")
375
+
376
+ # benchmark.json を書き込み
377
+ with open(output_json, "w") as f:
378
+ json.dump(benchmark, f, indent=2)
379
+ print(f"Generated: {output_json}")
380
+
381
+ # benchmark.md を書き込み
382
+ markdown = generate_markdown(benchmark)
383
+ with open(output_md, "w") as f:
384
+ f.write(markdown)
385
+ print(f"Generated: {output_md}")
144
386
 
145
- # テーブル表示
146
- print_table(aggregated, verbose=args.verbose)
387
+ # サマリーを表示
388
+ run_summary = benchmark["run_summary"]
389
+ configs = [k for k in run_summary if k != "delta"]
390
+ delta = run_summary.get("delta", {})
147
391
 
148
- # JSON出力
149
- if args.json:
150
- print(json.dumps(aggregated, indent=2, ensure_ascii=False))
392
+ print(f"\nSummary:")
393
+ for config in configs:
394
+ pr = run_summary[config]["pass_rate"]["mean"]
395
+ label = config.replace("_", " ").title()
396
+ print(f" {label}: {pr*100:.1f}% pass rate")
397
+ print(f" Delta: {delta.get('pass_rate', '—')}")
151
398
 
152
399
 
153
400
  if __name__ == "__main__":