@einja/dev-cli 0.1.40 → 0.1.44

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. package/README.md +89 -1
  2. package/dist/cli.js +1 -0
  3. package/dist/cli.js.map +1 -1
  4. package/dist/commands/init.d.ts.map +1 -1
  5. package/dist/commands/init.js +71 -1
  6. package/dist/commands/init.js.map +1 -1
  7. package/dist/commands/list.js.map +1 -1
  8. package/dist/commands/sync.d.ts.map +1 -1
  9. package/dist/commands/sync.js +187 -13
  10. package/dist/commands/sync.js.map +1 -1
  11. package/dist/commands/task-loop/lib/github-client.test.js.map +1 -1
  12. package/dist/commands/task-loop/lib/vibe-kanban-rest-client.js +2 -2
  13. package/dist/commands/task-loop/lib/vibe-kanban-rest-client.js.map +1 -1
  14. package/dist/lib/dependency-checker.d.ts.map +1 -1
  15. package/dist/lib/merger.d.ts +12 -0
  16. package/dist/lib/merger.d.ts.map +1 -1
  17. package/dist/lib/merger.js +28 -0
  18. package/dist/lib/merger.js.map +1 -1
  19. package/dist/lib/preset-update/cli-repo-detector.d.ts.map +1 -1
  20. package/dist/lib/preset-update/file-copier.d.ts.map +1 -1
  21. package/dist/lib/preset-update/file-copier.js +3 -3
  22. package/dist/lib/preset-update/file-copier.js.map +1 -1
  23. package/dist/lib/preset-update/preset-finder.d.ts.map +1 -1
  24. package/dist/lib/preset.d.ts.map +1 -1
  25. package/dist/lib/sync/category-validator.d.ts +1 -1
  26. package/dist/lib/sync/category-validator.d.ts.map +1 -1
  27. package/dist/lib/sync/category-validator.js +2 -1
  28. package/dist/lib/sync/category-validator.js.map +1 -1
  29. package/dist/lib/sync/category-validator.test.js +3 -1
  30. package/dist/lib/sync/category-validator.test.js.map +1 -1
  31. package/dist/lib/sync/conflict-reporter.d.ts.map +1 -1
  32. package/dist/lib/sync/diff-engine.d.ts.map +1 -1
  33. package/dist/lib/sync/file-filter.d.ts.map +1 -1
  34. package/dist/lib/sync/file-filter.js +1 -0
  35. package/dist/lib/sync/file-filter.js.map +1 -1
  36. package/dist/lib/sync/integration.test.js +255 -69
  37. package/dist/lib/sync/integration.test.js.map +1 -1
  38. package/dist/lib/sync/json-processor.d.ts +4 -4
  39. package/dist/lib/sync/json-processor.d.ts.map +1 -1
  40. package/dist/lib/sync/json-processor.js +11 -11
  41. package/dist/lib/sync/json-processor.js.map +1 -1
  42. package/dist/lib/sync/marker-processor.d.ts +60 -8
  43. package/dist/lib/sync/marker-processor.d.ts.map +1 -1
  44. package/dist/lib/sync/marker-processor.js +117 -26
  45. package/dist/lib/sync/marker-processor.js.map +1 -1
  46. package/dist/lib/sync/marker-processor.test.js +261 -40
  47. package/dist/lib/sync/marker-processor.test.js.map +1 -1
  48. package/dist/lib/sync/metadata-manager.d.ts +4 -0
  49. package/dist/lib/sync/metadata-manager.d.ts.map +1 -1
  50. package/dist/lib/sync/metadata-manager.js +15 -0
  51. package/dist/lib/sync/metadata-manager.js.map +1 -1
  52. package/dist/lib/sync/metadata-manager.test.js +69 -0
  53. package/dist/lib/sync/metadata-manager.test.js.map +1 -1
  54. package/dist/lib/sync/orphan-cleaner.d.ts +29 -0
  55. package/dist/lib/sync/orphan-cleaner.d.ts.map +1 -0
  56. package/dist/lib/sync/orphan-cleaner.js +80 -0
  57. package/dist/lib/sync/orphan-cleaner.js.map +1 -0
  58. package/dist/lib/sync/orphan-cleaner.test.d.ts +2 -0
  59. package/dist/lib/sync/orphan-cleaner.test.d.ts.map +1 -0
  60. package/dist/lib/sync/orphan-cleaner.test.js +169 -0
  61. package/dist/lib/sync/orphan-cleaner.test.js.map +1 -0
  62. package/dist/lib/sync/project-private-synchronizer.d.ts +52 -0
  63. package/dist/lib/sync/project-private-synchronizer.d.ts.map +1 -0
  64. package/dist/lib/sync/project-private-synchronizer.js +110 -0
  65. package/dist/lib/sync/project-private-synchronizer.js.map +1 -0
  66. package/dist/lib/sync/project-private-synchronizer.test.d.ts +2 -0
  67. package/dist/lib/sync/project-private-synchronizer.test.d.ts.map +1 -0
  68. package/dist/lib/sync/project-private-synchronizer.test.js +348 -0
  69. package/dist/lib/sync/project-private-synchronizer.test.js.map +1 -0
  70. package/dist/types/index.d.ts +1 -0
  71. package/dist/types/index.d.ts.map +1 -1
  72. package/dist/types/sync.d.ts +36 -6
  73. package/dist/types/sync.d.ts.map +1 -1
  74. package/dist/types/sync.js +2 -2
  75. package/dist/types/sync.js.map +1 -1
  76. package/package.json +5 -4
  77. package/presets/default/.claude/agents/einja/Explore.md +140 -0
  78. package/presets/default/.claude/agents/einja/backend-architect.md +21 -1
  79. package/presets/default/.claude/agents/einja/codex-agent.md +5 -1
  80. package/presets/default/.claude/agents/einja/design-engineer.md +5 -1
  81. package/presets/default/.claude/agents/einja/docs/docs-updater.md +7 -93
  82. package/presets/default/.claude/agents/einja/frontend-architect.md +21 -1
  83. package/presets/default/.claude/agents/einja/frontend-coder.md +5 -1
  84. package/presets/default/.claude/agents/einja/{specs/spec-design-generator.md → issue-specs/design-generator.md} +16 -8
  85. package/presets/default/.claude/agents/einja/{specs/spec-qa-generator.md → issue-specs/qa-generator.md} +10 -4
  86. package/presets/default/.claude/agents/einja/{specs/spec-requirements-generator.md → issue-specs/requirements-generator.md} +9 -6
  87. package/presets/default/.claude/agents/einja/{specs/spec-tasks-generator.md → issue-specs/tasks-generator.md} +19 -16
  88. package/presets/default/.claude/agents/einja/{specs/spec-tasks-validator.md → issue-specs/tasks-validator.md} +13 -9
  89. package/presets/default/.claude/agents/einja/issue-specs/ui-design-generator.md +114 -0
  90. package/presets/default/.claude/agents/einja/task/task-executer.md +64 -116
  91. package/presets/default/.claude/agents/einja/task/task-modification-analyzer.md +6 -2
  92. package/presets/default/.claude/agents/einja/task/task-qa.md +7 -3
  93. package/presets/default/.claude/agents/einja/task/task-reviewer.md +17 -1
  94. package/presets/default/.claude/commands/einja/einja-sync.md +124 -45
  95. package/presets/default/.claude/commands/einja/frontend-implement.md +3 -1
  96. package/presets/default/.claude/commands/einja/issue-exec.md +413 -0
  97. package/presets/default/.claude/commands/einja/start-dev.md +4 -0
  98. package/presets/default/.claude/commands/einja/sync-cursor-commands.md +10 -6
  99. package/presets/default/.claude/commands/einja/{update-docs-by-task-specs.md → update-docs-by-issue-specs.md} +61 -57
  100. package/presets/default/.claude/hooks/einja/plan-mode-skill-loader.sh +27 -0
  101. package/presets/default/.claude/settings.json +29 -5
  102. package/presets/default/.claude/skills/{einja-general-context-loader → _einja-general-context-loader}/SKILL.md +6 -2
  103. package/presets/default/.claude/skills/{einja-output-format → _einja-output-format}/SKILL.md +5 -1
  104. package/presets/default/.claude/skills/_einja-project-overview/SKILL.md +29 -0
  105. package/presets/default/.claude/skills/{einja-spec-context-loader → _einja-spec-context-loader}/SKILL.md +9 -5
  106. package/presets/default/.claude/skills/einja-coding-standards/references/testing-strategy.md +899 -0
  107. package/presets/default/.claude/skills/einja-conflict-resolver/SKILL.md +5 -1
  108. package/presets/default/.claude/skills/einja-create-pr/SKILL.md +138 -0
  109. package/presets/default/.claude/skills/einja-infra-maintenance/SKILL.md +779 -0
  110. package/presets/default/.claude/{commands/einja/spec-create.md → skills/einja-issue-spec-create/SKILL.md} +60 -23
  111. package/presets/default/.claude/skills/einja-issue-spec-generator/SKILL.md +105 -0
  112. package/presets/default/.claude/skills/einja-issue-spec-generator/references/format-rules.md +35 -0
  113. package/presets/default/.claude/skills/einja-issue-spec-validator/SKILL.md +130 -0
  114. package/presets/default/.claude/skills/einja-issue-spec-validator/references/validation-rules.md +52 -0
  115. package/presets/default/.claude/skills/einja-npm-release/SKILL.md +242 -0
  116. package/presets/default/.claude/skills/einja-skill-creator/SKILL.md +311 -263
  117. package/presets/default/.claude/skills/einja-skill-creator/agents/analyzer.md +274 -0
  118. package/presets/default/.claude/skills/einja-skill-creator/agents/comparator.md +202 -0
  119. package/presets/default/.claude/skills/einja-skill-creator/agents/grader.md +195 -0
  120. package/presets/default/.claude/skills/einja-skill-creator/assets/eval_review.html +146 -0
  121. package/presets/default/.claude/skills/einja-skill-creator/eval-viewer/generate_review.py +471 -0
  122. package/presets/default/.claude/skills/einja-skill-creator/eval-viewer/viewer.html +1325 -0
  123. package/presets/default/.claude/skills/einja-skill-creator/references/schemas.md +430 -0
  124. package/presets/default/.claude/skills/einja-skill-creator/scripts/aggregate_benchmark.py +401 -0
  125. package/presets/default/.claude/skills/einja-skill-creator/scripts/compare_runs.py +154 -0
  126. package/presets/default/.claude/skills/einja-skill-creator/scripts/generate_report.py +272 -0
  127. package/presets/default/.claude/skills/einja-skill-creator/scripts/improve_description.py +247 -0
  128. package/presets/default/.claude/skills/einja-skill-creator/scripts/init_skill.py +13 -19
  129. package/presets/default/.claude/skills/einja-skill-creator/scripts/package_skill.py +36 -7
  130. package/presets/default/.claude/skills/einja-skill-creator/scripts/run_eval.py +310 -0
  131. package/presets/default/.claude/skills/einja-skill-creator/scripts/run_loop.py +375 -0
  132. package/presets/default/.claude/skills/einja-skill-creator/scripts/utils.py +48 -0
  133. package/presets/default/.claude/skills/einja-skill-first/SKILL.md +265 -0
  134. package/presets/default/.claude/skills/einja-subagent-question-protocol/SKILL.md +98 -0
  135. package/presets/default/.claude/skills/einja-task-commit/SKILL.md +11 -7
  136. package/presets/default/.claude/{commands/einja/task-exec.md → skills/einja-task-exec/SKILL.md} +106 -89
  137. package/presets/default/.claude/skills/einja-task-qa/SKILL.md +8 -4
  138. package/presets/default/.claude/skills/einja-task-qa/references/troubleshooting.md +1 -1
  139. package/presets/default/.claude/skills/einja-task-qa/references/usage-patterns.md +2 -2
  140. package/presets/default/.claude/skills/einja-team-exec/SKILL.md +165 -0
  141. package/presets/default/.envrc +5 -0
  142. package/presets/default/.mcp.json +2 -12
  143. package/presets/default/CLAUDE.md.template +45 -8
  144. package/presets/default/docs/einja/example/specs/issues/issue999-example-task/tasks.md +1 -1
  145. package/presets/default/docs/einja/instructions/deployment-setup.md +4 -9
  146. package/presets/default/docs/einja/instructions/environment-setup.md +3 -8
  147. package/presets/default/docs/einja/instructions/issue-exec-workflow.md +276 -0
  148. package/presets/default/docs/einja/instructions/local-server-environment-and-worktree.md +71 -9
  149. package/presets/default/docs/einja/instructions/neon-cli-reference.md +3 -8
  150. package/presets/default/docs/einja/instructions/setup-flow.md +279 -0
  151. package/presets/default/docs/einja/instructions/task-execute.md +63 -68
  152. package/presets/default/docs/einja/instructions/vercel-cli-reference.md +17 -10
  153. package/presets/default/docs/einja/steering/README.md +11 -11
  154. package/presets/default/docs/einja/steering/acceptance-criteria-and-qa-guide.md +4 -9
  155. package/presets/default/docs/einja/steering/architecture.md +3 -8
  156. package/presets/default/docs/einja/steering/branch-strategy.md +63 -70
  157. package/presets/default/docs/einja/steering/commit-rules.md +3 -8
  158. package/presets/default/docs/einja/steering/db-schema-design.md +3 -8
  159. package/presets/default/docs/einja/steering/development/api-development.md +3 -8
  160. package/presets/default/docs/einja/steering/development/backend-architecture.md +3 -8
  161. package/presets/default/docs/einja/steering/development/coding-standards.md +723 -0
  162. package/presets/default/docs/einja/steering/development/component-design.md +502 -0
  163. package/presets/default/docs/einja/steering/development/database-guidelines.md +2 -2
  164. package/presets/default/docs/einja/steering/development/frontend-development.md +3 -8
  165. package/presets/default/docs/einja/steering/development/playwright-guidelines.md +59 -0
  166. package/presets/default/docs/einja/steering/development/review-guidelines.md +3 -8
  167. package/presets/default/docs/einja/steering/development/testing-strategy.md +3 -8
  168. package/presets/default/docs/einja/steering/development-workflow.md +155 -140
  169. package/presets/default/docs/einja/steering/infrastructure/deployment.md +156 -55
  170. package/presets/default/docs/einja/steering/infrastructure/environment-variables.md +4 -8
  171. package/presets/default/docs/einja/steering/product.md +3 -8
  172. package/presets/default/docs/einja/steering/task-management.md +22 -110
  173. package/presets/default/scripts/ensure-serena.sh +75 -0
  174. package/presets/default/scripts/env-rotate-secrets.ts +396 -0
  175. package/presets/default/scripts/env-show.ts +130 -0
  176. package/presets/default/scripts/env.ts +479 -0
  177. package/presets/default/scripts/init-github.ts +363 -0
  178. package/presets/default/scripts/init.sh +98 -0
  179. package/presets/default/scripts/lib/env-common.ts +108 -0
  180. package/presets/default/scripts/lib/worktree-config.ts +64 -0
  181. package/presets/default/scripts/setup-dev.ts +655 -0
  182. package/presets/default/scripts/stop-serena.sh +25 -0
  183. package/presets/default/scripts/worktree/dev.ts +872 -0
  184. package/dist/lib/sync/seed-synchronizer.d.ts +0 -27
  185. package/dist/lib/sync/seed-synchronizer.d.ts.map +0 -1
  186. package/dist/lib/sync/seed-synchronizer.js +0 -72
  187. package/dist/lib/sync/seed-synchronizer.js.map +0 -1
  188. package/dist/lib/sync/seed-synchronizer.test.d.ts +0 -2
  189. package/dist/lib/sync/seed-synchronizer.test.d.ts.map +0 -1
  190. package/dist/lib/sync/seed-synchronizer.test.js +0 -147
  191. package/dist/lib/sync/seed-synchronizer.test.js.map +0 -1
  192. package/presets/default/.claude/agents/einja/git/conflict-resolver.md +0 -148
  193. package/presets/default/.claude/hooks/einja/validate-git-commit.sh +0 -239
  194. package/presets/default/.claude/skills/einja-api-development/SKILL.md +0 -14
  195. package/presets/default/.claude/skills/einja-backend-architecture/SKILL.md +0 -18
  196. package/presets/default/.claude/skills/einja-coding-standards/SKILL.md +0 -132
  197. package/presets/default/.claude/skills/einja-coding-standards/references/import-conventions.md +0 -69
  198. package/presets/default/.claude/skills/einja-coding-standards/references/naming-conventions.md +0 -107
  199. package/presets/default/.claude/skills/einja-coding-standards/references/prohibited-patterns.md +0 -169
  200. package/presets/default/.claude/skills/einja-coding-standards/references/typescript-rules.md +0 -247
  201. package/presets/default/.claude/skills/einja-component-design/SKILL.md +0 -109
  202. package/presets/default/.claude/skills/einja-component-design/references/directory-structure.md +0 -117
  203. package/presets/default/.claude/skills/einja-component-design/references/props-patterns.md +0 -159
  204. package/presets/default/.claude/skills/einja-component-design/references/styling-guide.md +0 -122
  205. package/presets/default/.claude/skills/einja-frontend-development/SKILL.md +0 -14
  206. package/presets/default/.claude/skills/einja-project-overview/SKILL.md +0 -35
  207. package/presets/default/docs/einja/instructions/task-vibe-kanban-loop.md +0 -565
@@ -0,0 +1,310 @@
1
+ #!/usr/bin/env python3
2
+ """スキルdescriptionのトリガー評価を実行。
3
+
4
+ スキルのdescriptionが一連のクエリに対してClaudeのスキル使用を
5
+ トリガーするかどうかをテストする。結果をJSONで出力。
6
+ """
7
+
8
+ import argparse
9
+ import json
10
+ import os
11
+ import select
12
+ import subprocess
13
+ import sys
14
+ import time
15
+ import uuid
16
+ from concurrent.futures import ProcessPoolExecutor, as_completed
17
+ from pathlib import Path
18
+
19
+ try:
20
+ from scripts.utils import parse_skill_md
21
+ except ImportError:
22
+ from utils import parse_skill_md
23
+
24
+
25
+ def find_project_root() -> Path:
26
+ """cwdから上方向にウォークし、.claude/を探してプロジェクトルートを見つける。
27
+
28
+ Claude Codeがプロジェクトルートを発見する方法を模倣し、
29
+ 作成するコマンドファイルがclaude -pの検索対象に入るようにする。
30
+ """
31
+ current = Path.cwd()
32
+ for parent in [current, *current.parents]:
33
+ if (parent / ".claude").is_dir():
34
+ return parent
35
+ return current
36
+
37
+
38
+ def run_single_query(
39
+ query: str,
40
+ skill_name: str,
41
+ skill_description: str,
42
+ timeout: int,
43
+ project_root: str,
44
+ model: str | None = None,
45
+ ) -> bool:
46
+ """単一のクエリを実行し、スキルがトリガーされたかどうかを返す。
47
+
48
+ .claude/commands/にコマンドファイルを作成してClaudeのavailable_skillsリストに
49
+ 表示させ、`claude -p`で生のクエリを実行する。
50
+ --include-partial-messagesを使用してストリームイベント(content_block_start)から
51
+ 早期にトリガーを検出する。
52
+ """
53
+ unique_id = uuid.uuid4().hex[:8]
54
+ clean_name = f"{skill_name}-skill-{unique_id}"
55
+ project_commands_dir = Path(project_root) / ".claude" / "commands"
56
+ command_file = project_commands_dir / f"{clean_name}.md"
57
+
58
+ try:
59
+ project_commands_dir.mkdir(parents=True, exist_ok=True)
60
+ # description内のクォートでの破損を避けるためYAMLブロックスカラーを使用
61
+ indented_desc = "\n ".join(skill_description.split("\n"))
62
+ command_content = (
63
+ f"---\n"
64
+ f"description: |\n"
65
+ f" {indented_desc}\n"
66
+ f"---\n\n"
67
+ f"# {skill_name}\n\n"
68
+ f"This skill handles: {skill_description}\n"
69
+ )
70
+ command_file.write_text(command_content)
71
+
72
+ cmd = [
73
+ "claude",
74
+ "-p", query,
75
+ "--output-format", "stream-json",
76
+ "--verbose",
77
+ "--include-partial-messages",
78
+ ]
79
+ if model:
80
+ cmd.extend(["--model", model])
81
+
82
+ # Claude Codeセッション内でclaude -pのネストを許可するためCLAUDECODE環境変数を除去
83
+ env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"}
84
+
85
+ process = subprocess.Popen(
86
+ cmd,
87
+ stdout=subprocess.PIPE,
88
+ stderr=subprocess.DEVNULL,
89
+ cwd=project_root,
90
+ env=env,
91
+ )
92
+
93
+ triggered = False
94
+ start_time = time.time()
95
+ buffer = ""
96
+ # ストリームイベント検出用の状態追跡
97
+ pending_tool_name = None
98
+ accumulated_json = ""
99
+
100
+ try:
101
+ while time.time() - start_time < timeout:
102
+ if process.poll() is not None:
103
+ remaining = process.stdout.read()
104
+ if remaining:
105
+ buffer += remaining.decode("utf-8", errors="replace")
106
+ break
107
+
108
+ ready, _, _ = select.select([process.stdout], [], [], 1.0)
109
+ if not ready:
110
+ continue
111
+
112
+ chunk = os.read(process.stdout.fileno(), 8192)
113
+ if not chunk:
114
+ break
115
+ buffer += chunk.decode("utf-8", errors="replace")
116
+
117
+ while "\n" in buffer:
118
+ line, buffer = buffer.split("\n", 1)
119
+ line = line.strip()
120
+ if not line:
121
+ continue
122
+
123
+ try:
124
+ event = json.loads(line)
125
+ except json.JSONDecodeError:
126
+ continue
127
+
128
+ # ストリームイベントによる早期検出
129
+ if event.get("type") == "stream_event":
130
+ se = event.get("event", {})
131
+ se_type = se.get("type", "")
132
+
133
+ if se_type == "content_block_start":
134
+ cb = se.get("content_block", {})
135
+ if cb.get("type") == "tool_use":
136
+ tool_name = cb.get("name", "")
137
+ if tool_name in ("Skill", "Read"):
138
+ pending_tool_name = tool_name
139
+ accumulated_json = ""
140
+ else:
141
+ return False
142
+
143
+ elif se_type == "content_block_delta" and pending_tool_name:
144
+ delta = se.get("delta", {})
145
+ if delta.get("type") == "input_json_delta":
146
+ accumulated_json += delta.get("partial_json", "")
147
+ if clean_name in accumulated_json:
148
+ return True
149
+
150
+ elif se_type in ("content_block_stop", "message_stop"):
151
+ if pending_tool_name:
152
+ return clean_name in accumulated_json
153
+ if se_type == "message_stop":
154
+ return False
155
+
156
+ # フォールバック: 完全なassistantメッセージ
157
+ elif event.get("type") == "assistant":
158
+ message = event.get("message", {})
159
+ for content_item in message.get("content", []):
160
+ if content_item.get("type") != "tool_use":
161
+ continue
162
+ tool_name = content_item.get("name", "")
163
+ tool_input = content_item.get("input", {})
164
+ if tool_name == "Skill" and clean_name in tool_input.get("skill", ""):
165
+ triggered = True
166
+ elif tool_name == "Read" and clean_name in tool_input.get("file_path", ""):
167
+ triggered = True
168
+ return triggered
169
+
170
+ elif event.get("type") == "result":
171
+ return triggered
172
+ finally:
173
+ # 任意の終了パス(return、例外、タイムアウト)でプロセスをクリーンアップ
174
+ if process.poll() is None:
175
+ process.kill()
176
+ process.wait()
177
+
178
+ return triggered
179
+ finally:
180
+ if command_file.exists():
181
+ command_file.unlink()
182
+
183
+
184
+ def run_eval(
185
+ eval_set: list[dict],
186
+ skill_name: str,
187
+ description: str,
188
+ num_workers: int,
189
+ timeout: int,
190
+ project_root: Path,
191
+ runs_per_query: int = 1,
192
+ trigger_threshold: float = 0.5,
193
+ model: str | None = None,
194
+ ) -> dict:
195
+ """評価セット全体を実行し、結果を返す。"""
196
+ results = []
197
+
198
+ with ProcessPoolExecutor(max_workers=num_workers) as executor:
199
+ future_to_info = {}
200
+ for item in eval_set:
201
+ for run_idx in range(runs_per_query):
202
+ future = executor.submit(
203
+ run_single_query,
204
+ item["query"],
205
+ skill_name,
206
+ description,
207
+ timeout,
208
+ str(project_root),
209
+ model,
210
+ )
211
+ future_to_info[future] = (item, run_idx)
212
+
213
+ query_triggers: dict[str, list[bool]] = {}
214
+ query_items: dict[str, dict] = {}
215
+ for future in as_completed(future_to_info):
216
+ item, _ = future_to_info[future]
217
+ query = item["query"]
218
+ query_items[query] = item
219
+ if query not in query_triggers:
220
+ query_triggers[query] = []
221
+ try:
222
+ query_triggers[query].append(future.result())
223
+ except Exception as e:
224
+ print(f"警告: クエリが失敗しました: {e}", file=sys.stderr)
225
+ query_triggers[query].append(False)
226
+
227
+ for query, triggers in query_triggers.items():
228
+ item = query_items[query]
229
+ trigger_rate = sum(triggers) / len(triggers)
230
+ should_trigger = item["should_trigger"]
231
+ if should_trigger:
232
+ did_pass = trigger_rate >= trigger_threshold
233
+ else:
234
+ did_pass = trigger_rate < trigger_threshold
235
+ results.append({
236
+ "query": query,
237
+ "should_trigger": should_trigger,
238
+ "trigger_rate": trigger_rate,
239
+ "triggers": sum(triggers),
240
+ "runs": len(triggers),
241
+ "pass": did_pass,
242
+ })
243
+
244
+ passed = sum(1 for r in results if r["pass"])
245
+ total = len(results)
246
+
247
+ return {
248
+ "skill_name": skill_name,
249
+ "description": description,
250
+ "results": results,
251
+ "summary": {
252
+ "total": total,
253
+ "passed": passed,
254
+ "failed": total - passed,
255
+ },
256
+ }
257
+
258
+
259
+ def main():
260
+ parser = argparse.ArgumentParser(description="スキルdescriptionのトリガー評価を実行")
261
+ parser.add_argument("--eval-set", required=True, help="評価セットJSONファイルへのパス")
262
+ parser.add_argument("--skill-path", required=True, help="スキルディレクトリへのパス")
263
+ parser.add_argument("--description", default=None, help="テスト用descriptionの上書き")
264
+ parser.add_argument("--num-workers", type=int, default=10, help="並行ワーカー数")
265
+ parser.add_argument("--timeout", type=int, default=30, help="クエリごとのタイムアウト(秒)")
266
+ parser.add_argument("--runs-per-query", type=int, default=3, help="クエリごとの実行回数")
267
+ parser.add_argument("--trigger-threshold", type=float, default=0.5, help="トリガー率の閾値")
268
+ parser.add_argument("--model", default=None, help="claude -pに使用するモデル(デフォルト: ユーザー設定のモデル)")
269
+ parser.add_argument("--verbose", action="store_true", help="進捗をstderrに出力")
270
+ args = parser.parse_args()
271
+
272
+ eval_set = json.loads(Path(args.eval_set).read_text())
273
+ skill_path = Path(args.skill_path)
274
+
275
+ if not (skill_path / "SKILL.md").exists():
276
+ print(f"エラー: {skill_path} にSKILL.mdが見つかりません", file=sys.stderr)
277
+ sys.exit(1)
278
+
279
+ name, original_description, content = parse_skill_md(skill_path)
280
+ description = args.description or original_description
281
+ project_root = find_project_root()
282
+
283
+ if args.verbose:
284
+ print(f"評価中: {description}", file=sys.stderr)
285
+
286
+ output = run_eval(
287
+ eval_set=eval_set,
288
+ skill_name=name,
289
+ description=description,
290
+ num_workers=args.num_workers,
291
+ timeout=args.timeout,
292
+ project_root=project_root,
293
+ runs_per_query=args.runs_per_query,
294
+ trigger_threshold=args.trigger_threshold,
295
+ model=args.model,
296
+ )
297
+
298
+ if args.verbose:
299
+ summary = output["summary"]
300
+ print(f"結果: {summary['passed']}/{summary['total']} パス", file=sys.stderr)
301
+ for r in output["results"]:
302
+ status = "PASS" if r["pass"] else "FAIL"
303
+ rate_str = f"{r['triggers']}/{r['runs']}"
304
+ print(f" [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:70]}", file=sys.stderr)
305
+
306
+ print(json.dumps(output, indent=2))
307
+
308
+
309
+ if __name__ == "__main__":
310
+ main()
@@ -0,0 +1,375 @@
1
+ #!/usr/bin/env python3
2
+ """評価+改善ループを実行。全パスまたは最大イテレーション到達まで繰り返す。
3
+
4
+ run_eval.pyとimprove_description.pyをループで組み合わせ、
5
+ 履歴を追跡し最良のdescriptionを返す。
6
+ 過学習防止のためtrain/test分割(fraction指定)に対応。
7
+ """
8
+
9
+ import argparse
10
+ import json
11
+ import random
12
+ import sys
13
+ import tempfile
14
+ import time
15
+ import webbrowser
16
+ from pathlib import Path
17
+
18
+ try:
19
+ from scripts.generate_report import generate_html
20
+ from scripts.improve_description import improve_description
21
+ from scripts.run_eval import find_project_root, run_eval
22
+ from scripts.utils import parse_skill_md
23
+ except ImportError:
24
+ from generate_report import generate_html
25
+ from improve_description import improve_description
26
+ from run_eval import find_project_root, run_eval
27
+ from utils import parse_skill_md
28
+
29
+ import anthropic
30
+
31
+
32
+ def split_eval_set(
33
+ eval_set: list[dict],
34
+ holdout: float,
35
+ seed: int = 42,
36
+ ) -> tuple[list[dict], list[dict]]:
37
+ """評価セットをトレーニングとテストに分割する(fraction指定)。
38
+
39
+ holdoutは全体に対する割合(例: 0.4 = 40%)。
40
+ holdoutが0の場合、全データをトレーニングに使用する。
41
+ should_trigger=Trueとshould_trigger=Falseの両方から
42
+ 均等にホールドアウトする(stratified split)。
43
+ """
44
+ if holdout <= 0:
45
+ return eval_set, []
46
+
47
+ rng = random.Random(seed)
48
+
49
+ # should_triggerで分離
50
+ trigger = [e for e in eval_set if e.get("should_trigger", True)]
51
+ no_trigger = [e for e in eval_set if not e.get("should_trigger", True)]
52
+
53
+ # 各グループをシャッフル
54
+ rng.shuffle(trigger)
55
+ rng.shuffle(no_trigger)
56
+
57
+ # 分割点を割合で計算
58
+ n_trigger_test = max(1, int(len(trigger) * holdout))
59
+ n_no_trigger_test = max(1, int(len(no_trigger) * holdout))
60
+
61
+ # 分割
62
+ test_set = trigger[:n_trigger_test] + no_trigger[:n_no_trigger_test]
63
+ train_set = trigger[n_trigger_test:] + no_trigger[n_no_trigger_test:]
64
+
65
+ return train_set, test_set
66
+
67
+
68
+ def run_loop(
69
+ eval_set: list[dict],
70
+ skill_path: Path,
71
+ description_override: str | None,
72
+ num_workers: int,
73
+ timeout: int,
74
+ max_iterations: int,
75
+ runs_per_query: int,
76
+ trigger_threshold: float,
77
+ holdout: float,
78
+ seed: int | None,
79
+ model: str | None,
80
+ improve_model: str,
81
+ verbose: bool,
82
+ live_report_path: Path | None = None,
83
+ log_dir: Path | None = None,
84
+ ) -> dict:
85
+ """評価+改善ループのメイン関数。"""
86
+ project_root = find_project_root()
87
+ name, original_description, content = parse_skill_md(skill_path)
88
+ current_description = description_override or original_description
89
+
90
+ # train/test分割(holdoutが0より大きい場合のみ)
91
+ if holdout > 0:
92
+ train_set, test_set = split_eval_set(eval_set, holdout, seed if seed is not None else 42)
93
+ if verbose:
94
+ print(f"分割: トレーニング {len(train_set)} 件, テスト {len(test_set)} 件 (holdout={holdout})", file=sys.stderr)
95
+ else:
96
+ train_set = eval_set
97
+ test_set = []
98
+
99
+ client = anthropic.Anthropic()
100
+ history: list[dict] = []
101
+ exit_reason = "unknown"
102
+
103
+ for iteration in range(1, max_iterations + 1):
104
+ if verbose:
105
+ print(f"\n{'='*60}", file=sys.stderr)
106
+ print(f"イテレーション {iteration}/{max_iterations}", file=sys.stderr)
107
+ print(f"description: {current_description}", file=sys.stderr)
108
+ print(f"{'='*60}", file=sys.stderr)
109
+
110
+ # train + test を一括で並行評価(効率化)
111
+ all_queries = train_set + test_set
112
+ t0 = time.time()
113
+ all_results = run_eval(
114
+ eval_set=all_queries,
115
+ skill_name=name,
116
+ description=current_description,
117
+ num_workers=num_workers,
118
+ timeout=timeout,
119
+ project_root=project_root,
120
+ runs_per_query=runs_per_query,
121
+ trigger_threshold=trigger_threshold,
122
+ model=model,
123
+ )
124
+ eval_elapsed = time.time() - t0
125
+
126
+ # クエリの一致でtrain/testに結果を振り分け
127
+ train_queries_set = {q["query"] for q in train_set}
128
+ train_result_list = [r for r in all_results["results"] if r["query"] in train_queries_set]
129
+ test_result_list = [r for r in all_results["results"] if r["query"] not in train_queries_set]
130
+
131
+ train_passed = sum(1 for r in train_result_list if r["pass"])
132
+ train_total = len(train_result_list)
133
+ train_summary = {"passed": train_passed, "failed": train_total - train_passed, "total": train_total}
134
+ train_results = {"results": train_result_list, "summary": train_summary}
135
+
136
+ if test_set:
137
+ test_passed = sum(1 for r in test_result_list if r["pass"])
138
+ test_total = len(test_result_list)
139
+ test_summary = {"passed": test_passed, "failed": test_total - test_passed, "total": test_total}
140
+ test_results = {"results": test_result_list, "summary": test_summary}
141
+ else:
142
+ test_results = None
143
+ test_summary = None
144
+
145
+ # 履歴エントリの構築(レポートジェネレーターとの後方互換性を保持)
146
+ history.append({
147
+ "iteration": iteration,
148
+ "description": current_description,
149
+ "train_passed": train_summary["passed"],
150
+ "train_failed": train_summary["failed"],
151
+ "train_total": train_summary["total"],
152
+ "train_results": train_results["results"],
153
+ "test_passed": test_summary["passed"] if test_summary else None,
154
+ "test_failed": test_summary["failed"] if test_summary else None,
155
+ "test_total": test_summary["total"] if test_summary else None,
156
+ "test_results": test_results["results"] if test_results else None,
157
+ # レポートジェネレーター後方互換
158
+ "passed": train_summary["passed"],
159
+ "failed": train_summary["failed"],
160
+ "total": train_summary["total"],
161
+ "results": train_results["results"],
162
+ })
163
+
164
+ # ライブレポートを更新(指定されている場合)
165
+ if live_report_path:
166
+ partial_output = {
167
+ "original_description": original_description,
168
+ "best_description": current_description,
169
+ "best_score": "in progress",
170
+ "iterations_run": len(history),
171
+ "holdout": holdout,
172
+ "train_size": len(train_set),
173
+ "test_size": len(test_set),
174
+ "history": history,
175
+ }
176
+ live_report_path.write_text(generate_html(partial_output, auto_refresh=True, skill_name=name))
177
+ if verbose:
178
+ print(f"レポートを更新しました: {live_report_path}", file=sys.stderr)
179
+
180
+ if verbose:
181
+ def print_eval_stats(label: str, results: list[dict], elapsed: float) -> None:
182
+ pos = [r for r in results if r.get("should_trigger", True)]
183
+ neg = [r for r in results if not r.get("should_trigger", True)]
184
+ tp = sum(r["triggers"] for r in pos)
185
+ pos_runs = sum(r["runs"] for r in pos)
186
+ fn = pos_runs - tp
187
+ fp = sum(r["triggers"] for r in neg)
188
+ neg_runs = sum(r["runs"] for r in neg)
189
+ tn = neg_runs - fp
190
+ total = tp + tn + fp + fn
191
+ precision = tp / (tp + fp) if (tp + fp) > 0 else 1.0
192
+ recall = tp / (tp + fn) if (tp + fn) > 0 else 1.0
193
+ accuracy = (tp + tn) / total if total > 0 else 0.0
194
+ print(
195
+ f"{label}: {tp+tn}/{total} 正解, "
196
+ f"precision={precision:.0%} recall={recall:.0%} accuracy={accuracy:.0%} ({elapsed:.1f}s)",
197
+ file=sys.stderr,
198
+ )
199
+ for r in results:
200
+ status = "PASS" if r["pass"] else "FAIL"
201
+ rate_str = f"{r['triggers']}/{r['runs']}"
202
+ print(
203
+ f" [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:60]}",
204
+ file=sys.stderr,
205
+ )
206
+
207
+ print_eval_stats("Train", train_results["results"], eval_elapsed)
208
+ if test_summary:
209
+ print_eval_stats("Test ", test_results["results"], 0) # type: ignore[index]
210
+
211
+ # train全パスなら終了(testは過学習モニタリング用のみ)
212
+ if train_summary["failed"] == 0:
213
+ exit_reason = f"all_passed (iteration {iteration})"
214
+ if verbose:
215
+ print(f"\nイテレーション {iteration} でtrain全クエリパス!ループを終了します。", file=sys.stderr)
216
+ break
217
+
218
+ if iteration == max_iterations:
219
+ exit_reason = f"max_iterations ({max_iterations})"
220
+ if verbose:
221
+ print(f"\n最大イテレーション数到達 ({max_iterations})。", file=sys.stderr)
222
+ break
223
+
224
+ # descriptionを改善(train結果のみ使用)
225
+ if verbose:
226
+ print(f"\ndescriptionを改善中...", file=sys.stderr)
227
+
228
+ t0 = time.time()
229
+ # 過学習防止のため、改善モデルにtest_スコアを見せないようにブラインド処理
230
+ blinded_history = [
231
+ {k: v for k, v in h.items() if not k.startswith("test_")}
232
+ for h in history
233
+ ]
234
+ new_description = improve_description(
235
+ client=client,
236
+ skill_name=name,
237
+ skill_content=content,
238
+ current_description=current_description,
239
+ eval_results=train_results,
240
+ history=blinded_history,
241
+ model=improve_model,
242
+ log_dir=log_dir,
243
+ iteration=iteration,
244
+ )
245
+ improve_elapsed = time.time() - t0
246
+
247
+ if verbose:
248
+ print(f"新しいdescription ({improve_elapsed:.1f}s): {new_description}", file=sys.stderr)
249
+
250
+ current_description = new_description
251
+
252
+ # 最良のdescriptionを選択(testセットあり→test優先、なし→train)
253
+ if test_set:
254
+ best = max(history, key=lambda h: h["test_passed"] or 0)
255
+ best_score = f"{best['test_passed']}/{best['test_total']}"
256
+ else:
257
+ best = max(history, key=lambda h: h["train_passed"])
258
+ best_score = f"{best['train_passed']}/{best['train_total']}"
259
+
260
+ if verbose:
261
+ print(f"\n終了理由: {exit_reason}", file=sys.stderr)
262
+ print(f"最良スコア: {best_score} (イテレーション {best['iteration']})", file=sys.stderr)
263
+
264
+ return {
265
+ "exit_reason": exit_reason,
266
+ "original_description": original_description,
267
+ "best_description": best["description"],
268
+ "best_score": best_score,
269
+ "best_train_score": f"{best['train_passed']}/{best['train_total']}",
270
+ "best_test_score": f"{best['test_passed']}/{best['test_total']}" if test_set else None,
271
+ "final_description": current_description,
272
+ "iterations_run": len(history),
273
+ "holdout": holdout,
274
+ "train_size": len(train_set),
275
+ "test_size": len(test_set),
276
+ "history": history,
277
+ }
278
+
279
+
280
+ def main() -> None:
281
+ parser = argparse.ArgumentParser(description="評価+改善ループを実行")
282
+ parser.add_argument("--eval-set", required=True, help="評価セットJSONファイルへのパス")
283
+ parser.add_argument("--skill-path", required=True, help="スキルディレクトリへのパス")
284
+ parser.add_argument("--description", default=None, help="開始descriptionを上書き")
285
+ parser.add_argument("--num-workers", type=int, default=10, help="並行ワーカー数(デフォルト: 10)")
286
+ parser.add_argument("--timeout", type=int, default=30, help="クエリごとのタイムアウト秒数(デフォルト: 30)")
287
+ parser.add_argument("--max-iterations", type=int, default=5, help="最大イテレーション数(デフォルト: 5)")
288
+ parser.add_argument("--runs-per-query", type=int, default=3, help="クエリごとの実行回数(デフォルト: 3)")
289
+ parser.add_argument("--trigger-threshold", type=float, default=0.5, help="トリガー率の閾値(デフォルト: 0.5)")
290
+ parser.add_argument("--holdout", type=float, default=0.4, help="テスト用ホールドアウト割合(0で無効、デフォルト: 0.4)")
291
+ parser.add_argument("--seed", type=int, default=None, help="train/test分割のランダムシード")
292
+ parser.add_argument("--model", default=None, help="評価時にclaude -pに使用するモデル")
293
+ parser.add_argument("--improve-model", default="claude-sonnet-4-20250514", help="description改善に使用するモデル(デフォルト: claude-sonnet-4-20250514)")
294
+ parser.add_argument("--verbose", action="store_true", help="進捗をstderrに出力")
295
+ parser.add_argument("--report", default="auto", help="HTMLレポートの出力先パス('auto'で一時ファイル自動起動、'none'で無効)")
296
+ parser.add_argument("--results-dir", default=None, help="タイムスタンプ付きサブディレクトリに全出力(results.json, report.html, logs)を保存")
297
+ parser.add_argument("--log-dir", default=None, help="改善トランスクリプトのログディレクトリ(--results-dirより優先)")
298
+ args = parser.parse_args()
299
+
300
+ eval_set = json.loads(Path(args.eval_set).read_text())
301
+ skill_path = Path(args.skill_path)
302
+
303
+ if not (skill_path / "SKILL.md").exists():
304
+ print(f"エラー: {skill_path} にSKILL.mdが見つかりません", file=sys.stderr)
305
+ sys.exit(1)
306
+
307
+ name, _, _ = parse_skill_md(skill_path)
308
+
309
+ # ライブレポートパスのセットアップ
310
+ if args.report != "none":
311
+ if args.report == "auto":
312
+ timestamp = time.strftime("%Y%m%d_%H%M%S")
313
+ live_report_path = Path(tempfile.gettempdir()) / f"skill_description_report_{skill_path.name}_{timestamp}.html"
314
+ else:
315
+ live_report_path = Path(args.report)
316
+ # ブラウザで即座に開けるよう初期HTMLを書き込む
317
+ live_report_path.write_text("<html><body><h1>最適化ループを開始しています...</h1><meta http-equiv='refresh' content='5'></body></html>")
318
+ webbrowser.open(str(live_report_path))
319
+ else:
320
+ live_report_path = None
321
+
322
+ # 出力ディレクトリの決定(run_loop実行前に作成してlogsを保存可能にする)
323
+ if args.results_dir:
324
+ timestamp = time.strftime("%Y-%m-%d_%H%M%S")
325
+ results_dir = Path(args.results_dir) / timestamp
326
+ results_dir.mkdir(parents=True, exist_ok=True)
327
+ else:
328
+ results_dir = None
329
+
330
+ # --log-dir が明示指定されていればそちらを優先、なければ results_dir/logs
331
+ if args.log_dir:
332
+ log_dir: Path | None = Path(args.log_dir)
333
+ elif results_dir:
334
+ log_dir = results_dir / "logs"
335
+ else:
336
+ log_dir = None
337
+
338
+ output = run_loop(
339
+ eval_set=eval_set,
340
+ skill_path=skill_path,
341
+ description_override=args.description,
342
+ num_workers=args.num_workers,
343
+ timeout=args.timeout,
344
+ max_iterations=args.max_iterations,
345
+ runs_per_query=args.runs_per_query,
346
+ trigger_threshold=args.trigger_threshold,
347
+ holdout=args.holdout,
348
+ seed=args.seed,
349
+ model=args.model,
350
+ improve_model=args.improve_model,
351
+ verbose=args.verbose,
352
+ live_report_path=live_report_path,
353
+ log_dir=log_dir,
354
+ )
355
+
356
+ # JSON出力
357
+ json_output = json.dumps(output, indent=2)
358
+ print(json_output)
359
+ if results_dir:
360
+ (results_dir / "results.json").write_text(json_output)
361
+
362
+ # 最終HTMLレポートの書き込み(auto_refreshオフ)
363
+ if live_report_path:
364
+ live_report_path.write_text(generate_html(output, auto_refresh=False, skill_name=name))
365
+ print(f"\nレポート: {live_report_path}", file=sys.stderr)
366
+
367
+ if results_dir and live_report_path:
368
+ (results_dir / "report.html").write_text(generate_html(output, auto_refresh=False, skill_name=name))
369
+
370
+ if results_dir:
371
+ print(f"結果を保存しました: {results_dir}", file=sys.stderr)
372
+
373
+
374
+ if __name__ == "__main__":
375
+ main()