@einja/dev-cli 0.1.39 → 0.1.41

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (183) hide show
  1. package/README.md +89 -1
  2. package/dist/cli.js +1 -0
  3. package/dist/cli.js.map +1 -1
  4. package/dist/commands/init.d.ts.map +1 -1
  5. package/dist/commands/init.js +71 -1
  6. package/dist/commands/init.js.map +1 -1
  7. package/dist/commands/list.js.map +1 -1
  8. package/dist/commands/sync.d.ts.map +1 -1
  9. package/dist/commands/sync.js +187 -13
  10. package/dist/commands/sync.js.map +1 -1
  11. package/dist/lib/dependency-checker.d.ts.map +1 -1
  12. package/dist/lib/merger.d.ts +12 -0
  13. package/dist/lib/merger.d.ts.map +1 -1
  14. package/dist/lib/merger.js +28 -0
  15. package/dist/lib/merger.js.map +1 -1
  16. package/dist/lib/preset-update/cli-repo-detector.d.ts.map +1 -1
  17. package/dist/lib/preset-update/file-copier.d.ts.map +1 -1
  18. package/dist/lib/preset-update/preset-finder.d.ts.map +1 -1
  19. package/dist/lib/preset.d.ts.map +1 -1
  20. package/dist/lib/sync/category-validator.d.ts +1 -1
  21. package/dist/lib/sync/category-validator.d.ts.map +1 -1
  22. package/dist/lib/sync/category-validator.js +2 -1
  23. package/dist/lib/sync/category-validator.js.map +1 -1
  24. package/dist/lib/sync/category-validator.test.js +3 -1
  25. package/dist/lib/sync/category-validator.test.js.map +1 -1
  26. package/dist/lib/sync/conflict-reporter.d.ts.map +1 -1
  27. package/dist/lib/sync/diff-engine.d.ts.map +1 -1
  28. package/dist/lib/sync/file-filter.d.ts.map +1 -1
  29. package/dist/lib/sync/file-filter.js +1 -0
  30. package/dist/lib/sync/file-filter.js.map +1 -1
  31. package/dist/lib/sync/integration.test.js +255 -69
  32. package/dist/lib/sync/integration.test.js.map +1 -1
  33. package/dist/lib/sync/json-processor.d.ts +4 -4
  34. package/dist/lib/sync/json-processor.d.ts.map +1 -1
  35. package/dist/lib/sync/json-processor.js +11 -11
  36. package/dist/lib/sync/json-processor.js.map +1 -1
  37. package/dist/lib/sync/marker-processor.d.ts +60 -8
  38. package/dist/lib/sync/marker-processor.d.ts.map +1 -1
  39. package/dist/lib/sync/marker-processor.js +117 -26
  40. package/dist/lib/sync/marker-processor.js.map +1 -1
  41. package/dist/lib/sync/marker-processor.test.js +261 -40
  42. package/dist/lib/sync/marker-processor.test.js.map +1 -1
  43. package/dist/lib/sync/metadata-manager.d.ts +4 -0
  44. package/dist/lib/sync/metadata-manager.d.ts.map +1 -1
  45. package/dist/lib/sync/metadata-manager.js +15 -0
  46. package/dist/lib/sync/metadata-manager.js.map +1 -1
  47. package/dist/lib/sync/metadata-manager.test.js +68 -0
  48. package/dist/lib/sync/metadata-manager.test.js.map +1 -1
  49. package/dist/lib/sync/orphan-cleaner.d.ts +29 -0
  50. package/dist/lib/sync/orphan-cleaner.d.ts.map +1 -0
  51. package/dist/lib/sync/orphan-cleaner.js +80 -0
  52. package/dist/lib/sync/orphan-cleaner.js.map +1 -0
  53. package/dist/lib/sync/orphan-cleaner.test.d.ts +2 -0
  54. package/dist/lib/sync/orphan-cleaner.test.d.ts.map +1 -0
  55. package/dist/lib/sync/orphan-cleaner.test.js +169 -0
  56. package/dist/lib/sync/orphan-cleaner.test.js.map +1 -0
  57. package/dist/lib/sync/project-private-synchronizer.d.ts +52 -0
  58. package/dist/lib/sync/project-private-synchronizer.d.ts.map +1 -0
  59. package/dist/lib/sync/project-private-synchronizer.js +106 -0
  60. package/dist/lib/sync/project-private-synchronizer.js.map +1 -0
  61. package/dist/lib/sync/project-private-synchronizer.test.d.ts +2 -0
  62. package/dist/lib/sync/project-private-synchronizer.test.d.ts.map +1 -0
  63. package/dist/lib/sync/project-private-synchronizer.test.js +348 -0
  64. package/dist/lib/sync/project-private-synchronizer.test.js.map +1 -0
  65. package/dist/types/index.d.ts +1 -0
  66. package/dist/types/index.d.ts.map +1 -1
  67. package/dist/types/sync.d.ts +36 -6
  68. package/dist/types/sync.d.ts.map +1 -1
  69. package/dist/types/sync.js +2 -2
  70. package/dist/types/sync.js.map +1 -1
  71. package/package.json +5 -4
  72. package/presets/default/.claude/agents/einja/Explore.md +140 -0
  73. package/presets/default/.claude/agents/einja/backend-architect.md +4 -0
  74. package/presets/default/.claude/agents/einja/codex-agent.md +4 -0
  75. package/presets/default/.claude/agents/einja/design-engineer.md +4 -0
  76. package/presets/default/.claude/agents/einja/docs/docs-updater.md +4 -0
  77. package/presets/default/.claude/agents/einja/frontend-architect.md +4 -0
  78. package/presets/default/.claude/agents/einja/frontend-coder.md +4 -0
  79. package/presets/default/.claude/agents/einja/git/conflict-resolver.md +4 -0
  80. package/presets/default/.claude/agents/einja/specs/spec-design-generator.md +4 -1
  81. package/presets/default/.claude/agents/einja/specs/spec-qa-generator.md +4 -0
  82. package/presets/default/.claude/agents/einja/specs/spec-requirements-generator.md +4 -1
  83. package/presets/default/.claude/agents/einja/specs/spec-tasks-generator.md +6 -2
  84. package/presets/default/.claude/agents/einja/specs/spec-tasks-validator.md +4 -0
  85. package/presets/default/.claude/agents/einja/task/task-executer.md +57 -115
  86. package/presets/default/.claude/agents/einja/task/task-modification-analyzer.md +4 -0
  87. package/presets/default/.claude/agents/einja/task/task-qa.md +4 -0
  88. package/presets/default/.claude/agents/einja/task/task-reviewer.md +4 -0
  89. package/presets/default/.claude/commands/einja/einja-sync.md +5 -1
  90. package/presets/default/.claude/commands/einja/frontend-implement.md +3 -1
  91. package/presets/default/.claude/commands/einja/issue-exec.md +403 -0
  92. package/presets/default/.claude/commands/einja/spec-create.md +15 -1
  93. package/presets/default/.claude/commands/einja/start-dev.md +4 -0
  94. package/presets/default/.claude/commands/einja/sync-cursor-commands.md +4 -0
  95. package/presets/default/.claude/commands/einja/task-exec.md +106 -14
  96. package/presets/default/.claude/commands/einja/update-docs-by-task-specs.md +4 -0
  97. package/presets/default/.claude/hooks/einja/plan-mode-skill-loader.sh +23 -0
  98. package/presets/default/.claude/settings.json +15 -1
  99. package/presets/default/.claude/skills/einja-conflict-resolver/SKILL.md +4 -0
  100. package/presets/default/.claude/skills/einja-general-context-loader/SKILL.md +4 -0
  101. package/presets/default/.claude/skills/einja-output-format/SKILL.md +4 -0
  102. package/presets/default/.claude/skills/einja-project-overview/SKILL.md +7 -3
  103. package/presets/default/.claude/skills/einja-skill-creator/SKILL.md +266 -274
  104. package/presets/default/.claude/skills/einja-skill-creator/agents/analyzer.md +274 -0
  105. package/presets/default/.claude/skills/einja-skill-creator/agents/comparator.md +202 -0
  106. package/presets/default/.claude/skills/einja-skill-creator/agents/grader.md +195 -0
  107. package/presets/default/.claude/skills/einja-skill-creator/assets/eval_review.html +146 -0
  108. package/presets/default/.claude/skills/einja-skill-creator/eval-viewer/generate_review.py +471 -0
  109. package/presets/default/.claude/skills/einja-skill-creator/eval-viewer/viewer.html +1325 -0
  110. package/presets/default/.claude/skills/einja-skill-creator/references/schemas.md +430 -0
  111. package/presets/default/.claude/skills/einja-skill-creator/scripts/aggregate_benchmark.py +154 -0
  112. package/presets/default/.claude/skills/einja-skill-creator/scripts/generate_report.py +265 -0
  113. package/presets/default/.claude/skills/einja-skill-creator/scripts/improve_description.py +252 -0
  114. package/presets/default/.claude/skills/einja-skill-creator/scripts/init_skill.py +13 -19
  115. package/presets/default/.claude/skills/einja-skill-creator/scripts/package_skill.py +36 -7
  116. package/presets/default/.claude/skills/einja-skill-creator/scripts/run_eval.py +310 -0
  117. package/presets/default/.claude/skills/einja-skill-creator/scripts/run_loop.py +295 -0
  118. package/presets/default/.claude/skills/einja-skill-creator/scripts/utils.py +48 -0
  119. package/presets/default/.claude/skills/einja-spec-context-loader/SKILL.md +4 -0
  120. package/presets/default/.claude/skills/einja-task-commit/SKILL.md +4 -0
  121. package/presets/default/.claude/skills/einja-task-qa/SKILL.md +4 -0
  122. package/presets/default/.envrc +5 -0
  123. package/presets/default/.mcp.json +2 -12
  124. package/presets/default/CLAUDE.md.template +26 -4
  125. package/presets/default/docs/einja/example/specs/issues/issue999-example-task/tasks.md +1 -1
  126. package/presets/default/docs/einja/instructions/deployment-setup.md +3 -8
  127. package/presets/default/docs/einja/instructions/environment-setup.md +3 -8
  128. package/presets/default/docs/einja/instructions/issue-exec-workflow.md +276 -0
  129. package/presets/default/docs/einja/instructions/local-server-environment-and-worktree.md +70 -8
  130. package/presets/default/docs/einja/instructions/neon-cli-reference.md +3 -8
  131. package/presets/default/docs/einja/instructions/task-execute.md +23 -28
  132. package/presets/default/docs/einja/instructions/vercel-cli-reference.md +17 -10
  133. package/presets/default/docs/einja/steering/README.md +11 -11
  134. package/presets/default/docs/einja/steering/acceptance-criteria-and-qa-guide.md +3 -8
  135. package/presets/default/docs/einja/steering/architecture.md +3 -8
  136. package/presets/default/docs/einja/steering/branch-strategy.md +63 -70
  137. package/presets/default/docs/einja/steering/commit-rules.md +3 -8
  138. package/presets/default/docs/einja/steering/db-schema-design.md +3 -8
  139. package/presets/default/docs/einja/steering/development/api-development.md +3 -8
  140. package/presets/default/docs/einja/steering/development/backend-architecture.md +3 -8
  141. package/presets/default/docs/einja/steering/development/coding-standards.md +723 -0
  142. package/presets/default/docs/einja/steering/development/component-design.md +502 -0
  143. package/presets/default/docs/einja/steering/development/database-guidelines.md +54 -5
  144. package/presets/default/docs/einja/steering/development/frontend-development.md +3 -8
  145. package/presets/default/docs/einja/steering/development/playwright-guidelines.md +59 -0
  146. package/presets/default/docs/einja/steering/development/review-guidelines.md +3 -8
  147. package/presets/default/docs/einja/steering/development/testing-strategy.md +3 -8
  148. package/presets/default/docs/einja/steering/development-workflow.md +71 -124
  149. package/presets/default/docs/einja/steering/infrastructure/deployment.md +49 -55
  150. package/presets/default/docs/einja/steering/infrastructure/environment-variables.md +4 -8
  151. package/presets/default/docs/einja/steering/product.md +3 -8
  152. package/presets/default/docs/einja/steering/task-management.md +14 -98
  153. package/presets/default/scripts/ensure-serena.sh +75 -0
  154. package/presets/default/scripts/env-rotate-secrets.ts +336 -0
  155. package/presets/default/scripts/env-show.ts +130 -0
  156. package/presets/default/scripts/env.ts +479 -0
  157. package/presets/default/scripts/init.sh +92 -0
  158. package/presets/default/scripts/lib/env-common.ts +108 -0
  159. package/presets/default/scripts/lib/worktree-config.ts +64 -0
  160. package/presets/default/scripts/setup-dev.ts +640 -0
  161. package/presets/default/scripts/stop-serena.sh +25 -0
  162. package/presets/default/scripts/worktree/dev.ts +872 -0
  163. package/dist/lib/sync/seed-synchronizer.d.ts +0 -27
  164. package/dist/lib/sync/seed-synchronizer.d.ts.map +0 -1
  165. package/dist/lib/sync/seed-synchronizer.js +0 -72
  166. package/dist/lib/sync/seed-synchronizer.js.map +0 -1
  167. package/dist/lib/sync/seed-synchronizer.test.d.ts +0 -2
  168. package/dist/lib/sync/seed-synchronizer.test.d.ts.map +0 -1
  169. package/dist/lib/sync/seed-synchronizer.test.js +0 -147
  170. package/dist/lib/sync/seed-synchronizer.test.js.map +0 -1
  171. package/presets/default/.claude/skills/einja-api-development/SKILL.md +0 -14
  172. package/presets/default/.claude/skills/einja-backend-architecture/SKILL.md +0 -18
  173. package/presets/default/.claude/skills/einja-coding-standards/SKILL.md +0 -132
  174. package/presets/default/.claude/skills/einja-coding-standards/references/import-conventions.md +0 -69
  175. package/presets/default/.claude/skills/einja-coding-standards/references/naming-conventions.md +0 -107
  176. package/presets/default/.claude/skills/einja-coding-standards/references/prohibited-patterns.md +0 -169
  177. package/presets/default/.claude/skills/einja-coding-standards/references/typescript-rules.md +0 -247
  178. package/presets/default/.claude/skills/einja-component-design/SKILL.md +0 -109
  179. package/presets/default/.claude/skills/einja-component-design/references/directory-structure.md +0 -117
  180. package/presets/default/.claude/skills/einja-component-design/references/props-patterns.md +0 -159
  181. package/presets/default/.claude/skills/einja-component-design/references/styling-guide.md +0 -122
  182. package/presets/default/.claude/skills/einja-frontend-development/SKILL.md +0 -14
  183. package/presets/default/docs/einja/instructions/task-vibe-kanban-loop.md +0 -565
@@ -0,0 +1,310 @@
1
+ #!/usr/bin/env python3
2
+ """スキルdescriptionのトリガー評価を実行。
3
+
4
+ スキルのdescriptionが一連のクエリに対してClaudeのスキル使用を
5
+ トリガーするかどうかをテストする。結果をJSONで出力。
6
+ """
7
+
8
+ import argparse
9
+ import json
10
+ import os
11
+ import select
12
+ import subprocess
13
+ import sys
14
+ import time
15
+ import uuid
16
+ from concurrent.futures import ProcessPoolExecutor, as_completed
17
+ from pathlib import Path
18
+
19
+ try:
20
+ from scripts.utils import parse_skill_md
21
+ except ImportError:
22
+ from utils import parse_skill_md
23
+
24
+
25
+ def find_project_root() -> Path:
26
+ """cwdから上方向にウォークし、.claude/を探してプロジェクトルートを見つける。
27
+
28
+ Claude Codeがプロジェクトルートを発見する方法を模倣し、
29
+ 作成するコマンドファイルがclaude -pの検索対象に入るようにする。
30
+ """
31
+ current = Path.cwd()
32
+ for parent in [current, *current.parents]:
33
+ if (parent / ".claude").is_dir():
34
+ return parent
35
+ return current
36
+
37
+
38
+ def run_single_query(
39
+ query: str,
40
+ skill_name: str,
41
+ skill_description: str,
42
+ timeout: int,
43
+ project_root: str,
44
+ model: str | None = None,
45
+ ) -> bool:
46
+ """単一のクエリを実行し、スキルがトリガーされたかどうかを返す。
47
+
48
+ .claude/commands/にコマンドファイルを作成してClaudeのavailable_skillsリストに
49
+ 表示させ、`claude -p`で生のクエリを実行する。
50
+ --include-partial-messagesを使用してストリームイベント(content_block_start)から
51
+ 早期にトリガーを検出する。
52
+ """
53
+ unique_id = uuid.uuid4().hex[:8]
54
+ clean_name = f"{skill_name}-skill-{unique_id}"
55
+ project_commands_dir = Path(project_root) / ".claude" / "commands"
56
+ command_file = project_commands_dir / f"{clean_name}.md"
57
+
58
+ try:
59
+ project_commands_dir.mkdir(parents=True, exist_ok=True)
60
+ # description内のクォートでの破損を避けるためYAMLブロックスカラーを使用
61
+ indented_desc = "\n ".join(skill_description.split("\n"))
62
+ command_content = (
63
+ f"---\n"
64
+ f"description: |\n"
65
+ f" {indented_desc}\n"
66
+ f"---\n\n"
67
+ f"# {skill_name}\n\n"
68
+ f"This skill handles: {skill_description}\n"
69
+ )
70
+ command_file.write_text(command_content)
71
+
72
+ cmd = [
73
+ "claude",
74
+ "-p", query,
75
+ "--output-format", "stream-json",
76
+ "--verbose",
77
+ "--include-partial-messages",
78
+ ]
79
+ if model:
80
+ cmd.extend(["--model", model])
81
+
82
+ # Claude Codeセッション内でclaude -pのネストを許可するためCLAUDECODE環境変数を除去
83
+ env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"}
84
+
85
+ process = subprocess.Popen(
86
+ cmd,
87
+ stdout=subprocess.PIPE,
88
+ stderr=subprocess.DEVNULL,
89
+ cwd=project_root,
90
+ env=env,
91
+ )
92
+
93
+ triggered = False
94
+ start_time = time.time()
95
+ buffer = ""
96
+ # ストリームイベント検出用の状態追跡
97
+ pending_tool_name = None
98
+ accumulated_json = ""
99
+
100
+ try:
101
+ while time.time() - start_time < timeout:
102
+ if process.poll() is not None:
103
+ remaining = process.stdout.read()
104
+ if remaining:
105
+ buffer += remaining.decode("utf-8", errors="replace")
106
+ break
107
+
108
+ ready, _, _ = select.select([process.stdout], [], [], 1.0)
109
+ if not ready:
110
+ continue
111
+
112
+ chunk = os.read(process.stdout.fileno(), 8192)
113
+ if not chunk:
114
+ break
115
+ buffer += chunk.decode("utf-8", errors="replace")
116
+
117
+ while "\n" in buffer:
118
+ line, buffer = buffer.split("\n", 1)
119
+ line = line.strip()
120
+ if not line:
121
+ continue
122
+
123
+ try:
124
+ event = json.loads(line)
125
+ except json.JSONDecodeError:
126
+ continue
127
+
128
+ # ストリームイベントによる早期検出
129
+ if event.get("type") == "stream_event":
130
+ se = event.get("event", {})
131
+ se_type = se.get("type", "")
132
+
133
+ if se_type == "content_block_start":
134
+ cb = se.get("content_block", {})
135
+ if cb.get("type") == "tool_use":
136
+ tool_name = cb.get("name", "")
137
+ if tool_name in ("Skill", "Read"):
138
+ pending_tool_name = tool_name
139
+ accumulated_json = ""
140
+ else:
141
+ return False
142
+
143
+ elif se_type == "content_block_delta" and pending_tool_name:
144
+ delta = se.get("delta", {})
145
+ if delta.get("type") == "input_json_delta":
146
+ accumulated_json += delta.get("partial_json", "")
147
+ if clean_name in accumulated_json:
148
+ return True
149
+
150
+ elif se_type in ("content_block_stop", "message_stop"):
151
+ if pending_tool_name:
152
+ return clean_name in accumulated_json
153
+ if se_type == "message_stop":
154
+ return False
155
+
156
+ # フォールバック: 完全なassistantメッセージ
157
+ elif event.get("type") == "assistant":
158
+ message = event.get("message", {})
159
+ for content_item in message.get("content", []):
160
+ if content_item.get("type") != "tool_use":
161
+ continue
162
+ tool_name = content_item.get("name", "")
163
+ tool_input = content_item.get("input", {})
164
+ if tool_name == "Skill" and clean_name in tool_input.get("skill", ""):
165
+ triggered = True
166
+ elif tool_name == "Read" and clean_name in tool_input.get("file_path", ""):
167
+ triggered = True
168
+ return triggered
169
+
170
+ elif event.get("type") == "result":
171
+ return triggered
172
+ finally:
173
+ # 任意の終了パス(return、例外、タイムアウト)でプロセスをクリーンアップ
174
+ if process.poll() is None:
175
+ process.kill()
176
+ process.wait()
177
+
178
+ return triggered
179
+ finally:
180
+ if command_file.exists():
181
+ command_file.unlink()
182
+
183
+
184
+ def run_eval(
185
+ eval_set: list[dict],
186
+ skill_name: str,
187
+ description: str,
188
+ num_workers: int,
189
+ timeout: int,
190
+ project_root: Path,
191
+ runs_per_query: int = 1,
192
+ trigger_threshold: float = 0.5,
193
+ model: str | None = None,
194
+ ) -> dict:
195
+ """評価セット全体を実行し、結果を返す。"""
196
+ results = []
197
+
198
+ with ProcessPoolExecutor(max_workers=num_workers) as executor:
199
+ future_to_info = {}
200
+ for item in eval_set:
201
+ for run_idx in range(runs_per_query):
202
+ future = executor.submit(
203
+ run_single_query,
204
+ item["query"],
205
+ skill_name,
206
+ description,
207
+ timeout,
208
+ str(project_root),
209
+ model,
210
+ )
211
+ future_to_info[future] = (item, run_idx)
212
+
213
+ query_triggers: dict[str, list[bool]] = {}
214
+ query_items: dict[str, dict] = {}
215
+ for future in as_completed(future_to_info):
216
+ item, _ = future_to_info[future]
217
+ query = item["query"]
218
+ query_items[query] = item
219
+ if query not in query_triggers:
220
+ query_triggers[query] = []
221
+ try:
222
+ query_triggers[query].append(future.result())
223
+ except Exception as e:
224
+ print(f"警告: クエリが失敗しました: {e}", file=sys.stderr)
225
+ query_triggers[query].append(False)
226
+
227
+ for query, triggers in query_triggers.items():
228
+ item = query_items[query]
229
+ trigger_rate = sum(triggers) / len(triggers)
230
+ should_trigger = item["should_trigger"]
231
+ if should_trigger:
232
+ did_pass = trigger_rate >= trigger_threshold
233
+ else:
234
+ did_pass = trigger_rate < trigger_threshold
235
+ results.append({
236
+ "query": query,
237
+ "should_trigger": should_trigger,
238
+ "trigger_rate": trigger_rate,
239
+ "triggers": sum(triggers),
240
+ "runs": len(triggers),
241
+ "pass": did_pass,
242
+ })
243
+
244
+ passed = sum(1 for r in results if r["pass"])
245
+ total = len(results)
246
+
247
+ return {
248
+ "skill_name": skill_name,
249
+ "description": description,
250
+ "results": results,
251
+ "summary": {
252
+ "total": total,
253
+ "passed": passed,
254
+ "failed": total - passed,
255
+ },
256
+ }
257
+
258
+
259
+ def main():
260
+ parser = argparse.ArgumentParser(description="スキルdescriptionのトリガー評価を実行")
261
+ parser.add_argument("--eval-set", required=True, help="評価セットJSONファイルへのパス")
262
+ parser.add_argument("--skill-path", required=True, help="スキルディレクトリへのパス")
263
+ parser.add_argument("--description", default=None, help="テスト用descriptionの上書き")
264
+ parser.add_argument("--num-workers", type=int, default=10, help="並行ワーカー数")
265
+ parser.add_argument("--timeout", type=int, default=30, help="クエリごとのタイムアウト(秒)")
266
+ parser.add_argument("--runs-per-query", type=int, default=3, help="クエリごとの実行回数")
267
+ parser.add_argument("--trigger-threshold", type=float, default=0.5, help="トリガー率の閾値")
268
+ parser.add_argument("--model", default=None, help="claude -pに使用するモデル(デフォルト: ユーザー設定のモデル)")
269
+ parser.add_argument("--verbose", action="store_true", help="進捗をstderrに出力")
270
+ args = parser.parse_args()
271
+
272
+ eval_set = json.loads(Path(args.eval_set).read_text())
273
+ skill_path = Path(args.skill_path)
274
+
275
+ if not (skill_path / "SKILL.md").exists():
276
+ print(f"エラー: {skill_path} にSKILL.mdが見つかりません", file=sys.stderr)
277
+ sys.exit(1)
278
+
279
+ name, original_description, content = parse_skill_md(skill_path)
280
+ description = args.description or original_description
281
+ project_root = find_project_root()
282
+
283
+ if args.verbose:
284
+ print(f"評価中: {description}", file=sys.stderr)
285
+
286
+ output = run_eval(
287
+ eval_set=eval_set,
288
+ skill_name=name,
289
+ description=description,
290
+ num_workers=args.num_workers,
291
+ timeout=args.timeout,
292
+ project_root=project_root,
293
+ runs_per_query=args.runs_per_query,
294
+ trigger_threshold=args.trigger_threshold,
295
+ model=args.model,
296
+ )
297
+
298
+ if args.verbose:
299
+ summary = output["summary"]
300
+ print(f"結果: {summary['passed']}/{summary['total']} パス", file=sys.stderr)
301
+ for r in output["results"]:
302
+ status = "PASS" if r["pass"] else "FAIL"
303
+ rate_str = f"{r['triggers']}/{r['runs']}"
304
+ print(f" [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:70]}", file=sys.stderr)
305
+
306
+ print(json.dumps(output, indent=2))
307
+
308
+
309
+ if __name__ == "__main__":
310
+ main()
@@ -0,0 +1,295 @@
1
+ #!/usr/bin/env python3
2
+ """評価+改善ループを実行。全パスまたは最大イテレーション到達まで繰り返す。
3
+
4
+ run_eval.pyとimprove_description.pyをループで組み合わせ、
5
+ 履歴を追跡し最良のdescriptionを返す。
6
+ 過学習防止のためtrain/test分割に対応。
7
+ """
8
+
9
+ import argparse
10
+ import json
11
+ import random
12
+ import sys
13
+ import time
14
+ from pathlib import Path
15
+
16
+ try:
17
+ from scripts.generate_report import generate_html
18
+ from scripts.improve_description import improve_description
19
+ from scripts.run_eval import find_project_root, run_eval
20
+ from scripts.utils import parse_skill_md
21
+ except ImportError:
22
+ from generate_report import generate_html
23
+ from improve_description import improve_description
24
+ from run_eval import find_project_root, run_eval
25
+ from utils import parse_skill_md
26
+
27
+ import anthropic
28
+
29
+
30
+ def split_eval_set(
31
+ eval_set: list[dict],
32
+ holdout: int,
33
+ seed: int | None = None,
34
+ ) -> tuple[list[dict], list[dict]]:
35
+ """評価セットをトレーニングとテストに分割する。
36
+
37
+ holdoutが0の場合、全データをトレーニングに使用する。
38
+ should_trigger=Trueとshould_trigger=Falseの両方から
39
+ 均等にホールドアウトする。
40
+ """
41
+ if holdout <= 0:
42
+ return eval_set, []
43
+
44
+ rng = random.Random(seed)
45
+
46
+ positive = [item for item in eval_set if item.get("should_trigger", True)]
47
+ negative = [item for item in eval_set if not item.get("should_trigger", True)]
48
+
49
+ # 正例と負例から均等にホールドアウト
50
+ pos_holdout = holdout // 2
51
+ neg_holdout = holdout - pos_holdout
52
+
53
+ # 上限調整
54
+ pos_holdout = min(pos_holdout, len(positive) - 1) if len(positive) > 1 else 0
55
+ neg_holdout = min(neg_holdout, len(negative) - 1) if len(negative) > 1 else 0
56
+
57
+ rng.shuffle(positive)
58
+ rng.shuffle(negative)
59
+
60
+ test_set = positive[:pos_holdout] + negative[:neg_holdout]
61
+ train_set = positive[pos_holdout:] + negative[neg_holdout:]
62
+
63
+ return train_set, test_set
64
+
65
+
66
+ def run_loop(
67
+ eval_set_path: str,
68
+ skill_path: str,
69
+ max_iterations: int = 10,
70
+ num_workers: int = 10,
71
+ timeout: int = 30,
72
+ runs_per_query: int = 3,
73
+ trigger_threshold: float = 0.5,
74
+ holdout: int = 0,
75
+ seed: int | None = None,
76
+ model: str | None = None,
77
+ improve_model: str = "claude-sonnet-4-20250514",
78
+ verbose: bool = False,
79
+ report_path: str | None = None,
80
+ log_dir: str | None = None,
81
+ ) -> dict:
82
+ """評価+改善ループのメイン関数。"""
83
+ eval_set = json.loads(Path(eval_set_path).read_text())
84
+ skill_dir = Path(skill_path)
85
+
86
+ if not (skill_dir / "SKILL.md").exists():
87
+ print(f"エラー: {skill_dir} にSKILL.mdが見つかりません", file=sys.stderr)
88
+ sys.exit(1)
89
+
90
+ name, original_description, content = parse_skill_md(skill_dir)
91
+ project_root = find_project_root()
92
+
93
+ # train/test分割
94
+ train_set, test_set = split_eval_set(eval_set, holdout, seed)
95
+
96
+ if verbose:
97
+ print(f"スキル: {name}", file=sys.stderr)
98
+ print(f"トレーニングクエリ: {len(train_set)}, テストクエリ: {len(test_set)}", file=sys.stderr)
99
+ print(f"最大イテレーション: {max_iterations}", file=sys.stderr)
100
+ print(f"オリジナルdescription: {original_description}", file=sys.stderr)
101
+
102
+ client = anthropic.Anthropic()
103
+ current_description = original_description
104
+ history: list[dict] = []
105
+ improve_history: list[dict] = []
106
+ log_path = Path(log_dir) if log_dir else None
107
+
108
+ for iteration in range(max_iterations):
109
+ if verbose:
110
+ print(f"\n--- イテレーション {iteration} ---", file=sys.stderr)
111
+ print(f"description: {current_description[:100]}...", file=sys.stderr)
112
+
113
+ # トレーニング評価
114
+ train_results = run_eval(
115
+ eval_set=train_set,
116
+ skill_name=name,
117
+ description=current_description,
118
+ num_workers=num_workers,
119
+ timeout=timeout,
120
+ project_root=project_root,
121
+ runs_per_query=runs_per_query,
122
+ trigger_threshold=trigger_threshold,
123
+ model=model,
124
+ )
125
+
126
+ # テスト評価(テストセットがある場合)
127
+ test_results = None
128
+ if test_set:
129
+ test_results = run_eval(
130
+ eval_set=test_set,
131
+ skill_name=name,
132
+ description=current_description,
133
+ num_workers=num_workers,
134
+ timeout=timeout,
135
+ project_root=project_root,
136
+ runs_per_query=runs_per_query,
137
+ trigger_threshold=trigger_threshold,
138
+ model=model,
139
+ )
140
+
141
+ # 履歴エントリの構築
142
+ entry: dict = {
143
+ "description": current_description,
144
+ "train_passed": train_results["summary"]["passed"],
145
+ "train_failed": train_results["summary"]["failed"],
146
+ "train_total": train_results["summary"]["total"],
147
+ "train_results": train_results["results"],
148
+ }
149
+ if test_results:
150
+ entry["test_passed"] = test_results["summary"]["passed"]
151
+ entry["test_failed"] = test_results["summary"]["failed"]
152
+ entry["test_total"] = test_results["summary"]["total"]
153
+ entry["test_results"] = test_results["results"]
154
+
155
+ history.append(entry)
156
+
157
+ if verbose:
158
+ train_s = f"{train_results['summary']['passed']}/{train_results['summary']['total']}"
159
+ msg = f"トレーニングスコア: {train_s}"
160
+ if test_results:
161
+ test_s = f"{test_results['summary']['passed']}/{test_results['summary']['total']}"
162
+ msg += f", テストスコア: {test_s}"
163
+ print(msg, file=sys.stderr)
164
+
165
+ # レポート更新
166
+ if report_path:
167
+ output_data = {"history": history, "holdout": holdout}
168
+ report_html = generate_html(output_data, auto_refresh=True, skill_name=name)
169
+ Path(report_path).write_text(report_html)
170
+ if verbose:
171
+ print(f"レポートを更新しました: {report_path}", file=sys.stderr)
172
+
173
+ # 全パスなら終了
174
+ if train_results["summary"]["failed"] == 0:
175
+ if test_results is None or test_results["summary"]["failed"] == 0:
176
+ if verbose:
177
+ print("全クエリパス。ループを終了します。", file=sys.stderr)
178
+ break
179
+
180
+ # 最終イテレーションでなければ改善
181
+ if iteration < max_iterations - 1:
182
+ if verbose:
183
+ print("descriptionを改善中...", file=sys.stderr)
184
+
185
+ new_description = improve_description(
186
+ client=client,
187
+ skill_name=name,
188
+ skill_content=content,
189
+ current_description=current_description,
190
+ eval_results=train_results,
191
+ history=improve_history,
192
+ model=improve_model,
193
+ test_results=test_results,
194
+ log_dir=log_path,
195
+ iteration=iteration,
196
+ )
197
+
198
+ # 改善履歴を更新
199
+ improve_entry: dict = {
200
+ "description": current_description,
201
+ "train_passed": train_results["summary"]["passed"],
202
+ "train_total": train_results["summary"]["total"],
203
+ "results": train_results["results"],
204
+ }
205
+ if test_results:
206
+ improve_entry["test_passed"] = test_results["summary"]["passed"]
207
+ improve_entry["test_total"] = test_results["summary"]["total"]
208
+ improve_history.append(improve_entry)
209
+
210
+ current_description = new_description
211
+
212
+ if verbose:
213
+ print(f"新しいdescription: {new_description[:100]}...", file=sys.stderr)
214
+
215
+ # 最良のdescriptionを選択(テスト > トレーニングで優先)
216
+ best_idx = 0
217
+ best_test = -1
218
+ best_train = -1
219
+ for i, h in enumerate(history):
220
+ t_passed = h.get("test_passed", -1)
221
+ tr_passed = h.get("train_passed", h.get("passed", 0))
222
+ if t_passed > best_test or (t_passed == best_test and tr_passed > best_train):
223
+ best_test = t_passed
224
+ best_train = tr_passed
225
+ best_idx = i
226
+
227
+ best = history[best_idx]
228
+
229
+ # 最終レポート(auto_refreshオフ)
230
+ if report_path:
231
+ output_data = {"history": history, "holdout": holdout}
232
+ report_html = generate_html(output_data, auto_refresh=False, skill_name=name)
233
+ Path(report_path).write_text(report_html)
234
+
235
+ output = {
236
+ "skill_name": name,
237
+ "original_description": original_description,
238
+ "best_description": best["description"],
239
+ "best_iteration": best_idx,
240
+ "history": history,
241
+ "holdout": holdout,
242
+ }
243
+
244
+ if verbose:
245
+ print(f"\n最良のdescription (イテレーション {best_idx}): {best['description']}", file=sys.stderr)
246
+ train_s = f"{best['train_passed']}/{best['train_total']}"
247
+ msg = f"最良スコア - トレーニング: {train_s}"
248
+ if best.get("test_passed") is not None:
249
+ test_s = f"{best['test_passed']}/{best['test_total']}"
250
+ msg += f", テスト: {test_s}"
251
+ print(msg, file=sys.stderr)
252
+
253
+ return output
254
+
255
+
256
+ def main():
257
+ parser = argparse.ArgumentParser(description="評価+改善ループを実行")
258
+ parser.add_argument("--eval-set", required=True, help="評価セットJSONファイルへのパス")
259
+ parser.add_argument("--skill-path", required=True, help="スキルディレクトリへのパス")
260
+ parser.add_argument("--max-iterations", type=int, default=10, help="最大イテレーション数(デフォルト: 10)")
261
+ parser.add_argument("--num-workers", type=int, default=10, help="並行ワーカー数(デフォルト: 10)")
262
+ parser.add_argument("--timeout", type=int, default=30, help="クエリごとのタイムアウト秒数(デフォルト: 30)")
263
+ parser.add_argument("--runs-per-query", type=int, default=3, help="クエリごとの実行回数(デフォルト: 3)")
264
+ parser.add_argument("--trigger-threshold", type=float, default=0.5, help="トリガー率の閾値(デフォルト: 0.5)")
265
+ parser.add_argument("--holdout", type=int, default=0, help="テスト用ホールドアウトクエリ数(デフォルト: 0)")
266
+ parser.add_argument("--seed", type=int, default=None, help="train/test分割のランダムシード")
267
+ parser.add_argument("--model", default=None, help="評価時にclaude -pに使用するモデル")
268
+ parser.add_argument("--improve-model", default="claude-sonnet-4-20250514", help="description改善に使用するモデル(デフォルト: claude-sonnet-4-20250514)")
269
+ parser.add_argument("--verbose", action="store_true", help="進捗をstderrに出力")
270
+ parser.add_argument("--report", default=None, help="HTMLレポートの出力先パス(ライブ更新あり)")
271
+ parser.add_argument("--log-dir", default=None, help="改善トランスクリプトのログディレクトリ")
272
+ args = parser.parse_args()
273
+
274
+ output = run_loop(
275
+ eval_set_path=args.eval_set,
276
+ skill_path=args.skill_path,
277
+ max_iterations=args.max_iterations,
278
+ num_workers=args.num_workers,
279
+ timeout=args.timeout,
280
+ runs_per_query=args.runs_per_query,
281
+ trigger_threshold=args.trigger_threshold,
282
+ holdout=args.holdout,
283
+ seed=args.seed,
284
+ model=args.model,
285
+ improve_model=args.improve_model,
286
+ verbose=args.verbose,
287
+ report_path=args.report,
288
+ log_dir=args.log_dir,
289
+ )
290
+
291
+ print(json.dumps(output, indent=2))
292
+
293
+
294
+ if __name__ == "__main__":
295
+ main()
@@ -0,0 +1,48 @@
1
+ #!/usr/bin/env python3
2
+ """スキルクリエイタースクリプト共通ユーティリティ。"""
3
+
4
+ from pathlib import Path
5
+
6
+
7
+
8
+ def parse_skill_md(skill_path: Path) -> tuple[str, str, str]:
9
+ """SKILL.mdファイルをパースし、(name, description, full_content)を返す。"""
10
+ content = (skill_path / "SKILL.md").read_text()
11
+ lines = content.split("\n")
12
+
13
+ if lines[0].strip() != "---":
14
+ raise ValueError("SKILL.mdにフロントマターがありません(開始の---がありません)")
15
+
16
+ end_idx = None
17
+ for i, line in enumerate(lines[1:], start=1):
18
+ if line.strip() == "---":
19
+ end_idx = i
20
+ break
21
+
22
+ if end_idx is None:
23
+ raise ValueError("SKILL.mdにフロントマターがありません(終了の---がありません)")
24
+
25
+ name = ""
26
+ description = ""
27
+ frontmatter_lines = lines[1:end_idx]
28
+ i = 0
29
+ while i < len(frontmatter_lines):
30
+ line = frontmatter_lines[i]
31
+ if line.startswith("name:"):
32
+ name = line[len("name:"):].strip().strip('"').strip("'")
33
+ elif line.startswith("description:"):
34
+ value = line[len("description:"):].strip()
35
+ # YAMLマルチラインインジケータ(>, |, >-, |-)の処理
36
+ if value in (">", "|", ">-", "|-"):
37
+ continuation_lines: list[str] = []
38
+ i += 1
39
+ while i < len(frontmatter_lines) and (frontmatter_lines[i].startswith(" ") or frontmatter_lines[i].startswith("\t")):
40
+ continuation_lines.append(frontmatter_lines[i].strip())
41
+ i += 1
42
+ description = " ".join(continuation_lines)
43
+ continue
44
+ else:
45
+ description = value.strip('"').strip("'")
46
+ i += 1
47
+
48
+ return name, description, content
@@ -175,3 +175,7 @@ interface Example {
175
175
  ---
176
176
 
177
177
  **最終更新**: 2025-01-10
178
+
179
+ <!-- @einja:project-private:start id="einja-spec-context-loader-project" -->
180
+ <!-- プロジェクト固有の情報を記入 -->
181
+ <!-- @einja:project-private:end -->
@@ -267,3 +267,7 @@ EOF
267
267
  ---
268
268
 
269
269
  **最終更新**: 2026-01-10
270
+
271
+ <!-- @einja:project-private:start id="einja-task-commit-project" -->
272
+ <!-- プロジェクト固有の情報を記入 -->
273
+ <!-- @einja:project-private:end -->
@@ -306,3 +306,7 @@ AskUserQuestion:
306
306
  ---
307
307
 
308
308
  **最終更新**: 2025-12-20
309
+
310
+ <!-- @einja:project-private:start id="einja-task-qa-project" -->
311
+ <!-- プロジェクト固有の情報を記入 -->
312
+ <!-- @einja:project-private:end -->