@einja/dev-cli 0.1.39 → 0.1.41
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +89 -1
- package/dist/cli.js +1 -0
- package/dist/cli.js.map +1 -1
- package/dist/commands/init.d.ts.map +1 -1
- package/dist/commands/init.js +71 -1
- package/dist/commands/init.js.map +1 -1
- package/dist/commands/list.js.map +1 -1
- package/dist/commands/sync.d.ts.map +1 -1
- package/dist/commands/sync.js +187 -13
- package/dist/commands/sync.js.map +1 -1
- package/dist/lib/dependency-checker.d.ts.map +1 -1
- package/dist/lib/merger.d.ts +12 -0
- package/dist/lib/merger.d.ts.map +1 -1
- package/dist/lib/merger.js +28 -0
- package/dist/lib/merger.js.map +1 -1
- package/dist/lib/preset-update/cli-repo-detector.d.ts.map +1 -1
- package/dist/lib/preset-update/file-copier.d.ts.map +1 -1
- package/dist/lib/preset-update/preset-finder.d.ts.map +1 -1
- package/dist/lib/preset.d.ts.map +1 -1
- package/dist/lib/sync/category-validator.d.ts +1 -1
- package/dist/lib/sync/category-validator.d.ts.map +1 -1
- package/dist/lib/sync/category-validator.js +2 -1
- package/dist/lib/sync/category-validator.js.map +1 -1
- package/dist/lib/sync/category-validator.test.js +3 -1
- package/dist/lib/sync/category-validator.test.js.map +1 -1
- package/dist/lib/sync/conflict-reporter.d.ts.map +1 -1
- package/dist/lib/sync/diff-engine.d.ts.map +1 -1
- package/dist/lib/sync/file-filter.d.ts.map +1 -1
- package/dist/lib/sync/file-filter.js +1 -0
- package/dist/lib/sync/file-filter.js.map +1 -1
- package/dist/lib/sync/integration.test.js +255 -69
- package/dist/lib/sync/integration.test.js.map +1 -1
- package/dist/lib/sync/json-processor.d.ts +4 -4
- package/dist/lib/sync/json-processor.d.ts.map +1 -1
- package/dist/lib/sync/json-processor.js +11 -11
- package/dist/lib/sync/json-processor.js.map +1 -1
- package/dist/lib/sync/marker-processor.d.ts +60 -8
- package/dist/lib/sync/marker-processor.d.ts.map +1 -1
- package/dist/lib/sync/marker-processor.js +117 -26
- package/dist/lib/sync/marker-processor.js.map +1 -1
- package/dist/lib/sync/marker-processor.test.js +261 -40
- package/dist/lib/sync/marker-processor.test.js.map +1 -1
- package/dist/lib/sync/metadata-manager.d.ts +4 -0
- package/dist/lib/sync/metadata-manager.d.ts.map +1 -1
- package/dist/lib/sync/metadata-manager.js +15 -0
- package/dist/lib/sync/metadata-manager.js.map +1 -1
- package/dist/lib/sync/metadata-manager.test.js +68 -0
- package/dist/lib/sync/metadata-manager.test.js.map +1 -1
- package/dist/lib/sync/orphan-cleaner.d.ts +29 -0
- package/dist/lib/sync/orphan-cleaner.d.ts.map +1 -0
- package/dist/lib/sync/orphan-cleaner.js +80 -0
- package/dist/lib/sync/orphan-cleaner.js.map +1 -0
- package/dist/lib/sync/orphan-cleaner.test.d.ts +2 -0
- package/dist/lib/sync/orphan-cleaner.test.d.ts.map +1 -0
- package/dist/lib/sync/orphan-cleaner.test.js +169 -0
- package/dist/lib/sync/orphan-cleaner.test.js.map +1 -0
- package/dist/lib/sync/project-private-synchronizer.d.ts +52 -0
- package/dist/lib/sync/project-private-synchronizer.d.ts.map +1 -0
- package/dist/lib/sync/project-private-synchronizer.js +106 -0
- package/dist/lib/sync/project-private-synchronizer.js.map +1 -0
- package/dist/lib/sync/project-private-synchronizer.test.d.ts +2 -0
- package/dist/lib/sync/project-private-synchronizer.test.d.ts.map +1 -0
- package/dist/lib/sync/project-private-synchronizer.test.js +348 -0
- package/dist/lib/sync/project-private-synchronizer.test.js.map +1 -0
- package/dist/types/index.d.ts +1 -0
- package/dist/types/index.d.ts.map +1 -1
- package/dist/types/sync.d.ts +36 -6
- package/dist/types/sync.d.ts.map +1 -1
- package/dist/types/sync.js +2 -2
- package/dist/types/sync.js.map +1 -1
- package/package.json +5 -4
- package/presets/default/.claude/agents/einja/Explore.md +140 -0
- package/presets/default/.claude/agents/einja/backend-architect.md +4 -0
- package/presets/default/.claude/agents/einja/codex-agent.md +4 -0
- package/presets/default/.claude/agents/einja/design-engineer.md +4 -0
- package/presets/default/.claude/agents/einja/docs/docs-updater.md +4 -0
- package/presets/default/.claude/agents/einja/frontend-architect.md +4 -0
- package/presets/default/.claude/agents/einja/frontend-coder.md +4 -0
- package/presets/default/.claude/agents/einja/git/conflict-resolver.md +4 -0
- package/presets/default/.claude/agents/einja/specs/spec-design-generator.md +4 -1
- package/presets/default/.claude/agents/einja/specs/spec-qa-generator.md +4 -0
- package/presets/default/.claude/agents/einja/specs/spec-requirements-generator.md +4 -1
- package/presets/default/.claude/agents/einja/specs/spec-tasks-generator.md +6 -2
- package/presets/default/.claude/agents/einja/specs/spec-tasks-validator.md +4 -0
- package/presets/default/.claude/agents/einja/task/task-executer.md +57 -115
- package/presets/default/.claude/agents/einja/task/task-modification-analyzer.md +4 -0
- package/presets/default/.claude/agents/einja/task/task-qa.md +4 -0
- package/presets/default/.claude/agents/einja/task/task-reviewer.md +4 -0
- package/presets/default/.claude/commands/einja/einja-sync.md +5 -1
- package/presets/default/.claude/commands/einja/frontend-implement.md +3 -1
- package/presets/default/.claude/commands/einja/issue-exec.md +403 -0
- package/presets/default/.claude/commands/einja/spec-create.md +15 -1
- package/presets/default/.claude/commands/einja/start-dev.md +4 -0
- package/presets/default/.claude/commands/einja/sync-cursor-commands.md +4 -0
- package/presets/default/.claude/commands/einja/task-exec.md +106 -14
- package/presets/default/.claude/commands/einja/update-docs-by-task-specs.md +4 -0
- package/presets/default/.claude/hooks/einja/plan-mode-skill-loader.sh +23 -0
- package/presets/default/.claude/settings.json +15 -1
- package/presets/default/.claude/skills/einja-conflict-resolver/SKILL.md +4 -0
- package/presets/default/.claude/skills/einja-general-context-loader/SKILL.md +4 -0
- package/presets/default/.claude/skills/einja-output-format/SKILL.md +4 -0
- package/presets/default/.claude/skills/einja-project-overview/SKILL.md +7 -3
- package/presets/default/.claude/skills/einja-skill-creator/SKILL.md +266 -274
- package/presets/default/.claude/skills/einja-skill-creator/agents/analyzer.md +274 -0
- package/presets/default/.claude/skills/einja-skill-creator/agents/comparator.md +202 -0
- package/presets/default/.claude/skills/einja-skill-creator/agents/grader.md +195 -0
- package/presets/default/.claude/skills/einja-skill-creator/assets/eval_review.html +146 -0
- package/presets/default/.claude/skills/einja-skill-creator/eval-viewer/generate_review.py +471 -0
- package/presets/default/.claude/skills/einja-skill-creator/eval-viewer/viewer.html +1325 -0
- package/presets/default/.claude/skills/einja-skill-creator/references/schemas.md +430 -0
- package/presets/default/.claude/skills/einja-skill-creator/scripts/aggregate_benchmark.py +154 -0
- package/presets/default/.claude/skills/einja-skill-creator/scripts/generate_report.py +265 -0
- package/presets/default/.claude/skills/einja-skill-creator/scripts/improve_description.py +252 -0
- package/presets/default/.claude/skills/einja-skill-creator/scripts/init_skill.py +13 -19
- package/presets/default/.claude/skills/einja-skill-creator/scripts/package_skill.py +36 -7
- package/presets/default/.claude/skills/einja-skill-creator/scripts/run_eval.py +310 -0
- package/presets/default/.claude/skills/einja-skill-creator/scripts/run_loop.py +295 -0
- package/presets/default/.claude/skills/einja-skill-creator/scripts/utils.py +48 -0
- package/presets/default/.claude/skills/einja-spec-context-loader/SKILL.md +4 -0
- package/presets/default/.claude/skills/einja-task-commit/SKILL.md +4 -0
- package/presets/default/.claude/skills/einja-task-qa/SKILL.md +4 -0
- package/presets/default/.envrc +5 -0
- package/presets/default/.mcp.json +2 -12
- package/presets/default/CLAUDE.md.template +26 -4
- package/presets/default/docs/einja/example/specs/issues/issue999-example-task/tasks.md +1 -1
- package/presets/default/docs/einja/instructions/deployment-setup.md +3 -8
- package/presets/default/docs/einja/instructions/environment-setup.md +3 -8
- package/presets/default/docs/einja/instructions/issue-exec-workflow.md +276 -0
- package/presets/default/docs/einja/instructions/local-server-environment-and-worktree.md +70 -8
- package/presets/default/docs/einja/instructions/neon-cli-reference.md +3 -8
- package/presets/default/docs/einja/instructions/task-execute.md +23 -28
- package/presets/default/docs/einja/instructions/vercel-cli-reference.md +17 -10
- package/presets/default/docs/einja/steering/README.md +11 -11
- package/presets/default/docs/einja/steering/acceptance-criteria-and-qa-guide.md +3 -8
- package/presets/default/docs/einja/steering/architecture.md +3 -8
- package/presets/default/docs/einja/steering/branch-strategy.md +63 -70
- package/presets/default/docs/einja/steering/commit-rules.md +3 -8
- package/presets/default/docs/einja/steering/db-schema-design.md +3 -8
- package/presets/default/docs/einja/steering/development/api-development.md +3 -8
- package/presets/default/docs/einja/steering/development/backend-architecture.md +3 -8
- package/presets/default/docs/einja/steering/development/coding-standards.md +723 -0
- package/presets/default/docs/einja/steering/development/component-design.md +502 -0
- package/presets/default/docs/einja/steering/development/database-guidelines.md +54 -5
- package/presets/default/docs/einja/steering/development/frontend-development.md +3 -8
- package/presets/default/docs/einja/steering/development/playwright-guidelines.md +59 -0
- package/presets/default/docs/einja/steering/development/review-guidelines.md +3 -8
- package/presets/default/docs/einja/steering/development/testing-strategy.md +3 -8
- package/presets/default/docs/einja/steering/development-workflow.md +71 -124
- package/presets/default/docs/einja/steering/infrastructure/deployment.md +49 -55
- package/presets/default/docs/einja/steering/infrastructure/environment-variables.md +4 -8
- package/presets/default/docs/einja/steering/product.md +3 -8
- package/presets/default/docs/einja/steering/task-management.md +14 -98
- package/presets/default/scripts/ensure-serena.sh +75 -0
- package/presets/default/scripts/env-rotate-secrets.ts +336 -0
- package/presets/default/scripts/env-show.ts +130 -0
- package/presets/default/scripts/env.ts +479 -0
- package/presets/default/scripts/init.sh +92 -0
- package/presets/default/scripts/lib/env-common.ts +108 -0
- package/presets/default/scripts/lib/worktree-config.ts +64 -0
- package/presets/default/scripts/setup-dev.ts +640 -0
- package/presets/default/scripts/stop-serena.sh +25 -0
- package/presets/default/scripts/worktree/dev.ts +872 -0
- package/dist/lib/sync/seed-synchronizer.d.ts +0 -27
- package/dist/lib/sync/seed-synchronizer.d.ts.map +0 -1
- package/dist/lib/sync/seed-synchronizer.js +0 -72
- package/dist/lib/sync/seed-synchronizer.js.map +0 -1
- package/dist/lib/sync/seed-synchronizer.test.d.ts +0 -2
- package/dist/lib/sync/seed-synchronizer.test.d.ts.map +0 -1
- package/dist/lib/sync/seed-synchronizer.test.js +0 -147
- package/dist/lib/sync/seed-synchronizer.test.js.map +0 -1
- package/presets/default/.claude/skills/einja-api-development/SKILL.md +0 -14
- package/presets/default/.claude/skills/einja-backend-architecture/SKILL.md +0 -18
- package/presets/default/.claude/skills/einja-coding-standards/SKILL.md +0 -132
- package/presets/default/.claude/skills/einja-coding-standards/references/import-conventions.md +0 -69
- package/presets/default/.claude/skills/einja-coding-standards/references/naming-conventions.md +0 -107
- package/presets/default/.claude/skills/einja-coding-standards/references/prohibited-patterns.md +0 -169
- package/presets/default/.claude/skills/einja-coding-standards/references/typescript-rules.md +0 -247
- package/presets/default/.claude/skills/einja-component-design/SKILL.md +0 -109
- package/presets/default/.claude/skills/einja-component-design/references/directory-structure.md +0 -117
- package/presets/default/.claude/skills/einja-component-design/references/props-patterns.md +0 -159
- package/presets/default/.claude/skills/einja-component-design/references/styling-guide.md +0 -122
- package/presets/default/.claude/skills/einja-frontend-development/SKILL.md +0 -14
- package/presets/default/docs/einja/instructions/task-vibe-kanban-loop.md +0 -565
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""スキルdescriptionのトリガー評価を実行。
|
|
3
|
+
|
|
4
|
+
スキルのdescriptionが一連のクエリに対してClaudeのスキル使用を
|
|
5
|
+
トリガーするかどうかをテストする。結果をJSONで出力。
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import argparse
|
|
9
|
+
import json
|
|
10
|
+
import os
|
|
11
|
+
import select
|
|
12
|
+
import subprocess
|
|
13
|
+
import sys
|
|
14
|
+
import time
|
|
15
|
+
import uuid
|
|
16
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
try:
|
|
20
|
+
from scripts.utils import parse_skill_md
|
|
21
|
+
except ImportError:
|
|
22
|
+
from utils import parse_skill_md
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def find_project_root() -> Path:
|
|
26
|
+
"""cwdから上方向にウォークし、.claude/を探してプロジェクトルートを見つける。
|
|
27
|
+
|
|
28
|
+
Claude Codeがプロジェクトルートを発見する方法を模倣し、
|
|
29
|
+
作成するコマンドファイルがclaude -pの検索対象に入るようにする。
|
|
30
|
+
"""
|
|
31
|
+
current = Path.cwd()
|
|
32
|
+
for parent in [current, *current.parents]:
|
|
33
|
+
if (parent / ".claude").is_dir():
|
|
34
|
+
return parent
|
|
35
|
+
return current
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def run_single_query(
|
|
39
|
+
query: str,
|
|
40
|
+
skill_name: str,
|
|
41
|
+
skill_description: str,
|
|
42
|
+
timeout: int,
|
|
43
|
+
project_root: str,
|
|
44
|
+
model: str | None = None,
|
|
45
|
+
) -> bool:
|
|
46
|
+
"""単一のクエリを実行し、スキルがトリガーされたかどうかを返す。
|
|
47
|
+
|
|
48
|
+
.claude/commands/にコマンドファイルを作成してClaudeのavailable_skillsリストに
|
|
49
|
+
表示させ、`claude -p`で生のクエリを実行する。
|
|
50
|
+
--include-partial-messagesを使用してストリームイベント(content_block_start)から
|
|
51
|
+
早期にトリガーを検出する。
|
|
52
|
+
"""
|
|
53
|
+
unique_id = uuid.uuid4().hex[:8]
|
|
54
|
+
clean_name = f"{skill_name}-skill-{unique_id}"
|
|
55
|
+
project_commands_dir = Path(project_root) / ".claude" / "commands"
|
|
56
|
+
command_file = project_commands_dir / f"{clean_name}.md"
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
project_commands_dir.mkdir(parents=True, exist_ok=True)
|
|
60
|
+
# description内のクォートでの破損を避けるためYAMLブロックスカラーを使用
|
|
61
|
+
indented_desc = "\n ".join(skill_description.split("\n"))
|
|
62
|
+
command_content = (
|
|
63
|
+
f"---\n"
|
|
64
|
+
f"description: |\n"
|
|
65
|
+
f" {indented_desc}\n"
|
|
66
|
+
f"---\n\n"
|
|
67
|
+
f"# {skill_name}\n\n"
|
|
68
|
+
f"This skill handles: {skill_description}\n"
|
|
69
|
+
)
|
|
70
|
+
command_file.write_text(command_content)
|
|
71
|
+
|
|
72
|
+
cmd = [
|
|
73
|
+
"claude",
|
|
74
|
+
"-p", query,
|
|
75
|
+
"--output-format", "stream-json",
|
|
76
|
+
"--verbose",
|
|
77
|
+
"--include-partial-messages",
|
|
78
|
+
]
|
|
79
|
+
if model:
|
|
80
|
+
cmd.extend(["--model", model])
|
|
81
|
+
|
|
82
|
+
# Claude Codeセッション内でclaude -pのネストを許可するためCLAUDECODE環境変数を除去
|
|
83
|
+
env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"}
|
|
84
|
+
|
|
85
|
+
process = subprocess.Popen(
|
|
86
|
+
cmd,
|
|
87
|
+
stdout=subprocess.PIPE,
|
|
88
|
+
stderr=subprocess.DEVNULL,
|
|
89
|
+
cwd=project_root,
|
|
90
|
+
env=env,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
triggered = False
|
|
94
|
+
start_time = time.time()
|
|
95
|
+
buffer = ""
|
|
96
|
+
# ストリームイベント検出用の状態追跡
|
|
97
|
+
pending_tool_name = None
|
|
98
|
+
accumulated_json = ""
|
|
99
|
+
|
|
100
|
+
try:
|
|
101
|
+
while time.time() - start_time < timeout:
|
|
102
|
+
if process.poll() is not None:
|
|
103
|
+
remaining = process.stdout.read()
|
|
104
|
+
if remaining:
|
|
105
|
+
buffer += remaining.decode("utf-8", errors="replace")
|
|
106
|
+
break
|
|
107
|
+
|
|
108
|
+
ready, _, _ = select.select([process.stdout], [], [], 1.0)
|
|
109
|
+
if not ready:
|
|
110
|
+
continue
|
|
111
|
+
|
|
112
|
+
chunk = os.read(process.stdout.fileno(), 8192)
|
|
113
|
+
if not chunk:
|
|
114
|
+
break
|
|
115
|
+
buffer += chunk.decode("utf-8", errors="replace")
|
|
116
|
+
|
|
117
|
+
while "\n" in buffer:
|
|
118
|
+
line, buffer = buffer.split("\n", 1)
|
|
119
|
+
line = line.strip()
|
|
120
|
+
if not line:
|
|
121
|
+
continue
|
|
122
|
+
|
|
123
|
+
try:
|
|
124
|
+
event = json.loads(line)
|
|
125
|
+
except json.JSONDecodeError:
|
|
126
|
+
continue
|
|
127
|
+
|
|
128
|
+
# ストリームイベントによる早期検出
|
|
129
|
+
if event.get("type") == "stream_event":
|
|
130
|
+
se = event.get("event", {})
|
|
131
|
+
se_type = se.get("type", "")
|
|
132
|
+
|
|
133
|
+
if se_type == "content_block_start":
|
|
134
|
+
cb = se.get("content_block", {})
|
|
135
|
+
if cb.get("type") == "tool_use":
|
|
136
|
+
tool_name = cb.get("name", "")
|
|
137
|
+
if tool_name in ("Skill", "Read"):
|
|
138
|
+
pending_tool_name = tool_name
|
|
139
|
+
accumulated_json = ""
|
|
140
|
+
else:
|
|
141
|
+
return False
|
|
142
|
+
|
|
143
|
+
elif se_type == "content_block_delta" and pending_tool_name:
|
|
144
|
+
delta = se.get("delta", {})
|
|
145
|
+
if delta.get("type") == "input_json_delta":
|
|
146
|
+
accumulated_json += delta.get("partial_json", "")
|
|
147
|
+
if clean_name in accumulated_json:
|
|
148
|
+
return True
|
|
149
|
+
|
|
150
|
+
elif se_type in ("content_block_stop", "message_stop"):
|
|
151
|
+
if pending_tool_name:
|
|
152
|
+
return clean_name in accumulated_json
|
|
153
|
+
if se_type == "message_stop":
|
|
154
|
+
return False
|
|
155
|
+
|
|
156
|
+
# フォールバック: 完全なassistantメッセージ
|
|
157
|
+
elif event.get("type") == "assistant":
|
|
158
|
+
message = event.get("message", {})
|
|
159
|
+
for content_item in message.get("content", []):
|
|
160
|
+
if content_item.get("type") != "tool_use":
|
|
161
|
+
continue
|
|
162
|
+
tool_name = content_item.get("name", "")
|
|
163
|
+
tool_input = content_item.get("input", {})
|
|
164
|
+
if tool_name == "Skill" and clean_name in tool_input.get("skill", ""):
|
|
165
|
+
triggered = True
|
|
166
|
+
elif tool_name == "Read" and clean_name in tool_input.get("file_path", ""):
|
|
167
|
+
triggered = True
|
|
168
|
+
return triggered
|
|
169
|
+
|
|
170
|
+
elif event.get("type") == "result":
|
|
171
|
+
return triggered
|
|
172
|
+
finally:
|
|
173
|
+
# 任意の終了パス(return、例外、タイムアウト)でプロセスをクリーンアップ
|
|
174
|
+
if process.poll() is None:
|
|
175
|
+
process.kill()
|
|
176
|
+
process.wait()
|
|
177
|
+
|
|
178
|
+
return triggered
|
|
179
|
+
finally:
|
|
180
|
+
if command_file.exists():
|
|
181
|
+
command_file.unlink()
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def run_eval(
|
|
185
|
+
eval_set: list[dict],
|
|
186
|
+
skill_name: str,
|
|
187
|
+
description: str,
|
|
188
|
+
num_workers: int,
|
|
189
|
+
timeout: int,
|
|
190
|
+
project_root: Path,
|
|
191
|
+
runs_per_query: int = 1,
|
|
192
|
+
trigger_threshold: float = 0.5,
|
|
193
|
+
model: str | None = None,
|
|
194
|
+
) -> dict:
|
|
195
|
+
"""評価セット全体を実行し、結果を返す。"""
|
|
196
|
+
results = []
|
|
197
|
+
|
|
198
|
+
with ProcessPoolExecutor(max_workers=num_workers) as executor:
|
|
199
|
+
future_to_info = {}
|
|
200
|
+
for item in eval_set:
|
|
201
|
+
for run_idx in range(runs_per_query):
|
|
202
|
+
future = executor.submit(
|
|
203
|
+
run_single_query,
|
|
204
|
+
item["query"],
|
|
205
|
+
skill_name,
|
|
206
|
+
description,
|
|
207
|
+
timeout,
|
|
208
|
+
str(project_root),
|
|
209
|
+
model,
|
|
210
|
+
)
|
|
211
|
+
future_to_info[future] = (item, run_idx)
|
|
212
|
+
|
|
213
|
+
query_triggers: dict[str, list[bool]] = {}
|
|
214
|
+
query_items: dict[str, dict] = {}
|
|
215
|
+
for future in as_completed(future_to_info):
|
|
216
|
+
item, _ = future_to_info[future]
|
|
217
|
+
query = item["query"]
|
|
218
|
+
query_items[query] = item
|
|
219
|
+
if query not in query_triggers:
|
|
220
|
+
query_triggers[query] = []
|
|
221
|
+
try:
|
|
222
|
+
query_triggers[query].append(future.result())
|
|
223
|
+
except Exception as e:
|
|
224
|
+
print(f"警告: クエリが失敗しました: {e}", file=sys.stderr)
|
|
225
|
+
query_triggers[query].append(False)
|
|
226
|
+
|
|
227
|
+
for query, triggers in query_triggers.items():
|
|
228
|
+
item = query_items[query]
|
|
229
|
+
trigger_rate = sum(triggers) / len(triggers)
|
|
230
|
+
should_trigger = item["should_trigger"]
|
|
231
|
+
if should_trigger:
|
|
232
|
+
did_pass = trigger_rate >= trigger_threshold
|
|
233
|
+
else:
|
|
234
|
+
did_pass = trigger_rate < trigger_threshold
|
|
235
|
+
results.append({
|
|
236
|
+
"query": query,
|
|
237
|
+
"should_trigger": should_trigger,
|
|
238
|
+
"trigger_rate": trigger_rate,
|
|
239
|
+
"triggers": sum(triggers),
|
|
240
|
+
"runs": len(triggers),
|
|
241
|
+
"pass": did_pass,
|
|
242
|
+
})
|
|
243
|
+
|
|
244
|
+
passed = sum(1 for r in results if r["pass"])
|
|
245
|
+
total = len(results)
|
|
246
|
+
|
|
247
|
+
return {
|
|
248
|
+
"skill_name": skill_name,
|
|
249
|
+
"description": description,
|
|
250
|
+
"results": results,
|
|
251
|
+
"summary": {
|
|
252
|
+
"total": total,
|
|
253
|
+
"passed": passed,
|
|
254
|
+
"failed": total - passed,
|
|
255
|
+
},
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def main():
|
|
260
|
+
parser = argparse.ArgumentParser(description="スキルdescriptionのトリガー評価を実行")
|
|
261
|
+
parser.add_argument("--eval-set", required=True, help="評価セットJSONファイルへのパス")
|
|
262
|
+
parser.add_argument("--skill-path", required=True, help="スキルディレクトリへのパス")
|
|
263
|
+
parser.add_argument("--description", default=None, help="テスト用descriptionの上書き")
|
|
264
|
+
parser.add_argument("--num-workers", type=int, default=10, help="並行ワーカー数")
|
|
265
|
+
parser.add_argument("--timeout", type=int, default=30, help="クエリごとのタイムアウト(秒)")
|
|
266
|
+
parser.add_argument("--runs-per-query", type=int, default=3, help="クエリごとの実行回数")
|
|
267
|
+
parser.add_argument("--trigger-threshold", type=float, default=0.5, help="トリガー率の閾値")
|
|
268
|
+
parser.add_argument("--model", default=None, help="claude -pに使用するモデル(デフォルト: ユーザー設定のモデル)")
|
|
269
|
+
parser.add_argument("--verbose", action="store_true", help="進捗をstderrに出力")
|
|
270
|
+
args = parser.parse_args()
|
|
271
|
+
|
|
272
|
+
eval_set = json.loads(Path(args.eval_set).read_text())
|
|
273
|
+
skill_path = Path(args.skill_path)
|
|
274
|
+
|
|
275
|
+
if not (skill_path / "SKILL.md").exists():
|
|
276
|
+
print(f"エラー: {skill_path} にSKILL.mdが見つかりません", file=sys.stderr)
|
|
277
|
+
sys.exit(1)
|
|
278
|
+
|
|
279
|
+
name, original_description, content = parse_skill_md(skill_path)
|
|
280
|
+
description = args.description or original_description
|
|
281
|
+
project_root = find_project_root()
|
|
282
|
+
|
|
283
|
+
if args.verbose:
|
|
284
|
+
print(f"評価中: {description}", file=sys.stderr)
|
|
285
|
+
|
|
286
|
+
output = run_eval(
|
|
287
|
+
eval_set=eval_set,
|
|
288
|
+
skill_name=name,
|
|
289
|
+
description=description,
|
|
290
|
+
num_workers=args.num_workers,
|
|
291
|
+
timeout=args.timeout,
|
|
292
|
+
project_root=project_root,
|
|
293
|
+
runs_per_query=args.runs_per_query,
|
|
294
|
+
trigger_threshold=args.trigger_threshold,
|
|
295
|
+
model=args.model,
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
if args.verbose:
|
|
299
|
+
summary = output["summary"]
|
|
300
|
+
print(f"結果: {summary['passed']}/{summary['total']} パス", file=sys.stderr)
|
|
301
|
+
for r in output["results"]:
|
|
302
|
+
status = "PASS" if r["pass"] else "FAIL"
|
|
303
|
+
rate_str = f"{r['triggers']}/{r['runs']}"
|
|
304
|
+
print(f" [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:70]}", file=sys.stderr)
|
|
305
|
+
|
|
306
|
+
print(json.dumps(output, indent=2))
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
if __name__ == "__main__":
|
|
310
|
+
main()
|
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""評価+改善ループを実行。全パスまたは最大イテレーション到達まで繰り返す。
|
|
3
|
+
|
|
4
|
+
run_eval.pyとimprove_description.pyをループで組み合わせ、
|
|
5
|
+
履歴を追跡し最良のdescriptionを返す。
|
|
6
|
+
過学習防止のためtrain/test分割に対応。
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import argparse
|
|
10
|
+
import json
|
|
11
|
+
import random
|
|
12
|
+
import sys
|
|
13
|
+
import time
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
try:
|
|
17
|
+
from scripts.generate_report import generate_html
|
|
18
|
+
from scripts.improve_description import improve_description
|
|
19
|
+
from scripts.run_eval import find_project_root, run_eval
|
|
20
|
+
from scripts.utils import parse_skill_md
|
|
21
|
+
except ImportError:
|
|
22
|
+
from generate_report import generate_html
|
|
23
|
+
from improve_description import improve_description
|
|
24
|
+
from run_eval import find_project_root, run_eval
|
|
25
|
+
from utils import parse_skill_md
|
|
26
|
+
|
|
27
|
+
import anthropic
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def split_eval_set(
|
|
31
|
+
eval_set: list[dict],
|
|
32
|
+
holdout: int,
|
|
33
|
+
seed: int | None = None,
|
|
34
|
+
) -> tuple[list[dict], list[dict]]:
|
|
35
|
+
"""評価セットをトレーニングとテストに分割する。
|
|
36
|
+
|
|
37
|
+
holdoutが0の場合、全データをトレーニングに使用する。
|
|
38
|
+
should_trigger=Trueとshould_trigger=Falseの両方から
|
|
39
|
+
均等にホールドアウトする。
|
|
40
|
+
"""
|
|
41
|
+
if holdout <= 0:
|
|
42
|
+
return eval_set, []
|
|
43
|
+
|
|
44
|
+
rng = random.Random(seed)
|
|
45
|
+
|
|
46
|
+
positive = [item for item in eval_set if item.get("should_trigger", True)]
|
|
47
|
+
negative = [item for item in eval_set if not item.get("should_trigger", True)]
|
|
48
|
+
|
|
49
|
+
# 正例と負例から均等にホールドアウト
|
|
50
|
+
pos_holdout = holdout // 2
|
|
51
|
+
neg_holdout = holdout - pos_holdout
|
|
52
|
+
|
|
53
|
+
# 上限調整
|
|
54
|
+
pos_holdout = min(pos_holdout, len(positive) - 1) if len(positive) > 1 else 0
|
|
55
|
+
neg_holdout = min(neg_holdout, len(negative) - 1) if len(negative) > 1 else 0
|
|
56
|
+
|
|
57
|
+
rng.shuffle(positive)
|
|
58
|
+
rng.shuffle(negative)
|
|
59
|
+
|
|
60
|
+
test_set = positive[:pos_holdout] + negative[:neg_holdout]
|
|
61
|
+
train_set = positive[pos_holdout:] + negative[neg_holdout:]
|
|
62
|
+
|
|
63
|
+
return train_set, test_set
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def run_loop(
|
|
67
|
+
eval_set_path: str,
|
|
68
|
+
skill_path: str,
|
|
69
|
+
max_iterations: int = 10,
|
|
70
|
+
num_workers: int = 10,
|
|
71
|
+
timeout: int = 30,
|
|
72
|
+
runs_per_query: int = 3,
|
|
73
|
+
trigger_threshold: float = 0.5,
|
|
74
|
+
holdout: int = 0,
|
|
75
|
+
seed: int | None = None,
|
|
76
|
+
model: str | None = None,
|
|
77
|
+
improve_model: str = "claude-sonnet-4-20250514",
|
|
78
|
+
verbose: bool = False,
|
|
79
|
+
report_path: str | None = None,
|
|
80
|
+
log_dir: str | None = None,
|
|
81
|
+
) -> dict:
|
|
82
|
+
"""評価+改善ループのメイン関数。"""
|
|
83
|
+
eval_set = json.loads(Path(eval_set_path).read_text())
|
|
84
|
+
skill_dir = Path(skill_path)
|
|
85
|
+
|
|
86
|
+
if not (skill_dir / "SKILL.md").exists():
|
|
87
|
+
print(f"エラー: {skill_dir} にSKILL.mdが見つかりません", file=sys.stderr)
|
|
88
|
+
sys.exit(1)
|
|
89
|
+
|
|
90
|
+
name, original_description, content = parse_skill_md(skill_dir)
|
|
91
|
+
project_root = find_project_root()
|
|
92
|
+
|
|
93
|
+
# train/test分割
|
|
94
|
+
train_set, test_set = split_eval_set(eval_set, holdout, seed)
|
|
95
|
+
|
|
96
|
+
if verbose:
|
|
97
|
+
print(f"スキル: {name}", file=sys.stderr)
|
|
98
|
+
print(f"トレーニングクエリ: {len(train_set)}, テストクエリ: {len(test_set)}", file=sys.stderr)
|
|
99
|
+
print(f"最大イテレーション: {max_iterations}", file=sys.stderr)
|
|
100
|
+
print(f"オリジナルdescription: {original_description}", file=sys.stderr)
|
|
101
|
+
|
|
102
|
+
client = anthropic.Anthropic()
|
|
103
|
+
current_description = original_description
|
|
104
|
+
history: list[dict] = []
|
|
105
|
+
improve_history: list[dict] = []
|
|
106
|
+
log_path = Path(log_dir) if log_dir else None
|
|
107
|
+
|
|
108
|
+
for iteration in range(max_iterations):
|
|
109
|
+
if verbose:
|
|
110
|
+
print(f"\n--- イテレーション {iteration} ---", file=sys.stderr)
|
|
111
|
+
print(f"description: {current_description[:100]}...", file=sys.stderr)
|
|
112
|
+
|
|
113
|
+
# トレーニング評価
|
|
114
|
+
train_results = run_eval(
|
|
115
|
+
eval_set=train_set,
|
|
116
|
+
skill_name=name,
|
|
117
|
+
description=current_description,
|
|
118
|
+
num_workers=num_workers,
|
|
119
|
+
timeout=timeout,
|
|
120
|
+
project_root=project_root,
|
|
121
|
+
runs_per_query=runs_per_query,
|
|
122
|
+
trigger_threshold=trigger_threshold,
|
|
123
|
+
model=model,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
# テスト評価(テストセットがある場合)
|
|
127
|
+
test_results = None
|
|
128
|
+
if test_set:
|
|
129
|
+
test_results = run_eval(
|
|
130
|
+
eval_set=test_set,
|
|
131
|
+
skill_name=name,
|
|
132
|
+
description=current_description,
|
|
133
|
+
num_workers=num_workers,
|
|
134
|
+
timeout=timeout,
|
|
135
|
+
project_root=project_root,
|
|
136
|
+
runs_per_query=runs_per_query,
|
|
137
|
+
trigger_threshold=trigger_threshold,
|
|
138
|
+
model=model,
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# 履歴エントリの構築
|
|
142
|
+
entry: dict = {
|
|
143
|
+
"description": current_description,
|
|
144
|
+
"train_passed": train_results["summary"]["passed"],
|
|
145
|
+
"train_failed": train_results["summary"]["failed"],
|
|
146
|
+
"train_total": train_results["summary"]["total"],
|
|
147
|
+
"train_results": train_results["results"],
|
|
148
|
+
}
|
|
149
|
+
if test_results:
|
|
150
|
+
entry["test_passed"] = test_results["summary"]["passed"]
|
|
151
|
+
entry["test_failed"] = test_results["summary"]["failed"]
|
|
152
|
+
entry["test_total"] = test_results["summary"]["total"]
|
|
153
|
+
entry["test_results"] = test_results["results"]
|
|
154
|
+
|
|
155
|
+
history.append(entry)
|
|
156
|
+
|
|
157
|
+
if verbose:
|
|
158
|
+
train_s = f"{train_results['summary']['passed']}/{train_results['summary']['total']}"
|
|
159
|
+
msg = f"トレーニングスコア: {train_s}"
|
|
160
|
+
if test_results:
|
|
161
|
+
test_s = f"{test_results['summary']['passed']}/{test_results['summary']['total']}"
|
|
162
|
+
msg += f", テストスコア: {test_s}"
|
|
163
|
+
print(msg, file=sys.stderr)
|
|
164
|
+
|
|
165
|
+
# レポート更新
|
|
166
|
+
if report_path:
|
|
167
|
+
output_data = {"history": history, "holdout": holdout}
|
|
168
|
+
report_html = generate_html(output_data, auto_refresh=True, skill_name=name)
|
|
169
|
+
Path(report_path).write_text(report_html)
|
|
170
|
+
if verbose:
|
|
171
|
+
print(f"レポートを更新しました: {report_path}", file=sys.stderr)
|
|
172
|
+
|
|
173
|
+
# 全パスなら終了
|
|
174
|
+
if train_results["summary"]["failed"] == 0:
|
|
175
|
+
if test_results is None or test_results["summary"]["failed"] == 0:
|
|
176
|
+
if verbose:
|
|
177
|
+
print("全クエリパス。ループを終了します。", file=sys.stderr)
|
|
178
|
+
break
|
|
179
|
+
|
|
180
|
+
# 最終イテレーションでなければ改善
|
|
181
|
+
if iteration < max_iterations - 1:
|
|
182
|
+
if verbose:
|
|
183
|
+
print("descriptionを改善中...", file=sys.stderr)
|
|
184
|
+
|
|
185
|
+
new_description = improve_description(
|
|
186
|
+
client=client,
|
|
187
|
+
skill_name=name,
|
|
188
|
+
skill_content=content,
|
|
189
|
+
current_description=current_description,
|
|
190
|
+
eval_results=train_results,
|
|
191
|
+
history=improve_history,
|
|
192
|
+
model=improve_model,
|
|
193
|
+
test_results=test_results,
|
|
194
|
+
log_dir=log_path,
|
|
195
|
+
iteration=iteration,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
# 改善履歴を更新
|
|
199
|
+
improve_entry: dict = {
|
|
200
|
+
"description": current_description,
|
|
201
|
+
"train_passed": train_results["summary"]["passed"],
|
|
202
|
+
"train_total": train_results["summary"]["total"],
|
|
203
|
+
"results": train_results["results"],
|
|
204
|
+
}
|
|
205
|
+
if test_results:
|
|
206
|
+
improve_entry["test_passed"] = test_results["summary"]["passed"]
|
|
207
|
+
improve_entry["test_total"] = test_results["summary"]["total"]
|
|
208
|
+
improve_history.append(improve_entry)
|
|
209
|
+
|
|
210
|
+
current_description = new_description
|
|
211
|
+
|
|
212
|
+
if verbose:
|
|
213
|
+
print(f"新しいdescription: {new_description[:100]}...", file=sys.stderr)
|
|
214
|
+
|
|
215
|
+
# 最良のdescriptionを選択(テスト > トレーニングで優先)
|
|
216
|
+
best_idx = 0
|
|
217
|
+
best_test = -1
|
|
218
|
+
best_train = -1
|
|
219
|
+
for i, h in enumerate(history):
|
|
220
|
+
t_passed = h.get("test_passed", -1)
|
|
221
|
+
tr_passed = h.get("train_passed", h.get("passed", 0))
|
|
222
|
+
if t_passed > best_test or (t_passed == best_test and tr_passed > best_train):
|
|
223
|
+
best_test = t_passed
|
|
224
|
+
best_train = tr_passed
|
|
225
|
+
best_idx = i
|
|
226
|
+
|
|
227
|
+
best = history[best_idx]
|
|
228
|
+
|
|
229
|
+
# 最終レポート(auto_refreshオフ)
|
|
230
|
+
if report_path:
|
|
231
|
+
output_data = {"history": history, "holdout": holdout}
|
|
232
|
+
report_html = generate_html(output_data, auto_refresh=False, skill_name=name)
|
|
233
|
+
Path(report_path).write_text(report_html)
|
|
234
|
+
|
|
235
|
+
output = {
|
|
236
|
+
"skill_name": name,
|
|
237
|
+
"original_description": original_description,
|
|
238
|
+
"best_description": best["description"],
|
|
239
|
+
"best_iteration": best_idx,
|
|
240
|
+
"history": history,
|
|
241
|
+
"holdout": holdout,
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
if verbose:
|
|
245
|
+
print(f"\n最良のdescription (イテレーション {best_idx}): {best['description']}", file=sys.stderr)
|
|
246
|
+
train_s = f"{best['train_passed']}/{best['train_total']}"
|
|
247
|
+
msg = f"最良スコア - トレーニング: {train_s}"
|
|
248
|
+
if best.get("test_passed") is not None:
|
|
249
|
+
test_s = f"{best['test_passed']}/{best['test_total']}"
|
|
250
|
+
msg += f", テスト: {test_s}"
|
|
251
|
+
print(msg, file=sys.stderr)
|
|
252
|
+
|
|
253
|
+
return output
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def main():
|
|
257
|
+
parser = argparse.ArgumentParser(description="評価+改善ループを実行")
|
|
258
|
+
parser.add_argument("--eval-set", required=True, help="評価セットJSONファイルへのパス")
|
|
259
|
+
parser.add_argument("--skill-path", required=True, help="スキルディレクトリへのパス")
|
|
260
|
+
parser.add_argument("--max-iterations", type=int, default=10, help="最大イテレーション数(デフォルト: 10)")
|
|
261
|
+
parser.add_argument("--num-workers", type=int, default=10, help="並行ワーカー数(デフォルト: 10)")
|
|
262
|
+
parser.add_argument("--timeout", type=int, default=30, help="クエリごとのタイムアウト秒数(デフォルト: 30)")
|
|
263
|
+
parser.add_argument("--runs-per-query", type=int, default=3, help="クエリごとの実行回数(デフォルト: 3)")
|
|
264
|
+
parser.add_argument("--trigger-threshold", type=float, default=0.5, help="トリガー率の閾値(デフォルト: 0.5)")
|
|
265
|
+
parser.add_argument("--holdout", type=int, default=0, help="テスト用ホールドアウトクエリ数(デフォルト: 0)")
|
|
266
|
+
parser.add_argument("--seed", type=int, default=None, help="train/test分割のランダムシード")
|
|
267
|
+
parser.add_argument("--model", default=None, help="評価時にclaude -pに使用するモデル")
|
|
268
|
+
parser.add_argument("--improve-model", default="claude-sonnet-4-20250514", help="description改善に使用するモデル(デフォルト: claude-sonnet-4-20250514)")
|
|
269
|
+
parser.add_argument("--verbose", action="store_true", help="進捗をstderrに出力")
|
|
270
|
+
parser.add_argument("--report", default=None, help="HTMLレポートの出力先パス(ライブ更新あり)")
|
|
271
|
+
parser.add_argument("--log-dir", default=None, help="改善トランスクリプトのログディレクトリ")
|
|
272
|
+
args = parser.parse_args()
|
|
273
|
+
|
|
274
|
+
output = run_loop(
|
|
275
|
+
eval_set_path=args.eval_set,
|
|
276
|
+
skill_path=args.skill_path,
|
|
277
|
+
max_iterations=args.max_iterations,
|
|
278
|
+
num_workers=args.num_workers,
|
|
279
|
+
timeout=args.timeout,
|
|
280
|
+
runs_per_query=args.runs_per_query,
|
|
281
|
+
trigger_threshold=args.trigger_threshold,
|
|
282
|
+
holdout=args.holdout,
|
|
283
|
+
seed=args.seed,
|
|
284
|
+
model=args.model,
|
|
285
|
+
improve_model=args.improve_model,
|
|
286
|
+
verbose=args.verbose,
|
|
287
|
+
report_path=args.report,
|
|
288
|
+
log_dir=args.log_dir,
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
print(json.dumps(output, indent=2))
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
if __name__ == "__main__":
|
|
295
|
+
main()
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""スキルクリエイタースクリプト共通ユーティリティ。"""
|
|
3
|
+
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def parse_skill_md(skill_path: Path) -> tuple[str, str, str]:
|
|
9
|
+
"""SKILL.mdファイルをパースし、(name, description, full_content)を返す。"""
|
|
10
|
+
content = (skill_path / "SKILL.md").read_text()
|
|
11
|
+
lines = content.split("\n")
|
|
12
|
+
|
|
13
|
+
if lines[0].strip() != "---":
|
|
14
|
+
raise ValueError("SKILL.mdにフロントマターがありません(開始の---がありません)")
|
|
15
|
+
|
|
16
|
+
end_idx = None
|
|
17
|
+
for i, line in enumerate(lines[1:], start=1):
|
|
18
|
+
if line.strip() == "---":
|
|
19
|
+
end_idx = i
|
|
20
|
+
break
|
|
21
|
+
|
|
22
|
+
if end_idx is None:
|
|
23
|
+
raise ValueError("SKILL.mdにフロントマターがありません(終了の---がありません)")
|
|
24
|
+
|
|
25
|
+
name = ""
|
|
26
|
+
description = ""
|
|
27
|
+
frontmatter_lines = lines[1:end_idx]
|
|
28
|
+
i = 0
|
|
29
|
+
while i < len(frontmatter_lines):
|
|
30
|
+
line = frontmatter_lines[i]
|
|
31
|
+
if line.startswith("name:"):
|
|
32
|
+
name = line[len("name:"):].strip().strip('"').strip("'")
|
|
33
|
+
elif line.startswith("description:"):
|
|
34
|
+
value = line[len("description:"):].strip()
|
|
35
|
+
# YAMLマルチラインインジケータ(>, |, >-, |-)の処理
|
|
36
|
+
if value in (">", "|", ">-", "|-"):
|
|
37
|
+
continuation_lines: list[str] = []
|
|
38
|
+
i += 1
|
|
39
|
+
while i < len(frontmatter_lines) and (frontmatter_lines[i].startswith(" ") or frontmatter_lines[i].startswith("\t")):
|
|
40
|
+
continuation_lines.append(frontmatter_lines[i].strip())
|
|
41
|
+
i += 1
|
|
42
|
+
description = " ".join(continuation_lines)
|
|
43
|
+
continue
|
|
44
|
+
else:
|
|
45
|
+
description = value.strip('"').strip("'")
|
|
46
|
+
i += 1
|
|
47
|
+
|
|
48
|
+
return name, description, content
|