@einja/dev-cli 0.1.40 → 0.1.44
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +89 -1
- package/dist/cli.js +1 -0
- package/dist/cli.js.map +1 -1
- package/dist/commands/init.d.ts.map +1 -1
- package/dist/commands/init.js +71 -1
- package/dist/commands/init.js.map +1 -1
- package/dist/commands/list.js.map +1 -1
- package/dist/commands/sync.d.ts.map +1 -1
- package/dist/commands/sync.js +187 -13
- package/dist/commands/sync.js.map +1 -1
- package/dist/commands/task-loop/lib/github-client.test.js.map +1 -1
- package/dist/commands/task-loop/lib/vibe-kanban-rest-client.js +2 -2
- package/dist/commands/task-loop/lib/vibe-kanban-rest-client.js.map +1 -1
- package/dist/lib/dependency-checker.d.ts.map +1 -1
- package/dist/lib/merger.d.ts +12 -0
- package/dist/lib/merger.d.ts.map +1 -1
- package/dist/lib/merger.js +28 -0
- package/dist/lib/merger.js.map +1 -1
- package/dist/lib/preset-update/cli-repo-detector.d.ts.map +1 -1
- package/dist/lib/preset-update/file-copier.d.ts.map +1 -1
- package/dist/lib/preset-update/file-copier.js +3 -3
- package/dist/lib/preset-update/file-copier.js.map +1 -1
- package/dist/lib/preset-update/preset-finder.d.ts.map +1 -1
- package/dist/lib/preset.d.ts.map +1 -1
- package/dist/lib/sync/category-validator.d.ts +1 -1
- package/dist/lib/sync/category-validator.d.ts.map +1 -1
- package/dist/lib/sync/category-validator.js +2 -1
- package/dist/lib/sync/category-validator.js.map +1 -1
- package/dist/lib/sync/category-validator.test.js +3 -1
- package/dist/lib/sync/category-validator.test.js.map +1 -1
- package/dist/lib/sync/conflict-reporter.d.ts.map +1 -1
- package/dist/lib/sync/diff-engine.d.ts.map +1 -1
- package/dist/lib/sync/file-filter.d.ts.map +1 -1
- package/dist/lib/sync/file-filter.js +1 -0
- package/dist/lib/sync/file-filter.js.map +1 -1
- package/dist/lib/sync/integration.test.js +255 -69
- package/dist/lib/sync/integration.test.js.map +1 -1
- package/dist/lib/sync/json-processor.d.ts +4 -4
- package/dist/lib/sync/json-processor.d.ts.map +1 -1
- package/dist/lib/sync/json-processor.js +11 -11
- package/dist/lib/sync/json-processor.js.map +1 -1
- package/dist/lib/sync/marker-processor.d.ts +60 -8
- package/dist/lib/sync/marker-processor.d.ts.map +1 -1
- package/dist/lib/sync/marker-processor.js +117 -26
- package/dist/lib/sync/marker-processor.js.map +1 -1
- package/dist/lib/sync/marker-processor.test.js +261 -40
- package/dist/lib/sync/marker-processor.test.js.map +1 -1
- package/dist/lib/sync/metadata-manager.d.ts +4 -0
- package/dist/lib/sync/metadata-manager.d.ts.map +1 -1
- package/dist/lib/sync/metadata-manager.js +15 -0
- package/dist/lib/sync/metadata-manager.js.map +1 -1
- package/dist/lib/sync/metadata-manager.test.js +69 -0
- package/dist/lib/sync/metadata-manager.test.js.map +1 -1
- package/dist/lib/sync/orphan-cleaner.d.ts +29 -0
- package/dist/lib/sync/orphan-cleaner.d.ts.map +1 -0
- package/dist/lib/sync/orphan-cleaner.js +80 -0
- package/dist/lib/sync/orphan-cleaner.js.map +1 -0
- package/dist/lib/sync/orphan-cleaner.test.d.ts +2 -0
- package/dist/lib/sync/orphan-cleaner.test.d.ts.map +1 -0
- package/dist/lib/sync/orphan-cleaner.test.js +169 -0
- package/dist/lib/sync/orphan-cleaner.test.js.map +1 -0
- package/dist/lib/sync/project-private-synchronizer.d.ts +52 -0
- package/dist/lib/sync/project-private-synchronizer.d.ts.map +1 -0
- package/dist/lib/sync/project-private-synchronizer.js +110 -0
- package/dist/lib/sync/project-private-synchronizer.js.map +1 -0
- package/dist/lib/sync/project-private-synchronizer.test.d.ts +2 -0
- package/dist/lib/sync/project-private-synchronizer.test.d.ts.map +1 -0
- package/dist/lib/sync/project-private-synchronizer.test.js +348 -0
- package/dist/lib/sync/project-private-synchronizer.test.js.map +1 -0
- package/dist/types/index.d.ts +1 -0
- package/dist/types/index.d.ts.map +1 -1
- package/dist/types/sync.d.ts +36 -6
- package/dist/types/sync.d.ts.map +1 -1
- package/dist/types/sync.js +2 -2
- package/dist/types/sync.js.map +1 -1
- package/package.json +5 -4
- package/presets/default/.claude/agents/einja/Explore.md +140 -0
- package/presets/default/.claude/agents/einja/backend-architect.md +21 -1
- package/presets/default/.claude/agents/einja/codex-agent.md +5 -1
- package/presets/default/.claude/agents/einja/design-engineer.md +5 -1
- package/presets/default/.claude/agents/einja/docs/docs-updater.md +7 -93
- package/presets/default/.claude/agents/einja/frontend-architect.md +21 -1
- package/presets/default/.claude/agents/einja/frontend-coder.md +5 -1
- package/presets/default/.claude/agents/einja/{specs/spec-design-generator.md → issue-specs/design-generator.md} +16 -8
- package/presets/default/.claude/agents/einja/{specs/spec-qa-generator.md → issue-specs/qa-generator.md} +10 -4
- package/presets/default/.claude/agents/einja/{specs/spec-requirements-generator.md → issue-specs/requirements-generator.md} +9 -6
- package/presets/default/.claude/agents/einja/{specs/spec-tasks-generator.md → issue-specs/tasks-generator.md} +19 -16
- package/presets/default/.claude/agents/einja/{specs/spec-tasks-validator.md → issue-specs/tasks-validator.md} +13 -9
- package/presets/default/.claude/agents/einja/issue-specs/ui-design-generator.md +114 -0
- package/presets/default/.claude/agents/einja/task/task-executer.md +64 -116
- package/presets/default/.claude/agents/einja/task/task-modification-analyzer.md +6 -2
- package/presets/default/.claude/agents/einja/task/task-qa.md +7 -3
- package/presets/default/.claude/agents/einja/task/task-reviewer.md +17 -1
- package/presets/default/.claude/commands/einja/einja-sync.md +124 -45
- package/presets/default/.claude/commands/einja/frontend-implement.md +3 -1
- package/presets/default/.claude/commands/einja/issue-exec.md +413 -0
- package/presets/default/.claude/commands/einja/start-dev.md +4 -0
- package/presets/default/.claude/commands/einja/sync-cursor-commands.md +10 -6
- package/presets/default/.claude/commands/einja/{update-docs-by-task-specs.md → update-docs-by-issue-specs.md} +61 -57
- package/presets/default/.claude/hooks/einja/plan-mode-skill-loader.sh +27 -0
- package/presets/default/.claude/settings.json +29 -5
- package/presets/default/.claude/skills/{einja-general-context-loader → _einja-general-context-loader}/SKILL.md +6 -2
- package/presets/default/.claude/skills/{einja-output-format → _einja-output-format}/SKILL.md +5 -1
- package/presets/default/.claude/skills/_einja-project-overview/SKILL.md +29 -0
- package/presets/default/.claude/skills/{einja-spec-context-loader → _einja-spec-context-loader}/SKILL.md +9 -5
- package/presets/default/.claude/skills/einja-coding-standards/references/testing-strategy.md +899 -0
- package/presets/default/.claude/skills/einja-conflict-resolver/SKILL.md +5 -1
- package/presets/default/.claude/skills/einja-create-pr/SKILL.md +138 -0
- package/presets/default/.claude/skills/einja-infra-maintenance/SKILL.md +779 -0
- package/presets/default/.claude/{commands/einja/spec-create.md → skills/einja-issue-spec-create/SKILL.md} +60 -23
- package/presets/default/.claude/skills/einja-issue-spec-generator/SKILL.md +105 -0
- package/presets/default/.claude/skills/einja-issue-spec-generator/references/format-rules.md +35 -0
- package/presets/default/.claude/skills/einja-issue-spec-validator/SKILL.md +130 -0
- package/presets/default/.claude/skills/einja-issue-spec-validator/references/validation-rules.md +52 -0
- package/presets/default/.claude/skills/einja-npm-release/SKILL.md +242 -0
- package/presets/default/.claude/skills/einja-skill-creator/SKILL.md +311 -263
- package/presets/default/.claude/skills/einja-skill-creator/agents/analyzer.md +274 -0
- package/presets/default/.claude/skills/einja-skill-creator/agents/comparator.md +202 -0
- package/presets/default/.claude/skills/einja-skill-creator/agents/grader.md +195 -0
- package/presets/default/.claude/skills/einja-skill-creator/assets/eval_review.html +146 -0
- package/presets/default/.claude/skills/einja-skill-creator/eval-viewer/generate_review.py +471 -0
- package/presets/default/.claude/skills/einja-skill-creator/eval-viewer/viewer.html +1325 -0
- package/presets/default/.claude/skills/einja-skill-creator/references/schemas.md +430 -0
- package/presets/default/.claude/skills/einja-skill-creator/scripts/aggregate_benchmark.py +401 -0
- package/presets/default/.claude/skills/einja-skill-creator/scripts/compare_runs.py +154 -0
- package/presets/default/.claude/skills/einja-skill-creator/scripts/generate_report.py +272 -0
- package/presets/default/.claude/skills/einja-skill-creator/scripts/improve_description.py +247 -0
- package/presets/default/.claude/skills/einja-skill-creator/scripts/init_skill.py +13 -19
- package/presets/default/.claude/skills/einja-skill-creator/scripts/package_skill.py +36 -7
- package/presets/default/.claude/skills/einja-skill-creator/scripts/run_eval.py +310 -0
- package/presets/default/.claude/skills/einja-skill-creator/scripts/run_loop.py +375 -0
- package/presets/default/.claude/skills/einja-skill-creator/scripts/utils.py +48 -0
- package/presets/default/.claude/skills/einja-skill-first/SKILL.md +265 -0
- package/presets/default/.claude/skills/einja-subagent-question-protocol/SKILL.md +98 -0
- package/presets/default/.claude/skills/einja-task-commit/SKILL.md +11 -7
- package/presets/default/.claude/{commands/einja/task-exec.md → skills/einja-task-exec/SKILL.md} +106 -89
- package/presets/default/.claude/skills/einja-task-qa/SKILL.md +8 -4
- package/presets/default/.claude/skills/einja-task-qa/references/troubleshooting.md +1 -1
- package/presets/default/.claude/skills/einja-task-qa/references/usage-patterns.md +2 -2
- package/presets/default/.claude/skills/einja-team-exec/SKILL.md +165 -0
- package/presets/default/.envrc +5 -0
- package/presets/default/.mcp.json +2 -12
- package/presets/default/CLAUDE.md.template +45 -8
- package/presets/default/docs/einja/example/specs/issues/issue999-example-task/tasks.md +1 -1
- package/presets/default/docs/einja/instructions/deployment-setup.md +4 -9
- package/presets/default/docs/einja/instructions/environment-setup.md +3 -8
- package/presets/default/docs/einja/instructions/issue-exec-workflow.md +276 -0
- package/presets/default/docs/einja/instructions/local-server-environment-and-worktree.md +71 -9
- package/presets/default/docs/einja/instructions/neon-cli-reference.md +3 -8
- package/presets/default/docs/einja/instructions/setup-flow.md +279 -0
- package/presets/default/docs/einja/instructions/task-execute.md +63 -68
- package/presets/default/docs/einja/instructions/vercel-cli-reference.md +17 -10
- package/presets/default/docs/einja/steering/README.md +11 -11
- package/presets/default/docs/einja/steering/acceptance-criteria-and-qa-guide.md +4 -9
- package/presets/default/docs/einja/steering/architecture.md +3 -8
- package/presets/default/docs/einja/steering/branch-strategy.md +63 -70
- package/presets/default/docs/einja/steering/commit-rules.md +3 -8
- package/presets/default/docs/einja/steering/db-schema-design.md +3 -8
- package/presets/default/docs/einja/steering/development/api-development.md +3 -8
- package/presets/default/docs/einja/steering/development/backend-architecture.md +3 -8
- package/presets/default/docs/einja/steering/development/coding-standards.md +723 -0
- package/presets/default/docs/einja/steering/development/component-design.md +502 -0
- package/presets/default/docs/einja/steering/development/database-guidelines.md +2 -2
- package/presets/default/docs/einja/steering/development/frontend-development.md +3 -8
- package/presets/default/docs/einja/steering/development/playwright-guidelines.md +59 -0
- package/presets/default/docs/einja/steering/development/review-guidelines.md +3 -8
- package/presets/default/docs/einja/steering/development/testing-strategy.md +3 -8
- package/presets/default/docs/einja/steering/development-workflow.md +155 -140
- package/presets/default/docs/einja/steering/infrastructure/deployment.md +156 -55
- package/presets/default/docs/einja/steering/infrastructure/environment-variables.md +4 -8
- package/presets/default/docs/einja/steering/product.md +3 -8
- package/presets/default/docs/einja/steering/task-management.md +22 -110
- package/presets/default/scripts/ensure-serena.sh +75 -0
- package/presets/default/scripts/env-rotate-secrets.ts +396 -0
- package/presets/default/scripts/env-show.ts +130 -0
- package/presets/default/scripts/env.ts +479 -0
- package/presets/default/scripts/init-github.ts +363 -0
- package/presets/default/scripts/init.sh +98 -0
- package/presets/default/scripts/lib/env-common.ts +108 -0
- package/presets/default/scripts/lib/worktree-config.ts +64 -0
- package/presets/default/scripts/setup-dev.ts +655 -0
- package/presets/default/scripts/stop-serena.sh +25 -0
- package/presets/default/scripts/worktree/dev.ts +872 -0
- package/dist/lib/sync/seed-synchronizer.d.ts +0 -27
- package/dist/lib/sync/seed-synchronizer.d.ts.map +0 -1
- package/dist/lib/sync/seed-synchronizer.js +0 -72
- package/dist/lib/sync/seed-synchronizer.js.map +0 -1
- package/dist/lib/sync/seed-synchronizer.test.d.ts +0 -2
- package/dist/lib/sync/seed-synchronizer.test.d.ts.map +0 -1
- package/dist/lib/sync/seed-synchronizer.test.js +0 -147
- package/dist/lib/sync/seed-synchronizer.test.js.map +0 -1
- package/presets/default/.claude/agents/einja/git/conflict-resolver.md +0 -148
- package/presets/default/.claude/hooks/einja/validate-git-commit.sh +0 -239
- package/presets/default/.claude/skills/einja-api-development/SKILL.md +0 -14
- package/presets/default/.claude/skills/einja-backend-architecture/SKILL.md +0 -18
- package/presets/default/.claude/skills/einja-coding-standards/SKILL.md +0 -132
- package/presets/default/.claude/skills/einja-coding-standards/references/import-conventions.md +0 -69
- package/presets/default/.claude/skills/einja-coding-standards/references/naming-conventions.md +0 -107
- package/presets/default/.claude/skills/einja-coding-standards/references/prohibited-patterns.md +0 -169
- package/presets/default/.claude/skills/einja-coding-standards/references/typescript-rules.md +0 -247
- package/presets/default/.claude/skills/einja-component-design/SKILL.md +0 -109
- package/presets/default/.claude/skills/einja-component-design/references/directory-structure.md +0 -117
- package/presets/default/.claude/skills/einja-component-design/references/props-patterns.md +0 -159
- package/presets/default/.claude/skills/einja-component-design/references/styling-guide.md +0 -122
- package/presets/default/.claude/skills/einja-frontend-development/SKILL.md +0 -14
- package/presets/default/.claude/skills/einja-project-overview/SKILL.md +0 -35
- package/presets/default/docs/einja/instructions/task-vibe-kanban-loop.md +0 -565
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""スキルdescriptionのトリガー評価を実行。
|
|
3
|
+
|
|
4
|
+
スキルのdescriptionが一連のクエリに対してClaudeのスキル使用を
|
|
5
|
+
トリガーするかどうかをテストする。結果をJSONで出力。
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import argparse
|
|
9
|
+
import json
|
|
10
|
+
import os
|
|
11
|
+
import select
|
|
12
|
+
import subprocess
|
|
13
|
+
import sys
|
|
14
|
+
import time
|
|
15
|
+
import uuid
|
|
16
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
try:
|
|
20
|
+
from scripts.utils import parse_skill_md
|
|
21
|
+
except ImportError:
|
|
22
|
+
from utils import parse_skill_md
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def find_project_root() -> Path:
|
|
26
|
+
"""cwdから上方向にウォークし、.claude/を探してプロジェクトルートを見つける。
|
|
27
|
+
|
|
28
|
+
Claude Codeがプロジェクトルートを発見する方法を模倣し、
|
|
29
|
+
作成するコマンドファイルがclaude -pの検索対象に入るようにする。
|
|
30
|
+
"""
|
|
31
|
+
current = Path.cwd()
|
|
32
|
+
for parent in [current, *current.parents]:
|
|
33
|
+
if (parent / ".claude").is_dir():
|
|
34
|
+
return parent
|
|
35
|
+
return current
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def run_single_query(
|
|
39
|
+
query: str,
|
|
40
|
+
skill_name: str,
|
|
41
|
+
skill_description: str,
|
|
42
|
+
timeout: int,
|
|
43
|
+
project_root: str,
|
|
44
|
+
model: str | None = None,
|
|
45
|
+
) -> bool:
|
|
46
|
+
"""単一のクエリを実行し、スキルがトリガーされたかどうかを返す。
|
|
47
|
+
|
|
48
|
+
.claude/commands/にコマンドファイルを作成してClaudeのavailable_skillsリストに
|
|
49
|
+
表示させ、`claude -p`で生のクエリを実行する。
|
|
50
|
+
--include-partial-messagesを使用してストリームイベント(content_block_start)から
|
|
51
|
+
早期にトリガーを検出する。
|
|
52
|
+
"""
|
|
53
|
+
unique_id = uuid.uuid4().hex[:8]
|
|
54
|
+
clean_name = f"{skill_name}-skill-{unique_id}"
|
|
55
|
+
project_commands_dir = Path(project_root) / ".claude" / "commands"
|
|
56
|
+
command_file = project_commands_dir / f"{clean_name}.md"
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
project_commands_dir.mkdir(parents=True, exist_ok=True)
|
|
60
|
+
# description内のクォートでの破損を避けるためYAMLブロックスカラーを使用
|
|
61
|
+
indented_desc = "\n ".join(skill_description.split("\n"))
|
|
62
|
+
command_content = (
|
|
63
|
+
f"---\n"
|
|
64
|
+
f"description: |\n"
|
|
65
|
+
f" {indented_desc}\n"
|
|
66
|
+
f"---\n\n"
|
|
67
|
+
f"# {skill_name}\n\n"
|
|
68
|
+
f"This skill handles: {skill_description}\n"
|
|
69
|
+
)
|
|
70
|
+
command_file.write_text(command_content)
|
|
71
|
+
|
|
72
|
+
cmd = [
|
|
73
|
+
"claude",
|
|
74
|
+
"-p", query,
|
|
75
|
+
"--output-format", "stream-json",
|
|
76
|
+
"--verbose",
|
|
77
|
+
"--include-partial-messages",
|
|
78
|
+
]
|
|
79
|
+
if model:
|
|
80
|
+
cmd.extend(["--model", model])
|
|
81
|
+
|
|
82
|
+
# Claude Codeセッション内でclaude -pのネストを許可するためCLAUDECODE環境変数を除去
|
|
83
|
+
env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"}
|
|
84
|
+
|
|
85
|
+
process = subprocess.Popen(
|
|
86
|
+
cmd,
|
|
87
|
+
stdout=subprocess.PIPE,
|
|
88
|
+
stderr=subprocess.DEVNULL,
|
|
89
|
+
cwd=project_root,
|
|
90
|
+
env=env,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
triggered = False
|
|
94
|
+
start_time = time.time()
|
|
95
|
+
buffer = ""
|
|
96
|
+
# ストリームイベント検出用の状態追跡
|
|
97
|
+
pending_tool_name = None
|
|
98
|
+
accumulated_json = ""
|
|
99
|
+
|
|
100
|
+
try:
|
|
101
|
+
while time.time() - start_time < timeout:
|
|
102
|
+
if process.poll() is not None:
|
|
103
|
+
remaining = process.stdout.read()
|
|
104
|
+
if remaining:
|
|
105
|
+
buffer += remaining.decode("utf-8", errors="replace")
|
|
106
|
+
break
|
|
107
|
+
|
|
108
|
+
ready, _, _ = select.select([process.stdout], [], [], 1.0)
|
|
109
|
+
if not ready:
|
|
110
|
+
continue
|
|
111
|
+
|
|
112
|
+
chunk = os.read(process.stdout.fileno(), 8192)
|
|
113
|
+
if not chunk:
|
|
114
|
+
break
|
|
115
|
+
buffer += chunk.decode("utf-8", errors="replace")
|
|
116
|
+
|
|
117
|
+
while "\n" in buffer:
|
|
118
|
+
line, buffer = buffer.split("\n", 1)
|
|
119
|
+
line = line.strip()
|
|
120
|
+
if not line:
|
|
121
|
+
continue
|
|
122
|
+
|
|
123
|
+
try:
|
|
124
|
+
event = json.loads(line)
|
|
125
|
+
except json.JSONDecodeError:
|
|
126
|
+
continue
|
|
127
|
+
|
|
128
|
+
# ストリームイベントによる早期検出
|
|
129
|
+
if event.get("type") == "stream_event":
|
|
130
|
+
se = event.get("event", {})
|
|
131
|
+
se_type = se.get("type", "")
|
|
132
|
+
|
|
133
|
+
if se_type == "content_block_start":
|
|
134
|
+
cb = se.get("content_block", {})
|
|
135
|
+
if cb.get("type") == "tool_use":
|
|
136
|
+
tool_name = cb.get("name", "")
|
|
137
|
+
if tool_name in ("Skill", "Read"):
|
|
138
|
+
pending_tool_name = tool_name
|
|
139
|
+
accumulated_json = ""
|
|
140
|
+
else:
|
|
141
|
+
return False
|
|
142
|
+
|
|
143
|
+
elif se_type == "content_block_delta" and pending_tool_name:
|
|
144
|
+
delta = se.get("delta", {})
|
|
145
|
+
if delta.get("type") == "input_json_delta":
|
|
146
|
+
accumulated_json += delta.get("partial_json", "")
|
|
147
|
+
if clean_name in accumulated_json:
|
|
148
|
+
return True
|
|
149
|
+
|
|
150
|
+
elif se_type in ("content_block_stop", "message_stop"):
|
|
151
|
+
if pending_tool_name:
|
|
152
|
+
return clean_name in accumulated_json
|
|
153
|
+
if se_type == "message_stop":
|
|
154
|
+
return False
|
|
155
|
+
|
|
156
|
+
# フォールバック: 完全なassistantメッセージ
|
|
157
|
+
elif event.get("type") == "assistant":
|
|
158
|
+
message = event.get("message", {})
|
|
159
|
+
for content_item in message.get("content", []):
|
|
160
|
+
if content_item.get("type") != "tool_use":
|
|
161
|
+
continue
|
|
162
|
+
tool_name = content_item.get("name", "")
|
|
163
|
+
tool_input = content_item.get("input", {})
|
|
164
|
+
if tool_name == "Skill" and clean_name in tool_input.get("skill", ""):
|
|
165
|
+
triggered = True
|
|
166
|
+
elif tool_name == "Read" and clean_name in tool_input.get("file_path", ""):
|
|
167
|
+
triggered = True
|
|
168
|
+
return triggered
|
|
169
|
+
|
|
170
|
+
elif event.get("type") == "result":
|
|
171
|
+
return triggered
|
|
172
|
+
finally:
|
|
173
|
+
# 任意の終了パス(return、例外、タイムアウト)でプロセスをクリーンアップ
|
|
174
|
+
if process.poll() is None:
|
|
175
|
+
process.kill()
|
|
176
|
+
process.wait()
|
|
177
|
+
|
|
178
|
+
return triggered
|
|
179
|
+
finally:
|
|
180
|
+
if command_file.exists():
|
|
181
|
+
command_file.unlink()
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def run_eval(
|
|
185
|
+
eval_set: list[dict],
|
|
186
|
+
skill_name: str,
|
|
187
|
+
description: str,
|
|
188
|
+
num_workers: int,
|
|
189
|
+
timeout: int,
|
|
190
|
+
project_root: Path,
|
|
191
|
+
runs_per_query: int = 1,
|
|
192
|
+
trigger_threshold: float = 0.5,
|
|
193
|
+
model: str | None = None,
|
|
194
|
+
) -> dict:
|
|
195
|
+
"""評価セット全体を実行し、結果を返す。"""
|
|
196
|
+
results = []
|
|
197
|
+
|
|
198
|
+
with ProcessPoolExecutor(max_workers=num_workers) as executor:
|
|
199
|
+
future_to_info = {}
|
|
200
|
+
for item in eval_set:
|
|
201
|
+
for run_idx in range(runs_per_query):
|
|
202
|
+
future = executor.submit(
|
|
203
|
+
run_single_query,
|
|
204
|
+
item["query"],
|
|
205
|
+
skill_name,
|
|
206
|
+
description,
|
|
207
|
+
timeout,
|
|
208
|
+
str(project_root),
|
|
209
|
+
model,
|
|
210
|
+
)
|
|
211
|
+
future_to_info[future] = (item, run_idx)
|
|
212
|
+
|
|
213
|
+
query_triggers: dict[str, list[bool]] = {}
|
|
214
|
+
query_items: dict[str, dict] = {}
|
|
215
|
+
for future in as_completed(future_to_info):
|
|
216
|
+
item, _ = future_to_info[future]
|
|
217
|
+
query = item["query"]
|
|
218
|
+
query_items[query] = item
|
|
219
|
+
if query not in query_triggers:
|
|
220
|
+
query_triggers[query] = []
|
|
221
|
+
try:
|
|
222
|
+
query_triggers[query].append(future.result())
|
|
223
|
+
except Exception as e:
|
|
224
|
+
print(f"警告: クエリが失敗しました: {e}", file=sys.stderr)
|
|
225
|
+
query_triggers[query].append(False)
|
|
226
|
+
|
|
227
|
+
for query, triggers in query_triggers.items():
|
|
228
|
+
item = query_items[query]
|
|
229
|
+
trigger_rate = sum(triggers) / len(triggers)
|
|
230
|
+
should_trigger = item["should_trigger"]
|
|
231
|
+
if should_trigger:
|
|
232
|
+
did_pass = trigger_rate >= trigger_threshold
|
|
233
|
+
else:
|
|
234
|
+
did_pass = trigger_rate < trigger_threshold
|
|
235
|
+
results.append({
|
|
236
|
+
"query": query,
|
|
237
|
+
"should_trigger": should_trigger,
|
|
238
|
+
"trigger_rate": trigger_rate,
|
|
239
|
+
"triggers": sum(triggers),
|
|
240
|
+
"runs": len(triggers),
|
|
241
|
+
"pass": did_pass,
|
|
242
|
+
})
|
|
243
|
+
|
|
244
|
+
passed = sum(1 for r in results if r["pass"])
|
|
245
|
+
total = len(results)
|
|
246
|
+
|
|
247
|
+
return {
|
|
248
|
+
"skill_name": skill_name,
|
|
249
|
+
"description": description,
|
|
250
|
+
"results": results,
|
|
251
|
+
"summary": {
|
|
252
|
+
"total": total,
|
|
253
|
+
"passed": passed,
|
|
254
|
+
"failed": total - passed,
|
|
255
|
+
},
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def main():
|
|
260
|
+
parser = argparse.ArgumentParser(description="スキルdescriptionのトリガー評価を実行")
|
|
261
|
+
parser.add_argument("--eval-set", required=True, help="評価セットJSONファイルへのパス")
|
|
262
|
+
parser.add_argument("--skill-path", required=True, help="スキルディレクトリへのパス")
|
|
263
|
+
parser.add_argument("--description", default=None, help="テスト用descriptionの上書き")
|
|
264
|
+
parser.add_argument("--num-workers", type=int, default=10, help="並行ワーカー数")
|
|
265
|
+
parser.add_argument("--timeout", type=int, default=30, help="クエリごとのタイムアウト(秒)")
|
|
266
|
+
parser.add_argument("--runs-per-query", type=int, default=3, help="クエリごとの実行回数")
|
|
267
|
+
parser.add_argument("--trigger-threshold", type=float, default=0.5, help="トリガー率の閾値")
|
|
268
|
+
parser.add_argument("--model", default=None, help="claude -pに使用するモデル(デフォルト: ユーザー設定のモデル)")
|
|
269
|
+
parser.add_argument("--verbose", action="store_true", help="進捗をstderrに出力")
|
|
270
|
+
args = parser.parse_args()
|
|
271
|
+
|
|
272
|
+
eval_set = json.loads(Path(args.eval_set).read_text())
|
|
273
|
+
skill_path = Path(args.skill_path)
|
|
274
|
+
|
|
275
|
+
if not (skill_path / "SKILL.md").exists():
|
|
276
|
+
print(f"エラー: {skill_path} にSKILL.mdが見つかりません", file=sys.stderr)
|
|
277
|
+
sys.exit(1)
|
|
278
|
+
|
|
279
|
+
name, original_description, content = parse_skill_md(skill_path)
|
|
280
|
+
description = args.description or original_description
|
|
281
|
+
project_root = find_project_root()
|
|
282
|
+
|
|
283
|
+
if args.verbose:
|
|
284
|
+
print(f"評価中: {description}", file=sys.stderr)
|
|
285
|
+
|
|
286
|
+
output = run_eval(
|
|
287
|
+
eval_set=eval_set,
|
|
288
|
+
skill_name=name,
|
|
289
|
+
description=description,
|
|
290
|
+
num_workers=args.num_workers,
|
|
291
|
+
timeout=args.timeout,
|
|
292
|
+
project_root=project_root,
|
|
293
|
+
runs_per_query=args.runs_per_query,
|
|
294
|
+
trigger_threshold=args.trigger_threshold,
|
|
295
|
+
model=args.model,
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
if args.verbose:
|
|
299
|
+
summary = output["summary"]
|
|
300
|
+
print(f"結果: {summary['passed']}/{summary['total']} パス", file=sys.stderr)
|
|
301
|
+
for r in output["results"]:
|
|
302
|
+
status = "PASS" if r["pass"] else "FAIL"
|
|
303
|
+
rate_str = f"{r['triggers']}/{r['runs']}"
|
|
304
|
+
print(f" [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:70]}", file=sys.stderr)
|
|
305
|
+
|
|
306
|
+
print(json.dumps(output, indent=2))
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
if __name__ == "__main__":
|
|
310
|
+
main()
|
|
@@ -0,0 +1,375 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""評価+改善ループを実行。全パスまたは最大イテレーション到達まで繰り返す。
|
|
3
|
+
|
|
4
|
+
run_eval.pyとimprove_description.pyをループで組み合わせ、
|
|
5
|
+
履歴を追跡し最良のdescriptionを返す。
|
|
6
|
+
過学習防止のためtrain/test分割(fraction指定)に対応。
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import argparse
|
|
10
|
+
import json
|
|
11
|
+
import random
|
|
12
|
+
import sys
|
|
13
|
+
import tempfile
|
|
14
|
+
import time
|
|
15
|
+
import webbrowser
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
from scripts.generate_report import generate_html
|
|
20
|
+
from scripts.improve_description import improve_description
|
|
21
|
+
from scripts.run_eval import find_project_root, run_eval
|
|
22
|
+
from scripts.utils import parse_skill_md
|
|
23
|
+
except ImportError:
|
|
24
|
+
from generate_report import generate_html
|
|
25
|
+
from improve_description import improve_description
|
|
26
|
+
from run_eval import find_project_root, run_eval
|
|
27
|
+
from utils import parse_skill_md
|
|
28
|
+
|
|
29
|
+
import anthropic
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def split_eval_set(
|
|
33
|
+
eval_set: list[dict],
|
|
34
|
+
holdout: float,
|
|
35
|
+
seed: int = 42,
|
|
36
|
+
) -> tuple[list[dict], list[dict]]:
|
|
37
|
+
"""評価セットをトレーニングとテストに分割する(fraction指定)。
|
|
38
|
+
|
|
39
|
+
holdoutは全体に対する割合(例: 0.4 = 40%)。
|
|
40
|
+
holdoutが0の場合、全データをトレーニングに使用する。
|
|
41
|
+
should_trigger=Trueとshould_trigger=Falseの両方から
|
|
42
|
+
均等にホールドアウトする(stratified split)。
|
|
43
|
+
"""
|
|
44
|
+
if holdout <= 0:
|
|
45
|
+
return eval_set, []
|
|
46
|
+
|
|
47
|
+
rng = random.Random(seed)
|
|
48
|
+
|
|
49
|
+
# should_triggerで分離
|
|
50
|
+
trigger = [e for e in eval_set if e.get("should_trigger", True)]
|
|
51
|
+
no_trigger = [e for e in eval_set if not e.get("should_trigger", True)]
|
|
52
|
+
|
|
53
|
+
# 各グループをシャッフル
|
|
54
|
+
rng.shuffle(trigger)
|
|
55
|
+
rng.shuffle(no_trigger)
|
|
56
|
+
|
|
57
|
+
# 分割点を割合で計算
|
|
58
|
+
n_trigger_test = max(1, int(len(trigger) * holdout))
|
|
59
|
+
n_no_trigger_test = max(1, int(len(no_trigger) * holdout))
|
|
60
|
+
|
|
61
|
+
# 分割
|
|
62
|
+
test_set = trigger[:n_trigger_test] + no_trigger[:n_no_trigger_test]
|
|
63
|
+
train_set = trigger[n_trigger_test:] + no_trigger[n_no_trigger_test:]
|
|
64
|
+
|
|
65
|
+
return train_set, test_set
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def run_loop(
|
|
69
|
+
eval_set: list[dict],
|
|
70
|
+
skill_path: Path,
|
|
71
|
+
description_override: str | None,
|
|
72
|
+
num_workers: int,
|
|
73
|
+
timeout: int,
|
|
74
|
+
max_iterations: int,
|
|
75
|
+
runs_per_query: int,
|
|
76
|
+
trigger_threshold: float,
|
|
77
|
+
holdout: float,
|
|
78
|
+
seed: int | None,
|
|
79
|
+
model: str | None,
|
|
80
|
+
improve_model: str,
|
|
81
|
+
verbose: bool,
|
|
82
|
+
live_report_path: Path | None = None,
|
|
83
|
+
log_dir: Path | None = None,
|
|
84
|
+
) -> dict:
|
|
85
|
+
"""評価+改善ループのメイン関数。"""
|
|
86
|
+
project_root = find_project_root()
|
|
87
|
+
name, original_description, content = parse_skill_md(skill_path)
|
|
88
|
+
current_description = description_override or original_description
|
|
89
|
+
|
|
90
|
+
# train/test分割(holdoutが0より大きい場合のみ)
|
|
91
|
+
if holdout > 0:
|
|
92
|
+
train_set, test_set = split_eval_set(eval_set, holdout, seed if seed is not None else 42)
|
|
93
|
+
if verbose:
|
|
94
|
+
print(f"分割: トレーニング {len(train_set)} 件, テスト {len(test_set)} 件 (holdout={holdout})", file=sys.stderr)
|
|
95
|
+
else:
|
|
96
|
+
train_set = eval_set
|
|
97
|
+
test_set = []
|
|
98
|
+
|
|
99
|
+
client = anthropic.Anthropic()
|
|
100
|
+
history: list[dict] = []
|
|
101
|
+
exit_reason = "unknown"
|
|
102
|
+
|
|
103
|
+
for iteration in range(1, max_iterations + 1):
|
|
104
|
+
if verbose:
|
|
105
|
+
print(f"\n{'='*60}", file=sys.stderr)
|
|
106
|
+
print(f"イテレーション {iteration}/{max_iterations}", file=sys.stderr)
|
|
107
|
+
print(f"description: {current_description}", file=sys.stderr)
|
|
108
|
+
print(f"{'='*60}", file=sys.stderr)
|
|
109
|
+
|
|
110
|
+
# train + test を一括で並行評価(効率化)
|
|
111
|
+
all_queries = train_set + test_set
|
|
112
|
+
t0 = time.time()
|
|
113
|
+
all_results = run_eval(
|
|
114
|
+
eval_set=all_queries,
|
|
115
|
+
skill_name=name,
|
|
116
|
+
description=current_description,
|
|
117
|
+
num_workers=num_workers,
|
|
118
|
+
timeout=timeout,
|
|
119
|
+
project_root=project_root,
|
|
120
|
+
runs_per_query=runs_per_query,
|
|
121
|
+
trigger_threshold=trigger_threshold,
|
|
122
|
+
model=model,
|
|
123
|
+
)
|
|
124
|
+
eval_elapsed = time.time() - t0
|
|
125
|
+
|
|
126
|
+
# クエリの一致でtrain/testに結果を振り分け
|
|
127
|
+
train_queries_set = {q["query"] for q in train_set}
|
|
128
|
+
train_result_list = [r for r in all_results["results"] if r["query"] in train_queries_set]
|
|
129
|
+
test_result_list = [r for r in all_results["results"] if r["query"] not in train_queries_set]
|
|
130
|
+
|
|
131
|
+
train_passed = sum(1 for r in train_result_list if r["pass"])
|
|
132
|
+
train_total = len(train_result_list)
|
|
133
|
+
train_summary = {"passed": train_passed, "failed": train_total - train_passed, "total": train_total}
|
|
134
|
+
train_results = {"results": train_result_list, "summary": train_summary}
|
|
135
|
+
|
|
136
|
+
if test_set:
|
|
137
|
+
test_passed = sum(1 for r in test_result_list if r["pass"])
|
|
138
|
+
test_total = len(test_result_list)
|
|
139
|
+
test_summary = {"passed": test_passed, "failed": test_total - test_passed, "total": test_total}
|
|
140
|
+
test_results = {"results": test_result_list, "summary": test_summary}
|
|
141
|
+
else:
|
|
142
|
+
test_results = None
|
|
143
|
+
test_summary = None
|
|
144
|
+
|
|
145
|
+
# 履歴エントリの構築(レポートジェネレーターとの後方互換性を保持)
|
|
146
|
+
history.append({
|
|
147
|
+
"iteration": iteration,
|
|
148
|
+
"description": current_description,
|
|
149
|
+
"train_passed": train_summary["passed"],
|
|
150
|
+
"train_failed": train_summary["failed"],
|
|
151
|
+
"train_total": train_summary["total"],
|
|
152
|
+
"train_results": train_results["results"],
|
|
153
|
+
"test_passed": test_summary["passed"] if test_summary else None,
|
|
154
|
+
"test_failed": test_summary["failed"] if test_summary else None,
|
|
155
|
+
"test_total": test_summary["total"] if test_summary else None,
|
|
156
|
+
"test_results": test_results["results"] if test_results else None,
|
|
157
|
+
# レポートジェネレーター後方互換
|
|
158
|
+
"passed": train_summary["passed"],
|
|
159
|
+
"failed": train_summary["failed"],
|
|
160
|
+
"total": train_summary["total"],
|
|
161
|
+
"results": train_results["results"],
|
|
162
|
+
})
|
|
163
|
+
|
|
164
|
+
# ライブレポートを更新(指定されている場合)
|
|
165
|
+
if live_report_path:
|
|
166
|
+
partial_output = {
|
|
167
|
+
"original_description": original_description,
|
|
168
|
+
"best_description": current_description,
|
|
169
|
+
"best_score": "in progress",
|
|
170
|
+
"iterations_run": len(history),
|
|
171
|
+
"holdout": holdout,
|
|
172
|
+
"train_size": len(train_set),
|
|
173
|
+
"test_size": len(test_set),
|
|
174
|
+
"history": history,
|
|
175
|
+
}
|
|
176
|
+
live_report_path.write_text(generate_html(partial_output, auto_refresh=True, skill_name=name))
|
|
177
|
+
if verbose:
|
|
178
|
+
print(f"レポートを更新しました: {live_report_path}", file=sys.stderr)
|
|
179
|
+
|
|
180
|
+
if verbose:
|
|
181
|
+
def print_eval_stats(label: str, results: list[dict], elapsed: float) -> None:
|
|
182
|
+
pos = [r for r in results if r.get("should_trigger", True)]
|
|
183
|
+
neg = [r for r in results if not r.get("should_trigger", True)]
|
|
184
|
+
tp = sum(r["triggers"] for r in pos)
|
|
185
|
+
pos_runs = sum(r["runs"] for r in pos)
|
|
186
|
+
fn = pos_runs - tp
|
|
187
|
+
fp = sum(r["triggers"] for r in neg)
|
|
188
|
+
neg_runs = sum(r["runs"] for r in neg)
|
|
189
|
+
tn = neg_runs - fp
|
|
190
|
+
total = tp + tn + fp + fn
|
|
191
|
+
precision = tp / (tp + fp) if (tp + fp) > 0 else 1.0
|
|
192
|
+
recall = tp / (tp + fn) if (tp + fn) > 0 else 1.0
|
|
193
|
+
accuracy = (tp + tn) / total if total > 0 else 0.0
|
|
194
|
+
print(
|
|
195
|
+
f"{label}: {tp+tn}/{total} 正解, "
|
|
196
|
+
f"precision={precision:.0%} recall={recall:.0%} accuracy={accuracy:.0%} ({elapsed:.1f}s)",
|
|
197
|
+
file=sys.stderr,
|
|
198
|
+
)
|
|
199
|
+
for r in results:
|
|
200
|
+
status = "PASS" if r["pass"] else "FAIL"
|
|
201
|
+
rate_str = f"{r['triggers']}/{r['runs']}"
|
|
202
|
+
print(
|
|
203
|
+
f" [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:60]}",
|
|
204
|
+
file=sys.stderr,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
print_eval_stats("Train", train_results["results"], eval_elapsed)
|
|
208
|
+
if test_summary:
|
|
209
|
+
print_eval_stats("Test ", test_results["results"], 0) # type: ignore[index]
|
|
210
|
+
|
|
211
|
+
# train全パスなら終了(testは過学習モニタリング用のみ)
|
|
212
|
+
if train_summary["failed"] == 0:
|
|
213
|
+
exit_reason = f"all_passed (iteration {iteration})"
|
|
214
|
+
if verbose:
|
|
215
|
+
print(f"\nイテレーション {iteration} でtrain全クエリパス!ループを終了します。", file=sys.stderr)
|
|
216
|
+
break
|
|
217
|
+
|
|
218
|
+
if iteration == max_iterations:
|
|
219
|
+
exit_reason = f"max_iterations ({max_iterations})"
|
|
220
|
+
if verbose:
|
|
221
|
+
print(f"\n最大イテレーション数到達 ({max_iterations})。", file=sys.stderr)
|
|
222
|
+
break
|
|
223
|
+
|
|
224
|
+
# descriptionを改善(train結果のみ使用)
|
|
225
|
+
if verbose:
|
|
226
|
+
print(f"\ndescriptionを改善中...", file=sys.stderr)
|
|
227
|
+
|
|
228
|
+
t0 = time.time()
|
|
229
|
+
# 過学習防止のため、改善モデルにtest_スコアを見せないようにブラインド処理
|
|
230
|
+
blinded_history = [
|
|
231
|
+
{k: v for k, v in h.items() if not k.startswith("test_")}
|
|
232
|
+
for h in history
|
|
233
|
+
]
|
|
234
|
+
new_description = improve_description(
|
|
235
|
+
client=client,
|
|
236
|
+
skill_name=name,
|
|
237
|
+
skill_content=content,
|
|
238
|
+
current_description=current_description,
|
|
239
|
+
eval_results=train_results,
|
|
240
|
+
history=blinded_history,
|
|
241
|
+
model=improve_model,
|
|
242
|
+
log_dir=log_dir,
|
|
243
|
+
iteration=iteration,
|
|
244
|
+
)
|
|
245
|
+
improve_elapsed = time.time() - t0
|
|
246
|
+
|
|
247
|
+
if verbose:
|
|
248
|
+
print(f"新しいdescription ({improve_elapsed:.1f}s): {new_description}", file=sys.stderr)
|
|
249
|
+
|
|
250
|
+
current_description = new_description
|
|
251
|
+
|
|
252
|
+
# 最良のdescriptionを選択(testセットあり→test優先、なし→train)
|
|
253
|
+
if test_set:
|
|
254
|
+
best = max(history, key=lambda h: h["test_passed"] or 0)
|
|
255
|
+
best_score = f"{best['test_passed']}/{best['test_total']}"
|
|
256
|
+
else:
|
|
257
|
+
best = max(history, key=lambda h: h["train_passed"])
|
|
258
|
+
best_score = f"{best['train_passed']}/{best['train_total']}"
|
|
259
|
+
|
|
260
|
+
if verbose:
|
|
261
|
+
print(f"\n終了理由: {exit_reason}", file=sys.stderr)
|
|
262
|
+
print(f"最良スコア: {best_score} (イテレーション {best['iteration']})", file=sys.stderr)
|
|
263
|
+
|
|
264
|
+
return {
|
|
265
|
+
"exit_reason": exit_reason,
|
|
266
|
+
"original_description": original_description,
|
|
267
|
+
"best_description": best["description"],
|
|
268
|
+
"best_score": best_score,
|
|
269
|
+
"best_train_score": f"{best['train_passed']}/{best['train_total']}",
|
|
270
|
+
"best_test_score": f"{best['test_passed']}/{best['test_total']}" if test_set else None,
|
|
271
|
+
"final_description": current_description,
|
|
272
|
+
"iterations_run": len(history),
|
|
273
|
+
"holdout": holdout,
|
|
274
|
+
"train_size": len(train_set),
|
|
275
|
+
"test_size": len(test_set),
|
|
276
|
+
"history": history,
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def main() -> None:
|
|
281
|
+
parser = argparse.ArgumentParser(description="評価+改善ループを実行")
|
|
282
|
+
parser.add_argument("--eval-set", required=True, help="評価セットJSONファイルへのパス")
|
|
283
|
+
parser.add_argument("--skill-path", required=True, help="スキルディレクトリへのパス")
|
|
284
|
+
parser.add_argument("--description", default=None, help="開始descriptionを上書き")
|
|
285
|
+
parser.add_argument("--num-workers", type=int, default=10, help="並行ワーカー数(デフォルト: 10)")
|
|
286
|
+
parser.add_argument("--timeout", type=int, default=30, help="クエリごとのタイムアウト秒数(デフォルト: 30)")
|
|
287
|
+
parser.add_argument("--max-iterations", type=int, default=5, help="最大イテレーション数(デフォルト: 5)")
|
|
288
|
+
parser.add_argument("--runs-per-query", type=int, default=3, help="クエリごとの実行回数(デフォルト: 3)")
|
|
289
|
+
parser.add_argument("--trigger-threshold", type=float, default=0.5, help="トリガー率の閾値(デフォルト: 0.5)")
|
|
290
|
+
parser.add_argument("--holdout", type=float, default=0.4, help="テスト用ホールドアウト割合(0で無効、デフォルト: 0.4)")
|
|
291
|
+
parser.add_argument("--seed", type=int, default=None, help="train/test分割のランダムシード")
|
|
292
|
+
parser.add_argument("--model", default=None, help="評価時にclaude -pに使用するモデル")
|
|
293
|
+
parser.add_argument("--improve-model", default="claude-sonnet-4-20250514", help="description改善に使用するモデル(デフォルト: claude-sonnet-4-20250514)")
|
|
294
|
+
parser.add_argument("--verbose", action="store_true", help="進捗をstderrに出力")
|
|
295
|
+
parser.add_argument("--report", default="auto", help="HTMLレポートの出力先パス('auto'で一時ファイル自動起動、'none'で無効)")
|
|
296
|
+
parser.add_argument("--results-dir", default=None, help="タイムスタンプ付きサブディレクトリに全出力(results.json, report.html, logs)を保存")
|
|
297
|
+
parser.add_argument("--log-dir", default=None, help="改善トランスクリプトのログディレクトリ(--results-dirより優先)")
|
|
298
|
+
args = parser.parse_args()
|
|
299
|
+
|
|
300
|
+
eval_set = json.loads(Path(args.eval_set).read_text())
|
|
301
|
+
skill_path = Path(args.skill_path)
|
|
302
|
+
|
|
303
|
+
if not (skill_path / "SKILL.md").exists():
|
|
304
|
+
print(f"エラー: {skill_path} にSKILL.mdが見つかりません", file=sys.stderr)
|
|
305
|
+
sys.exit(1)
|
|
306
|
+
|
|
307
|
+
name, _, _ = parse_skill_md(skill_path)
|
|
308
|
+
|
|
309
|
+
# ライブレポートパスのセットアップ
|
|
310
|
+
if args.report != "none":
|
|
311
|
+
if args.report == "auto":
|
|
312
|
+
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
|
313
|
+
live_report_path = Path(tempfile.gettempdir()) / f"skill_description_report_{skill_path.name}_{timestamp}.html"
|
|
314
|
+
else:
|
|
315
|
+
live_report_path = Path(args.report)
|
|
316
|
+
# ブラウザで即座に開けるよう初期HTMLを書き込む
|
|
317
|
+
live_report_path.write_text("<html><body><h1>最適化ループを開始しています...</h1><meta http-equiv='refresh' content='5'></body></html>")
|
|
318
|
+
webbrowser.open(str(live_report_path))
|
|
319
|
+
else:
|
|
320
|
+
live_report_path = None
|
|
321
|
+
|
|
322
|
+
# 出力ディレクトリの決定(run_loop実行前に作成してlogsを保存可能にする)
|
|
323
|
+
if args.results_dir:
|
|
324
|
+
timestamp = time.strftime("%Y-%m-%d_%H%M%S")
|
|
325
|
+
results_dir = Path(args.results_dir) / timestamp
|
|
326
|
+
results_dir.mkdir(parents=True, exist_ok=True)
|
|
327
|
+
else:
|
|
328
|
+
results_dir = None
|
|
329
|
+
|
|
330
|
+
# --log-dir が明示指定されていればそちらを優先、なければ results_dir/logs
|
|
331
|
+
if args.log_dir:
|
|
332
|
+
log_dir: Path | None = Path(args.log_dir)
|
|
333
|
+
elif results_dir:
|
|
334
|
+
log_dir = results_dir / "logs"
|
|
335
|
+
else:
|
|
336
|
+
log_dir = None
|
|
337
|
+
|
|
338
|
+
output = run_loop(
|
|
339
|
+
eval_set=eval_set,
|
|
340
|
+
skill_path=skill_path,
|
|
341
|
+
description_override=args.description,
|
|
342
|
+
num_workers=args.num_workers,
|
|
343
|
+
timeout=args.timeout,
|
|
344
|
+
max_iterations=args.max_iterations,
|
|
345
|
+
runs_per_query=args.runs_per_query,
|
|
346
|
+
trigger_threshold=args.trigger_threshold,
|
|
347
|
+
holdout=args.holdout,
|
|
348
|
+
seed=args.seed,
|
|
349
|
+
model=args.model,
|
|
350
|
+
improve_model=args.improve_model,
|
|
351
|
+
verbose=args.verbose,
|
|
352
|
+
live_report_path=live_report_path,
|
|
353
|
+
log_dir=log_dir,
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
# JSON出力
|
|
357
|
+
json_output = json.dumps(output, indent=2)
|
|
358
|
+
print(json_output)
|
|
359
|
+
if results_dir:
|
|
360
|
+
(results_dir / "results.json").write_text(json_output)
|
|
361
|
+
|
|
362
|
+
# 最終HTMLレポートの書き込み(auto_refreshオフ)
|
|
363
|
+
if live_report_path:
|
|
364
|
+
live_report_path.write_text(generate_html(output, auto_refresh=False, skill_name=name))
|
|
365
|
+
print(f"\nレポート: {live_report_path}", file=sys.stderr)
|
|
366
|
+
|
|
367
|
+
if results_dir and live_report_path:
|
|
368
|
+
(results_dir / "report.html").write_text(generate_html(output, auto_refresh=False, skill_name=name))
|
|
369
|
+
|
|
370
|
+
if results_dir:
|
|
371
|
+
print(f"結果を保存しました: {results_dir}", file=sys.stderr)
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
if __name__ == "__main__":
|
|
375
|
+
main()
|