@einja/dev-cli 0.1.41 → 0.1.45
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -1
- package/dist/cli.js +0 -1
- package/dist/cli.js.map +1 -1
- package/dist/commands/sync.d.ts.map +1 -1
- package/dist/commands/sync.js +1 -20
- package/dist/commands/sync.js.map +1 -1
- package/dist/commands/task-loop/lib/github-client.test.js.map +1 -1
- package/dist/commands/task-loop/lib/vibe-kanban-rest-client.js +2 -2
- package/dist/commands/task-loop/lib/vibe-kanban-rest-client.js.map +1 -1
- package/dist/lib/preset-update/file-copier.js +3 -3
- package/dist/lib/preset-update/file-copier.js.map +1 -1
- package/dist/lib/sync/file-filter.js +2 -2
- package/dist/lib/sync/file-filter.js.map +1 -1
- package/dist/lib/sync/file-filter.test.js +20 -0
- package/dist/lib/sync/file-filter.test.js.map +1 -1
- package/dist/lib/sync/marker-processor.js.map +1 -1
- package/dist/lib/sync/metadata-manager.js +1 -1
- package/dist/lib/sync/metadata-manager.js.map +1 -1
- package/dist/lib/sync/metadata-manager.test.js +3 -2
- package/dist/lib/sync/metadata-manager.test.js.map +1 -1
- package/dist/lib/sync/project-private-synchronizer.d.ts.map +1 -1
- package/dist/lib/sync/project-private-synchronizer.js +5 -1
- package/dist/lib/sync/project-private-synchronizer.js.map +1 -1
- package/dist/types/index.d.ts +0 -1
- package/dist/types/index.d.ts.map +1 -1
- package/package.json +1 -1
- package/presets/default/.claude/agents/einja/backend-architect.md +17 -1
- package/presets/default/.claude/agents/einja/codex-agent.md +1 -1
- package/presets/default/.claude/agents/einja/design-engineer.md +1 -1
- package/presets/default/.claude/agents/einja/docs/docs-updater.md +3 -93
- package/presets/default/.claude/agents/einja/frontend-architect.md +17 -1
- package/presets/default/.claude/agents/einja/frontend-coder.md +1 -1
- package/presets/default/.claude/agents/einja/{specs/spec-design-generator.md → issue-specs/design-generator.md} +12 -7
- package/presets/default/.claude/agents/einja/{specs/spec-qa-generator.md → issue-specs/qa-generator.md} +6 -4
- package/presets/default/.claude/agents/einja/{specs/spec-requirements-generator.md → issue-specs/requirements-generator.md} +5 -5
- package/presets/default/.claude/agents/einja/{specs/spec-tasks-generator.md → issue-specs/tasks-generator.md} +13 -14
- package/presets/default/.claude/agents/einja/{specs/spec-tasks-validator.md → issue-specs/tasks-validator.md} +9 -9
- package/presets/default/.claude/agents/einja/issue-specs/ui-design-generator.md +114 -0
- package/presets/default/.claude/agents/einja/task/task-executer.md +9 -3
- package/presets/default/.claude/agents/einja/task/task-modification-analyzer.md +2 -2
- package/presets/default/.claude/agents/einja/task/task-qa.md +3 -3
- package/presets/default/.claude/agents/einja/task/task-reviewer.md +13 -1
- package/presets/default/.claude/commands/einja/einja-sync.md +119 -44
- package/presets/default/.claude/commands/einja/issue-exec.md +29 -19
- package/presets/default/.claude/commands/einja/sync-cursor-commands.md +6 -6
- package/presets/default/.claude/commands/einja/{update-docs-by-task-specs.md → update-docs-by-issue-specs.md} +58 -58
- package/presets/default/.claude/hooks/einja/plan-mode-skill-loader.sh +5 -1
- package/presets/default/.claude/settings.json +14 -4
- package/presets/default/.claude/skills/{einja-general-context-loader → _einja-general-context-loader}/SKILL.md +2 -2
- package/presets/default/.claude/skills/{einja-output-format → _einja-output-format}/SKILL.md +1 -1
- package/presets/default/.claude/skills/_einja-project-overview/SKILL.md +29 -0
- package/presets/default/.claude/skills/{einja-spec-context-loader → _einja-spec-context-loader}/SKILL.md +5 -5
- package/presets/default/.claude/skills/einja-coding-standards/references/testing-strategy.md +899 -0
- package/presets/default/.claude/skills/einja-conflict-resolver/SKILL.md +1 -1
- package/presets/default/.claude/skills/einja-create-pr/SKILL.md +138 -0
- package/presets/default/.claude/skills/einja-infra-maintenance/SKILL.md +779 -0
- package/presets/default/.claude/{commands/einja/spec-create.md → skills/einja-issue-spec-create/SKILL.md} +47 -24
- package/presets/default/.claude/skills/einja-issue-spec-generator/SKILL.md +105 -0
- package/presets/default/.claude/skills/einja-issue-spec-generator/references/format-rules.md +35 -0
- package/presets/default/.claude/skills/einja-issue-spec-validator/SKILL.md +130 -0
- package/presets/default/.claude/skills/einja-issue-spec-validator/references/validation-rules.md +52 -0
- package/presets/default/.claude/skills/einja-npm-release/SKILL.md +242 -0
- package/presets/default/.claude/skills/einja-skill-creator/SKILL.md +68 -12
- package/presets/default/.claude/skills/einja-skill-creator/scripts/aggregate_benchmark.py +368 -121
- package/presets/default/.claude/skills/einja-skill-creator/scripts/compare_runs.py +154 -0
- package/presets/default/.claude/skills/einja-skill-creator/scripts/generate_report.py +14 -7
- package/presets/default/.claude/skills/einja-skill-creator/scripts/improve_description.py +2 -7
- package/presets/default/.claude/skills/einja-skill-creator/scripts/run_loop.py +263 -183
- package/presets/default/.claude/skills/einja-skill-first/SKILL.md +265 -0
- package/presets/default/.claude/skills/einja-subagent-question-protocol/SKILL.md +98 -0
- package/presets/default/.claude/skills/einja-task-commit/SKILL.md +7 -7
- package/presets/default/.claude/{commands/einja/task-exec.md → skills/einja-task-exec/SKILL.md} +3 -78
- package/presets/default/.claude/skills/einja-task-qa/SKILL.md +4 -4
- package/presets/default/.claude/skills/einja-task-qa/references/troubleshooting.md +1 -1
- package/presets/default/.claude/skills/einja-task-qa/references/usage-patterns.md +2 -2
- package/presets/default/.claude/skills/einja-team-exec/SKILL.md +165 -0
- package/presets/default/CLAUDE.md.template +21 -6
- package/presets/default/docs/einja/instructions/deployment-setup.md +1 -1
- package/presets/default/docs/einja/instructions/issue-exec-workflow.md +11 -11
- package/presets/default/docs/einja/instructions/local-server-environment-and-worktree.md +1 -1
- package/presets/default/docs/einja/instructions/setup-flow.md +279 -0
- package/presets/default/docs/einja/instructions/task-execute.md +42 -42
- package/presets/default/docs/einja/steering/acceptance-criteria-and-qa-guide.md +1 -1
- package/presets/default/docs/einja/steering/branch-strategy.md +1 -1
- package/presets/default/docs/einja/steering/development-workflow.md +93 -25
- package/presets/default/docs/einja/steering/infrastructure/deployment.md +107 -0
- package/presets/default/docs/einja/steering/task-management.md +9 -13
- package/presets/default/scripts/ensure-serena.sh +2 -2
- package/presets/default/scripts/env-rotate-secrets.ts +66 -6
- package/presets/default/scripts/init-github.ts +363 -0
- package/presets/default/scripts/init.sh +11 -5
- package/presets/default/scripts/setup-dev.ts +16 -1
- package/dist/lib/sync/backup-manager.d.ts +0 -50
- package/dist/lib/sync/backup-manager.d.ts.map +0 -1
- package/dist/lib/sync/backup-manager.js +0 -117
- package/dist/lib/sync/backup-manager.js.map +0 -1
- package/dist/lib/sync/backup-manager.test.d.ts +0 -2
- package/dist/lib/sync/backup-manager.test.d.ts.map +0 -1
- package/dist/lib/sync/backup-manager.test.js +0 -155
- package/dist/lib/sync/backup-manager.test.js.map +0 -1
- package/presets/default/.claude/agents/einja/git/conflict-resolver.md +0 -152
- package/presets/default/.claude/hooks/einja/validate-git-commit.sh +0 -239
- package/presets/default/.claude/skills/einja-project-overview/SKILL.md +0 -39
|
@@ -3,14 +3,16 @@
|
|
|
3
3
|
|
|
4
4
|
run_eval.pyとimprove_description.pyをループで組み合わせ、
|
|
5
5
|
履歴を追跡し最良のdescriptionを返す。
|
|
6
|
-
過学習防止のためtrain/test
|
|
6
|
+
過学習防止のためtrain/test分割(fraction指定)に対応。
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
9
|
import argparse
|
|
10
10
|
import json
|
|
11
11
|
import random
|
|
12
12
|
import sys
|
|
13
|
+
import tempfile
|
|
13
14
|
import time
|
|
15
|
+
import webbrowser
|
|
14
16
|
from pathlib import Path
|
|
15
17
|
|
|
16
18
|
try:
|
|
@@ -29,90 +31,87 @@ import anthropic
|
|
|
29
31
|
|
|
30
32
|
def split_eval_set(
|
|
31
33
|
eval_set: list[dict],
|
|
32
|
-
holdout:
|
|
33
|
-
seed: int
|
|
34
|
+
holdout: float,
|
|
35
|
+
seed: int = 42,
|
|
34
36
|
) -> tuple[list[dict], list[dict]]:
|
|
35
|
-
"""
|
|
37
|
+
"""評価セットをトレーニングとテストに分割する(fraction指定)。
|
|
36
38
|
|
|
39
|
+
holdoutは全体に対する割合(例: 0.4 = 40%)。
|
|
37
40
|
holdoutが0の場合、全データをトレーニングに使用する。
|
|
38
41
|
should_trigger=Trueとshould_trigger=Falseの両方から
|
|
39
|
-
|
|
42
|
+
均等にホールドアウトする(stratified split)。
|
|
40
43
|
"""
|
|
41
44
|
if holdout <= 0:
|
|
42
45
|
return eval_set, []
|
|
43
46
|
|
|
44
47
|
rng = random.Random(seed)
|
|
45
48
|
|
|
46
|
-
|
|
47
|
-
|
|
49
|
+
# should_triggerで分離
|
|
50
|
+
trigger = [e for e in eval_set if e.get("should_trigger", True)]
|
|
51
|
+
no_trigger = [e for e in eval_set if not e.get("should_trigger", True)]
|
|
48
52
|
|
|
49
|
-
#
|
|
50
|
-
|
|
51
|
-
|
|
53
|
+
# 各グループをシャッフル
|
|
54
|
+
rng.shuffle(trigger)
|
|
55
|
+
rng.shuffle(no_trigger)
|
|
52
56
|
|
|
53
|
-
#
|
|
54
|
-
|
|
55
|
-
|
|
57
|
+
# 分割点を割合で計算
|
|
58
|
+
n_trigger_test = max(1, int(len(trigger) * holdout))
|
|
59
|
+
n_no_trigger_test = max(1, int(len(no_trigger) * holdout))
|
|
56
60
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
test_set = positive[:pos_holdout] + negative[:neg_holdout]
|
|
61
|
-
train_set = positive[pos_holdout:] + negative[neg_holdout:]
|
|
61
|
+
# 分割
|
|
62
|
+
test_set = trigger[:n_trigger_test] + no_trigger[:n_no_trigger_test]
|
|
63
|
+
train_set = trigger[n_trigger_test:] + no_trigger[n_no_trigger_test:]
|
|
62
64
|
|
|
63
65
|
return train_set, test_set
|
|
64
66
|
|
|
65
67
|
|
|
66
68
|
def run_loop(
|
|
67
|
-
|
|
68
|
-
skill_path:
|
|
69
|
-
|
|
70
|
-
num_workers: int
|
|
71
|
-
timeout: int
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
69
|
+
eval_set: list[dict],
|
|
70
|
+
skill_path: Path,
|
|
71
|
+
description_override: str | None,
|
|
72
|
+
num_workers: int,
|
|
73
|
+
timeout: int,
|
|
74
|
+
max_iterations: int,
|
|
75
|
+
runs_per_query: int,
|
|
76
|
+
trigger_threshold: float,
|
|
77
|
+
holdout: float,
|
|
78
|
+
seed: int | None,
|
|
79
|
+
model: str | None,
|
|
80
|
+
improve_model: str,
|
|
81
|
+
verbose: bool,
|
|
82
|
+
live_report_path: Path | None = None,
|
|
83
|
+
log_dir: Path | None = None,
|
|
81
84
|
) -> dict:
|
|
82
85
|
"""評価+改善ループのメイン関数。"""
|
|
83
|
-
eval_set = json.loads(Path(eval_set_path).read_text())
|
|
84
|
-
skill_dir = Path(skill_path)
|
|
85
|
-
|
|
86
|
-
if not (skill_dir / "SKILL.md").exists():
|
|
87
|
-
print(f"エラー: {skill_dir} にSKILL.mdが見つかりません", file=sys.stderr)
|
|
88
|
-
sys.exit(1)
|
|
89
|
-
|
|
90
|
-
name, original_description, content = parse_skill_md(skill_dir)
|
|
91
86
|
project_root = find_project_root()
|
|
87
|
+
name, original_description, content = parse_skill_md(skill_path)
|
|
88
|
+
current_description = description_override or original_description
|
|
92
89
|
|
|
93
|
-
# train/test
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
90
|
+
# train/test分割(holdoutが0より大きい場合のみ)
|
|
91
|
+
if holdout > 0:
|
|
92
|
+
train_set, test_set = split_eval_set(eval_set, holdout, seed if seed is not None else 42)
|
|
93
|
+
if verbose:
|
|
94
|
+
print(f"分割: トレーニング {len(train_set)} 件, テスト {len(test_set)} 件 (holdout={holdout})", file=sys.stderr)
|
|
95
|
+
else:
|
|
96
|
+
train_set = eval_set
|
|
97
|
+
test_set = []
|
|
101
98
|
|
|
102
99
|
client = anthropic.Anthropic()
|
|
103
|
-
current_description = original_description
|
|
104
100
|
history: list[dict] = []
|
|
105
|
-
|
|
106
|
-
log_path = Path(log_dir) if log_dir else None
|
|
101
|
+
exit_reason = "unknown"
|
|
107
102
|
|
|
108
|
-
for iteration in range(max_iterations):
|
|
103
|
+
for iteration in range(1, max_iterations + 1):
|
|
109
104
|
if verbose:
|
|
110
|
-
print(f"\n
|
|
111
|
-
print(f"
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
105
|
+
print(f"\n{'='*60}", file=sys.stderr)
|
|
106
|
+
print(f"イテレーション {iteration}/{max_iterations}", file=sys.stderr)
|
|
107
|
+
print(f"description: {current_description}", file=sys.stderr)
|
|
108
|
+
print(f"{'='*60}", file=sys.stderr)
|
|
109
|
+
|
|
110
|
+
# train + test を一括で並行評価(効率化)
|
|
111
|
+
all_queries = train_set + test_set
|
|
112
|
+
t0 = time.time()
|
|
113
|
+
all_results = run_eval(
|
|
114
|
+
eval_set=all_queries,
|
|
116
115
|
skill_name=name,
|
|
117
116
|
description=current_description,
|
|
118
117
|
num_workers=num_workers,
|
|
@@ -122,161 +121,227 @@ def run_loop(
|
|
|
122
121
|
trigger_threshold=trigger_threshold,
|
|
123
122
|
model=model,
|
|
124
123
|
)
|
|
124
|
+
eval_elapsed = time.time() - t0
|
|
125
|
+
|
|
126
|
+
# クエリの一致でtrain/testに結果を振り分け
|
|
127
|
+
train_queries_set = {q["query"] for q in train_set}
|
|
128
|
+
train_result_list = [r for r in all_results["results"] if r["query"] in train_queries_set]
|
|
129
|
+
test_result_list = [r for r in all_results["results"] if r["query"] not in train_queries_set]
|
|
130
|
+
|
|
131
|
+
train_passed = sum(1 for r in train_result_list if r["pass"])
|
|
132
|
+
train_total = len(train_result_list)
|
|
133
|
+
train_summary = {"passed": train_passed, "failed": train_total - train_passed, "total": train_total}
|
|
134
|
+
train_results = {"results": train_result_list, "summary": train_summary}
|
|
125
135
|
|
|
126
|
-
# テスト評価(テストセットがある場合)
|
|
127
|
-
test_results = None
|
|
128
136
|
if test_set:
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
# 履歴エントリの構築
|
|
142
|
-
entry: dict = {
|
|
137
|
+
test_passed = sum(1 for r in test_result_list if r["pass"])
|
|
138
|
+
test_total = len(test_result_list)
|
|
139
|
+
test_summary = {"passed": test_passed, "failed": test_total - test_passed, "total": test_total}
|
|
140
|
+
test_results = {"results": test_result_list, "summary": test_summary}
|
|
141
|
+
else:
|
|
142
|
+
test_results = None
|
|
143
|
+
test_summary = None
|
|
144
|
+
|
|
145
|
+
# 履歴エントリの構築(レポートジェネレーターとの後方互換性を保持)
|
|
146
|
+
history.append({
|
|
147
|
+
"iteration": iteration,
|
|
143
148
|
"description": current_description,
|
|
144
|
-
"train_passed":
|
|
145
|
-
"train_failed":
|
|
146
|
-
"train_total":
|
|
149
|
+
"train_passed": train_summary["passed"],
|
|
150
|
+
"train_failed": train_summary["failed"],
|
|
151
|
+
"train_total": train_summary["total"],
|
|
147
152
|
"train_results": train_results["results"],
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
153
|
+
"test_passed": test_summary["passed"] if test_summary else None,
|
|
154
|
+
"test_failed": test_summary["failed"] if test_summary else None,
|
|
155
|
+
"test_total": test_summary["total"] if test_summary else None,
|
|
156
|
+
"test_results": test_results["results"] if test_results else None,
|
|
157
|
+
# レポートジェネレーター後方互換
|
|
158
|
+
"passed": train_summary["passed"],
|
|
159
|
+
"failed": train_summary["failed"],
|
|
160
|
+
"total": train_summary["total"],
|
|
161
|
+
"results": train_results["results"],
|
|
162
|
+
})
|
|
163
|
+
|
|
164
|
+
# ライブレポートを更新(指定されている場合)
|
|
165
|
+
if live_report_path:
|
|
166
|
+
partial_output = {
|
|
167
|
+
"original_description": original_description,
|
|
168
|
+
"best_description": current_description,
|
|
169
|
+
"best_score": "in progress",
|
|
170
|
+
"iterations_run": len(history),
|
|
171
|
+
"holdout": holdout,
|
|
172
|
+
"train_size": len(train_set),
|
|
173
|
+
"test_size": len(test_set),
|
|
174
|
+
"history": history,
|
|
175
|
+
}
|
|
176
|
+
live_report_path.write_text(generate_html(partial_output, auto_refresh=True, skill_name=name))
|
|
177
|
+
if verbose:
|
|
178
|
+
print(f"レポートを更新しました: {live_report_path}", file=sys.stderr)
|
|
156
179
|
|
|
157
180
|
if verbose:
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
181
|
+
def print_eval_stats(label: str, results: list[dict], elapsed: float) -> None:
|
|
182
|
+
pos = [r for r in results if r.get("should_trigger", True)]
|
|
183
|
+
neg = [r for r in results if not r.get("should_trigger", True)]
|
|
184
|
+
tp = sum(r["triggers"] for r in pos)
|
|
185
|
+
pos_runs = sum(r["runs"] for r in pos)
|
|
186
|
+
fn = pos_runs - tp
|
|
187
|
+
fp = sum(r["triggers"] for r in neg)
|
|
188
|
+
neg_runs = sum(r["runs"] for r in neg)
|
|
189
|
+
tn = neg_runs - fp
|
|
190
|
+
total = tp + tn + fp + fn
|
|
191
|
+
precision = tp / (tp + fp) if (tp + fp) > 0 else 1.0
|
|
192
|
+
recall = tp / (tp + fn) if (tp + fn) > 0 else 1.0
|
|
193
|
+
accuracy = (tp + tn) / total if total > 0 else 0.0
|
|
194
|
+
print(
|
|
195
|
+
f"{label}: {tp+tn}/{total} 正解, "
|
|
196
|
+
f"precision={precision:.0%} recall={recall:.0%} accuracy={accuracy:.0%} ({elapsed:.1f}s)",
|
|
197
|
+
file=sys.stderr,
|
|
198
|
+
)
|
|
199
|
+
for r in results:
|
|
200
|
+
status = "PASS" if r["pass"] else "FAIL"
|
|
201
|
+
rate_str = f"{r['triggers']}/{r['runs']}"
|
|
202
|
+
print(
|
|
203
|
+
f" [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:60]}",
|
|
204
|
+
file=sys.stderr,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
print_eval_stats("Train", train_results["results"], eval_elapsed)
|
|
208
|
+
if test_summary:
|
|
209
|
+
print_eval_stats("Test ", test_results["results"], 0) # type: ignore[index]
|
|
210
|
+
|
|
211
|
+
# train全パスなら終了(testは過学習モニタリング用のみ)
|
|
212
|
+
if train_summary["failed"] == 0:
|
|
213
|
+
exit_reason = f"all_passed (iteration {iteration})"
|
|
170
214
|
if verbose:
|
|
171
|
-
print(f"
|
|
172
|
-
|
|
173
|
-
# 全パスなら終了
|
|
174
|
-
if train_results["summary"]["failed"] == 0:
|
|
175
|
-
if test_results is None or test_results["summary"]["failed"] == 0:
|
|
176
|
-
if verbose:
|
|
177
|
-
print("全クエリパス。ループを終了します。", file=sys.stderr)
|
|
178
|
-
break
|
|
215
|
+
print(f"\nイテレーション {iteration} でtrain全クエリパス!ループを終了します。", file=sys.stderr)
|
|
216
|
+
break
|
|
179
217
|
|
|
180
|
-
|
|
181
|
-
|
|
218
|
+
if iteration == max_iterations:
|
|
219
|
+
exit_reason = f"max_iterations ({max_iterations})"
|
|
182
220
|
if verbose:
|
|
183
|
-
print("
|
|
184
|
-
|
|
185
|
-
new_description = improve_description(
|
|
186
|
-
client=client,
|
|
187
|
-
skill_name=name,
|
|
188
|
-
skill_content=content,
|
|
189
|
-
current_description=current_description,
|
|
190
|
-
eval_results=train_results,
|
|
191
|
-
history=improve_history,
|
|
192
|
-
model=improve_model,
|
|
193
|
-
test_results=test_results,
|
|
194
|
-
log_dir=log_path,
|
|
195
|
-
iteration=iteration,
|
|
196
|
-
)
|
|
197
|
-
|
|
198
|
-
# 改善履歴を更新
|
|
199
|
-
improve_entry: dict = {
|
|
200
|
-
"description": current_description,
|
|
201
|
-
"train_passed": train_results["summary"]["passed"],
|
|
202
|
-
"train_total": train_results["summary"]["total"],
|
|
203
|
-
"results": train_results["results"],
|
|
204
|
-
}
|
|
205
|
-
if test_results:
|
|
206
|
-
improve_entry["test_passed"] = test_results["summary"]["passed"]
|
|
207
|
-
improve_entry["test_total"] = test_results["summary"]["total"]
|
|
208
|
-
improve_history.append(improve_entry)
|
|
221
|
+
print(f"\n最大イテレーション数到達 ({max_iterations})。", file=sys.stderr)
|
|
222
|
+
break
|
|
209
223
|
|
|
210
|
-
|
|
224
|
+
# descriptionを改善(train結果のみ使用)
|
|
225
|
+
if verbose:
|
|
226
|
+
print(f"\ndescriptionを改善中...", file=sys.stderr)
|
|
227
|
+
|
|
228
|
+
t0 = time.time()
|
|
229
|
+
# 過学習防止のため、改善モデルにtest_スコアを見せないようにブラインド処理
|
|
230
|
+
blinded_history = [
|
|
231
|
+
{k: v for k, v in h.items() if not k.startswith("test_")}
|
|
232
|
+
for h in history
|
|
233
|
+
]
|
|
234
|
+
new_description = improve_description(
|
|
235
|
+
client=client,
|
|
236
|
+
skill_name=name,
|
|
237
|
+
skill_content=content,
|
|
238
|
+
current_description=current_description,
|
|
239
|
+
eval_results=train_results,
|
|
240
|
+
history=blinded_history,
|
|
241
|
+
model=improve_model,
|
|
242
|
+
log_dir=log_dir,
|
|
243
|
+
iteration=iteration,
|
|
244
|
+
)
|
|
245
|
+
improve_elapsed = time.time() - t0
|
|
211
246
|
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
output_data = {"history": history, "holdout": holdout}
|
|
232
|
-
report_html = generate_html(output_data, auto_refresh=False, skill_name=name)
|
|
233
|
-
Path(report_path).write_text(report_html)
|
|
234
|
-
|
|
235
|
-
output = {
|
|
236
|
-
"skill_name": name,
|
|
247
|
+
if verbose:
|
|
248
|
+
print(f"新しいdescription ({improve_elapsed:.1f}s): {new_description}", file=sys.stderr)
|
|
249
|
+
|
|
250
|
+
current_description = new_description
|
|
251
|
+
|
|
252
|
+
# 最良のdescriptionを選択(testセットあり→test優先、なし→train)
|
|
253
|
+
if test_set:
|
|
254
|
+
best = max(history, key=lambda h: h["test_passed"] or 0)
|
|
255
|
+
best_score = f"{best['test_passed']}/{best['test_total']}"
|
|
256
|
+
else:
|
|
257
|
+
best = max(history, key=lambda h: h["train_passed"])
|
|
258
|
+
best_score = f"{best['train_passed']}/{best['train_total']}"
|
|
259
|
+
|
|
260
|
+
if verbose:
|
|
261
|
+
print(f"\n終了理由: {exit_reason}", file=sys.stderr)
|
|
262
|
+
print(f"最良スコア: {best_score} (イテレーション {best['iteration']})", file=sys.stderr)
|
|
263
|
+
|
|
264
|
+
return {
|
|
265
|
+
"exit_reason": exit_reason,
|
|
237
266
|
"original_description": original_description,
|
|
238
267
|
"best_description": best["description"],
|
|
239
|
-
"
|
|
240
|
-
"
|
|
268
|
+
"best_score": best_score,
|
|
269
|
+
"best_train_score": f"{best['train_passed']}/{best['train_total']}",
|
|
270
|
+
"best_test_score": f"{best['test_passed']}/{best['test_total']}" if test_set else None,
|
|
271
|
+
"final_description": current_description,
|
|
272
|
+
"iterations_run": len(history),
|
|
241
273
|
"holdout": holdout,
|
|
274
|
+
"train_size": len(train_set),
|
|
275
|
+
"test_size": len(test_set),
|
|
276
|
+
"history": history,
|
|
242
277
|
}
|
|
243
278
|
|
|
244
|
-
if verbose:
|
|
245
|
-
print(f"\n最良のdescription (イテレーション {best_idx}): {best['description']}", file=sys.stderr)
|
|
246
|
-
train_s = f"{best['train_passed']}/{best['train_total']}"
|
|
247
|
-
msg = f"最良スコア - トレーニング: {train_s}"
|
|
248
|
-
if best.get("test_passed") is not None:
|
|
249
|
-
test_s = f"{best['test_passed']}/{best['test_total']}"
|
|
250
|
-
msg += f", テスト: {test_s}"
|
|
251
|
-
print(msg, file=sys.stderr)
|
|
252
|
-
|
|
253
|
-
return output
|
|
254
279
|
|
|
255
|
-
|
|
256
|
-
def main():
|
|
280
|
+
def main() -> None:
|
|
257
281
|
parser = argparse.ArgumentParser(description="評価+改善ループを実行")
|
|
258
282
|
parser.add_argument("--eval-set", required=True, help="評価セットJSONファイルへのパス")
|
|
259
283
|
parser.add_argument("--skill-path", required=True, help="スキルディレクトリへのパス")
|
|
260
|
-
parser.add_argument("--
|
|
284
|
+
parser.add_argument("--description", default=None, help="開始descriptionを上書き")
|
|
261
285
|
parser.add_argument("--num-workers", type=int, default=10, help="並行ワーカー数(デフォルト: 10)")
|
|
262
286
|
parser.add_argument("--timeout", type=int, default=30, help="クエリごとのタイムアウト秒数(デフォルト: 30)")
|
|
287
|
+
parser.add_argument("--max-iterations", type=int, default=5, help="最大イテレーション数(デフォルト: 5)")
|
|
263
288
|
parser.add_argument("--runs-per-query", type=int, default=3, help="クエリごとの実行回数(デフォルト: 3)")
|
|
264
289
|
parser.add_argument("--trigger-threshold", type=float, default=0.5, help="トリガー率の閾値(デフォルト: 0.5)")
|
|
265
|
-
parser.add_argument("--holdout", type=
|
|
290
|
+
parser.add_argument("--holdout", type=float, default=0.4, help="テスト用ホールドアウト割合(0で無効、デフォルト: 0.4)")
|
|
266
291
|
parser.add_argument("--seed", type=int, default=None, help="train/test分割のランダムシード")
|
|
267
292
|
parser.add_argument("--model", default=None, help="評価時にclaude -pに使用するモデル")
|
|
268
293
|
parser.add_argument("--improve-model", default="claude-sonnet-4-20250514", help="description改善に使用するモデル(デフォルト: claude-sonnet-4-20250514)")
|
|
269
294
|
parser.add_argument("--verbose", action="store_true", help="進捗をstderrに出力")
|
|
270
|
-
parser.add_argument("--report", default=
|
|
271
|
-
parser.add_argument("--
|
|
295
|
+
parser.add_argument("--report", default="auto", help="HTMLレポートの出力先パス('auto'で一時ファイル自動起動、'none'で無効)")
|
|
296
|
+
parser.add_argument("--results-dir", default=None, help="タイムスタンプ付きサブディレクトリに全出力(results.json, report.html, logs)を保存")
|
|
297
|
+
parser.add_argument("--log-dir", default=None, help="改善トランスクリプトのログディレクトリ(--results-dirより優先)")
|
|
272
298
|
args = parser.parse_args()
|
|
273
299
|
|
|
300
|
+
eval_set = json.loads(Path(args.eval_set).read_text())
|
|
301
|
+
skill_path = Path(args.skill_path)
|
|
302
|
+
|
|
303
|
+
if not (skill_path / "SKILL.md").exists():
|
|
304
|
+
print(f"エラー: {skill_path} にSKILL.mdが見つかりません", file=sys.stderr)
|
|
305
|
+
sys.exit(1)
|
|
306
|
+
|
|
307
|
+
name, _, _ = parse_skill_md(skill_path)
|
|
308
|
+
|
|
309
|
+
# ライブレポートパスのセットアップ
|
|
310
|
+
if args.report != "none":
|
|
311
|
+
if args.report == "auto":
|
|
312
|
+
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
|
313
|
+
live_report_path = Path(tempfile.gettempdir()) / f"skill_description_report_{skill_path.name}_{timestamp}.html"
|
|
314
|
+
else:
|
|
315
|
+
live_report_path = Path(args.report)
|
|
316
|
+
# ブラウザで即座に開けるよう初期HTMLを書き込む
|
|
317
|
+
live_report_path.write_text("<html><body><h1>最適化ループを開始しています...</h1><meta http-equiv='refresh' content='5'></body></html>")
|
|
318
|
+
webbrowser.open(str(live_report_path))
|
|
319
|
+
else:
|
|
320
|
+
live_report_path = None
|
|
321
|
+
|
|
322
|
+
# 出力ディレクトリの決定(run_loop実行前に作成してlogsを保存可能にする)
|
|
323
|
+
if args.results_dir:
|
|
324
|
+
timestamp = time.strftime("%Y-%m-%d_%H%M%S")
|
|
325
|
+
results_dir = Path(args.results_dir) / timestamp
|
|
326
|
+
results_dir.mkdir(parents=True, exist_ok=True)
|
|
327
|
+
else:
|
|
328
|
+
results_dir = None
|
|
329
|
+
|
|
330
|
+
# --log-dir が明示指定されていればそちらを優先、なければ results_dir/logs
|
|
331
|
+
if args.log_dir:
|
|
332
|
+
log_dir: Path | None = Path(args.log_dir)
|
|
333
|
+
elif results_dir:
|
|
334
|
+
log_dir = results_dir / "logs"
|
|
335
|
+
else:
|
|
336
|
+
log_dir = None
|
|
337
|
+
|
|
274
338
|
output = run_loop(
|
|
275
|
-
|
|
276
|
-
skill_path=
|
|
277
|
-
|
|
339
|
+
eval_set=eval_set,
|
|
340
|
+
skill_path=skill_path,
|
|
341
|
+
description_override=args.description,
|
|
278
342
|
num_workers=args.num_workers,
|
|
279
343
|
timeout=args.timeout,
|
|
344
|
+
max_iterations=args.max_iterations,
|
|
280
345
|
runs_per_query=args.runs_per_query,
|
|
281
346
|
trigger_threshold=args.trigger_threshold,
|
|
282
347
|
holdout=args.holdout,
|
|
@@ -284,11 +349,26 @@ def main():
|
|
|
284
349
|
model=args.model,
|
|
285
350
|
improve_model=args.improve_model,
|
|
286
351
|
verbose=args.verbose,
|
|
287
|
-
|
|
288
|
-
log_dir=
|
|
352
|
+
live_report_path=live_report_path,
|
|
353
|
+
log_dir=log_dir,
|
|
289
354
|
)
|
|
290
355
|
|
|
291
|
-
|
|
356
|
+
# JSON出力
|
|
357
|
+
json_output = json.dumps(output, indent=2)
|
|
358
|
+
print(json_output)
|
|
359
|
+
if results_dir:
|
|
360
|
+
(results_dir / "results.json").write_text(json_output)
|
|
361
|
+
|
|
362
|
+
# 最終HTMLレポートの書き込み(auto_refreshオフ)
|
|
363
|
+
if live_report_path:
|
|
364
|
+
live_report_path.write_text(generate_html(output, auto_refresh=False, skill_name=name))
|
|
365
|
+
print(f"\nレポート: {live_report_path}", file=sys.stderr)
|
|
366
|
+
|
|
367
|
+
if results_dir and live_report_path:
|
|
368
|
+
(results_dir / "report.html").write_text(generate_html(output, auto_refresh=False, skill_name=name))
|
|
369
|
+
|
|
370
|
+
if results_dir:
|
|
371
|
+
print(f"結果を保存しました: {results_dir}", file=sys.stderr)
|
|
292
372
|
|
|
293
373
|
|
|
294
374
|
if __name__ == "__main__":
|