@einja/dev-cli 0.1.41 → 0.1.45

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. package/README.md +0 -1
  2. package/dist/cli.js +0 -1
  3. package/dist/cli.js.map +1 -1
  4. package/dist/commands/sync.d.ts.map +1 -1
  5. package/dist/commands/sync.js +1 -20
  6. package/dist/commands/sync.js.map +1 -1
  7. package/dist/commands/task-loop/lib/github-client.test.js.map +1 -1
  8. package/dist/commands/task-loop/lib/vibe-kanban-rest-client.js +2 -2
  9. package/dist/commands/task-loop/lib/vibe-kanban-rest-client.js.map +1 -1
  10. package/dist/lib/preset-update/file-copier.js +3 -3
  11. package/dist/lib/preset-update/file-copier.js.map +1 -1
  12. package/dist/lib/sync/file-filter.js +2 -2
  13. package/dist/lib/sync/file-filter.js.map +1 -1
  14. package/dist/lib/sync/file-filter.test.js +20 -0
  15. package/dist/lib/sync/file-filter.test.js.map +1 -1
  16. package/dist/lib/sync/marker-processor.js.map +1 -1
  17. package/dist/lib/sync/metadata-manager.js +1 -1
  18. package/dist/lib/sync/metadata-manager.js.map +1 -1
  19. package/dist/lib/sync/metadata-manager.test.js +3 -2
  20. package/dist/lib/sync/metadata-manager.test.js.map +1 -1
  21. package/dist/lib/sync/project-private-synchronizer.d.ts.map +1 -1
  22. package/dist/lib/sync/project-private-synchronizer.js +5 -1
  23. package/dist/lib/sync/project-private-synchronizer.js.map +1 -1
  24. package/dist/types/index.d.ts +0 -1
  25. package/dist/types/index.d.ts.map +1 -1
  26. package/package.json +1 -1
  27. package/presets/default/.claude/agents/einja/backend-architect.md +17 -1
  28. package/presets/default/.claude/agents/einja/codex-agent.md +1 -1
  29. package/presets/default/.claude/agents/einja/design-engineer.md +1 -1
  30. package/presets/default/.claude/agents/einja/docs/docs-updater.md +3 -93
  31. package/presets/default/.claude/agents/einja/frontend-architect.md +17 -1
  32. package/presets/default/.claude/agents/einja/frontend-coder.md +1 -1
  33. package/presets/default/.claude/agents/einja/{specs/spec-design-generator.md → issue-specs/design-generator.md} +12 -7
  34. package/presets/default/.claude/agents/einja/{specs/spec-qa-generator.md → issue-specs/qa-generator.md} +6 -4
  35. package/presets/default/.claude/agents/einja/{specs/spec-requirements-generator.md → issue-specs/requirements-generator.md} +5 -5
  36. package/presets/default/.claude/agents/einja/{specs/spec-tasks-generator.md → issue-specs/tasks-generator.md} +13 -14
  37. package/presets/default/.claude/agents/einja/{specs/spec-tasks-validator.md → issue-specs/tasks-validator.md} +9 -9
  38. package/presets/default/.claude/agents/einja/issue-specs/ui-design-generator.md +114 -0
  39. package/presets/default/.claude/agents/einja/task/task-executer.md +9 -3
  40. package/presets/default/.claude/agents/einja/task/task-modification-analyzer.md +2 -2
  41. package/presets/default/.claude/agents/einja/task/task-qa.md +3 -3
  42. package/presets/default/.claude/agents/einja/task/task-reviewer.md +13 -1
  43. package/presets/default/.claude/commands/einja/einja-sync.md +119 -44
  44. package/presets/default/.claude/commands/einja/issue-exec.md +29 -19
  45. package/presets/default/.claude/commands/einja/sync-cursor-commands.md +6 -6
  46. package/presets/default/.claude/commands/einja/{update-docs-by-task-specs.md → update-docs-by-issue-specs.md} +58 -58
  47. package/presets/default/.claude/hooks/einja/plan-mode-skill-loader.sh +5 -1
  48. package/presets/default/.claude/settings.json +14 -4
  49. package/presets/default/.claude/skills/{einja-general-context-loader → _einja-general-context-loader}/SKILL.md +2 -2
  50. package/presets/default/.claude/skills/{einja-output-format → _einja-output-format}/SKILL.md +1 -1
  51. package/presets/default/.claude/skills/_einja-project-overview/SKILL.md +29 -0
  52. package/presets/default/.claude/skills/{einja-spec-context-loader → _einja-spec-context-loader}/SKILL.md +5 -5
  53. package/presets/default/.claude/skills/einja-coding-standards/references/testing-strategy.md +899 -0
  54. package/presets/default/.claude/skills/einja-conflict-resolver/SKILL.md +1 -1
  55. package/presets/default/.claude/skills/einja-create-pr/SKILL.md +138 -0
  56. package/presets/default/.claude/skills/einja-infra-maintenance/SKILL.md +779 -0
  57. package/presets/default/.claude/{commands/einja/spec-create.md → skills/einja-issue-spec-create/SKILL.md} +47 -24
  58. package/presets/default/.claude/skills/einja-issue-spec-generator/SKILL.md +105 -0
  59. package/presets/default/.claude/skills/einja-issue-spec-generator/references/format-rules.md +35 -0
  60. package/presets/default/.claude/skills/einja-issue-spec-validator/SKILL.md +130 -0
  61. package/presets/default/.claude/skills/einja-issue-spec-validator/references/validation-rules.md +52 -0
  62. package/presets/default/.claude/skills/einja-npm-release/SKILL.md +242 -0
  63. package/presets/default/.claude/skills/einja-skill-creator/SKILL.md +68 -12
  64. package/presets/default/.claude/skills/einja-skill-creator/scripts/aggregate_benchmark.py +368 -121
  65. package/presets/default/.claude/skills/einja-skill-creator/scripts/compare_runs.py +154 -0
  66. package/presets/default/.claude/skills/einja-skill-creator/scripts/generate_report.py +14 -7
  67. package/presets/default/.claude/skills/einja-skill-creator/scripts/improve_description.py +2 -7
  68. package/presets/default/.claude/skills/einja-skill-creator/scripts/run_loop.py +263 -183
  69. package/presets/default/.claude/skills/einja-skill-first/SKILL.md +265 -0
  70. package/presets/default/.claude/skills/einja-subagent-question-protocol/SKILL.md +98 -0
  71. package/presets/default/.claude/skills/einja-task-commit/SKILL.md +7 -7
  72. package/presets/default/.claude/{commands/einja/task-exec.md → skills/einja-task-exec/SKILL.md} +3 -78
  73. package/presets/default/.claude/skills/einja-task-qa/SKILL.md +4 -4
  74. package/presets/default/.claude/skills/einja-task-qa/references/troubleshooting.md +1 -1
  75. package/presets/default/.claude/skills/einja-task-qa/references/usage-patterns.md +2 -2
  76. package/presets/default/.claude/skills/einja-team-exec/SKILL.md +165 -0
  77. package/presets/default/CLAUDE.md.template +21 -6
  78. package/presets/default/docs/einja/instructions/deployment-setup.md +1 -1
  79. package/presets/default/docs/einja/instructions/issue-exec-workflow.md +11 -11
  80. package/presets/default/docs/einja/instructions/local-server-environment-and-worktree.md +1 -1
  81. package/presets/default/docs/einja/instructions/setup-flow.md +279 -0
  82. package/presets/default/docs/einja/instructions/task-execute.md +42 -42
  83. package/presets/default/docs/einja/steering/acceptance-criteria-and-qa-guide.md +1 -1
  84. package/presets/default/docs/einja/steering/branch-strategy.md +1 -1
  85. package/presets/default/docs/einja/steering/development-workflow.md +93 -25
  86. package/presets/default/docs/einja/steering/infrastructure/deployment.md +107 -0
  87. package/presets/default/docs/einja/steering/task-management.md +9 -13
  88. package/presets/default/scripts/ensure-serena.sh +2 -2
  89. package/presets/default/scripts/env-rotate-secrets.ts +66 -6
  90. package/presets/default/scripts/init-github.ts +363 -0
  91. package/presets/default/scripts/init.sh +11 -5
  92. package/presets/default/scripts/setup-dev.ts +16 -1
  93. package/dist/lib/sync/backup-manager.d.ts +0 -50
  94. package/dist/lib/sync/backup-manager.d.ts.map +0 -1
  95. package/dist/lib/sync/backup-manager.js +0 -117
  96. package/dist/lib/sync/backup-manager.js.map +0 -1
  97. package/dist/lib/sync/backup-manager.test.d.ts +0 -2
  98. package/dist/lib/sync/backup-manager.test.d.ts.map +0 -1
  99. package/dist/lib/sync/backup-manager.test.js +0 -155
  100. package/dist/lib/sync/backup-manager.test.js.map +0 -1
  101. package/presets/default/.claude/agents/einja/git/conflict-resolver.md +0 -152
  102. package/presets/default/.claude/hooks/einja/validate-git-commit.sh +0 -239
  103. package/presets/default/.claude/skills/einja-project-overview/SKILL.md +0 -39
@@ -3,14 +3,16 @@
3
3
 
4
4
  run_eval.pyとimprove_description.pyをループで組み合わせ、
5
5
  履歴を追跡し最良のdescriptionを返す。
6
- 過学習防止のためtrain/test分割に対応。
6
+ 過学習防止のためtrain/test分割(fraction指定)に対応。
7
7
  """
8
8
 
9
9
  import argparse
10
10
  import json
11
11
  import random
12
12
  import sys
13
+ import tempfile
13
14
  import time
15
+ import webbrowser
14
16
  from pathlib import Path
15
17
 
16
18
  try:
@@ -29,90 +31,87 @@ import anthropic
29
31
 
30
32
  def split_eval_set(
31
33
  eval_set: list[dict],
32
- holdout: int,
33
- seed: int | None = None,
34
+ holdout: float,
35
+ seed: int = 42,
34
36
  ) -> tuple[list[dict], list[dict]]:
35
- """評価セットをトレーニングとテストに分割する。
37
+ """評価セットをトレーニングとテストに分割する(fraction指定)。
36
38
 
39
+ holdoutは全体に対する割合(例: 0.4 = 40%)。
37
40
  holdoutが0の場合、全データをトレーニングに使用する。
38
41
  should_trigger=Trueとshould_trigger=Falseの両方から
39
- 均等にホールドアウトする。
42
+ 均等にホールドアウトする(stratified split)。
40
43
  """
41
44
  if holdout <= 0:
42
45
  return eval_set, []
43
46
 
44
47
  rng = random.Random(seed)
45
48
 
46
- positive = [item for item in eval_set if item.get("should_trigger", True)]
47
- negative = [item for item in eval_set if not item.get("should_trigger", True)]
49
+ # should_triggerで分離
50
+ trigger = [e for e in eval_set if e.get("should_trigger", True)]
51
+ no_trigger = [e for e in eval_set if not e.get("should_trigger", True)]
48
52
 
49
- # 正例と負例から均等にホールドアウト
50
- pos_holdout = holdout // 2
51
- neg_holdout = holdout - pos_holdout
53
+ # 各グループをシャッフル
54
+ rng.shuffle(trigger)
55
+ rng.shuffle(no_trigger)
52
56
 
53
- # 上限調整
54
- pos_holdout = min(pos_holdout, len(positive) - 1) if len(positive) > 1 else 0
55
- neg_holdout = min(neg_holdout, len(negative) - 1) if len(negative) > 1 else 0
57
+ # 分割点を割合で計算
58
+ n_trigger_test = max(1, int(len(trigger) * holdout))
59
+ n_no_trigger_test = max(1, int(len(no_trigger) * holdout))
56
60
 
57
- rng.shuffle(positive)
58
- rng.shuffle(negative)
59
-
60
- test_set = positive[:pos_holdout] + negative[:neg_holdout]
61
- train_set = positive[pos_holdout:] + negative[neg_holdout:]
61
+ # 分割
62
+ test_set = trigger[:n_trigger_test] + no_trigger[:n_no_trigger_test]
63
+ train_set = trigger[n_trigger_test:] + no_trigger[n_no_trigger_test:]
62
64
 
63
65
  return train_set, test_set
64
66
 
65
67
 
66
68
  def run_loop(
67
- eval_set_path: str,
68
- skill_path: str,
69
- max_iterations: int = 10,
70
- num_workers: int = 10,
71
- timeout: int = 30,
72
- runs_per_query: int = 3,
73
- trigger_threshold: float = 0.5,
74
- holdout: int = 0,
75
- seed: int | None = None,
76
- model: str | None = None,
77
- improve_model: str = "claude-sonnet-4-20250514",
78
- verbose: bool = False,
79
- report_path: str | None = None,
80
- log_dir: str | None = None,
69
+ eval_set: list[dict],
70
+ skill_path: Path,
71
+ description_override: str | None,
72
+ num_workers: int,
73
+ timeout: int,
74
+ max_iterations: int,
75
+ runs_per_query: int,
76
+ trigger_threshold: float,
77
+ holdout: float,
78
+ seed: int | None,
79
+ model: str | None,
80
+ improve_model: str,
81
+ verbose: bool,
82
+ live_report_path: Path | None = None,
83
+ log_dir: Path | None = None,
81
84
  ) -> dict:
82
85
  """評価+改善ループのメイン関数。"""
83
- eval_set = json.loads(Path(eval_set_path).read_text())
84
- skill_dir = Path(skill_path)
85
-
86
- if not (skill_dir / "SKILL.md").exists():
87
- print(f"エラー: {skill_dir} にSKILL.mdが見つかりません", file=sys.stderr)
88
- sys.exit(1)
89
-
90
- name, original_description, content = parse_skill_md(skill_dir)
91
86
  project_root = find_project_root()
87
+ name, original_description, content = parse_skill_md(skill_path)
88
+ current_description = description_override or original_description
92
89
 
93
- # train/test分割
94
- train_set, test_set = split_eval_set(eval_set, holdout, seed)
95
-
96
- if verbose:
97
- print(f"スキル: {name}", file=sys.stderr)
98
- print(f"トレーニングクエリ: {len(train_set)}, テストクエリ: {len(test_set)}", file=sys.stderr)
99
- print(f"最大イテレーション: {max_iterations}", file=sys.stderr)
100
- print(f"オリジナルdescription: {original_description}", file=sys.stderr)
90
+ # train/test分割(holdoutが0より大きい場合のみ)
91
+ if holdout > 0:
92
+ train_set, test_set = split_eval_set(eval_set, holdout, seed if seed is not None else 42)
93
+ if verbose:
94
+ print(f"分割: トレーニング {len(train_set)} 件, テスト {len(test_set)} 件 (holdout={holdout})", file=sys.stderr)
95
+ else:
96
+ train_set = eval_set
97
+ test_set = []
101
98
 
102
99
  client = anthropic.Anthropic()
103
- current_description = original_description
104
100
  history: list[dict] = []
105
- improve_history: list[dict] = []
106
- log_path = Path(log_dir) if log_dir else None
101
+ exit_reason = "unknown"
107
102
 
108
- for iteration in range(max_iterations):
103
+ for iteration in range(1, max_iterations + 1):
109
104
  if verbose:
110
- print(f"\n--- イテレーション {iteration} ---", file=sys.stderr)
111
- print(f"description: {current_description[:100]}...", file=sys.stderr)
112
-
113
- # トレーニング評価
114
- train_results = run_eval(
115
- eval_set=train_set,
105
+ print(f"\n{'='*60}", file=sys.stderr)
106
+ print(f"イテレーション {iteration}/{max_iterations}", file=sys.stderr)
107
+ print(f"description: {current_description}", file=sys.stderr)
108
+ print(f"{'='*60}", file=sys.stderr)
109
+
110
+ # train + test を一括で並行評価(効率化)
111
+ all_queries = train_set + test_set
112
+ t0 = time.time()
113
+ all_results = run_eval(
114
+ eval_set=all_queries,
116
115
  skill_name=name,
117
116
  description=current_description,
118
117
  num_workers=num_workers,
@@ -122,161 +121,227 @@ def run_loop(
122
121
  trigger_threshold=trigger_threshold,
123
122
  model=model,
124
123
  )
124
+ eval_elapsed = time.time() - t0
125
+
126
+ # クエリの一致でtrain/testに結果を振り分け
127
+ train_queries_set = {q["query"] for q in train_set}
128
+ train_result_list = [r for r in all_results["results"] if r["query"] in train_queries_set]
129
+ test_result_list = [r for r in all_results["results"] if r["query"] not in train_queries_set]
130
+
131
+ train_passed = sum(1 for r in train_result_list if r["pass"])
132
+ train_total = len(train_result_list)
133
+ train_summary = {"passed": train_passed, "failed": train_total - train_passed, "total": train_total}
134
+ train_results = {"results": train_result_list, "summary": train_summary}
125
135
 
126
- # テスト評価(テストセットがある場合)
127
- test_results = None
128
136
  if test_set:
129
- test_results = run_eval(
130
- eval_set=test_set,
131
- skill_name=name,
132
- description=current_description,
133
- num_workers=num_workers,
134
- timeout=timeout,
135
- project_root=project_root,
136
- runs_per_query=runs_per_query,
137
- trigger_threshold=trigger_threshold,
138
- model=model,
139
- )
140
-
141
- # 履歴エントリの構築
142
- entry: dict = {
137
+ test_passed = sum(1 for r in test_result_list if r["pass"])
138
+ test_total = len(test_result_list)
139
+ test_summary = {"passed": test_passed, "failed": test_total - test_passed, "total": test_total}
140
+ test_results = {"results": test_result_list, "summary": test_summary}
141
+ else:
142
+ test_results = None
143
+ test_summary = None
144
+
145
+ # 履歴エントリの構築(レポートジェネレーターとの後方互換性を保持)
146
+ history.append({
147
+ "iteration": iteration,
143
148
  "description": current_description,
144
- "train_passed": train_results["summary"]["passed"],
145
- "train_failed": train_results["summary"]["failed"],
146
- "train_total": train_results["summary"]["total"],
149
+ "train_passed": train_summary["passed"],
150
+ "train_failed": train_summary["failed"],
151
+ "train_total": train_summary["total"],
147
152
  "train_results": train_results["results"],
148
- }
149
- if test_results:
150
- entry["test_passed"] = test_results["summary"]["passed"]
151
- entry["test_failed"] = test_results["summary"]["failed"]
152
- entry["test_total"] = test_results["summary"]["total"]
153
- entry["test_results"] = test_results["results"]
154
-
155
- history.append(entry)
153
+ "test_passed": test_summary["passed"] if test_summary else None,
154
+ "test_failed": test_summary["failed"] if test_summary else None,
155
+ "test_total": test_summary["total"] if test_summary else None,
156
+ "test_results": test_results["results"] if test_results else None,
157
+ # レポートジェネレーター後方互換
158
+ "passed": train_summary["passed"],
159
+ "failed": train_summary["failed"],
160
+ "total": train_summary["total"],
161
+ "results": train_results["results"],
162
+ })
163
+
164
+ # ライブレポートを更新(指定されている場合)
165
+ if live_report_path:
166
+ partial_output = {
167
+ "original_description": original_description,
168
+ "best_description": current_description,
169
+ "best_score": "in progress",
170
+ "iterations_run": len(history),
171
+ "holdout": holdout,
172
+ "train_size": len(train_set),
173
+ "test_size": len(test_set),
174
+ "history": history,
175
+ }
176
+ live_report_path.write_text(generate_html(partial_output, auto_refresh=True, skill_name=name))
177
+ if verbose:
178
+ print(f"レポートを更新しました: {live_report_path}", file=sys.stderr)
156
179
 
157
180
  if verbose:
158
- train_s = f"{train_results['summary']['passed']}/{train_results['summary']['total']}"
159
- msg = f"トレーニングスコア: {train_s}"
160
- if test_results:
161
- test_s = f"{test_results['summary']['passed']}/{test_results['summary']['total']}"
162
- msg += f", テストスコア: {test_s}"
163
- print(msg, file=sys.stderr)
164
-
165
- # レポート更新
166
- if report_path:
167
- output_data = {"history": history, "holdout": holdout}
168
- report_html = generate_html(output_data, auto_refresh=True, skill_name=name)
169
- Path(report_path).write_text(report_html)
181
+ def print_eval_stats(label: str, results: list[dict], elapsed: float) -> None:
182
+ pos = [r for r in results if r.get("should_trigger", True)]
183
+ neg = [r for r in results if not r.get("should_trigger", True)]
184
+ tp = sum(r["triggers"] for r in pos)
185
+ pos_runs = sum(r["runs"] for r in pos)
186
+ fn = pos_runs - tp
187
+ fp = sum(r["triggers"] for r in neg)
188
+ neg_runs = sum(r["runs"] for r in neg)
189
+ tn = neg_runs - fp
190
+ total = tp + tn + fp + fn
191
+ precision = tp / (tp + fp) if (tp + fp) > 0 else 1.0
192
+ recall = tp / (tp + fn) if (tp + fn) > 0 else 1.0
193
+ accuracy = (tp + tn) / total if total > 0 else 0.0
194
+ print(
195
+ f"{label}: {tp+tn}/{total} 正解, "
196
+ f"precision={precision:.0%} recall={recall:.0%} accuracy={accuracy:.0%} ({elapsed:.1f}s)",
197
+ file=sys.stderr,
198
+ )
199
+ for r in results:
200
+ status = "PASS" if r["pass"] else "FAIL"
201
+ rate_str = f"{r['triggers']}/{r['runs']}"
202
+ print(
203
+ f" [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:60]}",
204
+ file=sys.stderr,
205
+ )
206
+
207
+ print_eval_stats("Train", train_results["results"], eval_elapsed)
208
+ if test_summary:
209
+ print_eval_stats("Test ", test_results["results"], 0) # type: ignore[index]
210
+
211
+ # train全パスなら終了(testは過学習モニタリング用のみ)
212
+ if train_summary["failed"] == 0:
213
+ exit_reason = f"all_passed (iteration {iteration})"
170
214
  if verbose:
171
- print(f"レポートを更新しました: {report_path}", file=sys.stderr)
172
-
173
- # 全パスなら終了
174
- if train_results["summary"]["failed"] == 0:
175
- if test_results is None or test_results["summary"]["failed"] == 0:
176
- if verbose:
177
- print("全クエリパス。ループを終了します。", file=sys.stderr)
178
- break
215
+ print(f"\nイテレーション {iteration} でtrain全クエリパス!ループを終了します。", file=sys.stderr)
216
+ break
179
217
 
180
- # 最終イテレーションでなければ改善
181
- if iteration < max_iterations - 1:
218
+ if iteration == max_iterations:
219
+ exit_reason = f"max_iterations ({max_iterations})"
182
220
  if verbose:
183
- print("descriptionを改善中...", file=sys.stderr)
184
-
185
- new_description = improve_description(
186
- client=client,
187
- skill_name=name,
188
- skill_content=content,
189
- current_description=current_description,
190
- eval_results=train_results,
191
- history=improve_history,
192
- model=improve_model,
193
- test_results=test_results,
194
- log_dir=log_path,
195
- iteration=iteration,
196
- )
197
-
198
- # 改善履歴を更新
199
- improve_entry: dict = {
200
- "description": current_description,
201
- "train_passed": train_results["summary"]["passed"],
202
- "train_total": train_results["summary"]["total"],
203
- "results": train_results["results"],
204
- }
205
- if test_results:
206
- improve_entry["test_passed"] = test_results["summary"]["passed"]
207
- improve_entry["test_total"] = test_results["summary"]["total"]
208
- improve_history.append(improve_entry)
221
+ print(f"\n最大イテレーション数到達 ({max_iterations})。", file=sys.stderr)
222
+ break
209
223
 
210
- current_description = new_description
224
+ # descriptionを改善(train結果のみ使用)
225
+ if verbose:
226
+ print(f"\ndescriptionを改善中...", file=sys.stderr)
227
+
228
+ t0 = time.time()
229
+ # 過学習防止のため、改善モデルにtest_スコアを見せないようにブラインド処理
230
+ blinded_history = [
231
+ {k: v for k, v in h.items() if not k.startswith("test_")}
232
+ for h in history
233
+ ]
234
+ new_description = improve_description(
235
+ client=client,
236
+ skill_name=name,
237
+ skill_content=content,
238
+ current_description=current_description,
239
+ eval_results=train_results,
240
+ history=blinded_history,
241
+ model=improve_model,
242
+ log_dir=log_dir,
243
+ iteration=iteration,
244
+ )
245
+ improve_elapsed = time.time() - t0
211
246
 
212
- if verbose:
213
- print(f"新しいdescription: {new_description[:100]}...", file=sys.stderr)
214
-
215
- # 最良のdescriptionを選択(テスト > トレーニングで優先)
216
- best_idx = 0
217
- best_test = -1
218
- best_train = -1
219
- for i, h in enumerate(history):
220
- t_passed = h.get("test_passed", -1)
221
- tr_passed = h.get("train_passed", h.get("passed", 0))
222
- if t_passed > best_test or (t_passed == best_test and tr_passed > best_train):
223
- best_test = t_passed
224
- best_train = tr_passed
225
- best_idx = i
226
-
227
- best = history[best_idx]
228
-
229
- # 最終レポート(auto_refreshオフ)
230
- if report_path:
231
- output_data = {"history": history, "holdout": holdout}
232
- report_html = generate_html(output_data, auto_refresh=False, skill_name=name)
233
- Path(report_path).write_text(report_html)
234
-
235
- output = {
236
- "skill_name": name,
247
+ if verbose:
248
+ print(f"新しいdescription ({improve_elapsed:.1f}s): {new_description}", file=sys.stderr)
249
+
250
+ current_description = new_description
251
+
252
+ # 最良のdescriptionを選択(testセットあり→test優先、なし→train)
253
+ if test_set:
254
+ best = max(history, key=lambda h: h["test_passed"] or 0)
255
+ best_score = f"{best['test_passed']}/{best['test_total']}"
256
+ else:
257
+ best = max(history, key=lambda h: h["train_passed"])
258
+ best_score = f"{best['train_passed']}/{best['train_total']}"
259
+
260
+ if verbose:
261
+ print(f"\n終了理由: {exit_reason}", file=sys.stderr)
262
+ print(f"最良スコア: {best_score} (イテレーション {best['iteration']})", file=sys.stderr)
263
+
264
+ return {
265
+ "exit_reason": exit_reason,
237
266
  "original_description": original_description,
238
267
  "best_description": best["description"],
239
- "best_iteration": best_idx,
240
- "history": history,
268
+ "best_score": best_score,
269
+ "best_train_score": f"{best['train_passed']}/{best['train_total']}",
270
+ "best_test_score": f"{best['test_passed']}/{best['test_total']}" if test_set else None,
271
+ "final_description": current_description,
272
+ "iterations_run": len(history),
241
273
  "holdout": holdout,
274
+ "train_size": len(train_set),
275
+ "test_size": len(test_set),
276
+ "history": history,
242
277
  }
243
278
 
244
- if verbose:
245
- print(f"\n最良のdescription (イテレーション {best_idx}): {best['description']}", file=sys.stderr)
246
- train_s = f"{best['train_passed']}/{best['train_total']}"
247
- msg = f"最良スコア - トレーニング: {train_s}"
248
- if best.get("test_passed") is not None:
249
- test_s = f"{best['test_passed']}/{best['test_total']}"
250
- msg += f", テスト: {test_s}"
251
- print(msg, file=sys.stderr)
252
-
253
- return output
254
279
 
255
-
256
- def main():
280
+ def main() -> None:
257
281
  parser = argparse.ArgumentParser(description="評価+改善ループを実行")
258
282
  parser.add_argument("--eval-set", required=True, help="評価セットJSONファイルへのパス")
259
283
  parser.add_argument("--skill-path", required=True, help="スキルディレクトリへのパス")
260
- parser.add_argument("--max-iterations", type=int, default=10, help="最大イテレーション数(デフォルト: 10)")
284
+ parser.add_argument("--description", default=None, help="開始descriptionを上書き")
261
285
  parser.add_argument("--num-workers", type=int, default=10, help="並行ワーカー数(デフォルト: 10)")
262
286
  parser.add_argument("--timeout", type=int, default=30, help="クエリごとのタイムアウト秒数(デフォルト: 30)")
287
+ parser.add_argument("--max-iterations", type=int, default=5, help="最大イテレーション数(デフォルト: 5)")
263
288
  parser.add_argument("--runs-per-query", type=int, default=3, help="クエリごとの実行回数(デフォルト: 3)")
264
289
  parser.add_argument("--trigger-threshold", type=float, default=0.5, help="トリガー率の閾値(デフォルト: 0.5)")
265
- parser.add_argument("--holdout", type=int, default=0, help="テスト用ホールドアウトクエリ数(デフォルト: 0)")
290
+ parser.add_argument("--holdout", type=float, default=0.4, help="テスト用ホールドアウト割合(0で無効、デフォルト: 0.4)")
266
291
  parser.add_argument("--seed", type=int, default=None, help="train/test分割のランダムシード")
267
292
  parser.add_argument("--model", default=None, help="評価時にclaude -pに使用するモデル")
268
293
  parser.add_argument("--improve-model", default="claude-sonnet-4-20250514", help="description改善に使用するモデル(デフォルト: claude-sonnet-4-20250514)")
269
294
  parser.add_argument("--verbose", action="store_true", help="進捗をstderrに出力")
270
- parser.add_argument("--report", default=None, help="HTMLレポートの出力先パス(ライブ更新あり)")
271
- parser.add_argument("--log-dir", default=None, help="改善トランスクリプトのログディレクトリ")
295
+ parser.add_argument("--report", default="auto", help="HTMLレポートの出力先パス('auto'で一時ファイル自動起動、'none'で無効)")
296
+ parser.add_argument("--results-dir", default=None, help="タイムスタンプ付きサブディレクトリに全出力(results.json, report.html, logs)を保存")
297
+ parser.add_argument("--log-dir", default=None, help="改善トランスクリプトのログディレクトリ(--results-dirより優先)")
272
298
  args = parser.parse_args()
273
299
 
300
+ eval_set = json.loads(Path(args.eval_set).read_text())
301
+ skill_path = Path(args.skill_path)
302
+
303
+ if not (skill_path / "SKILL.md").exists():
304
+ print(f"エラー: {skill_path} にSKILL.mdが見つかりません", file=sys.stderr)
305
+ sys.exit(1)
306
+
307
+ name, _, _ = parse_skill_md(skill_path)
308
+
309
+ # ライブレポートパスのセットアップ
310
+ if args.report != "none":
311
+ if args.report == "auto":
312
+ timestamp = time.strftime("%Y%m%d_%H%M%S")
313
+ live_report_path = Path(tempfile.gettempdir()) / f"skill_description_report_{skill_path.name}_{timestamp}.html"
314
+ else:
315
+ live_report_path = Path(args.report)
316
+ # ブラウザで即座に開けるよう初期HTMLを書き込む
317
+ live_report_path.write_text("<html><body><h1>最適化ループを開始しています...</h1><meta http-equiv='refresh' content='5'></body></html>")
318
+ webbrowser.open(str(live_report_path))
319
+ else:
320
+ live_report_path = None
321
+
322
+ # 出力ディレクトリの決定(run_loop実行前に作成してlogsを保存可能にする)
323
+ if args.results_dir:
324
+ timestamp = time.strftime("%Y-%m-%d_%H%M%S")
325
+ results_dir = Path(args.results_dir) / timestamp
326
+ results_dir.mkdir(parents=True, exist_ok=True)
327
+ else:
328
+ results_dir = None
329
+
330
+ # --log-dir が明示指定されていればそちらを優先、なければ results_dir/logs
331
+ if args.log_dir:
332
+ log_dir: Path | None = Path(args.log_dir)
333
+ elif results_dir:
334
+ log_dir = results_dir / "logs"
335
+ else:
336
+ log_dir = None
337
+
274
338
  output = run_loop(
275
- eval_set_path=args.eval_set,
276
- skill_path=args.skill_path,
277
- max_iterations=args.max_iterations,
339
+ eval_set=eval_set,
340
+ skill_path=skill_path,
341
+ description_override=args.description,
278
342
  num_workers=args.num_workers,
279
343
  timeout=args.timeout,
344
+ max_iterations=args.max_iterations,
280
345
  runs_per_query=args.runs_per_query,
281
346
  trigger_threshold=args.trigger_threshold,
282
347
  holdout=args.holdout,
@@ -284,11 +349,26 @@ def main():
284
349
  model=args.model,
285
350
  improve_model=args.improve_model,
286
351
  verbose=args.verbose,
287
- report_path=args.report,
288
- log_dir=args.log_dir,
352
+ live_report_path=live_report_path,
353
+ log_dir=log_dir,
289
354
  )
290
355
 
291
- print(json.dumps(output, indent=2))
356
+ # JSON出力
357
+ json_output = json.dumps(output, indent=2)
358
+ print(json_output)
359
+ if results_dir:
360
+ (results_dir / "results.json").write_text(json_output)
361
+
362
+ # 最終HTMLレポートの書き込み(auto_refreshオフ)
363
+ if live_report_path:
364
+ live_report_path.write_text(generate_html(output, auto_refresh=False, skill_name=name))
365
+ print(f"\nレポート: {live_report_path}", file=sys.stderr)
366
+
367
+ if results_dir and live_report_path:
368
+ (results_dir / "report.html").write_text(generate_html(output, auto_refresh=False, skill_name=name))
369
+
370
+ if results_dir:
371
+ print(f"結果を保存しました: {results_dir}", file=sys.stderr)
292
372
 
293
373
 
294
374
  if __name__ == "__main__":