ai-engineering-init 1.6.0 → 1.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. package/.claude/agents/code-reviewer.md +3 -130
  2. package/.claude/hooks/skill-forced-eval.js +46 -60
  3. package/.claude/hooks/stop.js +24 -1
  4. package/.claude/settings.json +10 -1
  5. package/.claude/skills/api-development/SKILL.md +179 -130
  6. package/.claude/skills/architecture-design/SKILL.md +102 -212
  7. package/.claude/skills/backend-annotations/SKILL.md +166 -220
  8. package/.claude/skills/bug-detective/SKILL.md +225 -186
  9. package/.claude/skills/code-patterns/SKILL.md +127 -244
  10. package/.claude/skills/codex-code-review/SKILL.md +327 -0
  11. package/.claude/skills/collaborating-with-codex/SKILL.md +96 -113
  12. package/.claude/skills/crud-development/SKILL.md +226 -307
  13. package/.claude/skills/data-permission/SKILL.md +131 -202
  14. package/.claude/skills/database-ops/SKILL.md +158 -355
  15. package/.claude/skills/error-handler/SKILL.md +224 -285
  16. package/.claude/skills/file-oss-management/SKILL.md +174 -169
  17. package/.claude/skills/git-workflow/SKILL.md +123 -341
  18. package/.claude/skills/json-serialization/SKILL.md +121 -137
  19. package/.claude/skills/leniu-report-customization/SKILL.md +82 -2
  20. package/.claude/skills/leniu-report-standard-customization/SKILL.md +65 -2
  21. package/.claude/skills/loki-log-query/SKILL.md +400 -0
  22. package/.claude/skills/mysql-debug/SKILL.md +58 -22
  23. package/.claude/skills/performance-doctor/SKILL.md +83 -89
  24. package/.claude/skills/redis-cache/SKILL.md +134 -185
  25. package/.claude/skills/scheduled-jobs/SKILL.md +187 -224
  26. package/.claude/skills/security-guard/SKILL.md +168 -276
  27. package/.claude/skills/sms-mail/SKILL.md +266 -228
  28. package/.claude/skills/social-login/SKILL.md +257 -195
  29. package/.claude/skills/sync-back-merge/SKILL.md +66 -0
  30. package/.claude/skills/tenant-management/SKILL.md +172 -188
  31. package/.claude/skills/utils-toolkit/SKILL.md +214 -222
  32. package/.claude/skills/websocket-sse/SKILL.md +251 -172
  33. package/.claude/skills/workflow-engine/SKILL.md +178 -250
  34. package/.claude/skills/yunxiao-task-management/SKILL.md +489 -0
  35. package/.codex/skills/api-development/SKILL.md +179 -130
  36. package/.codex/skills/architecture-design/SKILL.md +102 -212
  37. package/.codex/skills/backend-annotations/SKILL.md +166 -220
  38. package/.codex/skills/bug-detective/SKILL.md +225 -186
  39. package/.codex/skills/code-patterns/SKILL.md +127 -244
  40. package/.codex/skills/collaborating-with-codex/SKILL.md +96 -113
  41. package/.codex/skills/crud-development/SKILL.md +226 -307
  42. package/.codex/skills/data-permission/SKILL.md +131 -202
  43. package/.codex/skills/database-ops/SKILL.md +158 -355
  44. package/.codex/skills/error-handler/SKILL.md +224 -285
  45. package/.codex/skills/file-oss-management/SKILL.md +174 -169
  46. package/.codex/skills/git-workflow/SKILL.md +123 -341
  47. package/.codex/skills/json-serialization/SKILL.md +121 -137
  48. package/.codex/skills/leniu-report-customization/SKILL.md +82 -2
  49. package/.codex/skills/leniu-report-standard-customization/SKILL.md +65 -2
  50. package/.codex/skills/loki-log-query/SKILL.md +400 -0
  51. package/.codex/skills/loki-log-query/environments.json +45 -0
  52. package/.codex/skills/mysql-debug/SKILL.md +58 -22
  53. package/.codex/skills/performance-doctor/SKILL.md +83 -89
  54. package/.codex/skills/redis-cache/SKILL.md +134 -185
  55. package/.codex/skills/scheduled-jobs/SKILL.md +187 -224
  56. package/.codex/skills/security-guard/SKILL.md +168 -276
  57. package/.codex/skills/skill-creator/LICENSE.txt +202 -0
  58. package/.codex/skills/skill-creator/SKILL.md +479 -0
  59. package/.codex/skills/skill-creator/agents/analyzer.md +274 -0
  60. package/.codex/skills/skill-creator/agents/comparator.md +202 -0
  61. package/.codex/skills/skill-creator/agents/grader.md +223 -0
  62. package/.codex/skills/skill-creator/assets/eval_review.html +146 -0
  63. package/.codex/skills/skill-creator/eval-viewer/generate_review.py +471 -0
  64. package/.codex/skills/skill-creator/eval-viewer/viewer.html +1325 -0
  65. package/.codex/skills/skill-creator/references/schemas.md +430 -0
  66. package/.codex/skills/skill-creator/scripts/__init__.py +0 -0
  67. package/.codex/skills/skill-creator/scripts/aggregate_benchmark.py +401 -0
  68. package/.codex/skills/skill-creator/scripts/generate_report.py +326 -0
  69. package/.codex/skills/skill-creator/scripts/improve_description.py +248 -0
  70. package/.codex/skills/skill-creator/scripts/package_skill.py +136 -0
  71. package/.codex/skills/skill-creator/scripts/quick_validate.py +103 -0
  72. package/.codex/skills/skill-creator/scripts/run_eval.py +310 -0
  73. package/.codex/skills/skill-creator/scripts/run_loop.py +332 -0
  74. package/.codex/skills/skill-creator/scripts/utils.py +47 -0
  75. package/.codex/skills/sms-mail/SKILL.md +266 -228
  76. package/.codex/skills/social-login/SKILL.md +257 -195
  77. package/.codex/skills/sync-back-merge/SKILL.md +66 -0
  78. package/.codex/skills/tenant-management/SKILL.md +172 -188
  79. package/.codex/skills/utils-toolkit/SKILL.md +214 -222
  80. package/.codex/skills/websocket-sse/SKILL.md +251 -172
  81. package/.codex/skills/workflow-engine/SKILL.md +178 -250
  82. package/.codex/skills/yunxiao-task-management/SKILL.md +489 -0
  83. package/.cursor/hooks/cursor-skill-eval.js +66 -6
  84. package/.cursor/hooks/stop.js +23 -1
  85. package/.cursor/skills/api-development/SKILL.md +179 -130
  86. package/.cursor/skills/architecture-design/SKILL.md +102 -212
  87. package/.cursor/skills/backend-annotations/SKILL.md +166 -220
  88. package/.cursor/skills/bug-detective/SKILL.md +225 -186
  89. package/.cursor/skills/code-patterns/SKILL.md +127 -244
  90. package/.cursor/skills/collaborating-with-codex/SKILL.md +96 -113
  91. package/.cursor/skills/crud-development/SKILL.md +226 -307
  92. package/.cursor/skills/data-permission/SKILL.md +131 -202
  93. package/.cursor/skills/database-ops/SKILL.md +158 -355
  94. package/.cursor/skills/error-handler/SKILL.md +224 -285
  95. package/.cursor/skills/file-oss-management/SKILL.md +174 -169
  96. package/.cursor/skills/git-workflow/SKILL.md +123 -341
  97. package/.cursor/skills/json-serialization/SKILL.md +121 -137
  98. package/.cursor/skills/leniu-report-customization/SKILL.md +82 -2
  99. package/.cursor/skills/leniu-report-standard-customization/SKILL.md +65 -2
  100. package/.cursor/skills/loki-log-query/SKILL.md +400 -0
  101. package/.cursor/skills/loki-log-query/environments.json +45 -0
  102. package/.cursor/skills/mysql-debug/SKILL.md +58 -22
  103. package/.cursor/skills/performance-doctor/SKILL.md +83 -89
  104. package/.cursor/skills/redis-cache/SKILL.md +134 -185
  105. package/.cursor/skills/scheduled-jobs/SKILL.md +187 -224
  106. package/.cursor/skills/security-guard/SKILL.md +168 -276
  107. package/.cursor/skills/skill-creator/LICENSE.txt +202 -0
  108. package/.cursor/skills/skill-creator/SKILL.md +479 -0
  109. package/.cursor/skills/skill-creator/agents/analyzer.md +274 -0
  110. package/.cursor/skills/skill-creator/agents/comparator.md +202 -0
  111. package/.cursor/skills/skill-creator/agents/grader.md +223 -0
  112. package/.cursor/skills/skill-creator/assets/eval_review.html +146 -0
  113. package/.cursor/skills/skill-creator/eval-viewer/generate_review.py +471 -0
  114. package/.cursor/skills/skill-creator/eval-viewer/viewer.html +1325 -0
  115. package/.cursor/skills/skill-creator/references/schemas.md +430 -0
  116. package/.cursor/skills/skill-creator/scripts/__init__.py +0 -0
  117. package/.cursor/skills/skill-creator/scripts/aggregate_benchmark.py +401 -0
  118. package/.cursor/skills/skill-creator/scripts/generate_report.py +326 -0
  119. package/.cursor/skills/skill-creator/scripts/improve_description.py +248 -0
  120. package/.cursor/skills/skill-creator/scripts/package_skill.py +136 -0
  121. package/.cursor/skills/skill-creator/scripts/quick_validate.py +103 -0
  122. package/.cursor/skills/skill-creator/scripts/run_eval.py +310 -0
  123. package/.cursor/skills/skill-creator/scripts/run_loop.py +332 -0
  124. package/.cursor/skills/skill-creator/scripts/utils.py +47 -0
  125. package/.cursor/skills/sms-mail/SKILL.md +266 -228
  126. package/.cursor/skills/social-login/SKILL.md +257 -195
  127. package/.cursor/skills/sync-back-merge/SKILL.md +66 -0
  128. package/.cursor/skills/tenant-management/SKILL.md +172 -188
  129. package/.cursor/skills/utils-toolkit/SKILL.md +214 -222
  130. package/.cursor/skills/websocket-sse/SKILL.md +251 -172
  131. package/.cursor/skills/workflow-engine/SKILL.md +178 -250
  132. package/.cursor/skills/yunxiao-task-management/SKILL.md +489 -0
  133. package/AGENTS.md +49 -540
  134. package/CLAUDE.md +73 -119
  135. package/README.md +37 -6
  136. package/bin/index.js +611 -25
  137. package/package.json +1 -1
  138. package/src/platform-map.json +4 -0
  139. package/src/skills/api-development/SKILL.md +179 -130
  140. package/src/skills/architecture-design/SKILL.md +102 -212
  141. package/src/skills/backend-annotations/SKILL.md +166 -220
  142. package/src/skills/bug-detective/SKILL.md +225 -186
  143. package/src/skills/code-patterns/SKILL.md +127 -244
  144. package/src/skills/codex-code-review/SKILL.md +261 -69
  145. package/src/skills/collaborating-with-codex/SKILL.md +96 -113
  146. package/src/skills/crud-development/SKILL.md +226 -307
  147. package/src/skills/data-permission/SKILL.md +131 -202
  148. package/src/skills/database-ops/SKILL.md +158 -355
  149. package/src/skills/error-handler/SKILL.md +224 -285
  150. package/src/skills/file-oss-management/SKILL.md +174 -169
  151. package/src/skills/git-workflow/SKILL.md +123 -341
  152. package/src/skills/json-serialization/SKILL.md +121 -137
  153. package/src/skills/leniu-report-customization/SKILL.md +82 -2
  154. package/src/skills/leniu-report-standard-customization/SKILL.md +65 -2
  155. package/src/skills/loki-log-query/SKILL.md +400 -0
  156. package/src/skills/loki-log-query/environments.json +45 -0
  157. package/src/skills/mysql-debug/SKILL.md +58 -22
  158. package/src/skills/performance-doctor/SKILL.md +83 -89
  159. package/src/skills/redis-cache/SKILL.md +134 -185
  160. package/src/skills/scheduled-jobs/SKILL.md +187 -224
  161. package/src/skills/security-guard/SKILL.md +168 -276
  162. package/src/skills/skill-creator/LICENSE.txt +202 -0
  163. package/src/skills/skill-creator/SKILL.md +479 -0
  164. package/src/skills/skill-creator/agents/analyzer.md +274 -0
  165. package/src/skills/skill-creator/agents/comparator.md +202 -0
  166. package/src/skills/skill-creator/agents/grader.md +223 -0
  167. package/src/skills/skill-creator/assets/eval_review.html +146 -0
  168. package/src/skills/skill-creator/eval-viewer/generate_review.py +471 -0
  169. package/src/skills/skill-creator/eval-viewer/viewer.html +1325 -0
  170. package/src/skills/skill-creator/references/schemas.md +430 -0
  171. package/src/skills/skill-creator/scripts/__init__.py +0 -0
  172. package/src/skills/skill-creator/scripts/aggregate_benchmark.py +401 -0
  173. package/src/skills/skill-creator/scripts/generate_report.py +326 -0
  174. package/src/skills/skill-creator/scripts/improve_description.py +248 -0
  175. package/src/skills/skill-creator/scripts/package_skill.py +136 -0
  176. package/src/skills/skill-creator/scripts/quick_validate.py +103 -0
  177. package/src/skills/skill-creator/scripts/run_eval.py +310 -0
  178. package/src/skills/skill-creator/scripts/run_loop.py +332 -0
  179. package/src/skills/skill-creator/scripts/utils.py +47 -0
  180. package/src/skills/sms-mail/SKILL.md +266 -228
  181. package/src/skills/social-login/SKILL.md +257 -195
  182. package/src/skills/sync-back-merge/SKILL.md +66 -0
  183. package/src/skills/tenant-management/SKILL.md +172 -188
  184. package/src/skills/utils-toolkit/SKILL.md +214 -222
  185. package/src/skills/websocket-sse/SKILL.md +251 -172
  186. package/src/skills/workflow-engine/SKILL.md +178 -250
  187. package/src/skills/yunxiao-task-management/SKILL.md +489 -0
@@ -0,0 +1,332 @@
1
+ #!/usr/bin/env python3
2
+ """Run the eval + improve loop until all pass or max iterations reached.
3
+
4
+ Combines run_eval.py and improve_description.py in a loop, tracking history
5
+ and returning the best description found. Supports train/test split to prevent
6
+ overfitting.
7
+ """
8
+
9
+ import argparse
10
+ import json
11
+ import random
12
+ import sys
13
+ import tempfile
14
+ import time
15
+ import webbrowser
16
+ from pathlib import Path
17
+
18
+ import anthropic
19
+
20
+ from scripts.generate_report import generate_html
21
+ from scripts.improve_description import improve_description
22
+ from scripts.run_eval import find_project_root, run_eval
23
+ from scripts.utils import parse_skill_md
24
+
25
+
26
+ def split_eval_set(eval_set: list[dict], holdout: float, seed: int = 42) -> tuple[list[dict], list[dict]]:
27
+ """Split eval set into train and test sets, stratified by should_trigger."""
28
+ random.seed(seed)
29
+
30
+ # Separate by should_trigger
31
+ trigger = [e for e in eval_set if e["should_trigger"]]
32
+ no_trigger = [e for e in eval_set if not e["should_trigger"]]
33
+
34
+ # Shuffle each group
35
+ random.shuffle(trigger)
36
+ random.shuffle(no_trigger)
37
+
38
+ # Calculate split points
39
+ n_trigger_test = max(1, int(len(trigger) * holdout))
40
+ n_no_trigger_test = max(1, int(len(no_trigger) * holdout))
41
+
42
+ # Split
43
+ test_set = trigger[:n_trigger_test] + no_trigger[:n_no_trigger_test]
44
+ train_set = trigger[n_trigger_test:] + no_trigger[n_no_trigger_test:]
45
+
46
+ return train_set, test_set
47
+
48
+
49
+ def run_loop(
50
+ eval_set: list[dict],
51
+ skill_path: Path,
52
+ description_override: str | None,
53
+ num_workers: int,
54
+ timeout: int,
55
+ max_iterations: int,
56
+ runs_per_query: int,
57
+ trigger_threshold: float,
58
+ holdout: float,
59
+ model: str,
60
+ verbose: bool,
61
+ live_report_path: Path | None = None,
62
+ log_dir: Path | None = None,
63
+ ) -> dict:
64
+ """Run the eval + improvement loop."""
65
+ project_root = find_project_root()
66
+ name, original_description, content = parse_skill_md(skill_path)
67
+ current_description = description_override or original_description
68
+
69
+ # Split into train/test if holdout > 0
70
+ if holdout > 0:
71
+ train_set, test_set = split_eval_set(eval_set, holdout)
72
+ if verbose:
73
+ print(f"Split: {len(train_set)} train, {len(test_set)} test (holdout={holdout})", file=sys.stderr)
74
+ else:
75
+ train_set = eval_set
76
+ test_set = []
77
+
78
+ client = anthropic.Anthropic()
79
+ history = []
80
+ exit_reason = "unknown"
81
+
82
+ for iteration in range(1, max_iterations + 1):
83
+ if verbose:
84
+ print(f"\n{'='*60}", file=sys.stderr)
85
+ print(f"Iteration {iteration}/{max_iterations}", file=sys.stderr)
86
+ print(f"Description: {current_description}", file=sys.stderr)
87
+ print(f"{'='*60}", file=sys.stderr)
88
+
89
+ # Evaluate train + test together in one batch for parallelism
90
+ all_queries = train_set + test_set
91
+ t0 = time.time()
92
+ all_results = run_eval(
93
+ eval_set=all_queries,
94
+ skill_name=name,
95
+ description=current_description,
96
+ num_workers=num_workers,
97
+ timeout=timeout,
98
+ project_root=project_root,
99
+ runs_per_query=runs_per_query,
100
+ trigger_threshold=trigger_threshold,
101
+ model=model,
102
+ )
103
+ eval_elapsed = time.time() - t0
104
+
105
+ # Split results back into train/test by matching queries
106
+ train_queries_set = {q["query"] for q in train_set}
107
+ train_result_list = [r for r in all_results["results"] if r["query"] in train_queries_set]
108
+ test_result_list = [r for r in all_results["results"] if r["query"] not in train_queries_set]
109
+
110
+ train_passed = sum(1 for r in train_result_list if r["pass"])
111
+ train_total = len(train_result_list)
112
+ train_summary = {"passed": train_passed, "failed": train_total - train_passed, "total": train_total}
113
+ train_results = {"results": train_result_list, "summary": train_summary}
114
+
115
+ if test_set:
116
+ test_passed = sum(1 for r in test_result_list if r["pass"])
117
+ test_total = len(test_result_list)
118
+ test_summary = {"passed": test_passed, "failed": test_total - test_passed, "total": test_total}
119
+ test_results = {"results": test_result_list, "summary": test_summary}
120
+ else:
121
+ test_results = None
122
+ test_summary = None
123
+
124
+ history.append({
125
+ "iteration": iteration,
126
+ "description": current_description,
127
+ "train_passed": train_summary["passed"],
128
+ "train_failed": train_summary["failed"],
129
+ "train_total": train_summary["total"],
130
+ "train_results": train_results["results"],
131
+ "test_passed": test_summary["passed"] if test_summary else None,
132
+ "test_failed": test_summary["failed"] if test_summary else None,
133
+ "test_total": test_summary["total"] if test_summary else None,
134
+ "test_results": test_results["results"] if test_results else None,
135
+ # For backward compat with report generator
136
+ "passed": train_summary["passed"],
137
+ "failed": train_summary["failed"],
138
+ "total": train_summary["total"],
139
+ "results": train_results["results"],
140
+ })
141
+
142
+ # Write live report if path provided
143
+ if live_report_path:
144
+ partial_output = {
145
+ "original_description": original_description,
146
+ "best_description": current_description,
147
+ "best_score": "in progress",
148
+ "iterations_run": len(history),
149
+ "holdout": holdout,
150
+ "train_size": len(train_set),
151
+ "test_size": len(test_set),
152
+ "history": history,
153
+ }
154
+ live_report_path.write_text(generate_html(partial_output, auto_refresh=True, skill_name=name))
155
+
156
+ if verbose:
157
+ def print_eval_stats(label, results, elapsed):
158
+ pos = [r for r in results if r["should_trigger"]]
159
+ neg = [r for r in results if not r["should_trigger"]]
160
+ tp = sum(r["triggers"] for r in pos)
161
+ pos_runs = sum(r["runs"] for r in pos)
162
+ fn = pos_runs - tp
163
+ fp = sum(r["triggers"] for r in neg)
164
+ neg_runs = sum(r["runs"] for r in neg)
165
+ tn = neg_runs - fp
166
+ total = tp + tn + fp + fn
167
+ precision = tp / (tp + fp) if (tp + fp) > 0 else 1.0
168
+ recall = tp / (tp + fn) if (tp + fn) > 0 else 1.0
169
+ accuracy = (tp + tn) / total if total > 0 else 0.0
170
+ print(f"{label}: {tp+tn}/{total} correct, precision={precision:.0%} recall={recall:.0%} accuracy={accuracy:.0%} ({elapsed:.1f}s)", file=sys.stderr)
171
+ for r in results:
172
+ status = "PASS" if r["pass"] else "FAIL"
173
+ rate_str = f"{r['triggers']}/{r['runs']}"
174
+ print(f" [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:60]}", file=sys.stderr)
175
+
176
+ print_eval_stats("Train", train_results["results"], eval_elapsed)
177
+ if test_summary:
178
+ print_eval_stats("Test ", test_results["results"], 0)
179
+
180
+ if train_summary["failed"] == 0:
181
+ exit_reason = f"all_passed (iteration {iteration})"
182
+ if verbose:
183
+ print(f"\nAll train queries passed on iteration {iteration}!", file=sys.stderr)
184
+ break
185
+
186
+ if iteration == max_iterations:
187
+ exit_reason = f"max_iterations ({max_iterations})"
188
+ if verbose:
189
+ print(f"\nMax iterations reached ({max_iterations}).", file=sys.stderr)
190
+ break
191
+
192
+ # Improve the description based on train results
193
+ if verbose:
194
+ print(f"\nImproving description...", file=sys.stderr)
195
+
196
+ t0 = time.time()
197
+ # Strip test scores from history so improvement model can't see them
198
+ blinded_history = [
199
+ {k: v for k, v in h.items() if not k.startswith("test_")}
200
+ for h in history
201
+ ]
202
+ new_description = improve_description(
203
+ client=client,
204
+ skill_name=name,
205
+ skill_content=content,
206
+ current_description=current_description,
207
+ eval_results=train_results,
208
+ history=blinded_history,
209
+ model=model,
210
+ log_dir=log_dir,
211
+ iteration=iteration,
212
+ )
213
+ improve_elapsed = time.time() - t0
214
+
215
+ if verbose:
216
+ print(f"Proposed ({improve_elapsed:.1f}s): {new_description}", file=sys.stderr)
217
+
218
+ current_description = new_description
219
+
220
+ # Find the best iteration by TEST score (or train if no test set)
221
+ if test_set:
222
+ best = max(history, key=lambda h: h["test_passed"] or 0)
223
+ best_score = f"{best['test_passed']}/{best['test_total']}"
224
+ else:
225
+ best = max(history, key=lambda h: h["train_passed"])
226
+ best_score = f"{best['train_passed']}/{best['train_total']}"
227
+
228
+ if verbose:
229
+ print(f"\nExit reason: {exit_reason}", file=sys.stderr)
230
+ print(f"Best score: {best_score} (iteration {best['iteration']})", file=sys.stderr)
231
+
232
+ return {
233
+ "exit_reason": exit_reason,
234
+ "original_description": original_description,
235
+ "best_description": best["description"],
236
+ "best_score": best_score,
237
+ "best_train_score": f"{best['train_passed']}/{best['train_total']}",
238
+ "best_test_score": f"{best['test_passed']}/{best['test_total']}" if test_set else None,
239
+ "final_description": current_description,
240
+ "iterations_run": len(history),
241
+ "holdout": holdout,
242
+ "train_size": len(train_set),
243
+ "test_size": len(test_set),
244
+ "history": history,
245
+ }
246
+
247
+
248
+ def main():
249
+ parser = argparse.ArgumentParser(description="Run eval + improve loop")
250
+ parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file")
251
+ parser.add_argument("--skill-path", required=True, help="Path to skill directory")
252
+ parser.add_argument("--description", default=None, help="Override starting description")
253
+ parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers")
254
+ parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds")
255
+ parser.add_argument("--max-iterations", type=int, default=5, help="Max improvement iterations")
256
+ parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query")
257
+ parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold")
258
+ parser.add_argument("--holdout", type=float, default=0.4, help="Fraction of eval set to hold out for testing (0 to disable)")
259
+ parser.add_argument("--model", required=True, help="Model for improvement")
260
+ parser.add_argument("--verbose", action="store_true", help="Print progress to stderr")
261
+ parser.add_argument("--report", default="auto", help="Generate HTML report at this path (default: 'auto' for temp file, 'none' to disable)")
262
+ parser.add_argument("--results-dir", default=None, help="Save all outputs (results.json, report.html, log.txt) to a timestamped subdirectory here")
263
+ args = parser.parse_args()
264
+
265
+ eval_set = json.loads(Path(args.eval_set).read_text())
266
+ skill_path = Path(args.skill_path)
267
+
268
+ if not (skill_path / "SKILL.md").exists():
269
+ print(f"Error: No SKILL.md found at {skill_path}", file=sys.stderr)
270
+ sys.exit(1)
271
+
272
+ name, _, _ = parse_skill_md(skill_path)
273
+
274
+ # Set up live report path
275
+ if args.report != "none":
276
+ if args.report == "auto":
277
+ timestamp = time.strftime("%Y%m%d_%H%M%S")
278
+ live_report_path = Path(tempfile.gettempdir()) / f"skill_description_report_{skill_path.name}_{timestamp}.html"
279
+ else:
280
+ live_report_path = Path(args.report)
281
+ # Open the report immediately so the user can watch
282
+ live_report_path.write_text("<html><body><h1>Starting optimization loop...</h1><meta http-equiv='refresh' content='5'></body></html>")
283
+ webbrowser.open(str(live_report_path))
284
+ else:
285
+ live_report_path = None
286
+
287
+ # Determine output directory (create before run_loop so logs can be written)
288
+ if args.results_dir:
289
+ timestamp = time.strftime("%Y-%m-%d_%H%M%S")
290
+ results_dir = Path(args.results_dir) / timestamp
291
+ results_dir.mkdir(parents=True, exist_ok=True)
292
+ else:
293
+ results_dir = None
294
+
295
+ log_dir = results_dir / "logs" if results_dir else None
296
+
297
+ output = run_loop(
298
+ eval_set=eval_set,
299
+ skill_path=skill_path,
300
+ description_override=args.description,
301
+ num_workers=args.num_workers,
302
+ timeout=args.timeout,
303
+ max_iterations=args.max_iterations,
304
+ runs_per_query=args.runs_per_query,
305
+ trigger_threshold=args.trigger_threshold,
306
+ holdout=args.holdout,
307
+ model=args.model,
308
+ verbose=args.verbose,
309
+ live_report_path=live_report_path,
310
+ log_dir=log_dir,
311
+ )
312
+
313
+ # Save JSON output
314
+ json_output = json.dumps(output, indent=2)
315
+ print(json_output)
316
+ if results_dir:
317
+ (results_dir / "results.json").write_text(json_output)
318
+
319
+ # Write final HTML report (without auto-refresh)
320
+ if live_report_path:
321
+ live_report_path.write_text(generate_html(output, auto_refresh=False, skill_name=name))
322
+ print(f"\nReport: {live_report_path}", file=sys.stderr)
323
+
324
+ if results_dir and live_report_path:
325
+ (results_dir / "report.html").write_text(generate_html(output, auto_refresh=False, skill_name=name))
326
+
327
+ if results_dir:
328
+ print(f"Results saved to: {results_dir}", file=sys.stderr)
329
+
330
+
331
+ if __name__ == "__main__":
332
+ main()
@@ -0,0 +1,47 @@
1
+ """Shared utilities for skill-creator scripts."""
2
+
3
+ from pathlib import Path
4
+
5
+
6
+
7
+ def parse_skill_md(skill_path: Path) -> tuple[str, str, str]:
8
+ """Parse a SKILL.md file, returning (name, description, full_content)."""
9
+ content = (skill_path / "SKILL.md").read_text()
10
+ lines = content.split("\n")
11
+
12
+ if lines[0].strip() != "---":
13
+ raise ValueError("SKILL.md missing frontmatter (no opening ---)")
14
+
15
+ end_idx = None
16
+ for i, line in enumerate(lines[1:], start=1):
17
+ if line.strip() == "---":
18
+ end_idx = i
19
+ break
20
+
21
+ if end_idx is None:
22
+ raise ValueError("SKILL.md missing frontmatter (no closing ---)")
23
+
24
+ name = ""
25
+ description = ""
26
+ frontmatter_lines = lines[1:end_idx]
27
+ i = 0
28
+ while i < len(frontmatter_lines):
29
+ line = frontmatter_lines[i]
30
+ if line.startswith("name:"):
31
+ name = line[len("name:"):].strip().strip('"').strip("'")
32
+ elif line.startswith("description:"):
33
+ value = line[len("description:"):].strip()
34
+ # Handle YAML multiline indicators (>, |, >-, |-)
35
+ if value in (">", "|", ">-", "|-"):
36
+ continuation_lines: list[str] = []
37
+ i += 1
38
+ while i < len(frontmatter_lines) and (frontmatter_lines[i].startswith(" ") or frontmatter_lines[i].startswith("\t")):
39
+ continuation_lines.append(frontmatter_lines[i].strip())
40
+ i += 1
41
+ description = " ".join(continuation_lines)
42
+ continue
43
+ else:
44
+ description = value.strip('"').strip("'")
45
+ i += 1
46
+
47
+ return name, description, content