@cleocode/skills 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. package/dispatch-config.json +404 -0
  2. package/index.d.ts +178 -0
  3. package/index.js +405 -0
  4. package/package.json +14 -0
  5. package/profiles/core.json +7 -0
  6. package/profiles/full.json +10 -0
  7. package/profiles/minimal.json +7 -0
  8. package/profiles/recommended.json +7 -0
  9. package/provider-skills-map.json +97 -0
  10. package/skills/_shared/cleo-style-guide.md +84 -0
  11. package/skills/_shared/manifest-operations.md +810 -0
  12. package/skills/_shared/placeholders.json +433 -0
  13. package/skills/_shared/skill-chaining-patterns.md +237 -0
  14. package/skills/_shared/subagent-protocol-base.md +223 -0
  15. package/skills/_shared/task-system-integration.md +232 -0
  16. package/skills/_shared/testing-framework-config.md +110 -0
  17. package/skills/ct-cleo/SKILL.md +490 -0
  18. package/skills/ct-cleo/references/anti-patterns.md +19 -0
  19. package/skills/ct-cleo/references/loom-lifecycle.md +136 -0
  20. package/skills/ct-cleo/references/orchestrator-constraints.md +55 -0
  21. package/skills/ct-cleo/references/session-protocol.md +162 -0
  22. package/skills/ct-codebase-mapper/SKILL.md +82 -0
  23. package/skills/ct-contribution/SKILL.md +521 -0
  24. package/skills/ct-contribution/templates/contribution-init.json +21 -0
  25. package/skills/ct-dev-workflow/SKILL.md +423 -0
  26. package/skills/ct-docs-lookup/SKILL.md +66 -0
  27. package/skills/ct-docs-review/SKILL.md +175 -0
  28. package/skills/ct-docs-write/SKILL.md +108 -0
  29. package/skills/ct-documentor/SKILL.md +231 -0
  30. package/skills/ct-epic-architect/SKILL.md +305 -0
  31. package/skills/ct-epic-architect/references/bug-epic-example.md +172 -0
  32. package/skills/ct-epic-architect/references/commands.md +201 -0
  33. package/skills/ct-epic-architect/references/feature-epic-example.md +210 -0
  34. package/skills/ct-epic-architect/references/migration-epic-example.md +244 -0
  35. package/skills/ct-epic-architect/references/output-format.md +92 -0
  36. package/skills/ct-epic-architect/references/patterns.md +284 -0
  37. package/skills/ct-epic-architect/references/refactor-epic-example.md +412 -0
  38. package/skills/ct-epic-architect/references/research-epic-example.md +226 -0
  39. package/skills/ct-epic-architect/references/shell-escaping.md +86 -0
  40. package/skills/ct-epic-architect/references/skill-aware-execution.md +195 -0
  41. package/skills/ct-grade/SKILL.md +230 -0
  42. package/skills/ct-grade/agents/analysis-reporter.md +203 -0
  43. package/skills/ct-grade/agents/blind-comparator.md +157 -0
  44. package/skills/ct-grade/agents/scenario-runner.md +134 -0
  45. package/skills/ct-grade/eval-viewer/__pycache__/generate_grade_review.cpython-314.pyc +0 -0
  46. package/skills/ct-grade/eval-viewer/generate_grade_review.py +1138 -0
  47. package/skills/ct-grade/eval-viewer/generate_grade_viewer.py +544 -0
  48. package/skills/ct-grade/eval-viewer/generate_review.py +283 -0
  49. package/skills/ct-grade/eval-viewer/grade-review.html +1574 -0
  50. package/skills/ct-grade/eval-viewer/viewer.html +219 -0
  51. package/skills/ct-grade/evals/evals.json +94 -0
  52. package/skills/ct-grade/references/ab-test-methodology.md +150 -0
  53. package/skills/ct-grade/references/domains.md +137 -0
  54. package/skills/ct-grade/references/grade-spec.md +236 -0
  55. package/skills/ct-grade/references/scenario-playbook.md +234 -0
  56. package/skills/ct-grade/references/token-tracking.md +120 -0
  57. package/skills/ct-grade/scripts/__pycache__/audit_analyzer.cpython-314.pyc +0 -0
  58. package/skills/ct-grade/scripts/__pycache__/run_ab_test.cpython-314.pyc +0 -0
  59. package/skills/ct-grade/scripts/__pycache__/run_all.cpython-314.pyc +0 -0
  60. package/skills/ct-grade/scripts/__pycache__/token_tracker.cpython-314.pyc +0 -0
  61. package/skills/ct-grade/scripts/audit_analyzer.py +279 -0
  62. package/skills/ct-grade/scripts/generate_report.py +283 -0
  63. package/skills/ct-grade/scripts/run_ab_test.py +504 -0
  64. package/skills/ct-grade/scripts/run_all.py +287 -0
  65. package/skills/ct-grade/scripts/setup_run.py +183 -0
  66. package/skills/ct-grade/scripts/token_tracker.py +630 -0
  67. package/skills/ct-grade-v2-1/SKILL.md +237 -0
  68. package/skills/ct-grade-v2-1/agents/analysis-reporter.md +203 -0
  69. package/skills/ct-grade-v2-1/agents/blind-comparator.md +157 -0
  70. package/skills/ct-grade-v2-1/agents/scenario-runner.md +179 -0
  71. package/skills/ct-grade-v2-1/evals/evals.json +74 -0
  72. package/skills/ct-grade-v2-1/grade-viewer/__pycache__/build_op_stats.cpython-314.pyc +0 -0
  73. package/skills/ct-grade-v2-1/grade-viewer/__pycache__/generate_grade_review.cpython-314.pyc +0 -0
  74. package/skills/ct-grade-v2-1/grade-viewer/build_op_stats.py +174 -0
  75. package/skills/ct-grade-v2-1/grade-viewer/eval-analysis.json +41 -0
  76. package/skills/ct-grade-v2-1/grade-viewer/eval-report.md +34 -0
  77. package/skills/ct-grade-v2-1/grade-viewer/generate_grade_review.py +1023 -0
  78. package/skills/ct-grade-v2-1/grade-viewer/generate_grade_viewer.py +548 -0
  79. package/skills/ct-grade-v2-1/grade-viewer/grade-review-eval.html +613 -0
  80. package/skills/ct-grade-v2-1/grade-viewer/grade-review.html +1532 -0
  81. package/skills/ct-grade-v2-1/grade-viewer/viewer.html +620 -0
  82. package/skills/ct-grade-v2-1/manifest-entry.json +31 -0
  83. package/skills/ct-grade-v2-1/references/ab-testing.md +233 -0
  84. package/skills/ct-grade-v2-1/references/domains-ssot.md +156 -0
  85. package/skills/ct-grade-v2-1/references/grade-spec-v2.md +167 -0
  86. package/skills/ct-grade-v2-1/references/playbook-v2.md +393 -0
  87. package/skills/ct-grade-v2-1/references/token-tracking.md +202 -0
  88. package/skills/ct-grade-v2-1/scripts/generate_report.py +419 -0
  89. package/skills/ct-grade-v2-1/scripts/run_ab_test.py +493 -0
  90. package/skills/ct-grade-v2-1/scripts/run_scenario.py +396 -0
  91. package/skills/ct-grade-v2-1/scripts/setup_run.py +207 -0
  92. package/skills/ct-grade-v2-1/scripts/token_tracker.py +175 -0
  93. package/skills/ct-memory/SKILL.md +84 -0
  94. package/skills/ct-orchestrator/INSTALL.md +61 -0
  95. package/skills/ct-orchestrator/README.md +69 -0
  96. package/skills/ct-orchestrator/SKILL.md +380 -0
  97. package/skills/ct-orchestrator/manifest-entry.json +19 -0
  98. package/skills/ct-orchestrator/orchestrator-prompt.txt +17 -0
  99. package/skills/ct-orchestrator/references/SUBAGENT-PROTOCOL-BLOCK.md +66 -0
  100. package/skills/ct-orchestrator/references/autonomous-operation.md +167 -0
  101. package/skills/ct-orchestrator/references/lifecycle-gates.md +98 -0
  102. package/skills/ct-orchestrator/references/orchestrator-compliance.md +271 -0
  103. package/skills/ct-orchestrator/references/orchestrator-handoffs.md +85 -0
  104. package/skills/ct-orchestrator/references/orchestrator-patterns.md +164 -0
  105. package/skills/ct-orchestrator/references/orchestrator-recovery.md +113 -0
  106. package/skills/ct-orchestrator/references/orchestrator-spawning.md +271 -0
  107. package/skills/ct-orchestrator/references/orchestrator-tokens.md +180 -0
  108. package/skills/ct-research-agent/SKILL.md +226 -0
  109. package/skills/ct-skill-creator/.cleo/.context-state.json +13 -0
  110. package/skills/ct-skill-creator/.cleo/logs/cleo.2026-03-07.1.log +24 -0
  111. package/skills/ct-skill-creator/.cleo/tasks.db +0 -0
  112. package/skills/ct-skill-creator/SKILL.md +356 -0
  113. package/skills/ct-skill-creator/agents/analyzer.md +276 -0
  114. package/skills/ct-skill-creator/agents/comparator.md +204 -0
  115. package/skills/ct-skill-creator/agents/grader.md +225 -0
  116. package/skills/ct-skill-creator/assets/eval_review.html +146 -0
  117. package/skills/ct-skill-creator/eval-viewer/__pycache__/generate_review.cpython-314.pyc +0 -0
  118. package/skills/ct-skill-creator/eval-viewer/generate_review.py +471 -0
  119. package/skills/ct-skill-creator/eval-viewer/viewer.html +1325 -0
  120. package/skills/ct-skill-creator/manifest-entry.json +17 -0
  121. package/skills/ct-skill-creator/references/dynamic-context.md +228 -0
  122. package/skills/ct-skill-creator/references/frontmatter.md +83 -0
  123. package/skills/ct-skill-creator/references/invocation-control.md +165 -0
  124. package/skills/ct-skill-creator/references/output-patterns.md +86 -0
  125. package/skills/ct-skill-creator/references/provider-deployment.md +175 -0
  126. package/skills/ct-skill-creator/references/schemas.md +430 -0
  127. package/skills/ct-skill-creator/references/workflows.md +28 -0
  128. package/skills/ct-skill-creator/scripts/__init__.py +1 -0
  129. package/skills/ct-skill-creator/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
  130. package/skills/ct-skill-creator/scripts/__pycache__/aggregate_benchmark.cpython-314.pyc +0 -0
  131. package/skills/ct-skill-creator/scripts/__pycache__/generate_report.cpython-314.pyc +0 -0
  132. package/skills/ct-skill-creator/scripts/__pycache__/improve_description.cpython-314.pyc +0 -0
  133. package/skills/ct-skill-creator/scripts/__pycache__/init_skill.cpython-314.pyc +0 -0
  134. package/skills/ct-skill-creator/scripts/__pycache__/quick_validate.cpython-314.pyc +0 -0
  135. package/skills/ct-skill-creator/scripts/__pycache__/run_eval.cpython-314.pyc +0 -0
  136. package/skills/ct-skill-creator/scripts/__pycache__/run_loop.cpython-314.pyc +0 -0
  137. package/skills/ct-skill-creator/scripts/__pycache__/utils.cpython-314.pyc +0 -0
  138. package/skills/ct-skill-creator/scripts/aggregate_benchmark.py +401 -0
  139. package/skills/ct-skill-creator/scripts/generate_report.py +326 -0
  140. package/skills/ct-skill-creator/scripts/improve_description.py +247 -0
  141. package/skills/ct-skill-creator/scripts/init_skill.py +306 -0
  142. package/skills/ct-skill-creator/scripts/package_skill.py +110 -0
  143. package/skills/ct-skill-creator/scripts/quick_validate.py +97 -0
  144. package/skills/ct-skill-creator/scripts/run_eval.py +310 -0
  145. package/skills/ct-skill-creator/scripts/run_loop.py +328 -0
  146. package/skills/ct-skill-creator/scripts/utils.py +47 -0
  147. package/skills/ct-skill-validator/SKILL.md +178 -0
  148. package/skills/ct-skill-validator/agents/ecosystem-checker.md +151 -0
  149. package/skills/ct-skill-validator/assets/valid-skill-example.md +13 -0
  150. package/skills/ct-skill-validator/evals/eval_set.json +14 -0
  151. package/skills/ct-skill-validator/evals/evals.json +52 -0
  152. package/skills/ct-skill-validator/manifest-entry.json +20 -0
  153. package/skills/ct-skill-validator/references/cleo-ecosystem-rules.md +163 -0
  154. package/skills/ct-skill-validator/references/validation-rules.md +168 -0
  155. package/skills/ct-skill-validator/scripts/__init__.py +0 -0
  156. package/skills/ct-skill-validator/scripts/__pycache__/audit_body.cpython-314.pyc +0 -0
  157. package/skills/ct-skill-validator/scripts/__pycache__/check_ecosystem.cpython-314.pyc +0 -0
  158. package/skills/ct-skill-validator/scripts/__pycache__/generate_validation_report.cpython-314.pyc +0 -0
  159. package/skills/ct-skill-validator/scripts/__pycache__/validate.cpython-314.pyc +0 -0
  160. package/skills/ct-skill-validator/scripts/audit_body.py +242 -0
  161. package/skills/ct-skill-validator/scripts/check_ecosystem.py +169 -0
  162. package/skills/ct-skill-validator/scripts/check_manifest.py +172 -0
  163. package/skills/ct-skill-validator/scripts/generate_validation_report.py +442 -0
  164. package/skills/ct-skill-validator/scripts/validate.py +422 -0
  165. package/skills/ct-spec-writer/SKILL.md +189 -0
  166. package/skills/ct-stickynote/README.md +14 -0
  167. package/skills/ct-stickynote/SKILL.md +46 -0
  168. package/skills/ct-task-executor/SKILL.md +296 -0
  169. package/skills/ct-validator/SKILL.md +216 -0
  170. package/skills/manifest.json +469 -0
  171. package/skills.json +281 -0
@@ -0,0 +1,310 @@
1
+ #!/usr/bin/env python3
2
+ """Run trigger evaluation for a skill description.
3
+
4
+ Tests whether a skill's description causes Claude to trigger (read the skill)
5
+ for a set of queries. Outputs results as JSON.
6
+ """
7
+
8
+ import argparse
9
+ import json
10
+ import os
11
+ import select
12
+ import subprocess
13
+ import sys
14
+ import time
15
+ import uuid
16
+ from concurrent.futures import ProcessPoolExecutor, as_completed
17
+ from pathlib import Path
18
+
19
+ from scripts.utils import parse_skill_md
20
+
21
+
22
+ def find_project_root() -> Path:
23
+ """Find the project root by walking up from cwd looking for .claude/.
24
+
25
+ Mimics how Claude Code discovers its project root, so the command file
26
+ we create ends up where claude -p will look for it.
27
+ """
28
+ current = Path.cwd()
29
+ for parent in [current, *current.parents]:
30
+ if (parent / ".claude").is_dir():
31
+ return parent
32
+ return current
33
+
34
+
35
+ def run_single_query(
36
+ query: str,
37
+ skill_name: str,
38
+ skill_description: str,
39
+ timeout: int,
40
+ project_root: str,
41
+ model: str | None = None,
42
+ ) -> bool:
43
+ """Run a single query and return whether the skill was triggered.
44
+
45
+ Creates a command file in .claude/commands/ so it appears in Claude's
46
+ available_skills list, then runs `claude -p` with the raw query.
47
+ Uses --include-partial-messages to detect triggering early from
48
+ stream events (content_block_start) rather than waiting for the
49
+ full assistant message, which only arrives after tool execution.
50
+ """
51
+ unique_id = uuid.uuid4().hex[:8]
52
+ clean_name = f"{skill_name}-skill-{unique_id}"
53
+ project_commands_dir = Path(project_root) / ".claude" / "commands"
54
+ command_file = project_commands_dir / f"{clean_name}.md"
55
+
56
+ try:
57
+ project_commands_dir.mkdir(parents=True, exist_ok=True)
58
+ # Use YAML block scalar to avoid breaking on quotes in description
59
+ indented_desc = "\n ".join(skill_description.split("\n"))
60
+ command_content = (
61
+ f"---\n"
62
+ f"description: |\n"
63
+ f" {indented_desc}\n"
64
+ f"---\n\n"
65
+ f"# {skill_name}\n\n"
66
+ f"This skill handles: {skill_description}\n"
67
+ )
68
+ command_file.write_text(command_content)
69
+
70
+ cmd = [
71
+ "claude",
72
+ "-p", query,
73
+ "--output-format", "stream-json",
74
+ "--verbose",
75
+ "--include-partial-messages",
76
+ ]
77
+ if model:
78
+ cmd.extend(["--model", model])
79
+
80
+ # Remove CLAUDECODE env var to allow nesting claude -p inside a
81
+ # Claude Code session. The guard is for interactive terminal conflicts;
82
+ # programmatic subprocess usage is safe.
83
+ env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"}
84
+
85
+ process = subprocess.Popen(
86
+ cmd,
87
+ stdout=subprocess.PIPE,
88
+ stderr=subprocess.DEVNULL,
89
+ cwd=project_root,
90
+ env=env,
91
+ )
92
+
93
+ triggered = False
94
+ start_time = time.time()
95
+ buffer = ""
96
+ # Track state for stream event detection
97
+ pending_tool_name = None
98
+ accumulated_json = ""
99
+
100
+ try:
101
+ while time.time() - start_time < timeout:
102
+ if process.poll() is not None:
103
+ remaining = process.stdout.read()
104
+ if remaining:
105
+ buffer += remaining.decode("utf-8", errors="replace")
106
+ break
107
+
108
+ ready, _, _ = select.select([process.stdout], [], [], 1.0)
109
+ if not ready:
110
+ continue
111
+
112
+ chunk = os.read(process.stdout.fileno(), 8192)
113
+ if not chunk:
114
+ break
115
+ buffer += chunk.decode("utf-8", errors="replace")
116
+
117
+ while "\n" in buffer:
118
+ line, buffer = buffer.split("\n", 1)
119
+ line = line.strip()
120
+ if not line:
121
+ continue
122
+
123
+ try:
124
+ event = json.loads(line)
125
+ except json.JSONDecodeError:
126
+ continue
127
+
128
+ # Early detection via stream events
129
+ if event.get("type") == "stream_event":
130
+ se = event.get("event", {})
131
+ se_type = se.get("type", "")
132
+
133
+ if se_type == "content_block_start":
134
+ cb = se.get("content_block", {})
135
+ if cb.get("type") == "tool_use":
136
+ tool_name = cb.get("name", "")
137
+ if tool_name in ("Skill", "Read"):
138
+ pending_tool_name = tool_name
139
+ accumulated_json = ""
140
+ else:
141
+ return False
142
+
143
+ elif se_type == "content_block_delta" and pending_tool_name:
144
+ delta = se.get("delta", {})
145
+ if delta.get("type") == "input_json_delta":
146
+ accumulated_json += delta.get("partial_json", "")
147
+ if clean_name in accumulated_json:
148
+ return True
149
+
150
+ elif se_type in ("content_block_stop", "message_stop"):
151
+ if pending_tool_name:
152
+ return clean_name in accumulated_json
153
+ if se_type == "message_stop":
154
+ return False
155
+
156
+ # Fallback: full assistant message
157
+ elif event.get("type") == "assistant":
158
+ message = event.get("message", {})
159
+ for content_item in message.get("content", []):
160
+ if content_item.get("type") != "tool_use":
161
+ continue
162
+ tool_name = content_item.get("name", "")
163
+ tool_input = content_item.get("input", {})
164
+ if tool_name == "Skill" and clean_name in tool_input.get("skill", ""):
165
+ triggered = True
166
+ elif tool_name == "Read" and clean_name in tool_input.get("file_path", ""):
167
+ triggered = True
168
+ return triggered
169
+
170
+ elif event.get("type") == "result":
171
+ return triggered
172
+ finally:
173
+ # Clean up process on any exit path (return, exception, timeout)
174
+ if process.poll() is None:
175
+ process.kill()
176
+ process.wait()
177
+
178
+ return triggered
179
+ finally:
180
+ if command_file.exists():
181
+ command_file.unlink()
182
+
183
+
184
+ def run_eval(
185
+ eval_set: list[dict],
186
+ skill_name: str,
187
+ description: str,
188
+ num_workers: int,
189
+ timeout: int,
190
+ project_root: Path,
191
+ runs_per_query: int = 1,
192
+ trigger_threshold: float = 0.5,
193
+ model: str | None = None,
194
+ ) -> dict:
195
+ """Run the full eval set and return results."""
196
+ results = []
197
+
198
+ with ProcessPoolExecutor(max_workers=num_workers) as executor:
199
+ future_to_info = {}
200
+ for item in eval_set:
201
+ for run_idx in range(runs_per_query):
202
+ future = executor.submit(
203
+ run_single_query,
204
+ item["query"],
205
+ skill_name,
206
+ description,
207
+ timeout,
208
+ str(project_root),
209
+ model,
210
+ )
211
+ future_to_info[future] = (item, run_idx)
212
+
213
+ query_triggers: dict[str, list[bool]] = {}
214
+ query_items: dict[str, dict] = {}
215
+ for future in as_completed(future_to_info):
216
+ item, _ = future_to_info[future]
217
+ query = item["query"]
218
+ query_items[query] = item
219
+ if query not in query_triggers:
220
+ query_triggers[query] = []
221
+ try:
222
+ query_triggers[query].append(future.result())
223
+ except Exception as e:
224
+ print(f"Warning: query failed: {e}", file=sys.stderr)
225
+ query_triggers[query].append(False)
226
+
227
+ for query, triggers in query_triggers.items():
228
+ item = query_items[query]
229
+ trigger_rate = sum(triggers) / len(triggers)
230
+ should_trigger = item["should_trigger"]
231
+ if should_trigger:
232
+ did_pass = trigger_rate >= trigger_threshold
233
+ else:
234
+ did_pass = trigger_rate < trigger_threshold
235
+ results.append({
236
+ "query": query,
237
+ "should_trigger": should_trigger,
238
+ "trigger_rate": trigger_rate,
239
+ "triggers": sum(triggers),
240
+ "runs": len(triggers),
241
+ "pass": did_pass,
242
+ })
243
+
244
+ passed = sum(1 for r in results if r["pass"])
245
+ total = len(results)
246
+
247
+ return {
248
+ "skill_name": skill_name,
249
+ "description": description,
250
+ "results": results,
251
+ "summary": {
252
+ "total": total,
253
+ "passed": passed,
254
+ "failed": total - passed,
255
+ },
256
+ }
257
+
258
+
259
+ def main():
260
+ parser = argparse.ArgumentParser(description="Run trigger evaluation for a skill description")
261
+ parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file")
262
+ parser.add_argument("--skill-path", required=True, help="Path to skill directory")
263
+ parser.add_argument("--description", default=None, help="Override description to test")
264
+ parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers")
265
+ parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds")
266
+ parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query")
267
+ parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold")
268
+ parser.add_argument("--model", default=None, help="Model to use for claude -p (default: user's configured model)")
269
+ parser.add_argument("--verbose", action="store_true", help="Print progress to stderr")
270
+ args = parser.parse_args()
271
+
272
+ eval_set = json.loads(Path(args.eval_set).read_text())
273
+ skill_path = Path(args.skill_path)
274
+
275
+ if not (skill_path / "SKILL.md").exists():
276
+ print(f"Error: No SKILL.md found at {skill_path}", file=sys.stderr)
277
+ sys.exit(1)
278
+
279
+ name, original_description, content = parse_skill_md(skill_path)
280
+ description = args.description or original_description
281
+ project_root = find_project_root()
282
+
283
+ if args.verbose:
284
+ print(f"Evaluating: {description}", file=sys.stderr)
285
+
286
+ output = run_eval(
287
+ eval_set=eval_set,
288
+ skill_name=name,
289
+ description=description,
290
+ num_workers=args.num_workers,
291
+ timeout=args.timeout,
292
+ project_root=project_root,
293
+ runs_per_query=args.runs_per_query,
294
+ trigger_threshold=args.trigger_threshold,
295
+ model=args.model,
296
+ )
297
+
298
+ if args.verbose:
299
+ summary = output["summary"]
300
+ print(f"Results: {summary['passed']}/{summary['total']} passed", file=sys.stderr)
301
+ for r in output["results"]:
302
+ status = "PASS" if r["pass"] else "FAIL"
303
+ rate_str = f"{r['triggers']}/{r['runs']}"
304
+ print(f" [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:70]}", file=sys.stderr)
305
+
306
+ print(json.dumps(output, indent=2))
307
+
308
+
309
+ if __name__ == "__main__":
310
+ main()
@@ -0,0 +1,328 @@
1
+ #!/usr/bin/env python3
2
+ """Run the eval + improve loop until all pass or max iterations reached.
3
+
4
+ Combines run_eval.py and improve_description.py in a loop, tracking history
5
+ and returning the best description found. Supports train/test split to prevent
6
+ overfitting.
7
+ """
8
+
9
+ import argparse
10
+ import json
11
+ import random
12
+ import sys
13
+ import tempfile
14
+ import time
15
+ import webbrowser
16
+ from pathlib import Path
17
+
18
+ from scripts.generate_report import generate_html
19
+ from scripts.improve_description import improve_description
20
+ from scripts.run_eval import find_project_root, run_eval
21
+ from scripts.utils import parse_skill_md
22
+
23
+
24
+ def split_eval_set(eval_set: list[dict], holdout: float, seed: int = 42) -> tuple[list[dict], list[dict]]:
25
+ """Split eval set into train and test sets, stratified by should_trigger."""
26
+ random.seed(seed)
27
+
28
+ # Separate by should_trigger
29
+ trigger = [e for e in eval_set if e["should_trigger"]]
30
+ no_trigger = [e for e in eval_set if not e["should_trigger"]]
31
+
32
+ # Shuffle each group
33
+ random.shuffle(trigger)
34
+ random.shuffle(no_trigger)
35
+
36
+ # Calculate split points
37
+ n_trigger_test = max(1, int(len(trigger) * holdout))
38
+ n_no_trigger_test = max(1, int(len(no_trigger) * holdout))
39
+
40
+ # Split
41
+ test_set = trigger[:n_trigger_test] + no_trigger[:n_no_trigger_test]
42
+ train_set = trigger[n_trigger_test:] + no_trigger[n_no_trigger_test:]
43
+
44
+ return train_set, test_set
45
+
46
+
47
+ def run_loop(
48
+ eval_set: list[dict],
49
+ skill_path: Path,
50
+ description_override: str | None,
51
+ num_workers: int,
52
+ timeout: int,
53
+ max_iterations: int,
54
+ runs_per_query: int,
55
+ trigger_threshold: float,
56
+ holdout: float,
57
+ model: str,
58
+ verbose: bool,
59
+ live_report_path: Path | None = None,
60
+ log_dir: Path | None = None,
61
+ ) -> dict:
62
+ """Run the eval + improvement loop."""
63
+ project_root = find_project_root()
64
+ name, original_description, content = parse_skill_md(skill_path)
65
+ current_description = description_override or original_description
66
+
67
+ # Split into train/test if holdout > 0
68
+ if holdout > 0:
69
+ train_set, test_set = split_eval_set(eval_set, holdout)
70
+ if verbose:
71
+ print(f"Split: {len(train_set)} train, {len(test_set)} test (holdout={holdout})", file=sys.stderr)
72
+ else:
73
+ train_set = eval_set
74
+ test_set = []
75
+
76
+ history = []
77
+ exit_reason = "unknown"
78
+
79
+ for iteration in range(1, max_iterations + 1):
80
+ if verbose:
81
+ print(f"\n{'='*60}", file=sys.stderr)
82
+ print(f"Iteration {iteration}/{max_iterations}", file=sys.stderr)
83
+ print(f"Description: {current_description}", file=sys.stderr)
84
+ print(f"{'='*60}", file=sys.stderr)
85
+
86
+ # Evaluate train + test together in one batch for parallelism
87
+ all_queries = train_set + test_set
88
+ t0 = time.time()
89
+ all_results = run_eval(
90
+ eval_set=all_queries,
91
+ skill_name=name,
92
+ description=current_description,
93
+ num_workers=num_workers,
94
+ timeout=timeout,
95
+ project_root=project_root,
96
+ runs_per_query=runs_per_query,
97
+ trigger_threshold=trigger_threshold,
98
+ model=model,
99
+ )
100
+ eval_elapsed = time.time() - t0
101
+
102
+ # Split results back into train/test by matching queries
103
+ train_queries_set = {q["query"] for q in train_set}
104
+ train_result_list = [r for r in all_results["results"] if r["query"] in train_queries_set]
105
+ test_result_list = [r for r in all_results["results"] if r["query"] not in train_queries_set]
106
+
107
+ train_passed = sum(1 for r in train_result_list if r["pass"])
108
+ train_total = len(train_result_list)
109
+ train_summary = {"passed": train_passed, "failed": train_total - train_passed, "total": train_total}
110
+ train_results = {"results": train_result_list, "summary": train_summary}
111
+
112
+ if test_set:
113
+ test_passed = sum(1 for r in test_result_list if r["pass"])
114
+ test_total = len(test_result_list)
115
+ test_summary = {"passed": test_passed, "failed": test_total - test_passed, "total": test_total}
116
+ test_results = {"results": test_result_list, "summary": test_summary}
117
+ else:
118
+ test_results = None
119
+ test_summary = None
120
+
121
+ history.append({
122
+ "iteration": iteration,
123
+ "description": current_description,
124
+ "train_passed": train_summary["passed"],
125
+ "train_failed": train_summary["failed"],
126
+ "train_total": train_summary["total"],
127
+ "train_results": train_results["results"],
128
+ "test_passed": test_summary["passed"] if test_summary else None,
129
+ "test_failed": test_summary["failed"] if test_summary else None,
130
+ "test_total": test_summary["total"] if test_summary else None,
131
+ "test_results": test_results["results"] if test_results else None,
132
+ # For backward compat with report generator
133
+ "passed": train_summary["passed"],
134
+ "failed": train_summary["failed"],
135
+ "total": train_summary["total"],
136
+ "results": train_results["results"],
137
+ })
138
+
139
+ # Write live report if path provided
140
+ if live_report_path:
141
+ partial_output = {
142
+ "original_description": original_description,
143
+ "best_description": current_description,
144
+ "best_score": "in progress",
145
+ "iterations_run": len(history),
146
+ "holdout": holdout,
147
+ "train_size": len(train_set),
148
+ "test_size": len(test_set),
149
+ "history": history,
150
+ }
151
+ live_report_path.write_text(generate_html(partial_output, auto_refresh=True, skill_name=name))
152
+
153
+ if verbose:
154
+ def print_eval_stats(label, results, elapsed):
155
+ pos = [r for r in results if r["should_trigger"]]
156
+ neg = [r for r in results if not r["should_trigger"]]
157
+ tp = sum(r["triggers"] for r in pos)
158
+ pos_runs = sum(r["runs"] for r in pos)
159
+ fn = pos_runs - tp
160
+ fp = sum(r["triggers"] for r in neg)
161
+ neg_runs = sum(r["runs"] for r in neg)
162
+ tn = neg_runs - fp
163
+ total = tp + tn + fp + fn
164
+ precision = tp / (tp + fp) if (tp + fp) > 0 else 1.0
165
+ recall = tp / (tp + fn) if (tp + fn) > 0 else 1.0
166
+ accuracy = (tp + tn) / total if total > 0 else 0.0
167
+ print(f"{label}: {tp+tn}/{total} correct, precision={precision:.0%} recall={recall:.0%} accuracy={accuracy:.0%} ({elapsed:.1f}s)", file=sys.stderr)
168
+ for r in results:
169
+ status = "PASS" if r["pass"] else "FAIL"
170
+ rate_str = f"{r['triggers']}/{r['runs']}"
171
+ print(f" [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:60]}", file=sys.stderr)
172
+
173
+ print_eval_stats("Train", train_results["results"], eval_elapsed)
174
+ if test_summary:
175
+ print_eval_stats("Test ", test_results["results"], 0)
176
+
177
+ if train_summary["failed"] == 0:
178
+ exit_reason = f"all_passed (iteration {iteration})"
179
+ if verbose:
180
+ print(f"\nAll train queries passed on iteration {iteration}!", file=sys.stderr)
181
+ break
182
+
183
+ if iteration == max_iterations:
184
+ exit_reason = f"max_iterations ({max_iterations})"
185
+ if verbose:
186
+ print(f"\nMax iterations reached ({max_iterations}).", file=sys.stderr)
187
+ break
188
+
189
+ # Improve the description based on train results
190
+ if verbose:
191
+ print(f"\nImproving description...", file=sys.stderr)
192
+
193
+ t0 = time.time()
194
+ # Strip test scores from history so improvement model can't see them
195
+ blinded_history = [
196
+ {k: v for k, v in h.items() if not k.startswith("test_")}
197
+ for h in history
198
+ ]
199
+ new_description = improve_description(
200
+ skill_name=name,
201
+ skill_content=content,
202
+ current_description=current_description,
203
+ eval_results=train_results,
204
+ history=blinded_history,
205
+ model=model,
206
+ log_dir=log_dir,
207
+ iteration=iteration,
208
+ )
209
+ improve_elapsed = time.time() - t0
210
+
211
+ if verbose:
212
+ print(f"Proposed ({improve_elapsed:.1f}s): {new_description}", file=sys.stderr)
213
+
214
+ current_description = new_description
215
+
216
+ # Find the best iteration by TEST score (or train if no test set)
217
+ if test_set:
218
+ best = max(history, key=lambda h: h["test_passed"] or 0)
219
+ best_score = f"{best['test_passed']}/{best['test_total']}"
220
+ else:
221
+ best = max(history, key=lambda h: h["train_passed"])
222
+ best_score = f"{best['train_passed']}/{best['train_total']}"
223
+
224
+ if verbose:
225
+ print(f"\nExit reason: {exit_reason}", file=sys.stderr)
226
+ print(f"Best score: {best_score} (iteration {best['iteration']})", file=sys.stderr)
227
+
228
+ return {
229
+ "exit_reason": exit_reason,
230
+ "original_description": original_description,
231
+ "best_description": best["description"],
232
+ "best_score": best_score,
233
+ "best_train_score": f"{best['train_passed']}/{best['train_total']}",
234
+ "best_test_score": f"{best['test_passed']}/{best['test_total']}" if test_set else None,
235
+ "final_description": current_description,
236
+ "iterations_run": len(history),
237
+ "holdout": holdout,
238
+ "train_size": len(train_set),
239
+ "test_size": len(test_set),
240
+ "history": history,
241
+ }
242
+
243
+
244
+ def main():
245
+ parser = argparse.ArgumentParser(description="Run eval + improve loop")
246
+ parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file")
247
+ parser.add_argument("--skill-path", required=True, help="Path to skill directory")
248
+ parser.add_argument("--description", default=None, help="Override starting description")
249
+ parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers")
250
+ parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds")
251
+ parser.add_argument("--max-iterations", type=int, default=5, help="Max improvement iterations")
252
+ parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query")
253
+ parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold")
254
+ parser.add_argument("--holdout", type=float, default=0.4, help="Fraction of eval set to hold out for testing (0 to disable)")
255
+ parser.add_argument("--model", required=True, help="Model for improvement")
256
+ parser.add_argument("--verbose", action="store_true", help="Print progress to stderr")
257
+ parser.add_argument("--report", default="auto", help="Generate HTML report at this path (default: 'auto' for temp file, 'none' to disable)")
258
+ parser.add_argument("--results-dir", default=None, help="Save all outputs (results.json, report.html, log.txt) to a timestamped subdirectory here")
259
+ args = parser.parse_args()
260
+
261
+ eval_set = json.loads(Path(args.eval_set).read_text())
262
+ skill_path = Path(args.skill_path)
263
+
264
+ if not (skill_path / "SKILL.md").exists():
265
+ print(f"Error: No SKILL.md found at {skill_path}", file=sys.stderr)
266
+ sys.exit(1)
267
+
268
+ name, _, _ = parse_skill_md(skill_path)
269
+
270
+ # Set up live report path
271
+ if args.report != "none":
272
+ if args.report == "auto":
273
+ timestamp = time.strftime("%Y%m%d_%H%M%S")
274
+ live_report_path = Path(tempfile.gettempdir()) / f"skill_description_report_{skill_path.name}_{timestamp}.html"
275
+ else:
276
+ live_report_path = Path(args.report)
277
+ # Open the report immediately so the user can watch
278
+ live_report_path.write_text("<html><body><h1>Starting optimization loop...</h1><meta http-equiv='refresh' content='5'></body></html>")
279
+ webbrowser.open(str(live_report_path))
280
+ else:
281
+ live_report_path = None
282
+
283
+ # Determine output directory (create before run_loop so logs can be written)
284
+ if args.results_dir:
285
+ timestamp = time.strftime("%Y-%m-%d_%H%M%S")
286
+ results_dir = Path(args.results_dir) / timestamp
287
+ results_dir.mkdir(parents=True, exist_ok=True)
288
+ else:
289
+ results_dir = None
290
+
291
+ log_dir = results_dir / "logs" if results_dir else None
292
+
293
+ output = run_loop(
294
+ eval_set=eval_set,
295
+ skill_path=skill_path,
296
+ description_override=args.description,
297
+ num_workers=args.num_workers,
298
+ timeout=args.timeout,
299
+ max_iterations=args.max_iterations,
300
+ runs_per_query=args.runs_per_query,
301
+ trigger_threshold=args.trigger_threshold,
302
+ holdout=args.holdout,
303
+ model=args.model,
304
+ verbose=args.verbose,
305
+ live_report_path=live_report_path,
306
+ log_dir=log_dir,
307
+ )
308
+
309
+ # Save JSON output
310
+ json_output = json.dumps(output, indent=2)
311
+ print(json_output)
312
+ if results_dir:
313
+ (results_dir / "results.json").write_text(json_output)
314
+
315
+ # Write final HTML report (without auto-refresh)
316
+ if live_report_path:
317
+ live_report_path.write_text(generate_html(output, auto_refresh=False, skill_name=name))
318
+ print(f"\nReport: {live_report_path}", file=sys.stderr)
319
+
320
+ if results_dir and live_report_path:
321
+ (results_dir / "report.html").write_text(generate_html(output, auto_refresh=False, skill_name=name))
322
+
323
+ if results_dir:
324
+ print(f"Results saved to: {results_dir}", file=sys.stderr)
325
+
326
+
327
+ if __name__ == "__main__":
328
+ main()
@@ -0,0 +1,47 @@
1
+ """Shared utilities for skill-creator scripts."""
2
+
3
+ from pathlib import Path
4
+
5
+
6
+
7
+ def parse_skill_md(skill_path: Path) -> tuple[str, str, str]:
8
+ """Parse a SKILL.md file, returning (name, description, full_content)."""
9
+ content = (skill_path / "SKILL.md").read_text()
10
+ lines = content.split("\n")
11
+
12
+ if lines[0].strip() != "---":
13
+ raise ValueError("SKILL.md missing frontmatter (no opening ---)")
14
+
15
+ end_idx = None
16
+ for i, line in enumerate(lines[1:], start=1):
17
+ if line.strip() == "---":
18
+ end_idx = i
19
+ break
20
+
21
+ if end_idx is None:
22
+ raise ValueError("SKILL.md missing frontmatter (no closing ---)")
23
+
24
+ name = ""
25
+ description = ""
26
+ frontmatter_lines = lines[1:end_idx]
27
+ i = 0
28
+ while i < len(frontmatter_lines):
29
+ line = frontmatter_lines[i]
30
+ if line.startswith("name:"):
31
+ name = line[len("name:"):].strip().strip('"').strip("'")
32
+ elif line.startswith("description:"):
33
+ value = line[len("description:"):].strip()
34
+ # Handle YAML multiline indicators (>, |, >-, |-)
35
+ if value in (">", "|", ">-", "|-"):
36
+ continuation_lines: list[str] = []
37
+ i += 1
38
+ while i < len(frontmatter_lines) and (frontmatter_lines[i].startswith(" ") or frontmatter_lines[i].startswith("\t")):
39
+ continuation_lines.append(frontmatter_lines[i].strip())
40
+ i += 1
41
+ description = " ".join(continuation_lines)
42
+ continue
43
+ else:
44
+ description = value.strip('"').strip("'")
45
+ i += 1
46
+
47
+ return name, description, content