@intentsolutionsio/skill-creator 5.0.0 → 5.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/package.json +1 -1
  2. package/scripts/validate-skill.py +45 -22
  3. package/skills/agent-creator/SKILL.md +40 -14
  4. package/skills/agent-creator/references/anthropic-agent-spec.md +1 -0
  5. package/skills/skill-creator/SKILL.md +34 -9
  6. package/skills/skill-creator/agents/analyzer.md +11 -0
  7. package/skills/skill-creator/agents/comparator.md +3 -0
  8. package/skills/skill-creator/agents/grader.md +4 -0
  9. package/skills/skill-creator/eval-viewer/generate_review.py +45 -13
  10. package/skills/skill-creator/references/advanced-eval-workflow.md +16 -0
  11. package/skills/skill-creator/references/anthropic-comparison.md +3 -0
  12. package/skills/skill-creator/references/creation-guide.md +20 -1
  13. package/skills/skill-creator/references/errors-template.md +1 -0
  14. package/skills/skill-creator/references/examples-template.md +1 -0
  15. package/skills/skill-creator/references/frontmatter-spec.md +1 -0
  16. package/skills/skill-creator/references/implementation-template.md +1 -0
  17. package/skills/skill-creator/references/output-patterns.md +7 -0
  18. package/skills/skill-creator/references/schemas.md +5 -0
  19. package/skills/skill-creator/references/source-of-truth.md +40 -2
  20. package/skills/skill-creator/references/validation-rules.md +19 -1
  21. package/skills/skill-creator/scripts/__pycache__/__init__.cpython-312.pyc +0 -0
  22. package/skills/skill-creator/scripts/__pycache__/run_eval.cpython-312.pyc +0 -0
  23. package/skills/skill-creator/scripts/__pycache__/utils.cpython-312.pyc +0 -0
  24. package/skills/skill-creator/scripts/aggregate_benchmark.py +46 -60
  25. package/skills/skill-creator/scripts/generate_report.py +29 -17
  26. package/skills/skill-creator/scripts/improve_description.py +18 -21
  27. package/skills/skill-creator/scripts/package_skill.py +2 -2
  28. package/skills/skill-creator/scripts/quick_validate.py +16 -15
  29. package/skills/skill-creator/scripts/run_eval.py +14 -10
  30. package/skills/skill-creator/scripts/run_loop.py +51 -31
  31. package/skills/skill-creator/scripts/utils.py +5 -4
  32. package/skills/skill-creator/templates/agent-template.md +3 -0
  33. package/skills/skill-creator/templates/skill-template.md +4 -0
@@ -41,9 +41,7 @@ def _call_claude(prompt: str, model: str | None, timeout: int = 300) -> str:
41
41
  timeout=timeout,
42
42
  )
43
43
  if result.returncode != 0:
44
- raise RuntimeError(
45
- f"claude -p exited {result.returncode}\nstderr: {result.stderr}"
46
- )
44
+ raise RuntimeError(f"claude -p exited {result.returncode}\nstderr: {result.stderr}")
47
45
  return result.stdout
48
46
 
49
47
 
@@ -59,14 +57,8 @@ def improve_description(
59
57
  iteration: int | None = None,
60
58
  ) -> str:
61
59
  """Call Claude to improve the description based on eval results."""
62
- failed_triggers = [
63
- r for r in eval_results["results"]
64
- if r["should_trigger"] and not r["pass"]
65
- ]
66
- false_triggers = [
67
- r for r in eval_results["results"]
68
- if not r["should_trigger"] and not r["pass"]
69
- ]
60
+ failed_triggers = [r for r in eval_results["results"] if r["should_trigger"] and not r["pass"]]
61
+ false_triggers = [r for r in eval_results["results"] if not r["should_trigger"] and not r["pass"]]
70
62
 
71
63
  # Build scores summary
72
64
  train_score = f"{eval_results['summary']['passed']}/{eval_results['summary']['total']}"
@@ -104,9 +96,11 @@ Current scores ({scores_summary}):
104
96
  prompt += "PREVIOUS ATTEMPTS (do NOT repeat these — try something structurally different):\n\n"
105
97
  for h in history:
106
98
  train_s = f"{h.get('train_passed', h.get('passed', 0))}/{h.get('train_total', h.get('total', 0))}"
107
- test_s = f"{h.get('test_passed', '?')}/{h.get('test_total', '?')}" if h.get('test_passed') is not None else None
99
+ test_s = (
100
+ f"{h.get('test_passed', '?')}/{h.get('test_total', '?')}" if h.get("test_passed") is not None else None
101
+ )
108
102
  score_str = f"train={train_s}" + (f", test={test_s}" if test_s else "")
109
- prompt += f'<attempt {score_str}>\n'
103
+ prompt += f"<attempt {score_str}>\n"
110
104
  prompt += f'Description: "{h["description"]}"\n'
111
105
  if "results" in h:
112
106
  prompt += "Train results:\n"
@@ -114,7 +108,7 @@ Current scores ({scores_summary}):
114
108
  status = "PASS" if r["pass"] else "FAIL"
115
109
  prompt += f' [{status}] "{r["query"][:80]}" (triggered {r["triggers"]}/{r["runs"]})\n'
116
110
  if h.get("note"):
117
- prompt += f'Note: {h["note"]}\n'
111
+ prompt += f"Note: {h['note']}\n"
118
112
  prompt += "</attempt>\n\n"
119
113
 
120
114
  prompt += f"""</scores_summary>
@@ -232,13 +226,16 @@ def main():
232
226
  # Output as JSON with both the new description and updated history
233
227
  output = {
234
228
  "description": new_description,
235
- "history": history + [{
236
- "description": current_description,
237
- "passed": eval_results["summary"]["passed"],
238
- "failed": eval_results["summary"]["failed"],
239
- "total": eval_results["summary"]["total"],
240
- "results": eval_results["results"],
241
- }],
229
+ "history": history
230
+ + [
231
+ {
232
+ "description": current_description,
233
+ "passed": eval_results["summary"]["passed"],
234
+ "failed": eval_results["summary"]["failed"],
235
+ "total": eval_results["summary"]["total"],
236
+ "results": eval_results["results"],
237
+ }
238
+ ],
242
239
  }
243
240
  print(json.dumps(output, indent=2))
244
241
 
@@ -88,9 +88,9 @@ def package_skill(skill_path, output_dir=None):
88
88
 
89
89
  # Create the .skill file (zip format)
90
90
  try:
91
- with zipfile.ZipFile(skill_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
91
+ with zipfile.ZipFile(skill_filename, "w", zipfile.ZIP_DEFLATED) as zipf:
92
92
  # Walk through the skill directory, excluding build artifacts
93
- for file_path in skill_path.rglob('*'):
93
+ for file_path in skill_path.rglob("*"):
94
94
  if not file_path.is_file():
95
95
  continue
96
96
  arcname = file_path.relative_to(skill_path.parent)
@@ -4,27 +4,27 @@ Quick validation script for skills - minimal version
4
4
  """
5
5
 
6
6
  import sys
7
- import os
8
7
  import re
9
8
  import yaml
10
9
  from pathlib import Path
11
10
 
11
+
12
12
  def validate_skill(skill_path):
13
13
  """Basic validation of a skill"""
14
14
  skill_path = Path(skill_path)
15
15
 
16
16
  # Check SKILL.md exists
17
- skill_md = skill_path / 'SKILL.md'
17
+ skill_md = skill_path / "SKILL.md"
18
18
  if not skill_md.exists():
19
19
  return False, "SKILL.md not found"
20
20
 
21
21
  # Read and validate frontmatter
22
22
  content = skill_md.read_text()
23
- if not content.startswith('---'):
23
+ if not content.startswith("---"):
24
24
  return False, "No YAML frontmatter found"
25
25
 
26
26
  # Extract frontmatter
27
- match = re.match(r'^---\n(.*?)\n---', content, re.DOTALL)
27
+ match = re.match(r"^---\n(.*?)\n---", content, re.DOTALL)
28
28
  if not match:
29
29
  return False, "Invalid frontmatter format"
30
30
 
@@ -39,7 +39,7 @@ def validate_skill(skill_path):
39
39
  return False, f"Invalid YAML in frontmatter: {e}"
40
40
 
41
41
  # Define allowed properties
42
- ALLOWED_PROPERTIES = {'name', 'description', 'license', 'allowed-tools', 'metadata', 'compatibility'}
42
+ ALLOWED_PROPERTIES = {"name", "description", "license", "allowed-tools", "metadata", "compatibility"}
43
43
 
44
44
  # Check for unexpected properties (excluding nested keys under metadata)
45
45
  unexpected_keys = set(frontmatter.keys()) - ALLOWED_PROPERTIES
@@ -50,41 +50,41 @@ def validate_skill(skill_path):
50
50
  )
51
51
 
52
52
  # Check required fields
53
- if 'name' not in frontmatter:
53
+ if "name" not in frontmatter:
54
54
  return False, "Missing 'name' in frontmatter"
55
- if 'description' not in frontmatter:
55
+ if "description" not in frontmatter:
56
56
  return False, "Missing 'description' in frontmatter"
57
57
 
58
58
  # Extract name for validation
59
- name = frontmatter.get('name', '')
59
+ name = frontmatter.get("name", "")
60
60
  if not isinstance(name, str):
61
61
  return False, f"Name must be a string, got {type(name).__name__}"
62
62
  name = name.strip()
63
63
  if name:
64
64
  # Check naming convention (kebab-case: lowercase with hyphens)
65
- if not re.match(r'^[a-z0-9-]+$', name):
65
+ if not re.match(r"^[a-z0-9-]+$", name):
66
66
  return False, f"Name '{name}' should be kebab-case (lowercase letters, digits, and hyphens only)"
67
- if name.startswith('-') or name.endswith('-') or '--' in name:
67
+ if name.startswith("-") or name.endswith("-") or "--" in name:
68
68
  return False, f"Name '{name}' cannot start/end with hyphen or contain consecutive hyphens"
69
69
  # Check name length (max 64 characters per spec)
70
70
  if len(name) > 64:
71
71
  return False, f"Name is too long ({len(name)} characters). Maximum is 64 characters."
72
72
 
73
73
  # Extract and validate description
74
- description = frontmatter.get('description', '')
74
+ description = frontmatter.get("description", "")
75
75
  if not isinstance(description, str):
76
76
  return False, f"Description must be a string, got {type(description).__name__}"
77
77
  description = description.strip()
78
78
  if description:
79
79
  # Check for angle brackets
80
- if '<' in description or '>' in description:
80
+ if "<" in description or ">" in description:
81
81
  return False, "Description cannot contain angle brackets (< or >)"
82
82
  # Check description length (max 1024 characters per spec)
83
83
  if len(description) > 1024:
84
84
  return False, f"Description is too long ({len(description)} characters). Maximum is 1024 characters."
85
85
 
86
86
  # Validate compatibility field if present (optional)
87
- compatibility = frontmatter.get('compatibility', '')
87
+ compatibility = frontmatter.get("compatibility", "")
88
88
  if compatibility:
89
89
  if not isinstance(compatibility, str):
90
90
  return False, f"Compatibility must be a string, got {type(compatibility).__name__}"
@@ -93,11 +93,12 @@ def validate_skill(skill_path):
93
93
 
94
94
  return True, "Skill is valid!"
95
95
 
96
+
96
97
  if __name__ == "__main__":
97
98
  if len(sys.argv) != 2:
98
99
  print("Usage: python quick_validate.py <skill_directory>")
99
100
  sys.exit(1)
100
-
101
+
101
102
  valid, message = validate_skill(sys.argv[1])
102
103
  print(message)
103
- sys.exit(0 if valid else 1)
104
+ sys.exit(0 if valid else 1)
@@ -101,8 +101,10 @@ def run_single_query(
101
101
 
102
102
  cmd = [
103
103
  "claude",
104
- "-p", query,
105
- "--output-format", "stream-json",
104
+ "-p",
105
+ query,
106
+ "--output-format",
107
+ "stream-json",
106
108
  "--verbose",
107
109
  "--include-partial-messages",
108
110
  ]
@@ -265,14 +267,16 @@ def run_eval(
265
267
  did_pass = trigger_rate >= trigger_threshold
266
268
  else:
267
269
  did_pass = trigger_rate < trigger_threshold
268
- results.append({
269
- "query": query,
270
- "should_trigger": should_trigger,
271
- "trigger_rate": trigger_rate,
272
- "triggers": sum(triggers),
273
- "runs": len(triggers),
274
- "pass": did_pass,
275
- })
270
+ results.append(
271
+ {
272
+ "query": query,
273
+ "should_trigger": should_trigger,
274
+ "trigger_rate": trigger_rate,
275
+ "triggers": sum(triggers),
276
+ "runs": len(triggers),
277
+ "pass": did_pass,
278
+ }
279
+ )
276
280
 
277
281
  passed = sum(1 for r in results if r["pass"])
278
282
  total = len(results)
@@ -78,10 +78,10 @@ def run_loop(
78
78
 
79
79
  for iteration in range(1, max_iterations + 1):
80
80
  if verbose:
81
- print(f"\n{'='*60}", file=sys.stderr)
81
+ print(f"\n{'=' * 60}", file=sys.stderr)
82
82
  print(f"Iteration {iteration}/{max_iterations}", file=sys.stderr)
83
83
  print(f"Description: {current_description}", file=sys.stderr)
84
- print(f"{'='*60}", file=sys.stderr)
84
+ print(f"{'=' * 60}", file=sys.stderr)
85
85
 
86
86
  # Evaluate train + test together in one batch for parallelism
87
87
  all_queries = train_set + test_set
@@ -119,23 +119,25 @@ def run_loop(
119
119
  test_results = None
120
120
  test_summary = None
121
121
 
122
- history.append({
123
- "iteration": iteration,
124
- "description": current_description,
125
- "train_passed": train_summary["passed"],
126
- "train_failed": train_summary["failed"],
127
- "train_total": train_summary["total"],
128
- "train_results": train_results["results"],
129
- "test_passed": test_summary["passed"] if test_summary else None,
130
- "test_failed": test_summary["failed"] if test_summary else None,
131
- "test_total": test_summary["total"] if test_summary else None,
132
- "test_results": test_results["results"] if test_results else None,
133
- # For backward compat with report generator
134
- "passed": train_summary["passed"],
135
- "failed": train_summary["failed"],
136
- "total": train_summary["total"],
137
- "results": train_results["results"],
138
- })
122
+ history.append(
123
+ {
124
+ "iteration": iteration,
125
+ "description": current_description,
126
+ "train_passed": train_summary["passed"],
127
+ "train_failed": train_summary["failed"],
128
+ "train_total": train_summary["total"],
129
+ "train_results": train_results["results"],
130
+ "test_passed": test_summary["passed"] if test_summary else None,
131
+ "test_failed": test_summary["failed"] if test_summary else None,
132
+ "test_total": test_summary["total"] if test_summary else None,
133
+ "test_results": test_results["results"] if test_results else None,
134
+ # For backward compat with report generator
135
+ "passed": train_summary["passed"],
136
+ "failed": train_summary["failed"],
137
+ "total": train_summary["total"],
138
+ "results": train_results["results"],
139
+ }
140
+ )
139
141
 
140
142
  # Write live report if path provided
141
143
  if live_report_path:
@@ -152,6 +154,7 @@ def run_loop(
152
154
  live_report_path.write_text(generate_html(partial_output, auto_refresh=True, skill_name=name))
153
155
 
154
156
  if verbose:
157
+
155
158
  def print_eval_stats(label, results, elapsed):
156
159
  pos = [r for r in results if r["should_trigger"]]
157
160
  neg = [r for r in results if not r["should_trigger"]]
@@ -165,11 +168,17 @@ def run_loop(
165
168
  precision = tp / (tp + fp) if (tp + fp) > 0 else 1.0
166
169
  recall = tp / (tp + fn) if (tp + fn) > 0 else 1.0
167
170
  accuracy = (tp + tn) / total if total > 0 else 0.0
168
- print(f"{label}: {tp+tn}/{total} correct, precision={precision:.0%} recall={recall:.0%} accuracy={accuracy:.0%} ({elapsed:.1f}s)", file=sys.stderr)
171
+ print(
172
+ f"{label}: {tp + tn}/{total} correct, precision={precision:.0%} recall={recall:.0%} accuracy={accuracy:.0%} ({elapsed:.1f}s)",
173
+ file=sys.stderr,
174
+ )
169
175
  for r in results:
170
176
  status = "PASS" if r["pass"] else "FAIL"
171
177
  rate_str = f"{r['triggers']}/{r['runs']}"
172
- print(f" [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:60]}", file=sys.stderr)
178
+ print(
179
+ f" [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:60]}",
180
+ file=sys.stderr,
181
+ )
173
182
 
174
183
  print_eval_stats("Train", train_results["results"], eval_elapsed)
175
184
  if test_summary:
@@ -189,14 +198,11 @@ def run_loop(
189
198
 
190
199
  # Improve the description based on train results
191
200
  if verbose:
192
- print(f"\nImproving description...", file=sys.stderr)
201
+ print("\nImproving description...", file=sys.stderr)
193
202
 
194
203
  t0 = time.time()
195
204
  # Strip test scores from history so improvement model can't see them
196
- blinded_history = [
197
- {k: v for k, v in h.items() if not k.startswith("test_")}
198
- for h in history
199
- ]
205
+ blinded_history = [{k: v for k, v in h.items() if not k.startswith("test_")} for h in history]
200
206
  new_description = improve_description(
201
207
  skill_name=name,
202
208
  skill_content=content,
@@ -252,11 +258,21 @@ def main():
252
258
  parser.add_argument("--max-iterations", type=int, default=5, help="Max improvement iterations")
253
259
  parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query")
254
260
  parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold")
255
- parser.add_argument("--holdout", type=float, default=0.4, help="Fraction of eval set to hold out for testing (0 to disable)")
261
+ parser.add_argument(
262
+ "--holdout", type=float, default=0.4, help="Fraction of eval set to hold out for testing (0 to disable)"
263
+ )
256
264
  parser.add_argument("--model", required=True, help="Model for improvement")
257
265
  parser.add_argument("--verbose", action="store_true", help="Print progress to stderr")
258
- parser.add_argument("--report", default="auto", help="Generate HTML report at this path (default: 'auto' for temp file, 'none' to disable)")
259
- parser.add_argument("--results-dir", default=None, help="Save all outputs (results.json, report.html, log.txt) to a timestamped subdirectory here")
266
+ parser.add_argument(
267
+ "--report",
268
+ default="auto",
269
+ help="Generate HTML report at this path (default: 'auto' for temp file, 'none' to disable)",
270
+ )
271
+ parser.add_argument(
272
+ "--results-dir",
273
+ default=None,
274
+ help="Save all outputs (results.json, report.html, log.txt) to a timestamped subdirectory here",
275
+ )
260
276
  args = parser.parse_args()
261
277
 
262
278
  eval_set = json.loads(Path(args.eval_set).read_text())
@@ -272,11 +288,15 @@ def main():
272
288
  if args.report != "none":
273
289
  if args.report == "auto":
274
290
  timestamp = time.strftime("%Y%m%d_%H%M%S")
275
- live_report_path = Path(tempfile.gettempdir()) / f"skill_description_report_{skill_path.name}_{timestamp}.html"
291
+ live_report_path = (
292
+ Path(tempfile.gettempdir()) / f"skill_description_report_{skill_path.name}_{timestamp}.html"
293
+ )
276
294
  else:
277
295
  live_report_path = Path(args.report)
278
296
  # Open the report immediately so the user can watch
279
- live_report_path.write_text("<html><body><h1>Starting optimization loop...</h1><meta http-equiv='refresh' content='5'></body></html>")
297
+ live_report_path.write_text(
298
+ "<html><body><h1>Starting optimization loop...</h1><meta http-equiv='refresh' content='5'></body></html>"
299
+ )
280
300
  webbrowser.open(str(live_report_path))
281
301
  else:
282
302
  live_report_path = None
@@ -3,7 +3,6 @@
3
3
  from pathlib import Path
4
4
 
5
5
 
6
-
7
6
  def parse_skill_md(skill_path: Path) -> tuple[str, str, str]:
8
7
  """Parse a SKILL.md file, returning (name, description, full_content)."""
9
8
  content = (skill_path / "SKILL.md").read_text()
@@ -28,14 +27,16 @@ def parse_skill_md(skill_path: Path) -> tuple[str, str, str]:
28
27
  while i < len(frontmatter_lines):
29
28
  line = frontmatter_lines[i]
30
29
  if line.startswith("name:"):
31
- name = line[len("name:"):].strip().strip('"').strip("'")
30
+ name = line[len("name:") :].strip().strip('"').strip("'")
32
31
  elif line.startswith("description:"):
33
- value = line[len("description:"):].strip()
32
+ value = line[len("description:") :].strip()
34
33
  # Handle YAML multiline indicators (>, |, >-, |-)
35
34
  if value in (">", "|", ">-", "|-"):
36
35
  continuation_lines: list[str] = []
37
36
  i += 1
38
- while i < len(frontmatter_lines) and (frontmatter_lines[i].startswith(" ") or frontmatter_lines[i].startswith("\t")):
37
+ while i < len(frontmatter_lines) and (
38
+ frontmatter_lines[i].startswith(" ") or frontmatter_lines[i].startswith("\t")
39
+ ):
39
40
  continuation_lines.append(frontmatter_lines[i].strip())
40
41
  i += 1
41
42
  description = " ".join(continuation_lines)
@@ -76,6 +76,7 @@ You receive these parameters in your prompt:
76
76
  ## When Activated
77
77
 
78
78
  You activate when:
79
+
79
80
  - {{ACTIVATION_CONDITION_1}}
80
81
  - {{ACTIVATION_CONDITION_2}}
81
82
  - {{ACTIVATION_CONDITION_3}}
@@ -89,11 +90,13 @@ You activate when:
89
90
  ## Success Criteria
90
91
 
91
92
  Good output includes:
93
+
92
94
  - {{QUALITY_MARKER_1}}
93
95
  - {{QUALITY_MARKER_2}}
94
96
  - {{QUALITY_MARKER_3}}
95
97
 
96
98
  Poor output is:
99
+
97
100
  - {{ANTI_PATTERN_1}}
98
101
  - {{ANTI_PATTERN_2}}
99
102
  - {{ANTI_PATTERN_3}}
@@ -68,11 +68,13 @@ model: inherit
68
68
  ### {{EXAMPLE_1_TITLE}}
69
69
 
70
70
  **Input:**
71
+
71
72
  ```
72
73
  {{EXAMPLE_1_INPUT}}
73
74
  ```
74
75
 
75
76
  **Output:**
77
+
76
78
  ```
77
79
  {{EXAMPLE_1_OUTPUT}}
78
80
  ```
@@ -80,11 +82,13 @@ model: inherit
80
82
  ### {{EXAMPLE_2_TITLE}}
81
83
 
82
84
  **Input:**
85
+
83
86
  ```
84
87
  {{EXAMPLE_2_INPUT}}
85
88
  ```
86
89
 
87
90
  **Output:**
91
+
88
92
  ```
89
93
  {{EXAMPLE_2_OUTPUT}}
90
94
  ```