@intentsolutionsio/skill-creator 5.0.0 → 5.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. package/package.json +1 -1
  2. package/scripts/validate-skill.py +61 -1100
  3. package/skills/agent-creator/SKILL.md +40 -14
  4. package/skills/agent-creator/references/anthropic-agent-spec.md +1 -0
  5. package/skills/skill-creator/SKILL.md +34 -9
  6. package/skills/skill-creator/agents/analyzer.md +39 -1
  7. package/skills/skill-creator/agents/comparator.md +31 -1
  8. package/skills/skill-creator/agents/grader.md +32 -1
  9. package/skills/skill-creator/eval-viewer/generate_review.py +45 -13
  10. package/skills/skill-creator/references/advanced-eval-workflow.md +16 -0
  11. package/skills/skill-creator/references/anthropic-comparison.md +3 -0
  12. package/skills/skill-creator/references/creation-guide.md +20 -1
  13. package/skills/skill-creator/references/errors-template.md +1 -0
  14. package/skills/skill-creator/references/examples-template.md +1 -0
  15. package/skills/skill-creator/references/frontmatter-spec.md +1 -0
  16. package/skills/skill-creator/references/implementation-template.md +1 -0
  17. package/skills/skill-creator/references/output-patterns.md +7 -0
  18. package/skills/skill-creator/references/schemas.md +5 -0
  19. package/skills/skill-creator/references/source-of-truth.md +40 -2
  20. package/skills/skill-creator/references/validation-rules.md +19 -1
  21. package/skills/skill-creator/scripts/aggregate_benchmark.py +46 -60
  22. package/skills/skill-creator/scripts/generate_report.py +29 -17
  23. package/skills/skill-creator/scripts/improve_description.py +18 -21
  24. package/skills/skill-creator/scripts/package_skill.py +2 -2
  25. package/skills/skill-creator/scripts/quick_validate.py +16 -15
  26. package/skills/skill-creator/scripts/run_eval.py +14 -10
  27. package/skills/skill-creator/scripts/run_loop.py +51 -31
  28. package/skills/skill-creator/scripts/utils.py +5 -4
  29. package/skills/skill-creator/templates/agent-template.md +3 -0
  30. package/skills/skill-creator/templates/skill-template.md +4 -0
@@ -78,10 +78,10 @@ def run_loop(
78
78
 
79
79
  for iteration in range(1, max_iterations + 1):
80
80
  if verbose:
81
- print(f"\n{'='*60}", file=sys.stderr)
81
+ print(f"\n{'=' * 60}", file=sys.stderr)
82
82
  print(f"Iteration {iteration}/{max_iterations}", file=sys.stderr)
83
83
  print(f"Description: {current_description}", file=sys.stderr)
84
- print(f"{'='*60}", file=sys.stderr)
84
+ print(f"{'=' * 60}", file=sys.stderr)
85
85
 
86
86
  # Evaluate train + test together in one batch for parallelism
87
87
  all_queries = train_set + test_set
@@ -119,23 +119,25 @@ def run_loop(
119
119
  test_results = None
120
120
  test_summary = None
121
121
 
122
- history.append({
123
- "iteration": iteration,
124
- "description": current_description,
125
- "train_passed": train_summary["passed"],
126
- "train_failed": train_summary["failed"],
127
- "train_total": train_summary["total"],
128
- "train_results": train_results["results"],
129
- "test_passed": test_summary["passed"] if test_summary else None,
130
- "test_failed": test_summary["failed"] if test_summary else None,
131
- "test_total": test_summary["total"] if test_summary else None,
132
- "test_results": test_results["results"] if test_results else None,
133
- # For backward compat with report generator
134
- "passed": train_summary["passed"],
135
- "failed": train_summary["failed"],
136
- "total": train_summary["total"],
137
- "results": train_results["results"],
138
- })
122
+ history.append(
123
+ {
124
+ "iteration": iteration,
125
+ "description": current_description,
126
+ "train_passed": train_summary["passed"],
127
+ "train_failed": train_summary["failed"],
128
+ "train_total": train_summary["total"],
129
+ "train_results": train_results["results"],
130
+ "test_passed": test_summary["passed"] if test_summary else None,
131
+ "test_failed": test_summary["failed"] if test_summary else None,
132
+ "test_total": test_summary["total"] if test_summary else None,
133
+ "test_results": test_results["results"] if test_results else None,
134
+ # For backward compat with report generator
135
+ "passed": train_summary["passed"],
136
+ "failed": train_summary["failed"],
137
+ "total": train_summary["total"],
138
+ "results": train_results["results"],
139
+ }
140
+ )
139
141
 
140
142
  # Write live report if path provided
141
143
  if live_report_path:
@@ -152,6 +154,7 @@ def run_loop(
152
154
  live_report_path.write_text(generate_html(partial_output, auto_refresh=True, skill_name=name))
153
155
 
154
156
  if verbose:
157
+
155
158
  def print_eval_stats(label, results, elapsed):
156
159
  pos = [r for r in results if r["should_trigger"]]
157
160
  neg = [r for r in results if not r["should_trigger"]]
@@ -165,11 +168,17 @@ def run_loop(
165
168
  precision = tp / (tp + fp) if (tp + fp) > 0 else 1.0
166
169
  recall = tp / (tp + fn) if (tp + fn) > 0 else 1.0
167
170
  accuracy = (tp + tn) / total if total > 0 else 0.0
168
- print(f"{label}: {tp+tn}/{total} correct, precision={precision:.0%} recall={recall:.0%} accuracy={accuracy:.0%} ({elapsed:.1f}s)", file=sys.stderr)
171
+ print(
172
+ f"{label}: {tp + tn}/{total} correct, precision={precision:.0%} recall={recall:.0%} accuracy={accuracy:.0%} ({elapsed:.1f}s)",
173
+ file=sys.stderr,
174
+ )
169
175
  for r in results:
170
176
  status = "PASS" if r["pass"] else "FAIL"
171
177
  rate_str = f"{r['triggers']}/{r['runs']}"
172
- print(f" [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:60]}", file=sys.stderr)
178
+ print(
179
+ f" [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:60]}",
180
+ file=sys.stderr,
181
+ )
173
182
 
174
183
  print_eval_stats("Train", train_results["results"], eval_elapsed)
175
184
  if test_summary:
@@ -189,14 +198,11 @@ def run_loop(
189
198
 
190
199
  # Improve the description based on train results
191
200
  if verbose:
192
- print(f"\nImproving description...", file=sys.stderr)
201
+ print("\nImproving description...", file=sys.stderr)
193
202
 
194
203
  t0 = time.time()
195
204
  # Strip test scores from history so improvement model can't see them
196
- blinded_history = [
197
- {k: v for k, v in h.items() if not k.startswith("test_")}
198
- for h in history
199
- ]
205
+ blinded_history = [{k: v for k, v in h.items() if not k.startswith("test_")} for h in history]
200
206
  new_description = improve_description(
201
207
  skill_name=name,
202
208
  skill_content=content,
@@ -252,11 +258,21 @@ def main():
252
258
  parser.add_argument("--max-iterations", type=int, default=5, help="Max improvement iterations")
253
259
  parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query")
254
260
  parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold")
255
- parser.add_argument("--holdout", type=float, default=0.4, help="Fraction of eval set to hold out for testing (0 to disable)")
261
+ parser.add_argument(
262
+ "--holdout", type=float, default=0.4, help="Fraction of eval set to hold out for testing (0 to disable)"
263
+ )
256
264
  parser.add_argument("--model", required=True, help="Model for improvement")
257
265
  parser.add_argument("--verbose", action="store_true", help="Print progress to stderr")
258
- parser.add_argument("--report", default="auto", help="Generate HTML report at this path (default: 'auto' for temp file, 'none' to disable)")
259
- parser.add_argument("--results-dir", default=None, help="Save all outputs (results.json, report.html, log.txt) to a timestamped subdirectory here")
266
+ parser.add_argument(
267
+ "--report",
268
+ default="auto",
269
+ help="Generate HTML report at this path (default: 'auto' for temp file, 'none' to disable)",
270
+ )
271
+ parser.add_argument(
272
+ "--results-dir",
273
+ default=None,
274
+ help="Save all outputs (results.json, report.html, log.txt) to a timestamped subdirectory here",
275
+ )
260
276
  args = parser.parse_args()
261
277
 
262
278
  eval_set = json.loads(Path(args.eval_set).read_text())
@@ -272,11 +288,15 @@ def main():
272
288
  if args.report != "none":
273
289
  if args.report == "auto":
274
290
  timestamp = time.strftime("%Y%m%d_%H%M%S")
275
- live_report_path = Path(tempfile.gettempdir()) / f"skill_description_report_{skill_path.name}_{timestamp}.html"
291
+ live_report_path = (
292
+ Path(tempfile.gettempdir()) / f"skill_description_report_{skill_path.name}_{timestamp}.html"
293
+ )
276
294
  else:
277
295
  live_report_path = Path(args.report)
278
296
  # Open the report immediately so the user can watch
279
- live_report_path.write_text("<html><body><h1>Starting optimization loop...</h1><meta http-equiv='refresh' content='5'></body></html>")
297
+ live_report_path.write_text(
298
+ "<html><body><h1>Starting optimization loop...</h1><meta http-equiv='refresh' content='5'></body></html>"
299
+ )
280
300
  webbrowser.open(str(live_report_path))
281
301
  else:
282
302
  live_report_path = None
@@ -3,7 +3,6 @@
3
3
  from pathlib import Path
4
4
 
5
5
 
6
-
7
6
  def parse_skill_md(skill_path: Path) -> tuple[str, str, str]:
8
7
  """Parse a SKILL.md file, returning (name, description, full_content)."""
9
8
  content = (skill_path / "SKILL.md").read_text()
@@ -28,14 +27,16 @@ def parse_skill_md(skill_path: Path) -> tuple[str, str, str]:
28
27
  while i < len(frontmatter_lines):
29
28
  line = frontmatter_lines[i]
30
29
  if line.startswith("name:"):
31
- name = line[len("name:"):].strip().strip('"').strip("'")
30
+ name = line[len("name:") :].strip().strip('"').strip("'")
32
31
  elif line.startswith("description:"):
33
- value = line[len("description:"):].strip()
32
+ value = line[len("description:") :].strip()
34
33
  # Handle YAML multiline indicators (>, |, >-, |-)
35
34
  if value in (">", "|", ">-", "|-"):
36
35
  continuation_lines: list[str] = []
37
36
  i += 1
38
- while i < len(frontmatter_lines) and (frontmatter_lines[i].startswith(" ") or frontmatter_lines[i].startswith("\t")):
37
+ while i < len(frontmatter_lines) and (
38
+ frontmatter_lines[i].startswith(" ") or frontmatter_lines[i].startswith("\t")
39
+ ):
39
40
  continuation_lines.append(frontmatter_lines[i].strip())
40
41
  i += 1
41
42
  description = " ".join(continuation_lines)
@@ -76,6 +76,7 @@ You receive these parameters in your prompt:
76
76
  ## When Activated
77
77
 
78
78
  You activate when:
79
+
79
80
  - {{ACTIVATION_CONDITION_1}}
80
81
  - {{ACTIVATION_CONDITION_2}}
81
82
  - {{ACTIVATION_CONDITION_3}}
@@ -89,11 +90,13 @@ You activate when:
89
90
  ## Success Criteria
90
91
 
91
92
  Good output includes:
93
+
92
94
  - {{QUALITY_MARKER_1}}
93
95
  - {{QUALITY_MARKER_2}}
94
96
  - {{QUALITY_MARKER_3}}
95
97
 
96
98
  Poor output is:
99
+
97
100
  - {{ANTI_PATTERN_1}}
98
101
  - {{ANTI_PATTERN_2}}
99
102
  - {{ANTI_PATTERN_3}}
@@ -68,11 +68,13 @@ model: inherit
68
68
  ### {{EXAMPLE_1_TITLE}}
69
69
 
70
70
  **Input:**
71
+
71
72
  ```
72
73
  {{EXAMPLE_1_INPUT}}
73
74
  ```
74
75
 
75
76
  **Output:**
77
+
76
78
  ```
77
79
  {{EXAMPLE_1_OUTPUT}}
78
80
  ```
@@ -80,11 +82,13 @@ model: inherit
80
82
  ### {{EXAMPLE_2_TITLE}}
81
83
 
82
84
  **Input:**
85
+
83
86
  ```
84
87
  {{EXAMPLE_2_INPUT}}
85
88
  ```
86
89
 
87
90
  **Output:**
91
+
88
92
  ```
89
93
  {{EXAMPLE_2_OUTPUT}}
90
94
  ```