@intentsolutionsio/skill-creator 5.0.0 → 5.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/scripts/validate-skill.py +61 -1100
- package/skills/agent-creator/SKILL.md +40 -14
- package/skills/agent-creator/references/anthropic-agent-spec.md +1 -0
- package/skills/skill-creator/SKILL.md +34 -9
- package/skills/skill-creator/agents/analyzer.md +39 -1
- package/skills/skill-creator/agents/comparator.md +31 -1
- package/skills/skill-creator/agents/grader.md +32 -1
- package/skills/skill-creator/eval-viewer/generate_review.py +45 -13
- package/skills/skill-creator/references/advanced-eval-workflow.md +16 -0
- package/skills/skill-creator/references/anthropic-comparison.md +3 -0
- package/skills/skill-creator/references/creation-guide.md +20 -1
- package/skills/skill-creator/references/errors-template.md +1 -0
- package/skills/skill-creator/references/examples-template.md +1 -0
- package/skills/skill-creator/references/frontmatter-spec.md +1 -0
- package/skills/skill-creator/references/implementation-template.md +1 -0
- package/skills/skill-creator/references/output-patterns.md +7 -0
- package/skills/skill-creator/references/schemas.md +5 -0
- package/skills/skill-creator/references/source-of-truth.md +40 -2
- package/skills/skill-creator/references/validation-rules.md +19 -1
- package/skills/skill-creator/scripts/aggregate_benchmark.py +46 -60
- package/skills/skill-creator/scripts/generate_report.py +29 -17
- package/skills/skill-creator/scripts/improve_description.py +18 -21
- package/skills/skill-creator/scripts/package_skill.py +2 -2
- package/skills/skill-creator/scripts/quick_validate.py +16 -15
- package/skills/skill-creator/scripts/run_eval.py +14 -10
- package/skills/skill-creator/scripts/run_loop.py +51 -31
- package/skills/skill-creator/scripts/utils.py +5 -4
- package/skills/skill-creator/templates/agent-template.md +3 -0
- package/skills/skill-creator/templates/skill-template.md +4 -0
|
@@ -78,10 +78,10 @@ def run_loop(
|
|
|
78
78
|
|
|
79
79
|
for iteration in range(1, max_iterations + 1):
|
|
80
80
|
if verbose:
|
|
81
|
-
print(f"\n{'='*60}", file=sys.stderr)
|
|
81
|
+
print(f"\n{'=' * 60}", file=sys.stderr)
|
|
82
82
|
print(f"Iteration {iteration}/{max_iterations}", file=sys.stderr)
|
|
83
83
|
print(f"Description: {current_description}", file=sys.stderr)
|
|
84
|
-
print(f"{'='*60}", file=sys.stderr)
|
|
84
|
+
print(f"{'=' * 60}", file=sys.stderr)
|
|
85
85
|
|
|
86
86
|
# Evaluate train + test together in one batch for parallelism
|
|
87
87
|
all_queries = train_set + test_set
|
|
@@ -119,23 +119,25 @@ def run_loop(
|
|
|
119
119
|
test_results = None
|
|
120
120
|
test_summary = None
|
|
121
121
|
|
|
122
|
-
history.append(
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
122
|
+
history.append(
|
|
123
|
+
{
|
|
124
|
+
"iteration": iteration,
|
|
125
|
+
"description": current_description,
|
|
126
|
+
"train_passed": train_summary["passed"],
|
|
127
|
+
"train_failed": train_summary["failed"],
|
|
128
|
+
"train_total": train_summary["total"],
|
|
129
|
+
"train_results": train_results["results"],
|
|
130
|
+
"test_passed": test_summary["passed"] if test_summary else None,
|
|
131
|
+
"test_failed": test_summary["failed"] if test_summary else None,
|
|
132
|
+
"test_total": test_summary["total"] if test_summary else None,
|
|
133
|
+
"test_results": test_results["results"] if test_results else None,
|
|
134
|
+
# For backward compat with report generator
|
|
135
|
+
"passed": train_summary["passed"],
|
|
136
|
+
"failed": train_summary["failed"],
|
|
137
|
+
"total": train_summary["total"],
|
|
138
|
+
"results": train_results["results"],
|
|
139
|
+
}
|
|
140
|
+
)
|
|
139
141
|
|
|
140
142
|
# Write live report if path provided
|
|
141
143
|
if live_report_path:
|
|
@@ -152,6 +154,7 @@ def run_loop(
|
|
|
152
154
|
live_report_path.write_text(generate_html(partial_output, auto_refresh=True, skill_name=name))
|
|
153
155
|
|
|
154
156
|
if verbose:
|
|
157
|
+
|
|
155
158
|
def print_eval_stats(label, results, elapsed):
|
|
156
159
|
pos = [r for r in results if r["should_trigger"]]
|
|
157
160
|
neg = [r for r in results if not r["should_trigger"]]
|
|
@@ -165,11 +168,17 @@ def run_loop(
|
|
|
165
168
|
precision = tp / (tp + fp) if (tp + fp) > 0 else 1.0
|
|
166
169
|
recall = tp / (tp + fn) if (tp + fn) > 0 else 1.0
|
|
167
170
|
accuracy = (tp + tn) / total if total > 0 else 0.0
|
|
168
|
-
print(
|
|
171
|
+
print(
|
|
172
|
+
f"{label}: {tp + tn}/{total} correct, precision={precision:.0%} recall={recall:.0%} accuracy={accuracy:.0%} ({elapsed:.1f}s)",
|
|
173
|
+
file=sys.stderr,
|
|
174
|
+
)
|
|
169
175
|
for r in results:
|
|
170
176
|
status = "PASS" if r["pass"] else "FAIL"
|
|
171
177
|
rate_str = f"{r['triggers']}/{r['runs']}"
|
|
172
|
-
print(
|
|
178
|
+
print(
|
|
179
|
+
f" [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:60]}",
|
|
180
|
+
file=sys.stderr,
|
|
181
|
+
)
|
|
173
182
|
|
|
174
183
|
print_eval_stats("Train", train_results["results"], eval_elapsed)
|
|
175
184
|
if test_summary:
|
|
@@ -189,14 +198,11 @@ def run_loop(
|
|
|
189
198
|
|
|
190
199
|
# Improve the description based on train results
|
|
191
200
|
if verbose:
|
|
192
|
-
print(
|
|
201
|
+
print("\nImproving description...", file=sys.stderr)
|
|
193
202
|
|
|
194
203
|
t0 = time.time()
|
|
195
204
|
# Strip test scores from history so improvement model can't see them
|
|
196
|
-
blinded_history = [
|
|
197
|
-
{k: v for k, v in h.items() if not k.startswith("test_")}
|
|
198
|
-
for h in history
|
|
199
|
-
]
|
|
205
|
+
blinded_history = [{k: v for k, v in h.items() if not k.startswith("test_")} for h in history]
|
|
200
206
|
new_description = improve_description(
|
|
201
207
|
skill_name=name,
|
|
202
208
|
skill_content=content,
|
|
@@ -252,11 +258,21 @@ def main():
|
|
|
252
258
|
parser.add_argument("--max-iterations", type=int, default=5, help="Max improvement iterations")
|
|
253
259
|
parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query")
|
|
254
260
|
parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold")
|
|
255
|
-
parser.add_argument(
|
|
261
|
+
parser.add_argument(
|
|
262
|
+
"--holdout", type=float, default=0.4, help="Fraction of eval set to hold out for testing (0 to disable)"
|
|
263
|
+
)
|
|
256
264
|
parser.add_argument("--model", required=True, help="Model for improvement")
|
|
257
265
|
parser.add_argument("--verbose", action="store_true", help="Print progress to stderr")
|
|
258
|
-
parser.add_argument(
|
|
259
|
-
|
|
266
|
+
parser.add_argument(
|
|
267
|
+
"--report",
|
|
268
|
+
default="auto",
|
|
269
|
+
help="Generate HTML report at this path (default: 'auto' for temp file, 'none' to disable)",
|
|
270
|
+
)
|
|
271
|
+
parser.add_argument(
|
|
272
|
+
"--results-dir",
|
|
273
|
+
default=None,
|
|
274
|
+
help="Save all outputs (results.json, report.html, log.txt) to a timestamped subdirectory here",
|
|
275
|
+
)
|
|
260
276
|
args = parser.parse_args()
|
|
261
277
|
|
|
262
278
|
eval_set = json.loads(Path(args.eval_set).read_text())
|
|
@@ -272,11 +288,15 @@ def main():
|
|
|
272
288
|
if args.report != "none":
|
|
273
289
|
if args.report == "auto":
|
|
274
290
|
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
|
275
|
-
live_report_path =
|
|
291
|
+
live_report_path = (
|
|
292
|
+
Path(tempfile.gettempdir()) / f"skill_description_report_{skill_path.name}_{timestamp}.html"
|
|
293
|
+
)
|
|
276
294
|
else:
|
|
277
295
|
live_report_path = Path(args.report)
|
|
278
296
|
# Open the report immediately so the user can watch
|
|
279
|
-
live_report_path.write_text(
|
|
297
|
+
live_report_path.write_text(
|
|
298
|
+
"<html><body><h1>Starting optimization loop...</h1><meta http-equiv='refresh' content='5'></body></html>"
|
|
299
|
+
)
|
|
280
300
|
webbrowser.open(str(live_report_path))
|
|
281
301
|
else:
|
|
282
302
|
live_report_path = None
|
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
|
|
5
5
|
|
|
6
|
-
|
|
7
6
|
def parse_skill_md(skill_path: Path) -> tuple[str, str, str]:
|
|
8
7
|
"""Parse a SKILL.md file, returning (name, description, full_content)."""
|
|
9
8
|
content = (skill_path / "SKILL.md").read_text()
|
|
@@ -28,14 +27,16 @@ def parse_skill_md(skill_path: Path) -> tuple[str, str, str]:
|
|
|
28
27
|
while i < len(frontmatter_lines):
|
|
29
28
|
line = frontmatter_lines[i]
|
|
30
29
|
if line.startswith("name:"):
|
|
31
|
-
name = line[len("name:"):].strip().strip('"').strip("'")
|
|
30
|
+
name = line[len("name:") :].strip().strip('"').strip("'")
|
|
32
31
|
elif line.startswith("description:"):
|
|
33
|
-
value = line[len("description:"):].strip()
|
|
32
|
+
value = line[len("description:") :].strip()
|
|
34
33
|
# Handle YAML multiline indicators (>, |, >-, |-)
|
|
35
34
|
if value in (">", "|", ">-", "|-"):
|
|
36
35
|
continuation_lines: list[str] = []
|
|
37
36
|
i += 1
|
|
38
|
-
while i < len(frontmatter_lines) and (
|
|
37
|
+
while i < len(frontmatter_lines) and (
|
|
38
|
+
frontmatter_lines[i].startswith(" ") or frontmatter_lines[i].startswith("\t")
|
|
39
|
+
):
|
|
39
40
|
continuation_lines.append(frontmatter_lines[i].strip())
|
|
40
41
|
i += 1
|
|
41
42
|
description = " ".join(continuation_lines)
|
|
@@ -76,6 +76,7 @@ You receive these parameters in your prompt:
|
|
|
76
76
|
## When Activated
|
|
77
77
|
|
|
78
78
|
You activate when:
|
|
79
|
+
|
|
79
80
|
- {{ACTIVATION_CONDITION_1}}
|
|
80
81
|
- {{ACTIVATION_CONDITION_2}}
|
|
81
82
|
- {{ACTIVATION_CONDITION_3}}
|
|
@@ -89,11 +90,13 @@ You activate when:
|
|
|
89
90
|
## Success Criteria
|
|
90
91
|
|
|
91
92
|
Good output includes:
|
|
93
|
+
|
|
92
94
|
- {{QUALITY_MARKER_1}}
|
|
93
95
|
- {{QUALITY_MARKER_2}}
|
|
94
96
|
- {{QUALITY_MARKER_3}}
|
|
95
97
|
|
|
96
98
|
Poor output is:
|
|
99
|
+
|
|
97
100
|
- {{ANTI_PATTERN_1}}
|
|
98
101
|
- {{ANTI_PATTERN_2}}
|
|
99
102
|
- {{ANTI_PATTERN_3}}
|
|
@@ -68,11 +68,13 @@ model: inherit
|
|
|
68
68
|
### {{EXAMPLE_1_TITLE}}
|
|
69
69
|
|
|
70
70
|
**Input:**
|
|
71
|
+
|
|
71
72
|
```
|
|
72
73
|
{{EXAMPLE_1_INPUT}}
|
|
73
74
|
```
|
|
74
75
|
|
|
75
76
|
**Output:**
|
|
77
|
+
|
|
76
78
|
```
|
|
77
79
|
{{EXAMPLE_1_OUTPUT}}
|
|
78
80
|
```
|
|
@@ -80,11 +82,13 @@ model: inherit
|
|
|
80
82
|
### {{EXAMPLE_2_TITLE}}
|
|
81
83
|
|
|
82
84
|
**Input:**
|
|
85
|
+
|
|
83
86
|
```
|
|
84
87
|
{{EXAMPLE_2_INPUT}}
|
|
85
88
|
```
|
|
86
89
|
|
|
87
90
|
**Output:**
|
|
91
|
+
|
|
88
92
|
```
|
|
89
93
|
{{EXAMPLE_2_OUTPUT}}
|
|
90
94
|
```
|