@cremini/skillpack 1.1.8 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. package/README.md +5 -1
  2. package/dist/cli.js +342 -91
  3. package/dist/runtime/registry.js +244 -0
  4. package/package.json +2 -2
  5. package/templates/builtin-skills/skill-creator/LICENSE.txt +202 -0
  6. package/templates/builtin-skills/skill-creator/SKILL.md +171 -0
  7. package/templates/builtin-skills/skill-creator/agents/analyzer.md +274 -0
  8. package/templates/builtin-skills/skill-creator/agents/comparator.md +202 -0
  9. package/templates/builtin-skills/skill-creator/agents/grader.md +223 -0
  10. package/templates/builtin-skills/skill-creator/assets/eval_review.html +146 -0
  11. package/templates/builtin-skills/skill-creator/eval-viewer/generate_review.py +471 -0
  12. package/templates/builtin-skills/skill-creator/eval-viewer/viewer.html +1325 -0
  13. package/templates/builtin-skills/skill-creator/references/schemas.md +430 -0
  14. package/templates/builtin-skills/skill-creator/scripts/__init__.py +0 -0
  15. package/templates/builtin-skills/skill-creator/scripts/aggregate_benchmark.py +401 -0
  16. package/templates/builtin-skills/skill-creator/scripts/generate_report.py +326 -0
  17. package/templates/builtin-skills/skill-creator/scripts/improve_description.py +247 -0
  18. package/templates/builtin-skills/skill-creator/scripts/package_skill.py +136 -0
  19. package/templates/builtin-skills/skill-creator/scripts/quick_validate.py +103 -0
  20. package/templates/builtin-skills/skill-creator/scripts/run_eval.py +310 -0
  21. package/templates/builtin-skills/skill-creator/scripts/run_loop.py +328 -0
  22. package/templates/builtin-skills/skill-creator/scripts/utils.py +47 -0
  23. package/web/js/api-key-dialog.js +3 -5
  24. package/web/js/chat-apps-dialog.js +4 -10
  25. package/web/js/chat.js +8 -8
  26. package/web/js/settings.js +3 -8
@@ -0,0 +1,328 @@
1
+ #!/usr/bin/env python3
2
+ """Run the eval + improve loop until all pass or max iterations reached.
3
+
4
+ Combines run_eval.py and improve_description.py in a loop, tracking history
5
+ and returning the best description found. Supports train/test split to prevent
6
+ overfitting.
7
+ """
8
+
9
+ import argparse
10
+ import json
11
+ import random
12
+ import sys
13
+ import tempfile
14
+ import time
15
+ import webbrowser
16
+ from pathlib import Path
17
+
18
+ from scripts.generate_report import generate_html
19
+ from scripts.improve_description import improve_description
20
+ from scripts.run_eval import find_project_root, run_eval
21
+ from scripts.utils import parse_skill_md
22
+
23
+
24
+ def split_eval_set(eval_set: list[dict], holdout: float, seed: int = 42) -> tuple[list[dict], list[dict]]:
25
+ """Split eval set into train and test sets, stratified by should_trigger."""
26
+ random.seed(seed)
27
+
28
+ # Separate by should_trigger
29
+ trigger = [e for e in eval_set if e["should_trigger"]]
30
+ no_trigger = [e for e in eval_set if not e["should_trigger"]]
31
+
32
+ # Shuffle each group
33
+ random.shuffle(trigger)
34
+ random.shuffle(no_trigger)
35
+
36
+ # Calculate split points
37
+ n_trigger_test = max(1, int(len(trigger) * holdout))
38
+ n_no_trigger_test = max(1, int(len(no_trigger) * holdout))
39
+
40
+ # Split
41
+ test_set = trigger[:n_trigger_test] + no_trigger[:n_no_trigger_test]
42
+ train_set = trigger[n_trigger_test:] + no_trigger[n_no_trigger_test:]
43
+
44
+ return train_set, test_set
45
+
46
+
47
+ def run_loop(
48
+ eval_set: list[dict],
49
+ skill_path: Path,
50
+ description_override: str | None,
51
+ num_workers: int,
52
+ timeout: int,
53
+ max_iterations: int,
54
+ runs_per_query: int,
55
+ trigger_threshold: float,
56
+ holdout: float,
57
+ model: str,
58
+ verbose: bool,
59
+ live_report_path: Path | None = None,
60
+ log_dir: Path | None = None,
61
+ ) -> dict:
62
+ """Run the eval + improvement loop."""
63
+ project_root = find_project_root()
64
+ name, original_description, content = parse_skill_md(skill_path)
65
+ current_description = description_override or original_description
66
+
67
+ # Split into train/test if holdout > 0
68
+ if holdout > 0:
69
+ train_set, test_set = split_eval_set(eval_set, holdout)
70
+ if verbose:
71
+ print(f"Split: {len(train_set)} train, {len(test_set)} test (holdout={holdout})", file=sys.stderr)
72
+ else:
73
+ train_set = eval_set
74
+ test_set = []
75
+
76
+ history = []
77
+ exit_reason = "unknown"
78
+
79
+ for iteration in range(1, max_iterations + 1):
80
+ if verbose:
81
+ print(f"\n{'='*60}", file=sys.stderr)
82
+ print(f"Iteration {iteration}/{max_iterations}", file=sys.stderr)
83
+ print(f"Description: {current_description}", file=sys.stderr)
84
+ print(f"{'='*60}", file=sys.stderr)
85
+
86
+ # Evaluate train + test together in one batch for parallelism
87
+ all_queries = train_set + test_set
88
+ t0 = time.time()
89
+ all_results = run_eval(
90
+ eval_set=all_queries,
91
+ skill_name=name,
92
+ description=current_description,
93
+ num_workers=num_workers,
94
+ timeout=timeout,
95
+ project_root=project_root,
96
+ runs_per_query=runs_per_query,
97
+ trigger_threshold=trigger_threshold,
98
+ model=model,
99
+ )
100
+ eval_elapsed = time.time() - t0
101
+
102
+ # Split results back into train/test by matching queries
103
+ train_queries_set = {q["query"] for q in train_set}
104
+ train_result_list = [r for r in all_results["results"] if r["query"] in train_queries_set]
105
+ test_result_list = [r for r in all_results["results"] if r["query"] not in train_queries_set]
106
+
107
+ train_passed = sum(1 for r in train_result_list if r["pass"])
108
+ train_total = len(train_result_list)
109
+ train_summary = {"passed": train_passed, "failed": train_total - train_passed, "total": train_total}
110
+ train_results = {"results": train_result_list, "summary": train_summary}
111
+
112
+ if test_set:
113
+ test_passed = sum(1 for r in test_result_list if r["pass"])
114
+ test_total = len(test_result_list)
115
+ test_summary = {"passed": test_passed, "failed": test_total - test_passed, "total": test_total}
116
+ test_results = {"results": test_result_list, "summary": test_summary}
117
+ else:
118
+ test_results = None
119
+ test_summary = None
120
+
121
+ history.append({
122
+ "iteration": iteration,
123
+ "description": current_description,
124
+ "train_passed": train_summary["passed"],
125
+ "train_failed": train_summary["failed"],
126
+ "train_total": train_summary["total"],
127
+ "train_results": train_results["results"],
128
+ "test_passed": test_summary["passed"] if test_summary else None,
129
+ "test_failed": test_summary["failed"] if test_summary else None,
130
+ "test_total": test_summary["total"] if test_summary else None,
131
+ "test_results": test_results["results"] if test_results else None,
132
+ # For backward compat with report generator
133
+ "passed": train_summary["passed"],
134
+ "failed": train_summary["failed"],
135
+ "total": train_summary["total"],
136
+ "results": train_results["results"],
137
+ })
138
+
139
+ # Write live report if path provided
140
+ if live_report_path:
141
+ partial_output = {
142
+ "original_description": original_description,
143
+ "best_description": current_description,
144
+ "best_score": "in progress",
145
+ "iterations_run": len(history),
146
+ "holdout": holdout,
147
+ "train_size": len(train_set),
148
+ "test_size": len(test_set),
149
+ "history": history,
150
+ }
151
+ live_report_path.write_text(generate_html(partial_output, auto_refresh=True, skill_name=name))
152
+
153
+ if verbose:
154
+ def print_eval_stats(label, results, elapsed):
155
+ pos = [r for r in results if r["should_trigger"]]
156
+ neg = [r for r in results if not r["should_trigger"]]
157
+ tp = sum(r["triggers"] for r in pos)
158
+ pos_runs = sum(r["runs"] for r in pos)
159
+ fn = pos_runs - tp
160
+ fp = sum(r["triggers"] for r in neg)
161
+ neg_runs = sum(r["runs"] for r in neg)
162
+ tn = neg_runs - fp
163
+ total = tp + tn + fp + fn
164
+ precision = tp / (tp + fp) if (tp + fp) > 0 else 1.0
165
+ recall = tp / (tp + fn) if (tp + fn) > 0 else 1.0
166
+ accuracy = (tp + tn) / total if total > 0 else 0.0
167
+ print(f"{label}: {tp+tn}/{total} correct, precision={precision:.0%} recall={recall:.0%} accuracy={accuracy:.0%} ({elapsed:.1f}s)", file=sys.stderr)
168
+ for r in results:
169
+ status = "PASS" if r["pass"] else "FAIL"
170
+ rate_str = f"{r['triggers']}/{r['runs']}"
171
+ print(f" [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:60]}", file=sys.stderr)
172
+
173
+ print_eval_stats("Train", train_results["results"], eval_elapsed)
174
+ if test_summary:
175
+ print_eval_stats("Test ", test_results["results"], 0)
176
+
177
+ if train_summary["failed"] == 0:
178
+ exit_reason = f"all_passed (iteration {iteration})"
179
+ if verbose:
180
+ print(f"\nAll train queries passed on iteration {iteration}!", file=sys.stderr)
181
+ break
182
+
183
+ if iteration == max_iterations:
184
+ exit_reason = f"max_iterations ({max_iterations})"
185
+ if verbose:
186
+ print(f"\nMax iterations reached ({max_iterations}).", file=sys.stderr)
187
+ break
188
+
189
+ # Improve the description based on train results
190
+ if verbose:
191
+ print(f"\nImproving description...", file=sys.stderr)
192
+
193
+ t0 = time.time()
194
+ # Strip test scores from history so improvement model can't see them
195
+ blinded_history = [
196
+ {k: v for k, v in h.items() if not k.startswith("test_")}
197
+ for h in history
198
+ ]
199
+ new_description = improve_description(
200
+ skill_name=name,
201
+ skill_content=content,
202
+ current_description=current_description,
203
+ eval_results=train_results,
204
+ history=blinded_history,
205
+ model=model,
206
+ log_dir=log_dir,
207
+ iteration=iteration,
208
+ )
209
+ improve_elapsed = time.time() - t0
210
+
211
+ if verbose:
212
+ print(f"Proposed ({improve_elapsed:.1f}s): {new_description}", file=sys.stderr)
213
+
214
+ current_description = new_description
215
+
216
+ # Find the best iteration by TEST score (or train if no test set)
217
+ if test_set:
218
+ best = max(history, key=lambda h: h["test_passed"] or 0)
219
+ best_score = f"{best['test_passed']}/{best['test_total']}"
220
+ else:
221
+ best = max(history, key=lambda h: h["train_passed"])
222
+ best_score = f"{best['train_passed']}/{best['train_total']}"
223
+
224
+ if verbose:
225
+ print(f"\nExit reason: {exit_reason}", file=sys.stderr)
226
+ print(f"Best score: {best_score} (iteration {best['iteration']})", file=sys.stderr)
227
+
228
+ return {
229
+ "exit_reason": exit_reason,
230
+ "original_description": original_description,
231
+ "best_description": best["description"],
232
+ "best_score": best_score,
233
+ "best_train_score": f"{best['train_passed']}/{best['train_total']}",
234
+ "best_test_score": f"{best['test_passed']}/{best['test_total']}" if test_set else None,
235
+ "final_description": current_description,
236
+ "iterations_run": len(history),
237
+ "holdout": holdout,
238
+ "train_size": len(train_set),
239
+ "test_size": len(test_set),
240
+ "history": history,
241
+ }
242
+
243
+
244
+ def main():
245
+ parser = argparse.ArgumentParser(description="Run eval + improve loop")
246
+ parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file")
247
+ parser.add_argument("--skill-path", required=True, help="Path to skill directory")
248
+ parser.add_argument("--description", default=None, help="Override starting description")
249
+ parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers")
250
+ parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds")
251
+ parser.add_argument("--max-iterations", type=int, default=5, help="Max improvement iterations")
252
+ parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query")
253
+ parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold")
254
+ parser.add_argument("--holdout", type=float, default=0.4, help="Fraction of eval set to hold out for testing (0 to disable)")
255
+ parser.add_argument("--model", required=True, help="Model for improvement")
256
+ parser.add_argument("--verbose", action="store_true", help="Print progress to stderr")
257
+ parser.add_argument("--report", default="auto", help="Generate HTML report at this path (default: 'auto' for temp file, 'none' to disable)")
258
+ parser.add_argument("--results-dir", default=None, help="Save all outputs (results.json, report.html, log.txt) to a timestamped subdirectory here")
259
+ args = parser.parse_args()
260
+
261
+ eval_set = json.loads(Path(args.eval_set).read_text())
262
+ skill_path = Path(args.skill_path)
263
+
264
+ if not (skill_path / "SKILL.md").exists():
265
+ print(f"Error: No SKILL.md found at {skill_path}", file=sys.stderr)
266
+ sys.exit(1)
267
+
268
+ name, _, _ = parse_skill_md(skill_path)
269
+
270
+ # Set up live report path
271
+ if args.report != "none":
272
+ if args.report == "auto":
273
+ timestamp = time.strftime("%Y%m%d_%H%M%S")
274
+ live_report_path = Path(tempfile.gettempdir()) / f"skill_description_report_{skill_path.name}_{timestamp}.html"
275
+ else:
276
+ live_report_path = Path(args.report)
277
+ # Open the report immediately so the user can watch
278
+ live_report_path.write_text("<html><body><h1>Starting optimization loop...</h1><meta http-equiv='refresh' content='5'></body></html>")
279
+ webbrowser.open(str(live_report_path))
280
+ else:
281
+ live_report_path = None
282
+
283
+ # Determine output directory (create before run_loop so logs can be written)
284
+ if args.results_dir:
285
+ timestamp = time.strftime("%Y-%m-%d_%H%M%S")
286
+ results_dir = Path(args.results_dir) / timestamp
287
+ results_dir.mkdir(parents=True, exist_ok=True)
288
+ else:
289
+ results_dir = None
290
+
291
+ log_dir = results_dir / "logs" if results_dir else None
292
+
293
+ output = run_loop(
294
+ eval_set=eval_set,
295
+ skill_path=skill_path,
296
+ description_override=args.description,
297
+ num_workers=args.num_workers,
298
+ timeout=args.timeout,
299
+ max_iterations=args.max_iterations,
300
+ runs_per_query=args.runs_per_query,
301
+ trigger_threshold=args.trigger_threshold,
302
+ holdout=args.holdout,
303
+ model=args.model,
304
+ verbose=args.verbose,
305
+ live_report_path=live_report_path,
306
+ log_dir=log_dir,
307
+ )
308
+
309
+ # Save JSON output
310
+ json_output = json.dumps(output, indent=2)
311
+ print(json_output)
312
+ if results_dir:
313
+ (results_dir / "results.json").write_text(json_output)
314
+
315
+ # Write final HTML report (without auto-refresh)
316
+ if live_report_path:
317
+ live_report_path.write_text(generate_html(output, auto_refresh=False, skill_name=name))
318
+ print(f"\nReport: {live_report_path}", file=sys.stderr)
319
+
320
+ if results_dir and live_report_path:
321
+ (results_dir / "report.html").write_text(generate_html(output, auto_refresh=False, skill_name=name))
322
+
323
+ if results_dir:
324
+ print(f"Results saved to: {results_dir}", file=sys.stderr)
325
+
326
+
327
+ if __name__ == "__main__":
328
+ main()
@@ -0,0 +1,47 @@
1
+ """Shared utilities for skill-creator scripts."""
2
+
3
+ from pathlib import Path
4
+
5
+
6
+
7
+ def parse_skill_md(skill_path: Path) -> tuple[str, str, str]:
8
+ """Parse a SKILL.md file, returning (name, description, full_content)."""
9
+ content = (skill_path / "SKILL.md").read_text()
10
+ lines = content.split("\n")
11
+
12
+ if lines[0].strip() != "---":
13
+ raise ValueError("SKILL.md missing frontmatter (no opening ---)")
14
+
15
+ end_idx = None
16
+ for i, line in enumerate(lines[1:], start=1):
17
+ if line.strip() == "---":
18
+ end_idx = i
19
+ break
20
+
21
+ if end_idx is None:
22
+ raise ValueError("SKILL.md missing frontmatter (no closing ---)")
23
+
24
+ name = ""
25
+ description = ""
26
+ frontmatter_lines = lines[1:end_idx]
27
+ i = 0
28
+ while i < len(frontmatter_lines):
29
+ line = frontmatter_lines[i]
30
+ if line.startswith("name:"):
31
+ name = line[len("name:"):].strip().strip('"').strip("'")
32
+ elif line.startswith("description:"):
33
+ value = line[len("description:"):].strip()
34
+ # Handle YAML multiline indicators (>, |, >-, |-)
35
+ if value in (">", "|", ">-", "|-"):
36
+ continuation_lines: list[str] = []
37
+ i += 1
38
+ while i < len(frontmatter_lines) and (frontmatter_lines[i].startswith(" ") or frontmatter_lines[i].startswith("\t")):
39
+ continuation_lines.append(frontmatter_lines[i].strip())
40
+ i += 1
41
+ description = " ".join(continuation_lines)
42
+ continue
43
+ else:
44
+ description = value.strip('"').strip("'")
45
+ i += 1
46
+
47
+ return name, description, content
@@ -136,19 +136,17 @@ async function handleSave() {
136
136
  apiKeyInput.value = "";
137
137
  }
138
138
 
139
- state.config.runtimeControl = res.runtimeControl;
139
+
140
140
  state.restartRequired = !!res.requiresRestart;
141
141
 
142
142
  updateApiKeyButton();
143
143
 
144
144
  if (res.requiresRestart) {
145
145
  setStatus(
146
- res.runtimeControl?.canManagedRestart
147
- ? "API key saved. Restart service to apply changes."
148
- : "API key saved. Restart the service manually to apply changes.",
146
+ "API key saved. Restart service to apply changes.",
149
147
  "warning",
150
148
  );
151
- updateRestartButton(!!res.runtimeControl?.canManagedRestart);
149
+ updateRestartButton(true);
152
150
  } else {
153
151
  setStatus("API key saved successfully", "success");
154
152
  // 延迟关闭让用户看到成功消息
@@ -101,14 +101,11 @@ function populateForm() {
101
101
 
102
102
  // Restart required status
103
103
  if (state.restartRequired) {
104
- const canRestart = config.runtimeControl?.canManagedRestart;
105
104
  setStatus(
106
- canRestart
107
- ? "Settings changed. Restart service to apply."
108
- : "Settings changed. Restart the service manually to apply.",
105
+ "Settings changed. Restart service to apply.",
109
106
  "warning",
110
107
  );
111
- updateRestartButton(canRestart);
108
+ updateRestartButton(true);
112
109
  } else {
113
110
  setStatus("", "");
114
111
  updateRestartButton(false);
@@ -139,17 +136,14 @@ async function handleSave() {
139
136
  const res = await saveConfigData(updates);
140
137
 
141
138
  state.config.adapters = res.adapters;
142
- state.config.runtimeControl = res.runtimeControl;
143
139
  state.restartRequired = !!res.requiresRestart;
144
140
 
145
141
  if (res.requiresRestart) {
146
142
  setStatus(
147
- res.runtimeControl?.canManagedRestart
148
- ? "Settings saved. Restart service to apply changes."
149
- : "Settings saved. Restart the service manually to apply changes.",
143
+ "Settings saved. Restart service to apply changes.",
150
144
  "warning",
151
145
  );
152
- updateRestartButton(!!res.runtimeControl?.canManagedRestart);
146
+ updateRestartButton(true);
153
147
  } else {
154
148
  close();
155
149
  }
package/web/js/chat.js CHANGED
@@ -49,14 +49,14 @@ export function showWelcome(config) {
49
49
  promptsHtml = `
50
50
  <div class="prompt-cards">
51
51
  ${config.prompts
52
- .map(
53
- (u, i) => `
52
+ .map(
53
+ (u, i) => `
54
54
  <div class="prompt-card" data-index="${i}" title="${u}">
55
55
  ${u.length > 60 ? u.substring(0, 60) + "..." : u}
56
56
  </div>
57
57
  `,
58
- )
59
- .join("")}
58
+ )
59
+ .join("")}
60
60
  </div>
61
61
  `;
62
62
  }
@@ -64,8 +64,8 @@ export function showWelcome(config) {
64
64
  if (welcomeContent) {
65
65
  welcomeContent.innerHTML = `
66
66
  <div class="welcome-message">
67
- <h2>Turn Skills into a Standalone App with UI</h2>
68
- <p>One command to orchestrate skills into a standalone app users can download and use on their computer</p>
67
+ <h2>Pack and deploy local AI agents for your team in minutes</h2>
68
+ <p>Deploy verified AI Skillpacks locally and use them directly from Slack and Telegram</p>
69
69
  ${promptsHtml}
70
70
  </div>
71
71
  `;
@@ -403,8 +403,8 @@ function handleAgentEvent(event) {
403
403
 
404
404
  const mdText =
405
405
  event.result &&
406
- typeof event.result === "string" &&
407
- (event.result.includes("\n") || event.result.length > 50)
406
+ typeof event.result === "string" &&
407
+ (event.result.includes("\n") || event.result.length > 50)
408
408
  ? "\`\`\`bash\n" + safeResult + "\n\`\`\`"
409
409
  : "\`\`\`json\n" + safeResult + "\n\`\`\`";
410
410
 
@@ -72,9 +72,7 @@ function populateForm() {
72
72
 
73
73
  if (state.restartRequired) {
74
74
  setStatus(
75
- config.runtimeControl?.canManagedRestart
76
- ? "Settings saved. Restart service to apply changes."
77
- : "Settings saved. Restart the service manually to apply changes.",
75
+ "Settings saved. Restart service to apply changes.",
78
76
  "warning",
79
77
  );
80
78
  updateRestartButton(true);
@@ -139,7 +137,6 @@ async function handleSave() {
139
137
  // Update local config
140
138
  state.config.provider = res.provider;
141
139
  state.config.adapters = res.adapters;
142
- state.config.runtimeControl = res.runtimeControl;
143
140
  if (updates.key) {
144
141
  state.config.hasApiKey = true;
145
142
  state.config.apiKey = updates.key;
@@ -156,12 +153,10 @@ async function handleSave() {
156
153
 
157
154
  if (res.requiresRestart) {
158
155
  setStatus(
159
- res.runtimeControl.canManagedRestart
160
- ? "Settings saved. Restart service to apply changes."
161
- : "Settings saved. Restart the service manually to apply changes.",
156
+ "Settings saved. Restart service to apply changes.",
162
157
  "warning",
163
158
  );
164
- updateRestartButton(res.runtimeControl.canManagedRestart);
159
+ updateRestartButton(true);
165
160
  return;
166
161
  }
167
162