@rudderhq/agent-runtime-gemini-local 0.2.1 → 0.2.2-canary.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/package.json +2 -2
  2. package/skills/conversation-to-skill/LICENSE.txt +202 -0
  3. package/skills/conversation-to-skill/SKILL.md +428 -0
  4. package/skills/conversation-to-skill/agents/analyzer.md +274 -0
  5. package/skills/conversation-to-skill/agents/comparator.md +202 -0
  6. package/skills/conversation-to-skill/agents/grader.md +223 -0
  7. package/skills/conversation-to-skill/assets/eval_review.html +146 -0
  8. package/skills/conversation-to-skill/eval-viewer/generate_review.py +471 -0
  9. package/skills/conversation-to-skill/eval-viewer/viewer.html +1325 -0
  10. package/skills/conversation-to-skill/references/compatibility.md +36 -0
  11. package/skills/conversation-to-skill/references/description-optimization.md +113 -0
  12. package/skills/conversation-to-skill/references/evaluation-suite.md +410 -0
  13. package/skills/conversation-to-skill/references/schemas.md +431 -0
  14. package/skills/conversation-to-skill/scripts/__init__.py +0 -0
  15. package/skills/conversation-to-skill/scripts/aggregate_benchmark.py +401 -0
  16. package/skills/conversation-to-skill/scripts/generate_report.py +335 -0
  17. package/skills/conversation-to-skill/scripts/improve_description.py +197 -0
  18. package/skills/conversation-to-skill/scripts/model_backends.py +115 -0
  19. package/skills/conversation-to-skill/scripts/package_skill.py +136 -0
  20. package/skills/conversation-to-skill/scripts/quick_validate.py +103 -0
  21. package/skills/conversation-to-skill/scripts/run_eval.py +363 -0
  22. package/skills/conversation-to-skill/scripts/run_loop.py +319 -0
  23. package/skills/conversation-to-skill/scripts/utils.py +223 -0
  24. package/skills/rudder/references/organization-skills.md +1 -1
  25. package/skills/skill-creator/SKILL.md +9 -0
  26. package/skills/skill-optimizer/CHANGELOG.md +29 -0
  27. package/skills/skill-optimizer/SKILL.md +205 -0
  28. package/skills/skill-optimizer/references/adapters/creative-brand-content.md +30 -0
  29. package/skills/skill-optimizer/references/adapters/customer-support-sales.md +30 -0
  30. package/skills/skill-optimizer/references/adapters/document-data-processing.md +31 -0
  31. package/skills/skill-optimizer/references/adapters/education-training.md +31 -0
  32. package/skills/skill-optimizer/references/adapters/finance-accounting.md +31 -0
  33. package/skills/skill-optimizer/references/adapters/healthcare-operations.md +30 -0
  34. package/skills/skill-optimizer/references/adapters/hr-people-ops.md +31 -0
  35. package/skills/skill-optimizer/references/adapters/legal-compliance.md +31 -0
  36. package/skills/skill-optimizer/references/adapters/operations-supply-chain.md +31 -0
  37. package/skills/skill-optimizer/references/adapters/personal-productivity.md +29 -0
  38. package/skills/skill-optimizer/references/adapters/research-knowledge.md +31 -0
  39. package/skills/skill-optimizer/references/adapters/software-ai.md +31 -0
  40. package/skills/skill-optimizer/references/domain-adapter-patterns.md +66 -0
  41. package/skills/skill-optimizer/references/eval-method.md +17 -0
  42. package/skills/skill-optimizer/references/universal-optimization-lens.md +73 -0
@@ -0,0 +1,319 @@
1
+ #!/usr/bin/env python3
2
+ """Run the eval + improve loop until all pass or max iterations reached."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import json
8
+ import random
9
+ import sys
10
+ import tempfile
11
+ import time
12
+ import webbrowser
13
+ from pathlib import Path
14
+
15
+ if __package__ in (None, ""):
16
+ sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
17
+
18
+ from scripts.generate_report import generate_html
19
+ from scripts.improve_description import improve_description
20
+ from scripts.model_backends import detect_backend
21
+ from scripts.run_eval import find_project_root, run_eval
22
+ from scripts.utils import parse_skill_md
23
+
24
+
25
+ def split_eval_set(eval_set: list[dict], holdout: float, seed: int = 42) -> tuple[list[dict], list[dict]]:
26
+ random.seed(seed)
27
+ trigger = [item for item in eval_set if item["should_trigger"]]
28
+ no_trigger = [item for item in eval_set if not item["should_trigger"]]
29
+ random.shuffle(trigger)
30
+ random.shuffle(no_trigger)
31
+ n_trigger_test = max(1, int(len(trigger) * holdout))
32
+ n_no_trigger_test = max(1, int(len(no_trigger) * holdout))
33
+ test_set = trigger[:n_trigger_test] + no_trigger[:n_no_trigger_test]
34
+ train_set = trigger[n_trigger_test:] + no_trigger[n_no_trigger_test:]
35
+ return train_set, test_set
36
+
37
+
38
+ def run_loop(
39
+ *,
40
+ eval_set: list[dict],
41
+ skill_path: Path,
42
+ description_override: str | None,
43
+ num_workers: int,
44
+ timeout: int,
45
+ max_iterations: int,
46
+ runs_per_query: int,
47
+ trigger_threshold: float,
48
+ holdout: float,
49
+ model: str | None,
50
+ backend: str,
51
+ verbose: bool,
52
+ live_report_path: Path | None = None,
53
+ log_dir: Path | None = None,
54
+ ) -> dict:
55
+ project_root = find_project_root()
56
+ name, original_description, content = parse_skill_md(skill_path)
57
+ current_description = description_override or original_description
58
+ backend = detect_backend(backend)
59
+
60
+ if holdout > 0:
61
+ train_set, test_set = split_eval_set(eval_set, holdout)
62
+ if verbose:
63
+ print(f"Split: {len(train_set)} train, {len(test_set)} test (holdout={holdout})", file=sys.stderr)
64
+ else:
65
+ train_set = eval_set
66
+ test_set = []
67
+
68
+ history = []
69
+ exit_reason = "unknown"
70
+
71
+ for iteration in range(1, max_iterations + 1):
72
+ if verbose:
73
+ print(f"\n{'=' * 60}", file=sys.stderr)
74
+ print(f"Iteration {iteration}/{max_iterations}", file=sys.stderr)
75
+ print(f"Backend: {backend}", file=sys.stderr)
76
+ print(f"Description: {current_description}", file=sys.stderr)
77
+ print(f"{'=' * 60}", file=sys.stderr)
78
+
79
+ all_queries = train_set + test_set
80
+ t0 = time.time()
81
+ all_results = run_eval(
82
+ eval_set=all_queries,
83
+ skill_name=name,
84
+ description=current_description,
85
+ num_workers=num_workers,
86
+ timeout=timeout,
87
+ project_root=project_root,
88
+ runs_per_query=runs_per_query,
89
+ trigger_threshold=trigger_threshold,
90
+ model=model,
91
+ backend=backend,
92
+ )
93
+ eval_elapsed = time.time() - t0
94
+
95
+ train_queries = {item["query"] for item in train_set}
96
+ train_result_list = [item for item in all_results["results"] if item["query"] in train_queries]
97
+ test_result_list = [item for item in all_results["results"] if item["query"] not in train_queries]
98
+
99
+ train_passed = sum(1 for item in train_result_list if item["pass"])
100
+ train_total = len(train_result_list)
101
+ train_summary = {"passed": train_passed, "failed": train_total - train_passed, "total": train_total}
102
+ train_results = {"results": train_result_list, "summary": train_summary}
103
+
104
+ if test_set:
105
+ test_passed = sum(1 for item in test_result_list if item["pass"])
106
+ test_total = len(test_result_list)
107
+ test_summary = {"passed": test_passed, "failed": test_total - test_passed, "total": test_total}
108
+ test_results = {"results": test_result_list, "summary": test_summary}
109
+ else:
110
+ test_summary = None
111
+ test_results = None
112
+
113
+ history.append({
114
+ "iteration": iteration,
115
+ "description": current_description,
116
+ "train_passed": train_summary["passed"],
117
+ "train_failed": train_summary["failed"],
118
+ "train_total": train_summary["total"],
119
+ "train_results": train_results["results"],
120
+ "test_passed": test_summary["passed"] if test_summary else None,
121
+ "test_failed": test_summary["failed"] if test_summary else None,
122
+ "test_total": test_summary["total"] if test_summary else None,
123
+ "test_results": test_results["results"] if test_results else None,
124
+ "passed": train_summary["passed"],
125
+ "failed": train_summary["failed"],
126
+ "total": train_summary["total"],
127
+ "results": train_results["results"],
128
+ })
129
+
130
+ if live_report_path:
131
+ partial_output = {
132
+ "backend": backend,
133
+ "original_description": original_description,
134
+ "best_description": current_description,
135
+ "best_score": "in progress",
136
+ "iterations_run": len(history),
137
+ "holdout": holdout,
138
+ "train_size": len(train_set),
139
+ "test_size": len(test_set),
140
+ "history": history,
141
+ }
142
+ live_report_path.write_text(generate_html(partial_output, auto_refresh=True, skill_name=name))
143
+
144
+ if verbose:
145
+ def print_eval_stats(label: str, results: list[dict], elapsed: float):
146
+ pos = [item for item in results if item["should_trigger"]]
147
+ neg = [item for item in results if not item["should_trigger"]]
148
+ tp = sum(item["triggers"] for item in pos)
149
+ pos_runs = sum(item["runs"] for item in pos)
150
+ fn = pos_runs - tp
151
+ fp = sum(item["triggers"] for item in neg)
152
+ neg_runs = sum(item["runs"] for item in neg)
153
+ tn = neg_runs - fp
154
+ total = tp + tn + fp + fn
155
+ precision = tp / (tp + fp) if (tp + fp) > 0 else 1.0
156
+ recall = tp / (tp + fn) if (tp + fn) > 0 else 1.0
157
+ accuracy = (tp + tn) / total if total > 0 else 0.0
158
+ print(
159
+ f"{label}: {tp + tn}/{total} correct, precision={precision:.0%} "
160
+ f"recall={recall:.0%} accuracy={accuracy:.0%} ({elapsed:.1f}s)",
161
+ file=sys.stderr,
162
+ )
163
+ for item in results:
164
+ status = "PASS" if item["pass"] else "FAIL"
165
+ rate_str = f"{item['triggers']}/{item['runs']}"
166
+ print(f" [{status}] rate={rate_str} expected={item['should_trigger']}: {item['query'][:60]}", file=sys.stderr)
167
+
168
+ print_eval_stats("Train", train_results["results"], eval_elapsed)
169
+ if test_summary:
170
+ print_eval_stats("Test ", test_results["results"], 0)
171
+
172
+ if train_summary["failed"] == 0:
173
+ exit_reason = f"all_passed (iteration {iteration})"
174
+ if verbose:
175
+ print(f"\nAll train queries passed on iteration {iteration}.", file=sys.stderr)
176
+ break
177
+
178
+ if iteration == max_iterations:
179
+ exit_reason = f"max_iterations ({max_iterations})"
180
+ if verbose:
181
+ print(f"\nMax iterations reached ({max_iterations}).", file=sys.stderr)
182
+ break
183
+
184
+ if verbose:
185
+ print("\nImproving description...", file=sys.stderr)
186
+
187
+ t0 = time.time()
188
+ blinded_history = [
189
+ {k: v for k, v in item.items() if not k.startswith("test_")}
190
+ for item in history
191
+ ]
192
+ new_description = improve_description(
193
+ backend=backend,
194
+ skill_name=name,
195
+ skill_content=content,
196
+ current_description=current_description,
197
+ eval_results=train_results,
198
+ history=blinded_history,
199
+ model=model,
200
+ log_dir=log_dir,
201
+ iteration=iteration,
202
+ )
203
+ improve_elapsed = time.time() - t0
204
+
205
+ if verbose:
206
+ print(f"Proposed ({improve_elapsed:.1f}s): {new_description}", file=sys.stderr)
207
+
208
+ current_description = new_description
209
+
210
+ if test_set:
211
+ best = max(history, key=lambda item: item["test_passed"] or 0)
212
+ best_score = f"{best['test_passed']}/{best['test_total']}"
213
+ else:
214
+ best = max(history, key=lambda item: item["train_passed"])
215
+ best_score = f"{best['train_passed']}/{best['train_total']}"
216
+
217
+ if verbose:
218
+ print(f"\nExit reason: {exit_reason}", file=sys.stderr)
219
+ print(f"Best score: {best_score} (iteration {best['iteration']})", file=sys.stderr)
220
+
221
+ return {
222
+ "backend": backend,
223
+ "exit_reason": exit_reason,
224
+ "original_description": original_description,
225
+ "best_description": best["description"],
226
+ "best_score": best_score,
227
+ "best_train_score": f"{best['train_passed']}/{best['train_total']}",
228
+ "best_test_score": f"{best['test_passed']}/{best['test_total']}" if test_set else None,
229
+ "final_description": current_description,
230
+ "iterations_run": len(history),
231
+ "holdout": holdout,
232
+ "train_size": len(train_set),
233
+ "test_size": len(test_set),
234
+ "history": history,
235
+ }
236
+
237
+
238
+ def main():
239
+ parser = argparse.ArgumentParser(description="Run eval + improve loop")
240
+ parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file")
241
+ parser.add_argument("--skill-path", required=True, help="Path to skill directory")
242
+ parser.add_argument("--description", default=None, help="Override starting description")
243
+ parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers")
244
+ parser.add_argument("--timeout", type=int, default=60, help="Timeout per query in seconds")
245
+ parser.add_argument("--max-iterations", type=int, default=5, help="Max improvement iterations")
246
+ parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query")
247
+ parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold")
248
+ parser.add_argument("--holdout", type=float, default=0.4, help="Fraction of eval set to hold out for testing (0 to disable)")
249
+ parser.add_argument("--model", default=None, help="Optional backend model identifier")
250
+ parser.add_argument("--backend", default="auto", choices=["auto", "claude", "codex"], help="Optimization backend")
251
+ parser.add_argument("--verbose", action="store_true", help="Print progress to stderr")
252
+ parser.add_argument("--report", default="auto", help="Generate HTML report at this path (default: 'auto', 'none' to disable)")
253
+ parser.add_argument("--results-dir", default=None, help="Save outputs to a timestamped subdirectory here")
254
+ args = parser.parse_args()
255
+
256
+ eval_set = json.loads(Path(args.eval_set).read_text())
257
+ skill_path = Path(args.skill_path)
258
+
259
+ if not (skill_path / "SKILL.md").exists():
260
+ print(f"Error: No SKILL.md found at {skill_path}", file=sys.stderr)
261
+ sys.exit(1)
262
+
263
+ name, _, _ = parse_skill_md(skill_path)
264
+
265
+ if args.report != "none":
266
+ if args.report == "auto":
267
+ timestamp = time.strftime("%Y%m%d_%H%M%S")
268
+ live_report_path = Path(tempfile.gettempdir()) / f"skill_description_report_{skill_path.name}_{timestamp}.html"
269
+ else:
270
+ live_report_path = Path(args.report)
271
+ live_report_path.write_text("<html><body><h1>Starting optimization loop...</h1><meta http-equiv='refresh' content='5'></body></html>")
272
+ webbrowser.open(str(live_report_path))
273
+ else:
274
+ live_report_path = None
275
+
276
+ if args.results_dir:
277
+ timestamp = time.strftime("%Y-%m-%d_%H%M%S")
278
+ results_dir = Path(args.results_dir) / timestamp
279
+ results_dir.mkdir(parents=True, exist_ok=True)
280
+ else:
281
+ results_dir = None
282
+
283
+ log_dir = results_dir / "logs" if results_dir else None
284
+
285
+ output = run_loop(
286
+ eval_set=eval_set,
287
+ skill_path=skill_path,
288
+ description_override=args.description,
289
+ num_workers=args.num_workers,
290
+ timeout=args.timeout,
291
+ max_iterations=args.max_iterations,
292
+ runs_per_query=args.runs_per_query,
293
+ trigger_threshold=args.trigger_threshold,
294
+ holdout=args.holdout,
295
+ model=args.model,
296
+ backend=args.backend,
297
+ verbose=args.verbose,
298
+ live_report_path=live_report_path,
299
+ log_dir=log_dir,
300
+ )
301
+
302
+ json_output = json.dumps(output, indent=2)
303
+ print(json_output)
304
+ if results_dir:
305
+ (results_dir / "results.json").write_text(json_output)
306
+
307
+ if live_report_path:
308
+ live_report_path.write_text(generate_html(output, auto_refresh=False, skill_name=name))
309
+ print(f"\nReport: {live_report_path}", file=sys.stderr)
310
+
311
+ if results_dir and live_report_path:
312
+ (results_dir / "report.html").write_text(generate_html(output, auto_refresh=False, skill_name=name))
313
+
314
+ if results_dir:
315
+ print(f"Results saved to: {results_dir}", file=sys.stderr)
316
+
317
+
318
+ if __name__ == "__main__":
319
+ main()
@@ -0,0 +1,223 @@
1
+ """Shared utilities for skill-creator scripts."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from pathlib import Path
7
+
8
+
9
+ FRONTMATTER_PATTERN = re.compile(r"^---\s*\n(.*?)\n---\s*(?:\n|$)", re.DOTALL)
10
+
11
+
12
+ def extract_frontmatter_text(content: str) -> str:
13
+ """Return the raw YAML frontmatter text from a SKILL.md file."""
14
+ match = FRONTMATTER_PATTERN.match(content)
15
+ if not match:
16
+ raise ValueError("SKILL.md missing frontmatter (expected opening and closing ---)")
17
+ return match.group(1)
18
+
19
+
20
+ def _count_indent(line: str) -> int:
21
+ return len(line) - len(line.lstrip(" "))
22
+
23
+
24
+ def _parse_scalar(value: str):
25
+ value = value.strip()
26
+ if not value:
27
+ return ""
28
+
29
+ if value[0] == value[-1] and value[0] in {'"', "'"}:
30
+ return value[1:-1]
31
+
32
+ lowered = value.lower()
33
+ if lowered in {"true", "false"}:
34
+ return lowered == "true"
35
+ if lowered in {"null", "~"}:
36
+ return None
37
+
38
+ if re.fullmatch(r"-?\d+", value):
39
+ return int(value)
40
+ if re.fullmatch(r"-?\d+\.\d+", value):
41
+ return float(value)
42
+
43
+ if value.startswith("[") and value.endswith("]"):
44
+ inner = value[1:-1].strip()
45
+ if not inner:
46
+ return []
47
+ return [_parse_scalar(part.strip()) for part in inner.split(",")]
48
+
49
+ return value
50
+
51
+
52
+ def _fold_lines(lines: list[str]) -> str:
53
+ paragraphs: list[list[str]] = [[]]
54
+ for line in lines:
55
+ if line == "":
56
+ if paragraphs[-1]:
57
+ paragraphs.append([])
58
+ continue
59
+ paragraphs[-1].append(line)
60
+
61
+ folded = [" ".join(paragraph).strip() for paragraph in paragraphs if paragraph]
62
+ return "\n\n".join(part for part in folded if part)
63
+
64
+
65
+ def _parse_block_scalar(lines: list[str], start: int, indent: int, style: str):
66
+ collected: list[str] = []
67
+ index = start
68
+ while index < len(lines):
69
+ raw = lines[index]
70
+ if not raw.strip():
71
+ collected.append("")
72
+ index += 1
73
+ continue
74
+
75
+ current_indent = _count_indent(raw)
76
+ if current_indent < indent:
77
+ break
78
+ collected.append(raw[indent:])
79
+ index += 1
80
+
81
+ if style.startswith("|"):
82
+ value = "\n".join(collected)
83
+ else:
84
+ value = _fold_lines(collected)
85
+ return value, index
86
+
87
+
88
+ def _looks_like_mapping_entry(text: str) -> bool:
89
+ if text.startswith("- "):
90
+ return False
91
+ return bool(re.match(r"^[A-Za-z0-9_-]+:\s*.*$", text))
92
+
93
+
94
+ def _next_nonempty_index(lines: list[str], start: int, min_indent: int):
95
+ index = start
96
+ while index < len(lines):
97
+ raw = lines[index]
98
+ if not raw.strip():
99
+ index += 1
100
+ continue
101
+ indent = _count_indent(raw)
102
+ if indent < min_indent:
103
+ return None
104
+ return index
105
+ return None
106
+
107
+
108
+ def _parse_list(lines: list[str], start: int, indent: int):
109
+ items = []
110
+ index = start
111
+ while index < len(lines):
112
+ raw = lines[index]
113
+ if not raw.strip():
114
+ index += 1
115
+ continue
116
+
117
+ current_indent = _count_indent(raw)
118
+ if current_indent < indent:
119
+ break
120
+ if current_indent != indent or not raw[indent:].startswith("- "):
121
+ raise ValueError(f"Invalid list entry near line: {raw}")
122
+
123
+ remainder = raw[indent + 2 :].strip()
124
+ index += 1
125
+
126
+ if remainder:
127
+ items.append(_parse_scalar(remainder))
128
+ continue
129
+
130
+ next_index = _next_nonempty_index(lines, index, indent + 2)
131
+ if next_index is None:
132
+ items.append("")
133
+ continue
134
+
135
+ nested_indent = _count_indent(lines[next_index])
136
+ nested_text = lines[next_index][nested_indent:]
137
+ if nested_text.startswith("- "):
138
+ value, index = _parse_list(lines, next_index, nested_indent)
139
+ elif _looks_like_mapping_entry(nested_text):
140
+ value, index = _parse_mapping(lines, next_index, nested_indent)
141
+ else:
142
+ value, index = _parse_block_scalar(lines, next_index, nested_indent, ">")
143
+ items.append(value)
144
+
145
+ return items, index
146
+
147
+
148
+ def _parse_mapping(lines: list[str], start: int, indent: int):
149
+ mapping = {}
150
+ index = start
151
+
152
+ while index < len(lines):
153
+ raw = lines[index]
154
+ if not raw.strip():
155
+ index += 1
156
+ continue
157
+
158
+ current_indent = _count_indent(raw)
159
+ if current_indent < indent:
160
+ break
161
+ if current_indent != indent:
162
+ raise ValueError(f"Unexpected indentation near line: {raw}")
163
+
164
+ text = raw[indent:]
165
+ if text.startswith("- "):
166
+ raise ValueError(f"Unexpected list item near line: {raw}")
167
+ if ":" not in text:
168
+ raise ValueError(f"Invalid mapping entry near line: {raw}")
169
+
170
+ key, remainder = text.split(":", 1)
171
+ key = key.strip()
172
+ remainder = remainder.strip()
173
+ index += 1
174
+
175
+ if remainder in {"|", ">", "|-", ">-"}:
176
+ value, index = _parse_block_scalar(lines, index, indent + 2, remainder)
177
+ elif remainder:
178
+ value = _parse_scalar(remainder)
179
+ else:
180
+ next_index = _next_nonempty_index(lines, index, indent + 2)
181
+ if next_index is None:
182
+ value = ""
183
+ else:
184
+ nested_indent = _count_indent(lines[next_index])
185
+ nested_text = lines[next_index][nested_indent:]
186
+ if nested_text.startswith("- "):
187
+ value, index = _parse_list(lines, next_index, nested_indent)
188
+ elif _looks_like_mapping_entry(nested_text):
189
+ value, index = _parse_mapping(lines, next_index, nested_indent)
190
+ else:
191
+ value, index = _parse_block_scalar(lines, next_index, nested_indent, ">")
192
+ mapping[key] = value
193
+
194
+ return mapping, index
195
+
196
+
197
+ def parse_frontmatter(frontmatter_text: str) -> dict:
198
+ """Parse a small YAML subset used by SKILL.md frontmatter without PyYAML."""
199
+ lines = frontmatter_text.splitlines()
200
+ mapping, index = _parse_mapping(lines, 0, 0)
201
+
202
+ trailing = [line for line in lines[index:] if line.strip()]
203
+ if trailing:
204
+ raise ValueError(f"Unexpected trailing content in frontmatter: {trailing[0]}")
205
+ return mapping
206
+
207
+
208
+ def load_skill_frontmatter(skill_path: Path) -> tuple[dict, str]:
209
+ """Load and parse the frontmatter from a skill directory."""
210
+ content = (skill_path / "SKILL.md").read_text()
211
+ frontmatter_text = extract_frontmatter_text(content)
212
+ frontmatter = parse_frontmatter(frontmatter_text)
213
+ if not isinstance(frontmatter, dict):
214
+ raise ValueError("Frontmatter must be a mapping")
215
+ return frontmatter, content
216
+
217
+
218
+ def parse_skill_md(skill_path: Path) -> tuple[str, str, str]:
219
+ """Parse a SKILL.md file, returning (name, description, full_content)."""
220
+ frontmatter, content = load_skill_frontmatter(skill_path)
221
+ name = frontmatter.get("name", "")
222
+ description = frontmatter.get("description", "")
223
+ return str(name or ""), str(description or ""), content
@@ -152,7 +152,7 @@ the full optional enabled-skill set intentionally.
152
152
 
153
153
  ## Notes
154
154
 
155
- - Built-in Rudder skills live in the organization library but are not auto-enabled.
155
+ - Built-in Rudder skills live in the organization library and are always loaded for agent runs.
156
156
  - New organizations also seed optional community preset skills into the organization library. They stay organization-managed and default-off for agents.
157
157
  - If a skill reference is missing or ambiguous, Rudder returns `422`.
158
158
  - Prefer linking back to the relevant issue, approval, and agent when commenting about skill changes.
@@ -0,0 +1,9 @@
1
+ ---
2
+ name: skill-creator
3
+ description: |
4
+ Create new skills, improve existing skills, and evaluate whether a skill definition is actually doing useful work.
5
+ ---
6
+
7
+ # Skill Creator
8
+
9
+ Use this skill when the task is to create a skill, refine a skill, or judge whether a skill definition should be changed.
@@ -0,0 +1,29 @@
1
+ # Changelog
2
+
3
+ ## v4.3 framing evidence hardening
4
+
5
+ - Added sequencing guidance for requests that ask to optimize a skill after another live task: complete and verify the primary task first, then optimize from evidence.
6
+ - Added explicit treatment of strong user corrections as high-signal evidence, especially framing corrections and wrong-abstraction-level failures.
7
+ - Added framing checks for user outcome vs UI surface, scenario spine vs fixture rows, source of truth vs derivative signal, and product intent vs local convenience.
8
+ - Relaxed the final response contract for larger workflows so the primary task result can be reported before concise skill changes.
9
+
10
+ ## v4.2 open-source package
11
+
12
+ - Added package mode and open-source project structure guidance.
13
+ - Added explicit adapter file lookup under `references/adapters/`.
14
+ - Added packaging expectations for README, examples, evals, changelog, and distributable skill zip.
15
+ - Preserved the generic analysis framework: core optimizer plus modular domain adapters.
16
+
17
+ ## v4.1 adapter hardening
18
+
19
+ - Added explicit domain adapter use rule: source of truth, required inputs, review owner, authority gates, privacy, output template, validation cases, and must-not behaviors.
20
+ - Added benchmark reporting split for trigger accuracy, patch-quality coverage, and downstream transfer.
21
+ - Added warning that synthetic verifier scores are regression signals, not official leaderboard results.
22
+
23
+ ## v4.0 generic
24
+
25
+ - Reframed Skill Optimizer from a software-focused hardening checklist into a domain-general analysis framework.
26
+ - Added universal optimization lens covering purpose, triggers, inputs, workflow, tools, outputs, quality, safety, failure, and maintainability.
27
+ - Moved domain-specific checks into modular adapter patterns.
28
+ - Added trigger optimization guidance and benchmark mode.
29
+ - Preserved strict patch safety around high-impact actions.