@shirayner/ace 0.1.0-snapshot.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/ace.js +39 -0
- package/package.json +42 -0
- package/src/commands/doctor.js +86 -0
- package/src/commands/init.js +98 -0
- package/src/commands/list.js +67 -0
- package/src/core/constants.js +106 -0
- package/src/core/installer.js +206 -0
- package/src/core/merger.js +103 -0
- package/templates/CLAUDE.md +16 -0
- package/templates/commands/report.md +63 -0
- package/templates/hookify/hookify.block-dangerous-ops.local.md +16 -0
- package/templates/hookify/hookify.protect-secrets.local.md +17 -0
- package/templates/hookify/hookify.require-verification.local.md +13 -0
- package/templates/hooks/java-compile-check.sh +106 -0
- package/templates/memory/MEMORY.md +4 -0
- package/templates/memory/roles/backend.md +11 -0
- package/templates/memory/roles/client.md +11 -0
- package/templates/memory/roles/frontend.md +11 -0
- package/templates/memory/roles/fullstack.md +11 -0
- package/templates/rules/clean-code.md +33 -0
- package/templates/rules/code-quality.md +74 -0
- package/templates/rules/context-hygiene.md +29 -0
- package/templates/rules/memory-policy.md +30 -0
- package/templates/rules/reporting.md +9 -0
- package/templates/rules/task-recovery.md +13 -0
- package/templates/rules/thinking.md +19 -0
- package/templates/settings.json +11 -0
- package/templates/skills/auto-goal/SKILL.md +188 -0
- package/templates/skills/coding/SKILL.md +251 -0
- package/templates/skills/coding/references/code-review-guide.md +137 -0
- package/templates/skills/coding/references/code-smells.md +201 -0
- package/templates/skills/coding/references/implement-guide.md +123 -0
- package/templates/skills/coding/references/unit-test-guide.md +211 -0
- package/templates/skills/skill-creator/LICENSE.txt +202 -0
- package/templates/skills/skill-creator/SKILL.md +479 -0
- package/templates/skills/skill-creator/agents/analyzer.md +274 -0
- package/templates/skills/skill-creator/agents/comparator.md +202 -0
- package/templates/skills/skill-creator/agents/grader.md +223 -0
- package/templates/skills/skill-creator/assets/eval_review.html +146 -0
- package/templates/skills/skill-creator/eval-viewer/generate_review.py +471 -0
- package/templates/skills/skill-creator/eval-viewer/viewer.html +1325 -0
- package/templates/skills/skill-creator/references/schemas.md +430 -0
- package/templates/skills/skill-creator/scripts/__init__.py +0 -0
- package/templates/skills/skill-creator/scripts/aggregate_benchmark.py +401 -0
- package/templates/skills/skill-creator/scripts/generate_report.py +326 -0
- package/templates/skills/skill-creator/scripts/improve_description.py +248 -0
- package/templates/skills/skill-creator/scripts/package_skill.py +136 -0
- package/templates/skills/skill-creator/scripts/quick_validate.py +103 -0
- package/templates/skills/skill-creator/scripts/run_eval.py +310 -0
- package/templates/skills/skill-creator/scripts/run_loop.py +332 -0
- package/templates/skills/skill-creator/scripts/utils.py +47 -0
- package/templates/skills/skill-optimize/SKILL.md +287 -0
- package/templates/skills/skill-optimize/references/.claude/settings.local.json +7 -0
- package/templates/skills/skill-optimize/references/anthropic-design-philosophy.md +250 -0
- package/templates/skills/skill-optimize/references/auto-goal-optimization-directions.md +130 -0
- package/templates/skills/skill-optimize/references/cross-disciplinary-insights.md +211 -0
- package/templates/skills/skill-optimize/references/quality-checklist.md +170 -0
- package/templates/skills/skill-optimize/references/theory-foundations.md +201 -0
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Run the eval + improve loop until all pass or max iterations reached.
|
|
3
|
+
|
|
4
|
+
Combines run_eval.py and improve_description.py in a loop, tracking history
|
|
5
|
+
and returning the best description found. Supports train/test split to prevent
|
|
6
|
+
overfitting.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import argparse
|
|
10
|
+
import json
|
|
11
|
+
import random
|
|
12
|
+
import sys
|
|
13
|
+
import tempfile
|
|
14
|
+
import time
|
|
15
|
+
import webbrowser
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
import anthropic
|
|
19
|
+
|
|
20
|
+
from scripts.generate_report import generate_html
|
|
21
|
+
from scripts.improve_description import improve_description
|
|
22
|
+
from scripts.run_eval import find_project_root, run_eval
|
|
23
|
+
from scripts.utils import parse_skill_md
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def split_eval_set(eval_set: list[dict], holdout: float, seed: int = 42) -> tuple[list[dict], list[dict]]:
|
|
27
|
+
"""Split eval set into train and test sets, stratified by should_trigger."""
|
|
28
|
+
random.seed(seed)
|
|
29
|
+
|
|
30
|
+
# Separate by should_trigger
|
|
31
|
+
trigger = [e for e in eval_set if e["should_trigger"]]
|
|
32
|
+
no_trigger = [e for e in eval_set if not e["should_trigger"]]
|
|
33
|
+
|
|
34
|
+
# Shuffle each group
|
|
35
|
+
random.shuffle(trigger)
|
|
36
|
+
random.shuffle(no_trigger)
|
|
37
|
+
|
|
38
|
+
# Calculate split points
|
|
39
|
+
n_trigger_test = max(1, int(len(trigger) * holdout))
|
|
40
|
+
n_no_trigger_test = max(1, int(len(no_trigger) * holdout))
|
|
41
|
+
|
|
42
|
+
# Split
|
|
43
|
+
test_set = trigger[:n_trigger_test] + no_trigger[:n_no_trigger_test]
|
|
44
|
+
train_set = trigger[n_trigger_test:] + no_trigger[n_no_trigger_test:]
|
|
45
|
+
|
|
46
|
+
return train_set, test_set
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def run_loop(
|
|
50
|
+
eval_set: list[dict],
|
|
51
|
+
skill_path: Path,
|
|
52
|
+
description_override: str | None,
|
|
53
|
+
num_workers: int,
|
|
54
|
+
timeout: int,
|
|
55
|
+
max_iterations: int,
|
|
56
|
+
runs_per_query: int,
|
|
57
|
+
trigger_threshold: float,
|
|
58
|
+
holdout: float,
|
|
59
|
+
model: str,
|
|
60
|
+
verbose: bool,
|
|
61
|
+
live_report_path: Path | None = None,
|
|
62
|
+
log_dir: Path | None = None,
|
|
63
|
+
) -> dict:
|
|
64
|
+
"""Run the eval + improvement loop."""
|
|
65
|
+
project_root = find_project_root()
|
|
66
|
+
name, original_description, content = parse_skill_md(skill_path)
|
|
67
|
+
current_description = description_override or original_description
|
|
68
|
+
|
|
69
|
+
# Split into train/test if holdout > 0
|
|
70
|
+
if holdout > 0:
|
|
71
|
+
train_set, test_set = split_eval_set(eval_set, holdout)
|
|
72
|
+
if verbose:
|
|
73
|
+
print(f"Split: {len(train_set)} train, {len(test_set)} test (holdout={holdout})", file=sys.stderr)
|
|
74
|
+
else:
|
|
75
|
+
train_set = eval_set
|
|
76
|
+
test_set = []
|
|
77
|
+
|
|
78
|
+
client = anthropic.Anthropic()
|
|
79
|
+
history = []
|
|
80
|
+
exit_reason = "unknown"
|
|
81
|
+
|
|
82
|
+
for iteration in range(1, max_iterations + 1):
|
|
83
|
+
if verbose:
|
|
84
|
+
print(f"\n{'='*60}", file=sys.stderr)
|
|
85
|
+
print(f"Iteration {iteration}/{max_iterations}", file=sys.stderr)
|
|
86
|
+
print(f"Description: {current_description}", file=sys.stderr)
|
|
87
|
+
print(f"{'='*60}", file=sys.stderr)
|
|
88
|
+
|
|
89
|
+
# Evaluate train + test together in one batch for parallelism
|
|
90
|
+
all_queries = train_set + test_set
|
|
91
|
+
t0 = time.time()
|
|
92
|
+
all_results = run_eval(
|
|
93
|
+
eval_set=all_queries,
|
|
94
|
+
skill_name=name,
|
|
95
|
+
description=current_description,
|
|
96
|
+
num_workers=num_workers,
|
|
97
|
+
timeout=timeout,
|
|
98
|
+
project_root=project_root,
|
|
99
|
+
runs_per_query=runs_per_query,
|
|
100
|
+
trigger_threshold=trigger_threshold,
|
|
101
|
+
model=model,
|
|
102
|
+
)
|
|
103
|
+
eval_elapsed = time.time() - t0
|
|
104
|
+
|
|
105
|
+
# Split results back into train/test by matching queries
|
|
106
|
+
train_queries_set = {q["query"] for q in train_set}
|
|
107
|
+
train_result_list = [r for r in all_results["results"] if r["query"] in train_queries_set]
|
|
108
|
+
test_result_list = [r for r in all_results["results"] if r["query"] not in train_queries_set]
|
|
109
|
+
|
|
110
|
+
train_passed = sum(1 for r in train_result_list if r["pass"])
|
|
111
|
+
train_total = len(train_result_list)
|
|
112
|
+
train_summary = {"passed": train_passed, "failed": train_total - train_passed, "total": train_total}
|
|
113
|
+
train_results = {"results": train_result_list, "summary": train_summary}
|
|
114
|
+
|
|
115
|
+
if test_set:
|
|
116
|
+
test_passed = sum(1 for r in test_result_list if r["pass"])
|
|
117
|
+
test_total = len(test_result_list)
|
|
118
|
+
test_summary = {"passed": test_passed, "failed": test_total - test_passed, "total": test_total}
|
|
119
|
+
test_results = {"results": test_result_list, "summary": test_summary}
|
|
120
|
+
else:
|
|
121
|
+
test_results = None
|
|
122
|
+
test_summary = None
|
|
123
|
+
|
|
124
|
+
history.append({
|
|
125
|
+
"iteration": iteration,
|
|
126
|
+
"description": current_description,
|
|
127
|
+
"train_passed": train_summary["passed"],
|
|
128
|
+
"train_failed": train_summary["failed"],
|
|
129
|
+
"train_total": train_summary["total"],
|
|
130
|
+
"train_results": train_results["results"],
|
|
131
|
+
"test_passed": test_summary["passed"] if test_summary else None,
|
|
132
|
+
"test_failed": test_summary["failed"] if test_summary else None,
|
|
133
|
+
"test_total": test_summary["total"] if test_summary else None,
|
|
134
|
+
"test_results": test_results["results"] if test_results else None,
|
|
135
|
+
# For backward compat with report generator
|
|
136
|
+
"passed": train_summary["passed"],
|
|
137
|
+
"failed": train_summary["failed"],
|
|
138
|
+
"total": train_summary["total"],
|
|
139
|
+
"results": train_results["results"],
|
|
140
|
+
})
|
|
141
|
+
|
|
142
|
+
# Write live report if path provided
|
|
143
|
+
if live_report_path:
|
|
144
|
+
partial_output = {
|
|
145
|
+
"original_description": original_description,
|
|
146
|
+
"best_description": current_description,
|
|
147
|
+
"best_score": "in progress",
|
|
148
|
+
"iterations_run": len(history),
|
|
149
|
+
"holdout": holdout,
|
|
150
|
+
"train_size": len(train_set),
|
|
151
|
+
"test_size": len(test_set),
|
|
152
|
+
"history": history,
|
|
153
|
+
}
|
|
154
|
+
live_report_path.write_text(generate_html(partial_output, auto_refresh=True, skill_name=name))
|
|
155
|
+
|
|
156
|
+
if verbose:
|
|
157
|
+
def print_eval_stats(label, results, elapsed):
|
|
158
|
+
pos = [r for r in results if r["should_trigger"]]
|
|
159
|
+
neg = [r for r in results if not r["should_trigger"]]
|
|
160
|
+
tp = sum(r["triggers"] for r in pos)
|
|
161
|
+
pos_runs = sum(r["runs"] for r in pos)
|
|
162
|
+
fn = pos_runs - tp
|
|
163
|
+
fp = sum(r["triggers"] for r in neg)
|
|
164
|
+
neg_runs = sum(r["runs"] for r in neg)
|
|
165
|
+
tn = neg_runs - fp
|
|
166
|
+
total = tp + tn + fp + fn
|
|
167
|
+
precision = tp / (tp + fp) if (tp + fp) > 0 else 1.0
|
|
168
|
+
recall = tp / (tp + fn) if (tp + fn) > 0 else 1.0
|
|
169
|
+
accuracy = (tp + tn) / total if total > 0 else 0.0
|
|
170
|
+
print(f"{label}: {tp+tn}/{total} correct, precision={precision:.0%} recall={recall:.0%} accuracy={accuracy:.0%} ({elapsed:.1f}s)", file=sys.stderr)
|
|
171
|
+
for r in results:
|
|
172
|
+
status = "PASS" if r["pass"] else "FAIL"
|
|
173
|
+
rate_str = f"{r['triggers']}/{r['runs']}"
|
|
174
|
+
print(f" [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:60]}", file=sys.stderr)
|
|
175
|
+
|
|
176
|
+
print_eval_stats("Train", train_results["results"], eval_elapsed)
|
|
177
|
+
if test_summary:
|
|
178
|
+
print_eval_stats("Test ", test_results["results"], 0)
|
|
179
|
+
|
|
180
|
+
if train_summary["failed"] == 0:
|
|
181
|
+
exit_reason = f"all_passed (iteration {iteration})"
|
|
182
|
+
if verbose:
|
|
183
|
+
print(f"\nAll train queries passed on iteration {iteration}!", file=sys.stderr)
|
|
184
|
+
break
|
|
185
|
+
|
|
186
|
+
if iteration == max_iterations:
|
|
187
|
+
exit_reason = f"max_iterations ({max_iterations})"
|
|
188
|
+
if verbose:
|
|
189
|
+
print(f"\nMax iterations reached ({max_iterations}).", file=sys.stderr)
|
|
190
|
+
break
|
|
191
|
+
|
|
192
|
+
# Improve the description based on train results
|
|
193
|
+
if verbose:
|
|
194
|
+
print(f"\nImproving description...", file=sys.stderr)
|
|
195
|
+
|
|
196
|
+
t0 = time.time()
|
|
197
|
+
# Strip test scores from history so improvement model can't see them
|
|
198
|
+
blinded_history = [
|
|
199
|
+
{k: v for k, v in h.items() if not k.startswith("test_")}
|
|
200
|
+
for h in history
|
|
201
|
+
]
|
|
202
|
+
new_description = improve_description(
|
|
203
|
+
client=client,
|
|
204
|
+
skill_name=name,
|
|
205
|
+
skill_content=content,
|
|
206
|
+
current_description=current_description,
|
|
207
|
+
eval_results=train_results,
|
|
208
|
+
history=blinded_history,
|
|
209
|
+
model=model,
|
|
210
|
+
log_dir=log_dir,
|
|
211
|
+
iteration=iteration,
|
|
212
|
+
)
|
|
213
|
+
improve_elapsed = time.time() - t0
|
|
214
|
+
|
|
215
|
+
if verbose:
|
|
216
|
+
print(f"Proposed ({improve_elapsed:.1f}s): {new_description}", file=sys.stderr)
|
|
217
|
+
|
|
218
|
+
current_description = new_description
|
|
219
|
+
|
|
220
|
+
# Find the best iteration by TEST score (or train if no test set)
|
|
221
|
+
if test_set:
|
|
222
|
+
best = max(history, key=lambda h: h["test_passed"] or 0)
|
|
223
|
+
best_score = f"{best['test_passed']}/{best['test_total']}"
|
|
224
|
+
else:
|
|
225
|
+
best = max(history, key=lambda h: h["train_passed"])
|
|
226
|
+
best_score = f"{best['train_passed']}/{best['train_total']}"
|
|
227
|
+
|
|
228
|
+
if verbose:
|
|
229
|
+
print(f"\nExit reason: {exit_reason}", file=sys.stderr)
|
|
230
|
+
print(f"Best score: {best_score} (iteration {best['iteration']})", file=sys.stderr)
|
|
231
|
+
|
|
232
|
+
return {
|
|
233
|
+
"exit_reason": exit_reason,
|
|
234
|
+
"original_description": original_description,
|
|
235
|
+
"best_description": best["description"],
|
|
236
|
+
"best_score": best_score,
|
|
237
|
+
"best_train_score": f"{best['train_passed']}/{best['train_total']}",
|
|
238
|
+
"best_test_score": f"{best['test_passed']}/{best['test_total']}" if test_set else None,
|
|
239
|
+
"final_description": current_description,
|
|
240
|
+
"iterations_run": len(history),
|
|
241
|
+
"holdout": holdout,
|
|
242
|
+
"train_size": len(train_set),
|
|
243
|
+
"test_size": len(test_set),
|
|
244
|
+
"history": history,
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def main():
|
|
249
|
+
parser = argparse.ArgumentParser(description="Run eval + improve loop")
|
|
250
|
+
parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file")
|
|
251
|
+
parser.add_argument("--skill-path", required=True, help="Path to skill directory")
|
|
252
|
+
parser.add_argument("--description", default=None, help="Override starting description")
|
|
253
|
+
parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers")
|
|
254
|
+
parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds")
|
|
255
|
+
parser.add_argument("--max-iterations", type=int, default=5, help="Max improvement iterations")
|
|
256
|
+
parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query")
|
|
257
|
+
parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold")
|
|
258
|
+
parser.add_argument("--holdout", type=float, default=0.4, help="Fraction of eval set to hold out for testing (0 to disable)")
|
|
259
|
+
parser.add_argument("--model", required=True, help="Model for improvement")
|
|
260
|
+
parser.add_argument("--verbose", action="store_true", help="Print progress to stderr")
|
|
261
|
+
parser.add_argument("--report", default="auto", help="Generate HTML report at this path (default: 'auto' for temp file, 'none' to disable)")
|
|
262
|
+
parser.add_argument("--results-dir", default=None, help="Save all outputs (results.json, report.html, log.txt) to a timestamped subdirectory here")
|
|
263
|
+
args = parser.parse_args()
|
|
264
|
+
|
|
265
|
+
eval_set = json.loads(Path(args.eval_set).read_text())
|
|
266
|
+
skill_path = Path(args.skill_path)
|
|
267
|
+
|
|
268
|
+
if not (skill_path / "SKILL.md").exists():
|
|
269
|
+
print(f"Error: No SKILL.md found at {skill_path}", file=sys.stderr)
|
|
270
|
+
sys.exit(1)
|
|
271
|
+
|
|
272
|
+
name, _, _ = parse_skill_md(skill_path)
|
|
273
|
+
|
|
274
|
+
# Set up live report path
|
|
275
|
+
if args.report != "none":
|
|
276
|
+
if args.report == "auto":
|
|
277
|
+
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
|
278
|
+
live_report_path = Path(tempfile.gettempdir()) / f"skill_description_report_{skill_path.name}_{timestamp}.html"
|
|
279
|
+
else:
|
|
280
|
+
live_report_path = Path(args.report)
|
|
281
|
+
# Open the report immediately so the user can watch
|
|
282
|
+
live_report_path.write_text("<html><body><h1>Starting optimization loop...</h1><meta http-equiv='refresh' content='5'></body></html>")
|
|
283
|
+
webbrowser.open(str(live_report_path))
|
|
284
|
+
else:
|
|
285
|
+
live_report_path = None
|
|
286
|
+
|
|
287
|
+
# Determine output directory (create before run_loop so logs can be written)
|
|
288
|
+
if args.results_dir:
|
|
289
|
+
timestamp = time.strftime("%Y-%m-%d_%H%M%S")
|
|
290
|
+
results_dir = Path(args.results_dir) / timestamp
|
|
291
|
+
results_dir.mkdir(parents=True, exist_ok=True)
|
|
292
|
+
else:
|
|
293
|
+
results_dir = None
|
|
294
|
+
|
|
295
|
+
log_dir = results_dir / "logs" if results_dir else None
|
|
296
|
+
|
|
297
|
+
output = run_loop(
|
|
298
|
+
eval_set=eval_set,
|
|
299
|
+
skill_path=skill_path,
|
|
300
|
+
description_override=args.description,
|
|
301
|
+
num_workers=args.num_workers,
|
|
302
|
+
timeout=args.timeout,
|
|
303
|
+
max_iterations=args.max_iterations,
|
|
304
|
+
runs_per_query=args.runs_per_query,
|
|
305
|
+
trigger_threshold=args.trigger_threshold,
|
|
306
|
+
holdout=args.holdout,
|
|
307
|
+
model=args.model,
|
|
308
|
+
verbose=args.verbose,
|
|
309
|
+
live_report_path=live_report_path,
|
|
310
|
+
log_dir=log_dir,
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
# Save JSON output
|
|
314
|
+
json_output = json.dumps(output, indent=2)
|
|
315
|
+
print(json_output)
|
|
316
|
+
if results_dir:
|
|
317
|
+
(results_dir / "results.json").write_text(json_output)
|
|
318
|
+
|
|
319
|
+
# Write final HTML report (without auto-refresh)
|
|
320
|
+
if live_report_path:
|
|
321
|
+
live_report_path.write_text(generate_html(output, auto_refresh=False, skill_name=name))
|
|
322
|
+
print(f"\nReport: {live_report_path}", file=sys.stderr)
|
|
323
|
+
|
|
324
|
+
if results_dir and live_report_path:
|
|
325
|
+
(results_dir / "report.html").write_text(generate_html(output, auto_refresh=False, skill_name=name))
|
|
326
|
+
|
|
327
|
+
if results_dir:
|
|
328
|
+
print(f"Results saved to: {results_dir}", file=sys.stderr)
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
if __name__ == "__main__":
|
|
332
|
+
main()
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""Shared utilities for skill-creator scripts."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def parse_skill_md(skill_path: Path) -> tuple[str, str, str]:
|
|
8
|
+
"""Parse a SKILL.md file, returning (name, description, full_content)."""
|
|
9
|
+
content = (skill_path / "SKILL.md").read_text()
|
|
10
|
+
lines = content.split("\n")
|
|
11
|
+
|
|
12
|
+
if lines[0].strip() != "---":
|
|
13
|
+
raise ValueError("SKILL.md missing frontmatter (no opening ---)")
|
|
14
|
+
|
|
15
|
+
end_idx = None
|
|
16
|
+
for i, line in enumerate(lines[1:], start=1):
|
|
17
|
+
if line.strip() == "---":
|
|
18
|
+
end_idx = i
|
|
19
|
+
break
|
|
20
|
+
|
|
21
|
+
if end_idx is None:
|
|
22
|
+
raise ValueError("SKILL.md missing frontmatter (no closing ---)")
|
|
23
|
+
|
|
24
|
+
name = ""
|
|
25
|
+
description = ""
|
|
26
|
+
frontmatter_lines = lines[1:end_idx]
|
|
27
|
+
i = 0
|
|
28
|
+
while i < len(frontmatter_lines):
|
|
29
|
+
line = frontmatter_lines[i]
|
|
30
|
+
if line.startswith("name:"):
|
|
31
|
+
name = line[len("name:"):].strip().strip('"').strip("'")
|
|
32
|
+
elif line.startswith("description:"):
|
|
33
|
+
value = line[len("description:"):].strip()
|
|
34
|
+
# Handle YAML multiline indicators (>, |, >-, |-)
|
|
35
|
+
if value in (">", "|", ">-", "|-"):
|
|
36
|
+
continuation_lines: list[str] = []
|
|
37
|
+
i += 1
|
|
38
|
+
while i < len(frontmatter_lines) and (frontmatter_lines[i].startswith(" ") or frontmatter_lines[i].startswith("\t")):
|
|
39
|
+
continuation_lines.append(frontmatter_lines[i].strip())
|
|
40
|
+
i += 1
|
|
41
|
+
description = " ".join(continuation_lines)
|
|
42
|
+
continue
|
|
43
|
+
else:
|
|
44
|
+
description = value.strip('"').strip("'")
|
|
45
|
+
i += 1
|
|
46
|
+
|
|
47
|
+
return name, description, content
|
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: skill-optimize
|
|
3
|
+
description: >-
|
|
4
|
+
深度优化现有 Claude Code Skill 的方法论工具。基于认知科学、信息论、控制论提炼的七条永恒优化原则,
|
|
5
|
+
对 skill 进行系统性诊断和重构。
|
|
6
|
+
当用户想要改进、优化、精简现有 skill 时使用——无论目标是提升触发准确率、改善指令清晰度、
|
|
7
|
+
降低 token 消耗还是优化架构结构。
|
|
8
|
+
触发场景包括:"优化这个 skill"、"skill 效果不好"、"skill 太长了"、"提升 skill 质量"、
|
|
9
|
+
"skill 触发不准"、"让这个 skill 更好"、"精简指令"、"optimize skill"、"improve skill"、
|
|
10
|
+
"refine skill instructions"、"skill not triggering correctly",
|
|
11
|
+
或用户描述了某个现有 skill 的具体行为问题。
|
|
12
|
+
当用户想将结构化优化方法论应用于任何 Claude 使用的 skill 或指令集时也应触发。
|
|
13
|
+
不要在从零创建新 skill 或运行 eval benchmark 时触发——那些属于 skill-creator 的职责。
|
|
14
|
+
不要在与 Claude Code skill 无关的通用代码审查或重构时触发。
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
# Skill Optimize
|
|
18
|
+
|
|
19
|
+
让现有 skill 变得更好的方法论。核心洞察:skill 质量来自理解 AI 如何处理指令,而非堆砌更多规则。
|
|
20
|
+
|
|
21
|
+
> 一句话总结:在有限的认知带宽内,用最高信噪比的表达,传递让 AI 能够自主判断的认知协议。
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## 本 Skill 与 skill-creator 的边界
|
|
26
|
+
|
|
27
|
+
| 需求 | 使用 |
|
|
28
|
+
|------|------|
|
|
29
|
+
| 从零创建新 skill | skill-creator |
|
|
30
|
+
| 运行 eval、benchmark、description 优化 | skill-creator |
|
|
31
|
+
| 诊断 skill 为什么表现不佳 | **skill-optimize** |
|
|
32
|
+
| 用原则方法论提升 skill 质量 | **skill-optimize** |
|
|
33
|
+
| 重构臃肿或低效的 skill | **skill-optimize** |
|
|
34
|
+
|
|
35
|
+
两者互补:用 skill-optimize 搞清楚"改什么、为什么改",再用 skill-creator 的 eval 基础设施验证改进效果。
|
|
36
|
+
|
|
37
|
+
---
|
|
38
|
+
|
|
39
|
+
## 四步优化流程
|
|
40
|
+
|
|
41
|
+
```
|
|
42
|
+
诊断现状 → 选择原则 → 重构实施 → 验证效果
|
|
43
|
+
↑ |
|
|
44
|
+
└────── 不满意?回到诊断 ─────────────┘
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
每次只改一个层面(触发 / 认知 / 资源),改完立即验证。不要批量修改——否则无法定位哪个改动起效。
|
|
48
|
+
|
|
49
|
+
---
|
|
50
|
+
|
|
51
|
+
## 第一步:诊断
|
|
52
|
+
|
|
53
|
+
读取目标 skill 的 SKILL.md 和 references/,从四个维度评估:
|
|
54
|
+
|
|
55
|
+
### 四维诊断
|
|
56
|
+
|
|
57
|
+
**1. 触发准确率**
|
|
58
|
+
- 该触发时触发了吗?(召回率)
|
|
59
|
+
- 不该触发时安静吗?(精确率)
|
|
60
|
+
- description 是否足够具体?是否与其他 skill 有边界混淆?
|
|
61
|
+
|
|
62
|
+
**2. 任务完成质量**
|
|
63
|
+
- 运行后输出是否满足需求?
|
|
64
|
+
- 有没有反复出现的错误模式?
|
|
65
|
+
- AI 是否误解了指令意图?
|
|
66
|
+
|
|
67
|
+
**3. Token 效率**
|
|
68
|
+
- 有没有指令导致 AI 做无用功?
|
|
69
|
+
- 是否加载了不需要的上下文?
|
|
70
|
+
- transcript 中有没有明显的浪费(长时间犹豫、重复尝试)?
|
|
71
|
+
|
|
72
|
+
**4. 结构健康度**
|
|
73
|
+
- SKILL.md 是否超过 500 行?
|
|
74
|
+
- 原则与细节是否分层清晰?
|
|
75
|
+
- references/ 是否被正确使用(深度知识在资源层,非堆在主体中)?
|
|
76
|
+
|
|
77
|
+
基于诊断结果,识别最关键的 1-3 个问题。不要试图一次修复所有问题。
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
## 第二步:选择原则
|
|
82
|
+
|
|
83
|
+
七条原则指导所有优化决策。每条解决一类失败模式。
|
|
84
|
+
|
|
85
|
+
### 原则一:传意不传形
|
|
86
|
+
|
|
87
|
+
**解决的问题**:AI 执行了指令的字面意思,却偏离了真实意图。
|
|
88
|
+
|
|
89
|
+
解释*为什么*某事重要,而不只是*做什么*。Claude 具备心智理论——给出规则背后的原因,它能泛化到新场景。刚性规则(ALWAYS/NEVER)限制了这种泛化能力。
|
|
90
|
+
|
|
91
|
+
**变换模式**:
|
|
92
|
+
- 变换前:"ALWAYS 用 bullet 列表"
|
|
93
|
+
- 变换后:"用 bullet 列表因为它们易于扫描——但当顺序重要时改用有序列表"
|
|
94
|
+
|
|
95
|
+
**诊断信号**:统计 skill 中的 ALWAYS/NEVER 数量。每个都是黄旗——问问解释原因是否能替代硬性规则。
|
|
96
|
+
|
|
97
|
+
### 原则二:渐进披露
|
|
98
|
+
|
|
99
|
+
**解决的问题**:上下文窗口浪费、认知过载。
|
|
100
|
+
|
|
101
|
+
在正确的时机加载正确的信息。三层模型:
|
|
102
|
+
|
|
103
|
+
| 层级 | 内容 | 预算 | 何时加载 |
|
|
104
|
+
|------|------|------|----------|
|
|
105
|
+
| 触发层(description) | 何时激活 | ~100 词 | 始终在上下文中 |
|
|
106
|
+
| 认知层(SKILL.md body) | 如何思考 | <500 行 | 触发时加载 |
|
|
107
|
+
| 资源层(references/) | 深度细节 | 不限 | 按需加载 |
|
|
108
|
+
|
|
109
|
+
**诊断信号**:SKILL.md 超过 500 行时,内容放错了层级。将实现细节、检查清单、领域特定知识移到 references/。
|
|
110
|
+
|
|
111
|
+
**自检**:只用 SKILL.md body(不看 references),AI 能完成 80% 的任务吗?如果不能,关键知识埋在了错误的层级。
|
|
112
|
+
|
|
113
|
+
### 原则三:信噪比优化
|
|
114
|
+
|
|
115
|
+
**解决的问题**:重要指令被填充内容稀释。
|
|
116
|
+
|
|
117
|
+
每句话都必须证明自己存在的必要性。信号 = 改变 AI 行为的信息。噪声 = 消耗上下文却无效果的填充。
|
|
118
|
+
|
|
119
|
+
**常见噪声模式**:
|
|
120
|
+
- 仪式性语言("请注意……"、"重要的是……"、"需要强调……")
|
|
121
|
+
- 不增加新信息的冗余重述
|
|
122
|
+
- 不增加精确度的对冲用词
|
|
123
|
+
|
|
124
|
+
**精简方法**:删掉一句话,如果行为不变,它就是噪声。
|
|
125
|
+
|
|
126
|
+
**悖论**:适度冗余是信号——在不同上下文中重复核心原则能强化记忆。区别在于每次重复是否增加了新的上下文。
|
|
127
|
+
|
|
128
|
+
### 原则四:单一职责
|
|
129
|
+
|
|
130
|
+
**解决的问题**:触发混淆、维护负担、代码膨胀。
|
|
131
|
+
|
|
132
|
+
一个 skill,一件事。如果描述 skill 职责时需要用"和"连接两个动作,考虑拆分。
|
|
133
|
+
|
|
134
|
+
**拆分信号**:
|
|
135
|
+
- 指令超过 500 行
|
|
136
|
+
- 条件分支超过 5 个
|
|
137
|
+
- 无法用"动词+名词"一句话描述
|
|
138
|
+
- AI 频繁选错工具或跳过步骤
|
|
139
|
+
|
|
140
|
+
**合并条件**:
|
|
141
|
+
- 两个 skill 共享大部分工具和指令
|
|
142
|
+
- 拆分后 handoff 开销超过收益
|
|
143
|
+
|
|
144
|
+
### 原则五:闭环验证
|
|
145
|
+
|
|
146
|
+
**解决的问题**:错误在多步骤任务中静默累积。
|
|
147
|
+
|
|
148
|
+
在关键决策点设计验证环节——不要把所有检查都推迟到最后。
|
|
149
|
+
|
|
150
|
+
| 验证类型 | 时机 | 示例 |
|
|
151
|
+
|----------|------|------|
|
|
152
|
+
| 微验证 | 每步完成后 | 编译通过?文件存在? |
|
|
153
|
+
| 阶段验证 | 里程碑后 | 满足阶段目标? |
|
|
154
|
+
| 终验证 | 任务完成时 | 满足原始需求? |
|
|
155
|
+
|
|
156
|
+
**诊断信号**:如果 skill 没有任何验证点,它是开环系统——简单任务可行,复杂任务必不可靠。
|
|
157
|
+
|
|
158
|
+
### 原则六:复杂度适配
|
|
159
|
+
|
|
160
|
+
**解决的问题**:简单任务用了重型框架,复杂任务用了轻量指令。
|
|
161
|
+
|
|
162
|
+
让框架的重量匹配问题的重量:
|
|
163
|
+
|
|
164
|
+
```
|
|
165
|
+
简单 → 快速路径:直接执行 + 验证
|
|
166
|
+
中等 → 标准路径:规划 → 执行 → 验证
|
|
167
|
+
复杂 → 深度路径:澄清 → 探索 → 规划 → 执行 → 验证
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
一个 skill 应在内部支持不同复杂度路径,而非强制所有输入走最重的流程。
|
|
171
|
+
|
|
172
|
+
**诊断信号**:如果一个 5 分钟的任务也要走完整的分析-规划-执行-验证循环,说明 skill 过度设计了。
|
|
173
|
+
|
|
174
|
+
### 原则七:抗过拟合
|
|
175
|
+
|
|
176
|
+
**解决的问题**:skill 在测试用例上完美,在真实使用中失败。
|
|
177
|
+
|
|
178
|
+
写传授推理模式的指令,而非输入-输出映射。如果一个修改只修复了某个特定测试用例,它很可能是过拟合。
|
|
179
|
+
|
|
180
|
+
**健康的示例**:3-5 个多样化示例,展示*推理过程*而非仅展示最终输出。
|
|
181
|
+
|
|
182
|
+
**诊断信号**:换一组完全不同的测试输入,skill 能表现一样好吗?如果不能,它过拟合了。
|
|
183
|
+
|
|
184
|
+
---
|
|
185
|
+
|
|
186
|
+
## 第三步:重构
|
|
187
|
+
|
|
188
|
+
基于诊断结果和选定原则,重构 skill。
|
|
189
|
+
|
|
190
|
+
### 触发层优化(description)
|
|
191
|
+
|
|
192
|
+
description 决定 skill 是否被激活。同时优化精确率和召回率:
|
|
193
|
+
|
|
194
|
+
**结构公式**:`[做什么] + [何时触发] + [何时不触发] + [与相邻 skill 的边界]`
|
|
195
|
+
|
|
196
|
+
**常见修复**:
|
|
197
|
+
- 触发不足 → 增加触发场景、替代措辞、非显式用例
|
|
198
|
+
- 过度触发 → 增加排除条件、与相似 skill 的显式边界
|
|
199
|
+
- 两者皆有 → skill 的职责范围不清——回到单一职责原则
|
|
200
|
+
|
|
201
|
+
### 认知层优化(SKILL.md body)
|
|
202
|
+
|
|
203
|
+
**结构优先级**:
|
|
204
|
+
1. 开头声明使命——1-2 句话说明目的和核心理念
|
|
205
|
+
2. 原则先于流程——先建思维框架,再给具体流程
|
|
206
|
+
3. 决策树优于步骤列表——"在此情境下判断 X"优于"按顺序做 1-2-3"
|
|
207
|
+
4. 正面指引与反模式并存——展示什么是对的 AND 什么是错的(及为什么错)
|
|
208
|
+
5. 指向资源层——清晰的"何时阅读什么"指引
|
|
209
|
+
|
|
210
|
+
**写作风格**:
|
|
211
|
+
- 祈使语气("分析依赖图"而非"你需要分析依赖图")
|
|
212
|
+
- 为每条规则解释原因
|
|
213
|
+
- 最小化 ALWAYS/NEVER——尽量用推理来替代
|
|
214
|
+
|
|
215
|
+
**长度检查**:
|
|
216
|
+
- 理想:200-400 行
|
|
217
|
+
- 上限:500 行
|
|
218
|
+
- 超过 500 行:移内容到 references/
|
|
219
|
+
- 超过 1000 行:skill 职责过于宽泛,考虑拆分
|
|
220
|
+
|
|
221
|
+
### 资源层优化(references/)
|
|
222
|
+
|
|
223
|
+
- 按领域/主题组织,而非按功能
|
|
224
|
+
- 每个参考文件应独立可用
|
|
225
|
+
- 大文件(>300 行)包含目录
|
|
226
|
+
- 脚本可以不加载到上下文中直接执行
|
|
227
|
+
|
|
228
|
+
---
|
|
229
|
+
|
|
230
|
+
## 第四步:验证
|
|
231
|
+
|
|
232
|
+
### 快速验证(每次都做)
|
|
233
|
+
|
|
234
|
+
- [ ] description 清晰传达了何时触发、何时不触发
|
|
235
|
+
- [ ] SKILL.md 在 500 行以内
|
|
236
|
+
- [ ] 无孤立的 references(所有引用都从 SKILL.md 指向)
|
|
237
|
+
- [ ] 核心原则在 SKILL.md 中,未埋在 references 里
|
|
238
|
+
- [ ] ALWAYS/NEVER 数量减少,或每个都附有原因说明
|
|
239
|
+
|
|
240
|
+
### 深度验证(重大变更时)
|
|
241
|
+
|
|
242
|
+
- 以"第一次看这个 skill"的视角阅读。3 分钟内能理解意图吗?
|
|
243
|
+
- 脑中推演 2-3 个现实场景。skill 能引导正确行为吗?
|
|
244
|
+
- 回归检查:之前有效的功能是否会被影响?
|
|
245
|
+
|
|
246
|
+
### 成熟度评估
|
|
247
|
+
|
|
248
|
+
| 等级 | 特征 | 优化重点 |
|
|
249
|
+
|------|------|----------|
|
|
250
|
+
| L0 草稿 | 能工作但不稳定,需要人工干预 | 触发准确率 + 基本功能 |
|
|
251
|
+
| L1 可用 | 多数场景正确,偶有失误 | 边界情况 + 错误恢复 |
|
|
252
|
+
| L2 可靠 | 稳定、高效、可预测 | Token 效率 + 精简 |
|
|
253
|
+
| L3 优雅 | 正确、简洁、灵活、可组合 | 泛化能力 + 架构简洁 |
|
|
254
|
+
|
|
255
|
+
---
|
|
256
|
+
|
|
257
|
+
## 优化纪律
|
|
258
|
+
|
|
259
|
+
**每次只改一处**:修改触发 OR 认知 OR 资源层。改完验证。不要累积修改。
|
|
260
|
+
|
|
261
|
+
**读 transcript,不只看输出**:执行过程暴露了最终输出隐藏的浪费和困惑。如果 AI 花了 500 token 在某个本应由 skill 明确的事情上犹豫,那就是优化目标。
|
|
262
|
+
|
|
263
|
+
**知道何时停止**:开始前定义"足够好"。边际收益递减是真实的——L0→L1 的跃升通常比 L2→L3 更有价值。
|
|
264
|
+
|
|
265
|
+
---
|
|
266
|
+
|
|
267
|
+
## 反模式速查
|
|
268
|
+
|
|
269
|
+
| 反模式 | 症状 | 修复方向 |
|
|
270
|
+
|--------|------|----------|
|
|
271
|
+
| 规则堆砌 | ALWAYS/NEVER 满天飞,>1000 行 | 减少规则,增加解释 |
|
|
272
|
+
| 过拟合 | 测试用例完美,真实使用失败 | 泛化原则,增加示例多样性 |
|
|
273
|
+
| 噪声膨胀 | "请注意"、"重要"等仪式性填充 | 删除后检查行为是否变化 |
|
|
274
|
+
| 抽象过度 | 原则太抽象无法指导行动 | 增加判断标准和边界案例 |
|
|
275
|
+
| 步骤固化 | 对多样化输入用死板的 1-2-3 | 决策树替代步骤列表 |
|
|
276
|
+
| 巨石 Skill | 一个 skill 做所有事 | 按职责拆分 |
|
|
277
|
+
| 缺少闭环 | 执行完不验证 | 在关键节点插入验证 |
|
|
278
|
+
| 批量优化 | 一次改 5 处,不知道哪个生效 | 一次改一处,改完验证 |
|
|
279
|
+
|
|
280
|
+
---
|
|
281
|
+
|
|
282
|
+
## 深度参考
|
|
283
|
+
|
|
284
|
+
按需阅读,不必每次都加载:
|
|
285
|
+
|
|
286
|
+
- `references/theory-foundations.md` — 每条原则背后的认知科学、哲学和信息论基础。当你想理解原则*为什么*有效(而非仅仅*是什么*)时阅读。
|
|
287
|
+
- `references/quality-checklist.md` — 详细的诊断检查清单、transcript 分析指南和精简方法论。做深度优化时阅读。
|