@rudderhq/agent-runtime-gemini-local 0.2.1 → 0.2.2-canary.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/skills/conversation-to-skill/LICENSE.txt +202 -0
- package/skills/conversation-to-skill/SKILL.md +428 -0
- package/skills/conversation-to-skill/agents/analyzer.md +274 -0
- package/skills/conversation-to-skill/agents/comparator.md +202 -0
- package/skills/conversation-to-skill/agents/grader.md +223 -0
- package/skills/conversation-to-skill/assets/eval_review.html +146 -0
- package/skills/conversation-to-skill/eval-viewer/generate_review.py +471 -0
- package/skills/conversation-to-skill/eval-viewer/viewer.html +1325 -0
- package/skills/conversation-to-skill/references/compatibility.md +36 -0
- package/skills/conversation-to-skill/references/description-optimization.md +113 -0
- package/skills/conversation-to-skill/references/evaluation-suite.md +410 -0
- package/skills/conversation-to-skill/references/schemas.md +431 -0
- package/skills/conversation-to-skill/scripts/__init__.py +0 -0
- package/skills/conversation-to-skill/scripts/aggregate_benchmark.py +401 -0
- package/skills/conversation-to-skill/scripts/generate_report.py +335 -0
- package/skills/conversation-to-skill/scripts/improve_description.py +197 -0
- package/skills/conversation-to-skill/scripts/model_backends.py +115 -0
- package/skills/conversation-to-skill/scripts/package_skill.py +136 -0
- package/skills/conversation-to-skill/scripts/quick_validate.py +103 -0
- package/skills/conversation-to-skill/scripts/run_eval.py +363 -0
- package/skills/conversation-to-skill/scripts/run_loop.py +319 -0
- package/skills/conversation-to-skill/scripts/utils.py +223 -0
- package/skills/rudder/references/organization-skills.md +1 -1
- package/skills/skill-creator/SKILL.md +9 -0
- package/skills/skill-optimizer/CHANGELOG.md +29 -0
- package/skills/skill-optimizer/SKILL.md +205 -0
- package/skills/skill-optimizer/references/adapters/creative-brand-content.md +30 -0
- package/skills/skill-optimizer/references/adapters/customer-support-sales.md +30 -0
- package/skills/skill-optimizer/references/adapters/document-data-processing.md +31 -0
- package/skills/skill-optimizer/references/adapters/education-training.md +31 -0
- package/skills/skill-optimizer/references/adapters/finance-accounting.md +31 -0
- package/skills/skill-optimizer/references/adapters/healthcare-operations.md +30 -0
- package/skills/skill-optimizer/references/adapters/hr-people-ops.md +31 -0
- package/skills/skill-optimizer/references/adapters/legal-compliance.md +31 -0
- package/skills/skill-optimizer/references/adapters/operations-supply-chain.md +31 -0
- package/skills/skill-optimizer/references/adapters/personal-productivity.md +29 -0
- package/skills/skill-optimizer/references/adapters/research-knowledge.md +31 -0
- package/skills/skill-optimizer/references/adapters/software-ai.md +31 -0
- package/skills/skill-optimizer/references/domain-adapter-patterns.md +66 -0
- package/skills/skill-optimizer/references/eval-method.md +17 -0
- package/skills/skill-optimizer/references/universal-optimization-lens.md +73 -0
|
@@ -0,0 +1,335 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Generate an HTML report from run_loop.py output.
|
|
3
|
+
|
|
4
|
+
Takes the JSON output from run_loop.py and generates a visual HTML report
|
|
5
|
+
showing each description attempt with check/x for each test case.
|
|
6
|
+
Distinguishes between train and test queries.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import argparse
|
|
10
|
+
import html
|
|
11
|
+
import json
|
|
12
|
+
import sys
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def generate_html(data: dict, auto_refresh: bool = False, skill_name: str = "") -> str:
|
|
17
|
+
"""Generate HTML report from loop output data. If auto_refresh is True, adds a meta refresh tag."""
|
|
18
|
+
history = data.get("history", [])
|
|
19
|
+
title_prefix = html.escape(skill_name + " \u2014 ") if skill_name else ""
|
|
20
|
+
|
|
21
|
+
if not history:
|
|
22
|
+
return f"""<!DOCTYPE html>
|
|
23
|
+
<html>
|
|
24
|
+
<head>
|
|
25
|
+
<meta charset="utf-8">
|
|
26
|
+
<title>{title_prefix}Skill Description Optimization</title>
|
|
27
|
+
</head>
|
|
28
|
+
<body>
|
|
29
|
+
<h1>{title_prefix}Skill Description Optimization</h1>
|
|
30
|
+
<p>No optimization history was found in the provided JSON.</p>
|
|
31
|
+
</body>
|
|
32
|
+
</html>
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
# Get all unique queries from train and test sets, with should_trigger info
|
|
36
|
+
train_queries: list[dict] = []
|
|
37
|
+
test_queries: list[dict] = []
|
|
38
|
+
for r in history[0].get("train_results", history[0].get("results", [])):
|
|
39
|
+
train_queries.append({"query": r["query"], "should_trigger": r.get("should_trigger", True)})
|
|
40
|
+
for r in history[0].get("test_results") or []:
|
|
41
|
+
test_queries.append({"query": r["query"], "should_trigger": r.get("should_trigger", True)})
|
|
42
|
+
|
|
43
|
+
refresh_tag = ' <meta http-equiv="refresh" content="5">\n' if auto_refresh else ""
|
|
44
|
+
|
|
45
|
+
html_parts = ["""<!DOCTYPE html>
|
|
46
|
+
<html>
|
|
47
|
+
<head>
|
|
48
|
+
<meta charset="utf-8">
|
|
49
|
+
""" + refresh_tag + """ <title>""" + title_prefix + """Skill Description Optimization</title>
|
|
50
|
+
<link rel="preconnect" href="https://fonts.googleapis.com">
|
|
51
|
+
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
|
52
|
+
<link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;600&family=Lora:wght@400;500&display=swap" rel="stylesheet">
|
|
53
|
+
<style>
|
|
54
|
+
body {
|
|
55
|
+
font-family: 'Lora', Georgia, serif;
|
|
56
|
+
max-width: 100%;
|
|
57
|
+
margin: 0 auto;
|
|
58
|
+
padding: 20px;
|
|
59
|
+
background: #faf9f5;
|
|
60
|
+
color: #141413;
|
|
61
|
+
}
|
|
62
|
+
h1 { font-family: 'Poppins', sans-serif; color: #141413; }
|
|
63
|
+
.explainer {
|
|
64
|
+
background: white;
|
|
65
|
+
padding: 15px;
|
|
66
|
+
border-radius: 6px;
|
|
67
|
+
margin-bottom: 20px;
|
|
68
|
+
border: 1px solid #e8e6dc;
|
|
69
|
+
color: #b0aea5;
|
|
70
|
+
font-size: 0.875rem;
|
|
71
|
+
line-height: 1.6;
|
|
72
|
+
}
|
|
73
|
+
.summary {
|
|
74
|
+
background: white;
|
|
75
|
+
padding: 15px;
|
|
76
|
+
border-radius: 6px;
|
|
77
|
+
margin-bottom: 20px;
|
|
78
|
+
border: 1px solid #e8e6dc;
|
|
79
|
+
}
|
|
80
|
+
.summary p { margin: 5px 0; }
|
|
81
|
+
.best { color: #788c5d; font-weight: bold; }
|
|
82
|
+
.table-container {
|
|
83
|
+
overflow-x: auto;
|
|
84
|
+
width: 100%;
|
|
85
|
+
}
|
|
86
|
+
table {
|
|
87
|
+
border-collapse: collapse;
|
|
88
|
+
background: white;
|
|
89
|
+
border: 1px solid #e8e6dc;
|
|
90
|
+
border-radius: 6px;
|
|
91
|
+
font-size: 12px;
|
|
92
|
+
min-width: 100%;
|
|
93
|
+
}
|
|
94
|
+
th, td {
|
|
95
|
+
padding: 8px;
|
|
96
|
+
text-align: left;
|
|
97
|
+
border: 1px solid #e8e6dc;
|
|
98
|
+
white-space: normal;
|
|
99
|
+
word-wrap: break-word;
|
|
100
|
+
}
|
|
101
|
+
th {
|
|
102
|
+
font-family: 'Poppins', sans-serif;
|
|
103
|
+
background: #141413;
|
|
104
|
+
color: #faf9f5;
|
|
105
|
+
font-weight: 500;
|
|
106
|
+
}
|
|
107
|
+
th.test-col {
|
|
108
|
+
background: #6a9bcc;
|
|
109
|
+
}
|
|
110
|
+
th.query-col { min-width: 200px; }
|
|
111
|
+
td.description {
|
|
112
|
+
font-family: monospace;
|
|
113
|
+
font-size: 11px;
|
|
114
|
+
word-wrap: break-word;
|
|
115
|
+
max-width: 400px;
|
|
116
|
+
}
|
|
117
|
+
td.result {
|
|
118
|
+
text-align: center;
|
|
119
|
+
font-size: 16px;
|
|
120
|
+
min-width: 40px;
|
|
121
|
+
}
|
|
122
|
+
td.test-result {
|
|
123
|
+
background: #f0f6fc;
|
|
124
|
+
}
|
|
125
|
+
.pass { color: #788c5d; }
|
|
126
|
+
.fail { color: #c44; }
|
|
127
|
+
.rate {
|
|
128
|
+
font-size: 9px;
|
|
129
|
+
color: #b0aea5;
|
|
130
|
+
display: block;
|
|
131
|
+
}
|
|
132
|
+
tr:hover { background: #faf9f5; }
|
|
133
|
+
.score {
|
|
134
|
+
display: inline-block;
|
|
135
|
+
padding: 2px 6px;
|
|
136
|
+
border-radius: 4px;
|
|
137
|
+
font-weight: bold;
|
|
138
|
+
font-size: 11px;
|
|
139
|
+
}
|
|
140
|
+
.score-good { background: #eef2e8; color: #788c5d; }
|
|
141
|
+
.score-ok { background: #fef3c7; color: #d97706; }
|
|
142
|
+
.score-bad { background: #fceaea; color: #c44; }
|
|
143
|
+
.train-label { color: #b0aea5; font-size: 10px; }
|
|
144
|
+
.test-label { color: #6a9bcc; font-size: 10px; font-weight: bold; }
|
|
145
|
+
.best-row { background: #f5f8f2; }
|
|
146
|
+
th.positive-col { border-bottom: 3px solid #788c5d; }
|
|
147
|
+
th.negative-col { border-bottom: 3px solid #c44; }
|
|
148
|
+
th.test-col.positive-col { border-bottom: 3px solid #788c5d; }
|
|
149
|
+
th.test-col.negative-col { border-bottom: 3px solid #c44; }
|
|
150
|
+
.legend { font-family: 'Poppins', sans-serif; display: flex; gap: 20px; margin-bottom: 10px; font-size: 13px; align-items: center; }
|
|
151
|
+
.legend-item { display: flex; align-items: center; gap: 6px; }
|
|
152
|
+
.legend-swatch { width: 16px; height: 16px; border-radius: 3px; display: inline-block; }
|
|
153
|
+
.swatch-positive { background: #141413; border-bottom: 3px solid #788c5d; }
|
|
154
|
+
.swatch-negative { background: #141413; border-bottom: 3px solid #c44; }
|
|
155
|
+
.swatch-test { background: #6a9bcc; }
|
|
156
|
+
.swatch-train { background: #141413; }
|
|
157
|
+
</style>
|
|
158
|
+
</head>
|
|
159
|
+
<body>
|
|
160
|
+
<h1>""" + title_prefix + """Skill Description Optimization</h1>
|
|
161
|
+
<div class="explainer">
|
|
162
|
+
<strong>Optimizing your skill's description.</strong> This page updates automatically as Claude tests different versions of your skill's description. Each row is an iteration — a new description attempt. The columns show test queries: green checkmarks mean the skill triggered correctly (or correctly didn't trigger), red crosses mean it got it wrong. The "Train" score shows performance on queries used to improve the description; the "Test" score shows performance on held-out queries the optimizer hasn't seen. When it's done, Claude will apply the best-performing description to your skill.
|
|
163
|
+
</div>
|
|
164
|
+
"""]
|
|
165
|
+
|
|
166
|
+
# Summary section
|
|
167
|
+
best_test_score = data.get('best_test_score')
|
|
168
|
+
best_train_score = data.get('best_train_score')
|
|
169
|
+
html_parts.append(f"""
|
|
170
|
+
<div class="summary">
|
|
171
|
+
<p><strong>Original:</strong> {html.escape(data.get('original_description', 'N/A'))}</p>
|
|
172
|
+
<p class="best"><strong>Best:</strong> {html.escape(data.get('best_description', 'N/A'))}</p>
|
|
173
|
+
<p><strong>Best Score:</strong> {data.get('best_score', 'N/A')} {'(test)' if best_test_score else '(train)'}</p>
|
|
174
|
+
<p><strong>Iterations:</strong> {data.get('iterations_run', 0)} | <strong>Train:</strong> {data.get('train_size', '?')} | <strong>Test:</strong> {data.get('test_size', '?')}</p>
|
|
175
|
+
</div>
|
|
176
|
+
""")
|
|
177
|
+
|
|
178
|
+
# Legend
|
|
179
|
+
html_parts.append("""
|
|
180
|
+
<div class="legend">
|
|
181
|
+
<span style="font-weight:600">Query columns:</span>
|
|
182
|
+
<span class="legend-item"><span class="legend-swatch swatch-positive"></span> Should trigger</span>
|
|
183
|
+
<span class="legend-item"><span class="legend-swatch swatch-negative"></span> Should NOT trigger</span>
|
|
184
|
+
<span class="legend-item"><span class="legend-swatch swatch-train"></span> Train</span>
|
|
185
|
+
<span class="legend-item"><span class="legend-swatch swatch-test"></span> Test</span>
|
|
186
|
+
</div>
|
|
187
|
+
""")
|
|
188
|
+
|
|
189
|
+
# Table header
|
|
190
|
+
html_parts.append("""
|
|
191
|
+
<div class="table-container">
|
|
192
|
+
<table>
|
|
193
|
+
<thead>
|
|
194
|
+
<tr>
|
|
195
|
+
<th>Iter</th>
|
|
196
|
+
<th>Train</th>
|
|
197
|
+
<th>Test</th>
|
|
198
|
+
<th class="query-col">Description</th>
|
|
199
|
+
""")
|
|
200
|
+
|
|
201
|
+
# Add column headers for train queries
|
|
202
|
+
for qinfo in train_queries:
|
|
203
|
+
polarity = "positive-col" if qinfo["should_trigger"] else "negative-col"
|
|
204
|
+
html_parts.append(f' <th class="{polarity}">{html.escape(qinfo["query"])}</th>\n')
|
|
205
|
+
|
|
206
|
+
# Add column headers for test queries (different color)
|
|
207
|
+
for qinfo in test_queries:
|
|
208
|
+
polarity = "positive-col" if qinfo["should_trigger"] else "negative-col"
|
|
209
|
+
html_parts.append(f' <th class="test-col {polarity}">{html.escape(qinfo["query"])}</th>\n')
|
|
210
|
+
|
|
211
|
+
html_parts.append(""" </tr>
|
|
212
|
+
</thead>
|
|
213
|
+
<tbody>
|
|
214
|
+
""")
|
|
215
|
+
|
|
216
|
+
# Find best iteration for highlighting
|
|
217
|
+
if test_queries:
|
|
218
|
+
best_iter = max(history, key=lambda h: h.get("test_passed") or 0).get("iteration")
|
|
219
|
+
else:
|
|
220
|
+
best_iter = max(history, key=lambda h: h.get("train_passed", h.get("passed", 0))).get("iteration")
|
|
221
|
+
|
|
222
|
+
# Add rows for each iteration
|
|
223
|
+
for h in history:
|
|
224
|
+
iteration = h.get("iteration", "?")
|
|
225
|
+
description = h.get("description", "")
|
|
226
|
+
train_results = h.get("train_results", h.get("results", [])) or []
|
|
227
|
+
test_results = h.get("test_results") or []
|
|
228
|
+
|
|
229
|
+
# Create lookups for results by query
|
|
230
|
+
train_by_query = {r["query"]: r for r in train_results}
|
|
231
|
+
test_by_query = {r["query"]: r for r in test_results} if test_results else {}
|
|
232
|
+
|
|
233
|
+
# Compute aggregate correct/total runs across all retries
|
|
234
|
+
def aggregate_runs(results: list[dict] | None) -> tuple[int, int]:
|
|
235
|
+
if not results:
|
|
236
|
+
return 0, 0
|
|
237
|
+
correct = 0
|
|
238
|
+
total = 0
|
|
239
|
+
for r in results:
|
|
240
|
+
runs = r.get("runs", 0)
|
|
241
|
+
triggers = r.get("triggers", 0)
|
|
242
|
+
total += runs
|
|
243
|
+
if r.get("should_trigger", True):
|
|
244
|
+
correct += triggers
|
|
245
|
+
else:
|
|
246
|
+
correct += runs - triggers
|
|
247
|
+
return correct, total
|
|
248
|
+
|
|
249
|
+
train_correct, train_runs = aggregate_runs(train_results)
|
|
250
|
+
test_correct, test_runs = aggregate_runs(test_results)
|
|
251
|
+
|
|
252
|
+
# Determine score classes
|
|
253
|
+
def score_class(correct: int, total: int) -> str:
|
|
254
|
+
if total > 0:
|
|
255
|
+
ratio = correct / total
|
|
256
|
+
if ratio >= 0.8:
|
|
257
|
+
return "score-good"
|
|
258
|
+
elif ratio >= 0.5:
|
|
259
|
+
return "score-ok"
|
|
260
|
+
return "score-bad"
|
|
261
|
+
|
|
262
|
+
train_class = score_class(train_correct, train_runs)
|
|
263
|
+
test_class = score_class(test_correct, test_runs)
|
|
264
|
+
|
|
265
|
+
row_class = "best-row" if iteration == best_iter else ""
|
|
266
|
+
|
|
267
|
+
html_parts.append(f""" <tr class="{row_class}">
|
|
268
|
+
<td>{iteration}</td>
|
|
269
|
+
<td><span class="score {train_class}">{train_correct}/{train_runs}</span></td>
|
|
270
|
+
<td><span class="score {test_class}">{test_correct}/{test_runs}</span></td>
|
|
271
|
+
<td class="description">{html.escape(description)}</td>
|
|
272
|
+
""")
|
|
273
|
+
|
|
274
|
+
# Add result for each train query
|
|
275
|
+
for qinfo in train_queries:
|
|
276
|
+
r = train_by_query.get(qinfo["query"], {})
|
|
277
|
+
did_pass = r.get("pass", False)
|
|
278
|
+
triggers = r.get("triggers", 0)
|
|
279
|
+
runs = r.get("runs", 0)
|
|
280
|
+
|
|
281
|
+
icon = "✓" if did_pass else "✗"
|
|
282
|
+
css_class = "pass" if did_pass else "fail"
|
|
283
|
+
|
|
284
|
+
html_parts.append(f' <td class="result {css_class}">{icon}<span class="rate">{triggers}/{runs}</span></td>\n')
|
|
285
|
+
|
|
286
|
+
# Add result for each test query (with different background)
|
|
287
|
+
for qinfo in test_queries:
|
|
288
|
+
r = test_by_query.get(qinfo["query"], {})
|
|
289
|
+
did_pass = r.get("pass", False)
|
|
290
|
+
triggers = r.get("triggers", 0)
|
|
291
|
+
runs = r.get("runs", 0)
|
|
292
|
+
|
|
293
|
+
icon = "✓" if did_pass else "✗"
|
|
294
|
+
css_class = "pass" if did_pass else "fail"
|
|
295
|
+
|
|
296
|
+
html_parts.append(f' <td class="result test-result {css_class}">{icon}<span class="rate">{triggers}/{runs}</span></td>\n')
|
|
297
|
+
|
|
298
|
+
html_parts.append(" </tr>\n")
|
|
299
|
+
|
|
300
|
+
html_parts.append(""" </tbody>
|
|
301
|
+
</table>
|
|
302
|
+
</div>
|
|
303
|
+
""")
|
|
304
|
+
|
|
305
|
+
html_parts.append("""
|
|
306
|
+
</body>
|
|
307
|
+
</html>
|
|
308
|
+
""")
|
|
309
|
+
|
|
310
|
+
return "".join(html_parts)
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def main():
|
|
314
|
+
parser = argparse.ArgumentParser(description="Generate HTML report from run_loop output")
|
|
315
|
+
parser.add_argument("input", help="Path to JSON output from run_loop.py (or - for stdin)")
|
|
316
|
+
parser.add_argument("-o", "--output", default=None, help="Output HTML file (default: stdout)")
|
|
317
|
+
parser.add_argument("--skill-name", default="", help="Skill name to include in the report title")
|
|
318
|
+
args = parser.parse_args()
|
|
319
|
+
|
|
320
|
+
if args.input == "-":
|
|
321
|
+
data = json.load(sys.stdin)
|
|
322
|
+
else:
|
|
323
|
+
data = json.loads(Path(args.input).read_text())
|
|
324
|
+
|
|
325
|
+
html_output = generate_html(data, skill_name=args.skill_name)
|
|
326
|
+
|
|
327
|
+
if args.output:
|
|
328
|
+
Path(args.output).write_text(html_output)
|
|
329
|
+
print(f"Report written to {args.output}", file=sys.stderr)
|
|
330
|
+
else:
|
|
331
|
+
print(html_output)
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
if __name__ == "__main__":
|
|
335
|
+
main()
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Improve a skill description based on eval results."""
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import argparse
|
|
7
|
+
import json
|
|
8
|
+
import sys
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
if __package__ in (None, ""):
|
|
12
|
+
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
13
|
+
|
|
14
|
+
from scripts.model_backends import detect_backend, extract_tagged_text, generate_text
|
|
15
|
+
from scripts.utils import parse_skill_md
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def improve_description(
|
|
19
|
+
*,
|
|
20
|
+
backend: str,
|
|
21
|
+
skill_name: str,
|
|
22
|
+
skill_content: str,
|
|
23
|
+
current_description: str,
|
|
24
|
+
eval_results: dict,
|
|
25
|
+
history: list[dict],
|
|
26
|
+
model: str | None = None,
|
|
27
|
+
test_results: dict | None = None,
|
|
28
|
+
log_dir: Path | None = None,
|
|
29
|
+
iteration: int | None = None,
|
|
30
|
+
) -> str:
|
|
31
|
+
"""Call the selected backend to improve the description."""
|
|
32
|
+
backend = detect_backend(backend)
|
|
33
|
+
failed_triggers = [
|
|
34
|
+
item for item in eval_results["results"]
|
|
35
|
+
if item["should_trigger"] and not item["pass"]
|
|
36
|
+
]
|
|
37
|
+
false_triggers = [
|
|
38
|
+
item for item in eval_results["results"]
|
|
39
|
+
if not item["should_trigger"] and not item["pass"]
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
train_score = f"{eval_results['summary']['passed']}/{eval_results['summary']['total']}"
|
|
43
|
+
if test_results:
|
|
44
|
+
test_score = f"{test_results['summary']['passed']}/{test_results['summary']['total']}"
|
|
45
|
+
scores_summary = f"Train: {train_score}, Test: {test_score}"
|
|
46
|
+
else:
|
|
47
|
+
scores_summary = f"Train: {train_score}"
|
|
48
|
+
|
|
49
|
+
prompt = f"""You are optimizing a reusable AI skill description.
|
|
50
|
+
|
|
51
|
+
The skill system uses progressive disclosure:
|
|
52
|
+
- The model first sees only the skill name and description.
|
|
53
|
+
- If it chooses the skill, it reads the full SKILL.md and bundled resources.
|
|
54
|
+
|
|
55
|
+
Your job is to improve only the description so that the skill triggers for the right requests and stays quiet for adjacent or simpler ones.
|
|
56
|
+
|
|
57
|
+
Skill name: {skill_name}
|
|
58
|
+
|
|
59
|
+
Current description:
|
|
60
|
+
<current_description>
|
|
61
|
+
{current_description}
|
|
62
|
+
</current_description>
|
|
63
|
+
|
|
64
|
+
Current scores: {scores_summary}
|
|
65
|
+
"""
|
|
66
|
+
if failed_triggers:
|
|
67
|
+
prompt += "FAILED TO TRIGGER:\n"
|
|
68
|
+
for item in failed_triggers:
|
|
69
|
+
prompt += f' - "{item["query"]}" (triggered {item["triggers"]}/{item["runs"]} times)\n'
|
|
70
|
+
prompt += "\n"
|
|
71
|
+
|
|
72
|
+
if false_triggers:
|
|
73
|
+
prompt += "FALSE TRIGGERS:\n"
|
|
74
|
+
for item in false_triggers:
|
|
75
|
+
prompt += f' - "{item["query"]}" (triggered {item["triggers"]}/{item["runs"]} times)\n'
|
|
76
|
+
prompt += "\n"
|
|
77
|
+
|
|
78
|
+
if history:
|
|
79
|
+
prompt += "PREVIOUS ATTEMPTS. Avoid repeating the same structure:\n\n"
|
|
80
|
+
for attempt in history:
|
|
81
|
+
train_s = f"{attempt.get('train_passed', attempt.get('passed', 0))}/{attempt.get('train_total', attempt.get('total', 0))}"
|
|
82
|
+
test_s = f"{attempt.get('test_passed', '?')}/{attempt.get('test_total', '?')}" if attempt.get('test_passed') is not None else None
|
|
83
|
+
score_str = f"train={train_s}" + (f", test={test_s}" if test_s else "")
|
|
84
|
+
prompt += f"<attempt {score_str}>\n"
|
|
85
|
+
prompt += f'Description: "{attempt["description"]}"\n'
|
|
86
|
+
if "results" in attempt:
|
|
87
|
+
for result in attempt["results"]:
|
|
88
|
+
status = "PASS" if result["pass"] else "FAIL"
|
|
89
|
+
prompt += f' [{status}] "{result["query"][:80]}" (triggered {result["triggers"]}/{result["runs"]})\n'
|
|
90
|
+
prompt += "</attempt>\n\n"
|
|
91
|
+
|
|
92
|
+
prompt += f"""Skill content for context:
|
|
93
|
+
<skill_content>
|
|
94
|
+
{skill_content}
|
|
95
|
+
</skill_content>
|
|
96
|
+
|
|
97
|
+
Write a new description that generalizes from the failures instead of listing lots of exact prompts.
|
|
98
|
+
|
|
99
|
+
Constraints:
|
|
100
|
+
- 100 to 200 words preferred
|
|
101
|
+
- Phrase it in terms of user intent, not internal implementation
|
|
102
|
+
- Make it distinctive enough to win against nearby skills
|
|
103
|
+
- Keep it portable across agents like Codex, Claude Code, and other tool-using assistants
|
|
104
|
+
|
|
105
|
+
Return only the description wrapped in <new_description> tags.
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
response_text = generate_text(prompt, backend=backend, model=model, timeout=300)
|
|
109
|
+
description = extract_tagged_text(response_text, "new_description")
|
|
110
|
+
|
|
111
|
+
transcript = {
|
|
112
|
+
"iteration": iteration,
|
|
113
|
+
"backend": backend,
|
|
114
|
+
"prompt": prompt,
|
|
115
|
+
"response": response_text,
|
|
116
|
+
"final_description": description,
|
|
117
|
+
"char_count": len(description),
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
if len(description) > 1024:
|
|
121
|
+
shorten_prompt = (
|
|
122
|
+
f"The description is {len(description)} characters. Rewrite it below 1024 characters without "
|
|
123
|
+
"losing the key trigger intent. Return only <new_description>...</new_description>."
|
|
124
|
+
)
|
|
125
|
+
shortened_response = generate_text(
|
|
126
|
+
"\n\n".join([prompt, response_text, shorten_prompt]),
|
|
127
|
+
backend=backend,
|
|
128
|
+
model=model,
|
|
129
|
+
timeout=300,
|
|
130
|
+
)
|
|
131
|
+
description = extract_tagged_text(shortened_response, "new_description")
|
|
132
|
+
transcript["rewrite_prompt"] = shorten_prompt
|
|
133
|
+
transcript["rewrite_response"] = shortened_response
|
|
134
|
+
transcript["rewrite_char_count"] = len(description)
|
|
135
|
+
transcript["final_description"] = description
|
|
136
|
+
|
|
137
|
+
if log_dir:
|
|
138
|
+
log_dir.mkdir(parents=True, exist_ok=True)
|
|
139
|
+
log_file = log_dir / f"improve_iter_{iteration or 'unknown'}.json"
|
|
140
|
+
log_file.write_text(json.dumps(transcript, indent=2))
|
|
141
|
+
|
|
142
|
+
return description
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def main():
|
|
146
|
+
parser = argparse.ArgumentParser(description="Improve a skill description based on eval results")
|
|
147
|
+
parser.add_argument("--eval-results", required=True, help="Path to eval results JSON")
|
|
148
|
+
parser.add_argument("--skill-path", required=True, help="Path to skill directory")
|
|
149
|
+
parser.add_argument("--history", default=None, help="Path to history JSON")
|
|
150
|
+
parser.add_argument("--model", default=None, help="Optional backend model identifier")
|
|
151
|
+
parser.add_argument("--backend", default="auto", choices=["auto", "claude", "codex"], help="Generation backend")
|
|
152
|
+
parser.add_argument("--verbose", action="store_true", help="Print progress to stderr")
|
|
153
|
+
args = parser.parse_args()
|
|
154
|
+
|
|
155
|
+
skill_path = Path(args.skill_path)
|
|
156
|
+
if not (skill_path / "SKILL.md").exists():
|
|
157
|
+
print(f"Error: No SKILL.md found at {skill_path}", file=sys.stderr)
|
|
158
|
+
sys.exit(1)
|
|
159
|
+
|
|
160
|
+
eval_results = json.loads(Path(args.eval_results).read_text())
|
|
161
|
+
history = json.loads(Path(args.history).read_text()) if args.history else []
|
|
162
|
+
|
|
163
|
+
name, _, content = parse_skill_md(skill_path)
|
|
164
|
+
current_description = eval_results["description"]
|
|
165
|
+
|
|
166
|
+
if args.verbose:
|
|
167
|
+
print(f"Current: {current_description}", file=sys.stderr)
|
|
168
|
+
print(f"Score: {eval_results['summary']['passed']}/{eval_results['summary']['total']}", file=sys.stderr)
|
|
169
|
+
|
|
170
|
+
new_description = improve_description(
|
|
171
|
+
backend=args.backend,
|
|
172
|
+
skill_name=name,
|
|
173
|
+
skill_content=content,
|
|
174
|
+
current_description=current_description,
|
|
175
|
+
eval_results=eval_results,
|
|
176
|
+
history=history,
|
|
177
|
+
model=args.model,
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
if args.verbose:
|
|
181
|
+
print(f"Improved: {new_description}", file=sys.stderr)
|
|
182
|
+
|
|
183
|
+
output = {
|
|
184
|
+
"description": new_description,
|
|
185
|
+
"history": history + [{
|
|
186
|
+
"description": current_description,
|
|
187
|
+
"passed": eval_results["summary"]["passed"],
|
|
188
|
+
"failed": eval_results["summary"]["failed"],
|
|
189
|
+
"total": eval_results["summary"]["total"],
|
|
190
|
+
"results": eval_results["results"],
|
|
191
|
+
}],
|
|
192
|
+
}
|
|
193
|
+
print(json.dumps(output, indent=2))
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
if __name__ == "__main__":
|
|
197
|
+
main()
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Helpers for running host-specific model backends.
|
|
3
|
+
|
|
4
|
+
This skill can be used from several agent environments. Some environments expose
|
|
5
|
+
real skill triggering, others only expose a model CLI. This module centralizes
|
|
6
|
+
those differences so the higher-level workflow can stay mostly host-neutral.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import os
|
|
13
|
+
import re
|
|
14
|
+
import shutil
|
|
15
|
+
import subprocess
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def command_exists(name: str) -> bool:
|
|
20
|
+
return shutil.which(name) is not None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def detect_backend(preferred: str | None = None) -> str:
|
|
24
|
+
if preferred and preferred != "auto":
|
|
25
|
+
return preferred
|
|
26
|
+
if command_exists("claude"):
|
|
27
|
+
return "claude"
|
|
28
|
+
if command_exists("codex"):
|
|
29
|
+
return "codex"
|
|
30
|
+
raise RuntimeError("No supported backend found. Install `claude` or `codex`, or pass --backend explicitly.")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _run(
|
|
34
|
+
cmd: list[str],
|
|
35
|
+
*,
|
|
36
|
+
cwd: Path | None = None,
|
|
37
|
+
timeout: int = 300,
|
|
38
|
+
strip_claudecode: bool = False,
|
|
39
|
+
retries: int = 0,
|
|
40
|
+
retry_timeout_scale: float = 1.5,
|
|
41
|
+
) -> str:
|
|
42
|
+
env = dict(os.environ)
|
|
43
|
+
if strip_claudecode:
|
|
44
|
+
env.pop("CLAUDECODE", None)
|
|
45
|
+
|
|
46
|
+
attempt = 0
|
|
47
|
+
current_timeout = timeout
|
|
48
|
+
while True:
|
|
49
|
+
try:
|
|
50
|
+
completed = subprocess.run(
|
|
51
|
+
cmd,
|
|
52
|
+
cwd=str(cwd) if cwd else None,
|
|
53
|
+
env=env,
|
|
54
|
+
capture_output=True,
|
|
55
|
+
text=True,
|
|
56
|
+
timeout=current_timeout,
|
|
57
|
+
check=True,
|
|
58
|
+
)
|
|
59
|
+
return completed.stdout.strip()
|
|
60
|
+
except subprocess.TimeoutExpired:
|
|
61
|
+
if attempt >= retries:
|
|
62
|
+
raise
|
|
63
|
+
attempt += 1
|
|
64
|
+
current_timeout = max(current_timeout + 1, int(current_timeout * retry_timeout_scale))
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def generate_text(
|
|
68
|
+
prompt: str,
|
|
69
|
+
*,
|
|
70
|
+
backend: str,
|
|
71
|
+
model: str | None = None,
|
|
72
|
+
cwd: Path | None = None,
|
|
73
|
+
timeout: int = 300,
|
|
74
|
+
) -> str:
|
|
75
|
+
backend = detect_backend(backend)
|
|
76
|
+
if backend == "claude":
|
|
77
|
+
cmd = ["claude", "-p", prompt]
|
|
78
|
+
if model:
|
|
79
|
+
cmd.extend(["--model", model])
|
|
80
|
+
return _run(cmd, cwd=cwd, timeout=timeout, strip_claudecode=True)
|
|
81
|
+
|
|
82
|
+
if backend == "codex":
|
|
83
|
+
cmd = ["codex", "exec", "-s", "read-only"]
|
|
84
|
+
if model:
|
|
85
|
+
cmd.extend(["-m", model])
|
|
86
|
+
if cwd:
|
|
87
|
+
cmd.extend(["-C", str(cwd)])
|
|
88
|
+
if not any((parent / ".git").exists() for parent in [cwd, *cwd.parents]):
|
|
89
|
+
cmd.append("--skip-git-repo-check")
|
|
90
|
+
cmd.append(prompt)
|
|
91
|
+
# Codex routing judgments can occasionally run close to the timeout.
|
|
92
|
+
return _run(cmd, cwd=cwd, timeout=timeout, retries=1)
|
|
93
|
+
|
|
94
|
+
raise RuntimeError(f"Unsupported backend: {backend}")
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def extract_tagged_text(text: str, tag: str) -> str:
|
|
98
|
+
match = re.search(rf"<{tag}>(.*?)</{tag}>", text, re.DOTALL)
|
|
99
|
+
if match:
|
|
100
|
+
return match.group(1).strip().strip('"')
|
|
101
|
+
return text.strip().strip('"')
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def extract_first_json_object(text: str) -> dict:
|
|
105
|
+
decoder = json.JSONDecoder()
|
|
106
|
+
for index, char in enumerate(text):
|
|
107
|
+
if char != "{":
|
|
108
|
+
continue
|
|
109
|
+
try:
|
|
110
|
+
obj, _ = decoder.raw_decode(text[index:])
|
|
111
|
+
if isinstance(obj, dict):
|
|
112
|
+
return obj
|
|
113
|
+
except json.JSONDecodeError:
|
|
114
|
+
continue
|
|
115
|
+
raise ValueError(f"No JSON object found in backend output: {text[:200]}")
|