claude-dev-env 1.16.0 → 1.16.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/hooks/HOOK_SPECS_PROMPT_WORKFLOW.md +6 -0
- package/hooks/blocking/prompt_workflow_gate_core.py +174 -56
- package/hooks/blocking/test_prompt_workflow_gate_core.py +84 -13
- package/package.json +1 -1
- package/skills/prompt-generator/evals/prompt-generator.json +5 -5
- package/skills/skill-builder/SKILL.md +87 -0
- package/skills/skill-builder/references/delegation-map.md +151 -0
- package/skills/skill-builder/templates/gap-analysis.md +41 -0
- package/skills/skill-builder/workflows/improve-skill.md +88 -0
- package/skills/skill-builder/workflows/new-skill.md +223 -0
- package/skills/skill-builder/workflows/polish-skill.md +83 -0
|
@@ -54,3 +54,9 @@ These two signals are runtime-checked by the Stop guard whenever a prompt-workfl
|
|
|
54
54
|
## Deterministic Boundary
|
|
55
55
|
|
|
56
56
|
These hooks enforce only structural/runtime checks. Semantic quality remains in auditor layer.
|
|
57
|
+
|
|
58
|
+
## Reviewing Flattened Transcript Exports
|
|
59
|
+
|
|
60
|
+
- Live prompt-workflow responses still require an explicit `Audit:` line plus one outer `xml` fence. The Stop guard and clipboard path continue to evaluate that literal boundary.
|
|
61
|
+
- Saved transcript exports can flatten blocked retry turns and omit the outer fence lines. Normalize those files with `prompt_workflow_gate_core.normalize_prompt_workflow_export(...)`, then evaluate the rebuilt message with `extract_fenced_xml_content(...)` or `extract_fenced_xml_content_from_export(...)`.
|
|
62
|
+
- Fence-relative evals review the **last successful Audit + artifact pair** after normalization. Earlier blocked retries in the flattened transcript remain diagnostic evidence and do not count as extra delivered artifacts.
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
from __future__ import annotations
|
|
5
5
|
|
|
6
6
|
import re
|
|
7
|
+
import textwrap
|
|
7
8
|
from typing import Iterable
|
|
8
9
|
|
|
9
10
|
from prompt_workflow_gate_config import (
|
|
@@ -18,25 +19,57 @@ from prompt_workflow_gate_config import (
|
|
|
18
19
|
REQUIRED_XML_SECTIONS,
|
|
19
20
|
)
|
|
20
21
|
|
|
22
|
+
TRIPLE_BACKTICK = "```"
|
|
23
|
+
AUDIT_LINE_PATTERN = re.compile(r"^\s*[●•]?\s*(Audit:\s*.+?)\s*$")
|
|
21
24
|
|
|
22
25
|
def _line_opens_xml_fence(line: str) -> bool:
|
|
23
26
|
stripped = line.strip()
|
|
24
|
-
if not stripped.startswith(
|
|
27
|
+
if not stripped.startswith(TRIPLE_BACKTICK):
|
|
25
28
|
return False
|
|
26
|
-
|
|
29
|
+
fence_marker_length = len(TRIPLE_BACKTICK)
|
|
30
|
+
remainder = stripped[fence_marker_length:].strip()
|
|
27
31
|
return remainder == "xml" or remainder.startswith("xml ")
|
|
28
32
|
|
|
29
|
-
|
|
30
33
|
def _line_is_bare_fence_close(line: str) -> bool:
|
|
31
|
-
return line.strip() ==
|
|
32
|
-
|
|
34
|
+
return line.strip() == TRIPLE_BACKTICK
|
|
33
35
|
|
|
34
36
|
def _line_opens_inner_markdown_fence(line: str) -> bool:
|
|
35
37
|
stripped = line.strip()
|
|
36
|
-
if not stripped.startswith(
|
|
38
|
+
if not stripped.startswith(TRIPLE_BACKTICK):
|
|
37
39
|
return False
|
|
38
|
-
return stripped !=
|
|
39
|
-
|
|
40
|
+
return stripped != TRIPLE_BACKTICK
|
|
41
|
+
|
|
42
|
+
def _collect_inner_markdown_fence(
|
|
43
|
+
lines: list[str],
|
|
44
|
+
start_index: int,
|
|
45
|
+
) -> tuple[list[str], int]:
|
|
46
|
+
inner_lines: list[str] = []
|
|
47
|
+
index = start_index
|
|
48
|
+
while index < len(lines):
|
|
49
|
+
current_line = lines[index]
|
|
50
|
+
inner_lines.append(current_line)
|
|
51
|
+
index += 1
|
|
52
|
+
if _line_is_bare_fence_close(current_line):
|
|
53
|
+
break
|
|
54
|
+
return inner_lines, index
|
|
55
|
+
|
|
56
|
+
def _collect_xml_fence_body(
|
|
57
|
+
lines: list[str],
|
|
58
|
+
start_index: int,
|
|
59
|
+
) -> tuple[list[str], int]:
|
|
60
|
+
body_lines: list[str] = []
|
|
61
|
+
index = start_index
|
|
62
|
+
while index < len(lines):
|
|
63
|
+
current_line = lines[index]
|
|
64
|
+
if _line_is_bare_fence_close(current_line):
|
|
65
|
+
return body_lines, index + 1
|
|
66
|
+
if _line_opens_inner_markdown_fence(current_line):
|
|
67
|
+
inner_lines, index = _collect_inner_markdown_fence(lines, index)
|
|
68
|
+
body_lines.extend(inner_lines)
|
|
69
|
+
continue
|
|
70
|
+
body_lines.append(current_line)
|
|
71
|
+
index += 1
|
|
72
|
+
return body_lines, index
|
|
40
73
|
|
|
41
74
|
def extract_fenced_xml_content(text: str) -> str:
|
|
42
75
|
"""Extract bodies of ```xml fenced blocks.
|
|
@@ -50,31 +83,104 @@ def extract_fenced_xml_content(text: str) -> str:
|
|
|
50
83
|
lines = text.splitlines()
|
|
51
84
|
index = 0
|
|
52
85
|
while index < len(lines):
|
|
53
|
-
if _line_opens_xml_fence(lines[index]):
|
|
86
|
+
if not _line_opens_xml_fence(lines[index]):
|
|
54
87
|
index += 1
|
|
55
|
-
body_lines: list[str] = []
|
|
56
|
-
while index < len(lines):
|
|
57
|
-
line = lines[index]
|
|
58
|
-
if _line_is_bare_fence_close(line):
|
|
59
|
-
index += 1
|
|
60
|
-
break
|
|
61
|
-
if _line_opens_inner_markdown_fence(line):
|
|
62
|
-
body_lines.append(line)
|
|
63
|
-
index += 1
|
|
64
|
-
while index < len(lines):
|
|
65
|
-
inner_line = lines[index]
|
|
66
|
-
body_lines.append(inner_line)
|
|
67
|
-
index += 1
|
|
68
|
-
if _line_is_bare_fence_close(inner_line):
|
|
69
|
-
break
|
|
70
|
-
continue
|
|
71
|
-
body_lines.append(line)
|
|
72
|
-
index += 1
|
|
73
|
-
results.append("\n".join(body_lines))
|
|
74
88
|
continue
|
|
75
|
-
index
|
|
89
|
+
body_lines, index = _collect_xml_fence_body(lines, index + 1)
|
|
90
|
+
results.append("\n".join(body_lines))
|
|
76
91
|
return "\n".join(results)
|
|
77
92
|
|
|
93
|
+
def _line_is_audit_line(line: str) -> bool:
|
|
94
|
+
return AUDIT_LINE_PATTERN.match(line) is not None
|
|
95
|
+
|
|
96
|
+
def _normalize_audit_line(line: str) -> str:
|
|
97
|
+
match = AUDIT_LINE_PATTERN.match(line)
|
|
98
|
+
if match:
|
|
99
|
+
return match.group(1).strip()
|
|
100
|
+
return line.strip()
|
|
101
|
+
|
|
102
|
+
def _line_starts_exported_artifact(line: str) -> bool:
|
|
103
|
+
stripped = line.strip()
|
|
104
|
+
if not stripped:
|
|
105
|
+
return False
|
|
106
|
+
if _line_opens_xml_fence(stripped):
|
|
107
|
+
return True
|
|
108
|
+
exported_artifact_pattern = re.compile(
|
|
109
|
+
r"^<(\?xml\b|prompt\b|runtime_context\b|role\b|background\b|instructions\b|constraints\b|output_format\b|illustrations\b|open_question\b)",
|
|
110
|
+
)
|
|
111
|
+
return exported_artifact_pattern.match(stripped) is not None
|
|
112
|
+
|
|
113
|
+
def _trim_trailing_blank_lines(lines: list[str]) -> list[str]:
|
|
114
|
+
trimmed = list(lines)
|
|
115
|
+
while trimmed and not trimmed[-1].strip():
|
|
116
|
+
trimmed.pop()
|
|
117
|
+
return trimmed
|
|
118
|
+
|
|
119
|
+
def _trim_flattened_export_tail(lines: list[str]) -> list[str]:
|
|
120
|
+
trimmed = _trim_trailing_blank_lines(lines)
|
|
121
|
+
while trimmed and trimmed[-1].lstrip().startswith("✻ "):
|
|
122
|
+
trimmed.pop()
|
|
123
|
+
trimmed = _trim_trailing_blank_lines(trimmed)
|
|
124
|
+
return trimmed
|
|
125
|
+
|
|
126
|
+
def _find_last_audit_index(lines: list[str]) -> int | None:
|
|
127
|
+
last_audit_index: int | None = None
|
|
128
|
+
for index, line in enumerate(lines):
|
|
129
|
+
if _line_is_audit_line(line):
|
|
130
|
+
last_audit_index = index
|
|
131
|
+
return last_audit_index
|
|
132
|
+
|
|
133
|
+
def _find_first_artifact_index(lines: list[str]) -> int | None:
|
|
134
|
+
for index, line in enumerate(lines):
|
|
135
|
+
if _line_starts_exported_artifact(line):
|
|
136
|
+
return index
|
|
137
|
+
return None
|
|
138
|
+
|
|
139
|
+
def _rebuild_from_existing_fence(audit_line: str, artifact_text: str) -> str:
|
|
140
|
+
fenced_body = extract_fenced_xml_content(artifact_text).strip()
|
|
141
|
+
if not fenced_body:
|
|
142
|
+
return audit_line
|
|
143
|
+
return f"{audit_line}\n```xml\n{fenced_body}\n```"
|
|
144
|
+
|
|
145
|
+
def _rebuild_from_flattened_body(audit_line: str, artifact_text: str) -> str:
|
|
146
|
+
dedented_body = textwrap.dedent(artifact_text).strip("\n")
|
|
147
|
+
if not dedented_body:
|
|
148
|
+
return audit_line
|
|
149
|
+
return f"{audit_line}\n```xml\n{dedented_body}\n```"
|
|
150
|
+
|
|
151
|
+
def _rebuild_canonical_export(audit_line: str, artifact_lines: list[str]) -> str:
|
|
152
|
+
if not artifact_lines:
|
|
153
|
+
return audit_line
|
|
154
|
+
artifact_text = "\n".join(artifact_lines).rstrip()
|
|
155
|
+
if _line_opens_xml_fence(artifact_lines[0]):
|
|
156
|
+
return _rebuild_from_existing_fence(audit_line, artifact_text)
|
|
157
|
+
return _rebuild_from_flattened_body(audit_line, artifact_text)
|
|
158
|
+
|
|
159
|
+
def normalize_prompt_workflow_export(text: str) -> str:
|
|
160
|
+
"""Return the last successful Audit + fenced XML pair from a message or export.
|
|
161
|
+
|
|
162
|
+
Saved transcript exports can flatten blocked retry turns and strip the outer
|
|
163
|
+
``xml`` fence. This helper keeps only the last successful ``Audit:`` attempt
|
|
164
|
+
and rebuilds the canonical audit-plus-fence shape used by prompt-workflow
|
|
165
|
+
hooks and reviewers.
|
|
166
|
+
"""
|
|
167
|
+
lines = text.splitlines()
|
|
168
|
+
last_audit_index = _find_last_audit_index(lines)
|
|
169
|
+
if last_audit_index is None:
|
|
170
|
+
return text.strip()
|
|
171
|
+
audit_line = _normalize_audit_line(lines[last_audit_index])
|
|
172
|
+
artifact_index = _find_first_artifact_index(lines[last_audit_index + 1 :])
|
|
173
|
+
if artifact_index is None:
|
|
174
|
+
return audit_line
|
|
175
|
+
artifact_lines = _trim_flattened_export_tail(
|
|
176
|
+
lines[last_audit_index + 1 + artifact_index :],
|
|
177
|
+
)
|
|
178
|
+
return _rebuild_canonical_export(audit_line, artifact_lines)
|
|
179
|
+
|
|
180
|
+
def extract_fenced_xml_content_from_export(text: str) -> str:
|
|
181
|
+
"""Extract fenced XML from a canonical message or flattened transcript export."""
|
|
182
|
+
normalized = normalize_prompt_workflow_export(text)
|
|
183
|
+
return extract_fenced_xml_content(normalized)
|
|
78
184
|
|
|
79
185
|
def missing_required_xml_sections(text: str) -> list[str]:
|
|
80
186
|
fenced_body = extract_fenced_xml_content(text)
|
|
@@ -88,6 +194,30 @@ def missing_required_xml_sections(text: str) -> list[str]:
|
|
|
88
194
|
missing_sections.append(section_name)
|
|
89
195
|
return missing_sections
|
|
90
196
|
|
|
197
|
+
def _build_negative_keyword_violation(
|
|
198
|
+
match: re.Match[str],
|
|
199
|
+
line_number: int,
|
|
200
|
+
line_text: str,
|
|
201
|
+
) -> dict[str, str | int]:
|
|
202
|
+
return {
|
|
203
|
+
"keyword": match.group(),
|
|
204
|
+
"line_number": line_number,
|
|
205
|
+
"line_text": line_text.strip(),
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
def _find_pattern_violations(
|
|
209
|
+
patterns: Iterable[re.Pattern[str]],
|
|
210
|
+
line_text: str,
|
|
211
|
+
line_number: int,
|
|
212
|
+
) -> list[dict[str, str | int]]:
|
|
213
|
+
violations: list[dict[str, str | int]] = []
|
|
214
|
+
for pattern in patterns:
|
|
215
|
+
match = pattern.search(line_text)
|
|
216
|
+
if match:
|
|
217
|
+
violations.append(
|
|
218
|
+
_build_negative_keyword_violation(match, line_number, line_text),
|
|
219
|
+
)
|
|
220
|
+
return violations
|
|
91
221
|
|
|
92
222
|
def find_negative_keywords_in_fenced_xml(
|
|
93
223
|
text: str,
|
|
@@ -95,45 +225,37 @@ def find_negative_keywords_in_fenced_xml(
|
|
|
95
225
|
fenced_content = extract_fenced_xml_content(text)
|
|
96
226
|
if not fenced_content:
|
|
97
227
|
return []
|
|
98
|
-
fenced_lines = fenced_content.splitlines()
|
|
99
228
|
all_violations: list[dict[str, str | int]] = []
|
|
100
|
-
for line_index, each_line in enumerate(
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
"line_text": each_line.strip(),
|
|
116
|
-
})
|
|
229
|
+
for line_index, each_line in enumerate(fenced_content.splitlines(), start=1):
|
|
230
|
+
all_violations.extend(
|
|
231
|
+
_find_pattern_violations(
|
|
232
|
+
COMPILED_NEGATIVE_KEYWORD_PATTERNS,
|
|
233
|
+
each_line,
|
|
234
|
+
line_index,
|
|
235
|
+
),
|
|
236
|
+
)
|
|
237
|
+
all_violations.extend(
|
|
238
|
+
_find_pattern_violations(
|
|
239
|
+
COMPILED_NEGATIVE_INDIRECT_PATTERNS,
|
|
240
|
+
each_line,
|
|
241
|
+
line_index,
|
|
242
|
+
),
|
|
243
|
+
)
|
|
117
244
|
return all_violations
|
|
118
245
|
|
|
119
|
-
|
|
120
246
|
def _contains_any_marker(text: str, markers: Iterable[str]) -> bool:
|
|
121
247
|
lower_text = text.lower()
|
|
122
248
|
return any(marker.lower() in lower_text for marker in markers)
|
|
123
249
|
|
|
124
|
-
|
|
125
250
|
def has_debug_intent(text: str) -> bool:
|
|
126
251
|
return _contains_any_marker(text, DEBUG_INTENT_MARKERS)
|
|
127
252
|
|
|
128
|
-
|
|
129
253
|
def has_internal_object_leak(text: str) -> bool:
|
|
130
254
|
return _contains_any_marker(text, INTERNAL_OBJECT_MARKERS)
|
|
131
255
|
|
|
132
|
-
|
|
133
256
|
def missing_scope_anchors(text: str) -> list[str]:
|
|
134
257
|
return [anchor for anchor in REQUIRED_SCOPE_ANCHORS if anchor not in text]
|
|
135
258
|
|
|
136
|
-
|
|
137
259
|
def find_ambiguous_scope_terms(text: str) -> list[str]:
|
|
138
260
|
if "scope" not in text.lower():
|
|
139
261
|
return []
|
|
@@ -144,16 +266,13 @@ def find_ambiguous_scope_terms(text: str) -> list[str]:
|
|
|
144
266
|
matches.append(term)
|
|
145
267
|
return matches
|
|
146
268
|
|
|
147
|
-
|
|
148
269
|
def has_checklist_container(text: str) -> bool:
|
|
149
270
|
lower_text = text.lower()
|
|
150
271
|
return "checklist_results" in lower_text or "checklist:" in lower_text
|
|
151
272
|
|
|
152
|
-
|
|
153
273
|
def missing_checklist_rows(text: str) -> list[str]:
|
|
154
274
|
return [row for row in REQUIRED_CHECKLIST_ROWS if row not in text]
|
|
155
275
|
|
|
156
|
-
|
|
157
276
|
def is_prompt_workflow_response(text: str) -> bool:
|
|
158
277
|
lower_text = text.lower()
|
|
159
278
|
matched_markers = [
|
|
@@ -161,7 +280,6 @@ def is_prompt_workflow_response(text: str) -> bool:
|
|
|
161
280
|
]
|
|
162
281
|
return len(matched_markers) >= 2
|
|
163
282
|
|
|
164
|
-
|
|
165
283
|
def missing_context_control_signals(text: str) -> list[str]:
|
|
166
284
|
required_signals: tuple[str, ...] = (
|
|
167
285
|
"base_minimal_instruction_layer: true",
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from prompt_workflow_gate_core import (
|
|
4
4
|
extract_fenced_xml_content,
|
|
5
|
+
extract_fenced_xml_content_from_export,
|
|
5
6
|
find_ambiguous_scope_terms,
|
|
6
7
|
has_checklist_container,
|
|
7
8
|
has_internal_object_leak,
|
|
@@ -10,31 +11,27 @@ from prompt_workflow_gate_core import (
|
|
|
10
11
|
missing_checklist_rows,
|
|
11
12
|
missing_required_xml_sections,
|
|
12
13
|
missing_scope_anchors,
|
|
14
|
+
normalize_prompt_workflow_export,
|
|
13
15
|
)
|
|
14
16
|
|
|
15
|
-
|
|
16
17
|
def test_internal_object_leak_detected() -> None:
|
|
17
18
|
text = '{"pipeline_mode": "internal_section_refinement_with_final_audit"}'
|
|
18
19
|
assert has_internal_object_leak(text)
|
|
19
20
|
|
|
20
|
-
|
|
21
21
|
def test_missing_scope_anchors_returns_expected_rows() -> None:
|
|
22
22
|
text = "target_local_roots only."
|
|
23
23
|
missing = missing_scope_anchors(text)
|
|
24
24
|
assert "target_canonical_roots" in missing
|
|
25
25
|
assert "completion_boundary" in missing
|
|
26
26
|
|
|
27
|
-
|
|
28
27
|
def test_missing_checklist_rows_detected() -> None:
|
|
29
28
|
text = "checklist_results: structured_scoped_instructions only"
|
|
30
29
|
missing = missing_checklist_rows(text)
|
|
31
30
|
assert "completion_boundary_measurable" in missing
|
|
32
31
|
|
|
33
|
-
|
|
34
32
|
def test_checklist_container_detection() -> None:
|
|
35
33
|
assert has_checklist_container("checklist_results:\n- structured_scoped_instructions")
|
|
36
34
|
|
|
37
|
-
|
|
38
35
|
def test_prompt_workflow_response_detection() -> None:
|
|
39
36
|
message = (
|
|
40
37
|
"overall_status: pass\n"
|
|
@@ -43,22 +40,36 @@ def test_prompt_workflow_response_detection() -> None:
|
|
|
43
40
|
)
|
|
44
41
|
assert is_prompt_workflow_response(message)
|
|
45
42
|
|
|
46
|
-
|
|
47
43
|
def test_missing_context_control_signals_detected() -> None:
|
|
48
44
|
missing = missing_context_control_signals("base_minimal_instruction_layer: true")
|
|
49
45
|
assert "on_demand_skill_loading: true" in missing
|
|
50
46
|
|
|
51
|
-
|
|
52
47
|
def test_ambiguous_scope_terms_detected() -> None:
|
|
53
48
|
text = "Scope applies to this session and current files."
|
|
54
49
|
terms = find_ambiguous_scope_terms(text)
|
|
55
50
|
assert "this session" in terms
|
|
56
51
|
assert "current files" in terms
|
|
57
52
|
|
|
58
|
-
|
|
59
53
|
def _fenced_xml(body: str) -> str:
|
|
60
54
|
return f"```xml\n{body}\n```"
|
|
61
55
|
|
|
56
|
+
def _runtime_context_lines() -> tuple[str, ...]:
|
|
57
|
+
return (
|
|
58
|
+
"<runtime_context>",
|
|
59
|
+
"base_minimal_instruction_layer: true",
|
|
60
|
+
"on_demand_skill_loading: true",
|
|
61
|
+
"</runtime_context>",
|
|
62
|
+
"",
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
def _flattened_transcript(*lines: str) -> str:
|
|
66
|
+
return "\n".join(lines) + "\n"
|
|
67
|
+
|
|
68
|
+
def _flattened_attempt(*body_lines: str, audit_line: str = "Audit: pass 15/15") -> str:
|
|
69
|
+
flattened_lines = [audit_line, ""]
|
|
70
|
+
for line in body_lines:
|
|
71
|
+
flattened_lines.append(f" {line}" if line else "")
|
|
72
|
+
return "\n".join(flattened_lines)
|
|
62
73
|
|
|
63
74
|
def test_missing_required_xml_sections_all_present_returns_empty() -> None:
|
|
64
75
|
body = (
|
|
@@ -70,7 +81,6 @@ def test_missing_required_xml_sections_all_present_returns_empty() -> None:
|
|
|
70
81
|
)
|
|
71
82
|
assert missing_required_xml_sections(_fenced_xml(body)) == []
|
|
72
83
|
|
|
73
|
-
|
|
74
84
|
def test_missing_required_xml_sections_missing_background() -> None:
|
|
75
85
|
body = (
|
|
76
86
|
"<role>R.</role>\n"
|
|
@@ -80,7 +90,6 @@ def test_missing_required_xml_sections_missing_background() -> None:
|
|
|
80
90
|
)
|
|
81
91
|
assert missing_required_xml_sections(_fenced_xml(body)) == ["background"]
|
|
82
92
|
|
|
83
|
-
|
|
84
93
|
def test_missing_required_xml_sections_missing_role_and_output_format() -> None:
|
|
85
94
|
body = (
|
|
86
95
|
"<background>C.</background>\n"
|
|
@@ -90,11 +99,9 @@ def test_missing_required_xml_sections_missing_role_and_output_format() -> None:
|
|
|
90
99
|
missing = missing_required_xml_sections(_fenced_xml(body))
|
|
91
100
|
assert missing == ["role", "output_format"]
|
|
92
101
|
|
|
93
|
-
|
|
94
102
|
def test_missing_required_xml_sections_no_fence_returns_empty() -> None:
|
|
95
103
|
assert missing_required_xml_sections("no fenced xml here") == []
|
|
96
104
|
|
|
97
|
-
|
|
98
105
|
def test_missing_required_xml_sections_prose_without_tags_counts_as_missing() -> None:
|
|
99
106
|
body = (
|
|
100
107
|
"<role>R.</role>\n"
|
|
@@ -105,7 +112,6 @@ def test_missing_required_xml_sections_prose_without_tags_counts_as_missing() ->
|
|
|
105
112
|
)
|
|
106
113
|
assert missing_required_xml_sections(_fenced_xml(body)) == ["background"]
|
|
107
114
|
|
|
108
|
-
|
|
109
115
|
def test_extract_fenced_xml_preserves_content_after_nested_inner_fence() -> None:
|
|
110
116
|
message = (
|
|
111
117
|
"```xml\n"
|
|
@@ -122,3 +128,68 @@ def test_extract_fenced_xml_preserves_content_after_nested_inner_fence() -> None
|
|
|
122
128
|
extracted = extract_fenced_xml_content(message)
|
|
123
129
|
assert "</illustrations>" in extracted
|
|
124
130
|
assert "<background>B</background>" in extracted
|
|
131
|
+
|
|
132
|
+
def test_normalize_prompt_workflow_export_rebuilds_fence_from_flattened_transcript() -> None:
|
|
133
|
+
transcript = _flattened_transcript(
|
|
134
|
+
_flattened_attempt(
|
|
135
|
+
*_runtime_context_lines(),
|
|
136
|
+
"<role>R</role>",
|
|
137
|
+
"<background>B</background>",
|
|
138
|
+
"<instructions>I</instructions>",
|
|
139
|
+
"<constraints>C</constraints>",
|
|
140
|
+
"<output_format>O</output_format>",
|
|
141
|
+
"✻ Worked for 1m 7s",
|
|
142
|
+
audit_line="● Audit: pass 15/15",
|
|
143
|
+
),
|
|
144
|
+
)
|
|
145
|
+
normalized = normalize_prompt_workflow_export(transcript)
|
|
146
|
+
assert normalized.startswith("Audit: pass 15/15\n```xml\n")
|
|
147
|
+
assert normalized.endswith("\n```")
|
|
148
|
+
assert "<runtime_context>" in normalized
|
|
149
|
+
assert "✻ Worked for 1m 7s" not in normalized
|
|
150
|
+
|
|
151
|
+
def test_normalize_prompt_workflow_export_uses_last_audit_attempt() -> None:
|
|
152
|
+
first_attempt = _flattened_attempt(
|
|
153
|
+
"<role>FIRST</role>",
|
|
154
|
+
"<background>Old</background>",
|
|
155
|
+
"<instructions>Old</instructions>",
|
|
156
|
+
"<constraints>Old</constraints>",
|
|
157
|
+
"<output_format>Old</output_format>",
|
|
158
|
+
audit_line="● Audit: pass 15/15",
|
|
159
|
+
)
|
|
160
|
+
second_attempt = _flattened_attempt(
|
|
161
|
+
*_runtime_context_lines(),
|
|
162
|
+
"<role>FINAL</role>",
|
|
163
|
+
"<background>Fresh</background>",
|
|
164
|
+
"<instructions>I</instructions>",
|
|
165
|
+
"<constraints>C</constraints>",
|
|
166
|
+
"<output_format>O</output_format>",
|
|
167
|
+
"✻ Worked for 2m 8s",
|
|
168
|
+
)
|
|
169
|
+
transcript = _flattened_transcript(
|
|
170
|
+
first_attempt,
|
|
171
|
+
"",
|
|
172
|
+
"● Re-emitting the full artifact with the runtime signals added.",
|
|
173
|
+
"",
|
|
174
|
+
second_attempt,
|
|
175
|
+
)
|
|
176
|
+
normalized = normalize_prompt_workflow_export(transcript)
|
|
177
|
+
assert "<role>FINAL</role>" in normalized
|
|
178
|
+
assert "<role>FIRST</role>" not in normalized
|
|
179
|
+
|
|
180
|
+
def test_extract_fenced_xml_content_from_export_supports_flattened_transcript() -> None:
|
|
181
|
+
transcript = _flattened_transcript(
|
|
182
|
+
_flattened_attempt(
|
|
183
|
+
"<role>R</role>",
|
|
184
|
+
"<background>B</background>",
|
|
185
|
+
"<instructions>I</instructions>",
|
|
186
|
+
"<constraints>C</constraints>",
|
|
187
|
+
"<output_format>O</output_format>",
|
|
188
|
+
"✻ Worked for 31s",
|
|
189
|
+
audit_line="● Audit: pass 15/15",
|
|
190
|
+
),
|
|
191
|
+
)
|
|
192
|
+
extracted = extract_fenced_xml_content_from_export(transcript)
|
|
193
|
+
assert extracted.startswith("<role>R</role>")
|
|
194
|
+
assert "<output_format>O</output_format>" in extracted
|
|
195
|
+
assert "Worked for" not in extracted
|
package/package.json
CHANGED
|
@@ -72,10 +72,10 @@
|
|
|
72
72
|
"prompt": "/prompt-generator Create a prompt for an agent that traces a routing bug across shared_utils/export_handler.py, orchestrator.py, and download_manager.py — find where extract_apk is called and whether it handles APK signature check failures",
|
|
73
73
|
"files": ["packages/samsung-automation/shared_utils/export_handler.py"],
|
|
74
74
|
"expected_behavior": [
|
|
75
|
-
"No tool_use blocks appear after the first fence marker of the prompt artifact",
|
|
75
|
+
"No tool_use blocks appear after the first fence marker of the canonical prompt artifact",
|
|
76
76
|
"All Glob/Grep discovery calls precede the AskUserQuestion",
|
|
77
77
|
"All AskUserQuestion interactions precede the fenced block",
|
|
78
|
-
"
|
|
78
|
+
"Review the last successful Audit + fenced xml pair; blocked retry attempts preserved by flattened transcript exports do not count as additional delivered artifacts"
|
|
79
79
|
]
|
|
80
80
|
},
|
|
81
81
|
{
|
|
@@ -85,7 +85,7 @@
|
|
|
85
85
|
"prompt": "/prompt-generator Write a detailed agent-harness prompt for a TDD bug-fix workflow that traces a routing error across 5+ files, with state management for multi-window execution and structured test tracking",
|
|
86
86
|
"files": [],
|
|
87
87
|
"expected_behavior": [
|
|
88
|
-
"
|
|
88
|
+
"The canonical prompt artifact has one opening xml fence and one matching closing fence; flattened transcript exports are normalized to that same boundary before review",
|
|
89
89
|
"Every XML tag properly opened and closed",
|
|
90
90
|
"No truncation at numbered-list bullets (the Issue #41 failure mode)",
|
|
91
91
|
"No mid-sentence cuts or incomplete sections",
|
|
@@ -102,8 +102,8 @@
|
|
|
102
102
|
"Discovery tool calls attempt to locate scoring logic before prompt generation",
|
|
103
103
|
"If resolved: prompt references concrete file paths from discovery",
|
|
104
104
|
"If unresolved: prompt contains <open_question> in <background> for downstream agent",
|
|
105
|
-
"No re-entry to discovery after
|
|
106
|
-
"AskUserQuestion may surface the uncertainty if discovery was inconclusive"
|
|
105
|
+
"No re-entry to discovery after the canonical artifact fence starts",
|
|
106
|
+
"AskUserQuestion may surface the uncertainty if discovery was inconclusive; when discovery resolves concrete paths before the artifact, absence of <open_question> is expected"
|
|
107
107
|
]
|
|
108
108
|
},
|
|
109
109
|
{
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: skill-builder
|
|
3
|
+
description: >-
|
|
4
|
+
Orchestrates the complete skill-building lifecycle using evaluation-driven
|
|
5
|
+
development. Routes through gap analysis, eval creation, skill writing (via
|
|
6
|
+
skill-writer), subagent testing (via skill-creator infrastructure), and
|
|
7
|
+
iterative refinement. Use when creating new skills, improving existing skills,
|
|
8
|
+
or optimizing skill descriptions. Triggers: 'build a skill', 'new skill
|
|
9
|
+
workflow', 'improve this skill', 'optimize skill description', 'skill
|
|
10
|
+
development lifecycle'.
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
@${CLAUDE_SKILL_DIR}/references/eval-driven-flow.md
|
|
14
|
+
|
|
15
|
+
# Skill Builder
|
|
16
|
+
|
|
17
|
+
**Core principle:** Evaluation-driven development. Build evals BEFORE writing extensive documentation. This ensures skills solve real problems rather than documenting imagined ones.
|
|
18
|
+
|
|
19
|
+
Source: [Anthropic Skill Best Practices - Evaluation and Iteration](https://platform.claude.com/docs/en/agents-and-tools/agent-skills/best-practices#evaluation-and-iteration)
|
|
20
|
+
|
|
21
|
+
## When this skill applies
|
|
22
|
+
|
|
23
|
+
Trigger for requests to **build**, **improve**, or **polish** a skill through the full evaluation-driven lifecycle. This skill orchestrates the process -- it delegates writing to `/skill-writer` and evaluation infrastructure to the `skill-creator` plugin.
|
|
24
|
+
|
|
25
|
+
For quick skill syntax questions or one-off SKILL.md edits, use `/skill-writer` directly instead.
|
|
26
|
+
|
|
27
|
+
## Routing
|
|
28
|
+
|
|
29
|
+
Assess the user's intent from conversation context and existing artifacts. Route directly:
|
|
30
|
+
|
|
31
|
+
**Creating a new skill?**
|
|
32
|
+
Read `${CLAUDE_SKILL_DIR}/workflows/new-skill.md` and follow it.
|
|
33
|
+
|
|
34
|
+
**Improving an existing skill?**
|
|
35
|
+
Read `${CLAUDE_SKILL_DIR}/workflows/improve-skill.md` and follow it.
|
|
36
|
+
|
|
37
|
+
**Final polish only (description optimization, trigger eval)?**
|
|
38
|
+
Read `${CLAUDE_SKILL_DIR}/workflows/polish-skill.md` and follow it.
|
|
39
|
+
|
|
40
|
+
**Ambiguous?** Ask: "Are you creating a new skill, improving an existing one, or doing a final polish pass?"
|
|
41
|
+
|
|
42
|
+
## The Claude A / Claude B Pattern
|
|
43
|
+
|
|
44
|
+
You and the user are **Claude A** -- the expert who designs and refines the skill. Subagents running the built skill on eval tasks are **Claude B** -- the agent using the skill to perform real work.
|
|
45
|
+
|
|
46
|
+
> "Work with one instance of Claude ('Claude A') to create a Skill that is used by other instances ('Claude B'). Claude A helps you design and refine instructions, while Claude B tests them in real tasks."
|
|
47
|
+
|
|
48
|
+
The feedback loop: observe Claude B's behavior, bring insights back, refine the skill, test again.
|
|
49
|
+
|
|
50
|
+
## Phase Overview
|
|
51
|
+
|
|
52
|
+
| Phase | Purpose | Delegated To |
|
|
53
|
+
|-------|---------|-------------|
|
|
54
|
+
| 1. Identify gaps | Document what fails without the skill | This skill (guided conversation) |
|
|
55
|
+
| 2. Build evals | Create 3+ scenarios testing the gaps | This skill (templates + user input) |
|
|
56
|
+
| 3. Write skill | Minimal instructions addressing gaps | `/skill-writer` |
|
|
57
|
+
| 4. Test | Subagent runs with/without skill, grade, benchmark | `skill-creator` eval infrastructure |
|
|
58
|
+
| 5. Iterate | Review results, refine, re-test | This skill + `/skill-writer` + Phase 4 |
|
|
59
|
+
| 6. Polish | Description optimization, trigger eval, final check | `skill-creator` description optimizer |
|
|
60
|
+
|
|
61
|
+
## Principles (apply across all phases)
|
|
62
|
+
|
|
63
|
+
1. **Evals before documentation.** Never write extensive skill content without evaluation scenarios to validate it.
|
|
64
|
+
|
|
65
|
+
2. **Minimal instructions first.** Write just enough to pass evaluations. Resist the urge to over-document.
|
|
66
|
+
|
|
67
|
+
3. **Generalize from feedback.** The skill will be used across many prompts. Do not overfit to test cases.
|
|
68
|
+
|
|
69
|
+
4. **Explain the why.** Theory of mind beats rigid rules. Help the model understand reasoning, not just constraints.
|
|
70
|
+
|
|
71
|
+
5. **Observe, do not assume.** Iterate based on what Claude B actually does, not what you think it should do.
|
|
72
|
+
|
|
73
|
+
## Delegation Details
|
|
74
|
+
|
|
75
|
+
See `${CLAUDE_SKILL_DIR}/references/delegation-map.md` for exact invocation patterns and integration points between this orchestrator, `/skill-writer`, and `skill-creator`.
|
|
76
|
+
|
|
77
|
+
## File Index
|
|
78
|
+
|
|
79
|
+
| File | Purpose |
|
|
80
|
+
|------|---------|
|
|
81
|
+
| `workflows/new-skill.md` | Full lifecycle for new skills (6 phases) |
|
|
82
|
+
| `workflows/improve-skill.md` | Observation-first flow for existing skills |
|
|
83
|
+
| `workflows/polish-skill.md` | Description optimization and final validation |
|
|
84
|
+
| `references/eval-driven-flow.md` | Official Anthropic methodology with citations |
|
|
85
|
+
| `references/delegation-map.md` | Integration map for skill-writer and skill-creator |
|
|
86
|
+
| `templates/gap-analysis.md` | Template for Phase 1 gap documentation |
|
|
87
|
+
| `templates/eval-scenario.json` | Eval template matching skill-creator schema |
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
# Delegation Map
|
|
2
|
+
|
|
3
|
+
How the skill-builder orchestrator integrates with external skills at each phase.
|
|
4
|
+
|
|
5
|
+
## Phase 1: Identify Gaps -- This Orchestrator
|
|
6
|
+
|
|
7
|
+
No external delegation. The orchestrator guides a conversation with the user to document what fails without a skill.
|
|
8
|
+
|
|
9
|
+
**Output:** `[skill-name]-workspace/gap-analysis.md` using the template at `templates/gap-analysis.md`.
|
|
10
|
+
|
|
11
|
+
## Phase 2: Build Evals -- This Orchestrator
|
|
12
|
+
|
|
13
|
+
No external delegation. The orchestrator helps the user transform gaps into eval scenarios.
|
|
14
|
+
|
|
15
|
+
**Output:** `[skill-name]-workspace/evals/evals.json` using the template at `templates/eval-scenario.json`.
|
|
16
|
+
|
|
17
|
+
**Baseline runs:** Spawn subagents WITHOUT any skill for each eval scenario. These run as background Agent tasks.
|
|
18
|
+
|
|
19
|
+
## Phase 3: Write Skill -- Delegate to `/skill-writer`
|
|
20
|
+
|
|
21
|
+
Invoke `/skill-writer` with the following context in your prompt:
|
|
22
|
+
|
|
23
|
+
```
|
|
24
|
+
Create a skill based on this gap analysis and eval scenarios.
|
|
25
|
+
|
|
26
|
+
Gap analysis: [paste or reference gap-analysis.md]
|
|
27
|
+
Eval scenarios: [paste or reference evals.json expected_output fields]
|
|
28
|
+
Baseline failures: [summarize what Claude got wrong without the skill]
|
|
29
|
+
|
|
30
|
+
Constraint: Write the minimum instructions needed to address these specific gaps.
|
|
31
|
+
Do not over-document. Every line must serve a documented gap.
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
skill-writer handles: type classification, degree of freedom, frontmatter, body structure, progressive disclosure, self-check.
|
|
35
|
+
|
|
36
|
+
**Output:** The skill's SKILL.md (and optional REFERENCE.md, scripts, etc.)
|
|
37
|
+
|
|
38
|
+
## Phase 4: Test -- Delegate to skill-creator Infrastructure
|
|
39
|
+
|
|
40
|
+
The skill-creator plugin provides the eval infrastructure. Reference its components directly:
|
|
41
|
+
|
|
42
|
+
### Spawning test runs
|
|
43
|
+
|
|
44
|
+
For each eval, spawn TWO subagents in the SAME turn (parallel):
|
|
45
|
+
|
|
46
|
+
**With-skill subagent:**
|
|
47
|
+
```
|
|
48
|
+
Execute this task:
|
|
49
|
+
- Read the skill at [path-to-skill]/SKILL.md and follow its instructions
|
|
50
|
+
- Task: [eval prompt from evals.json]
|
|
51
|
+
- Input files: [eval files if any]
|
|
52
|
+
- Save all output files to: [workspace]/iteration-N/eval-[name]/with_skill/outputs/
|
|
53
|
+
- Save a transcript of your complete work to: [workspace]/iteration-N/eval-[name]/with_skill/transcript.md
|
|
54
|
+
- At the end, write a metrics.json with tool call counts and file list
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
**Without-skill subagent** (baseline):
|
|
58
|
+
For iteration-1, reuse baseline results from Phase 2 (iteration-0). For later iterations, the original baseline persists.
|
|
59
|
+
|
|
60
|
+
### Grading
|
|
61
|
+
|
|
62
|
+
Read the grading agent instructions from the skill-creator plugin:
|
|
63
|
+
`[skill-creator-plugin-path]/agents/grader.md`
|
|
64
|
+
|
|
65
|
+
Spawn a grader subagent for each run with:
|
|
66
|
+
- The expectations from evals.json
|
|
67
|
+
- The transcript path
|
|
68
|
+
- The outputs directory
|
|
69
|
+
|
|
70
|
+
**Output:** `grading.json` in each run directory.
|
|
71
|
+
|
|
72
|
+
### Benchmarking
|
|
73
|
+
|
|
74
|
+
Run the aggregation script from the skill-creator plugin directory:
|
|
75
|
+
```bash
|
|
76
|
+
cd [skill-creator-plugin-path] && python -m scripts.aggregate_benchmark [workspace]/iteration-N --skill-name [name]
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
**Output:** `benchmark.json` and `benchmark.md` in the iteration directory.
|
|
80
|
+
|
|
81
|
+
### Eval Viewer
|
|
82
|
+
|
|
83
|
+
Launch the viewer from the skill-creator plugin:
|
|
84
|
+
```bash
|
|
85
|
+
python [skill-creator-plugin-path]/eval-viewer/generate_review.py \
|
|
86
|
+
[workspace]/iteration-N \
|
|
87
|
+
--skill-name "[name]" \
|
|
88
|
+
--benchmark [workspace]/iteration-N/benchmark.json
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
For iteration 2+, add: `--previous-workspace [workspace]/iteration-[N-1]`
|
|
92
|
+
|
|
93
|
+
If no browser/display available, add: `--static [workspace]/iteration-N/review.html`
|
|
94
|
+
|
|
95
|
+
**Output:** Browser-based reviewer where the user inspects outputs and leaves feedback.
|
|
96
|
+
|
|
97
|
+
### Finding the skill-creator plugin path
|
|
98
|
+
|
|
99
|
+
The skill-creator plugin is installed at a path like:
|
|
100
|
+
`~/.claude/plugins/marketplaces/claude-plugins-official/plugins/skill-creator/skills/skill-creator/`
|
|
101
|
+
|
|
102
|
+
To find it dynamically, search for the skill-creator SKILL.md:
|
|
103
|
+
```bash
|
|
104
|
+
find ~/.claude/plugins -name "SKILL.md" -path "*/skill-creator/*" 2>/dev/null | head -1
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
Then derive the plugin root from that path.
|
|
108
|
+
|
|
109
|
+
## Phase 5: Iterate -- This Orchestrator + `/skill-writer`
|
|
110
|
+
|
|
111
|
+
The orchestrator reads feedback.json and transcripts, synthesizes observations, then delegates refinement to `/skill-writer`:
|
|
112
|
+
|
|
113
|
+
```
|
|
114
|
+
Refine this existing skill based on these observations from testing.
|
|
115
|
+
|
|
116
|
+
Current SKILL.md: [paste or reference]
|
|
117
|
+
User feedback: [from feedback.json]
|
|
118
|
+
Behavioral observations: [from transcript analysis]
|
|
119
|
+
|
|
120
|
+
Specific issues to address:
|
|
121
|
+
1. [Issue from feedback]
|
|
122
|
+
2. [Issue from observation]
|
|
123
|
+
|
|
124
|
+
Constraint: Only change what the feedback demands. Do not reorganize working content.
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
Then return to Phase 4 with the refined skill.
|
|
128
|
+
|
|
129
|
+
## Phase 6: Polish -- Delegate to skill-creator Description Optimizer
|
|
130
|
+
|
|
131
|
+
The skill-creator plugin includes a description optimization loop:
|
|
132
|
+
|
|
133
|
+
### Trigger eval generation
|
|
134
|
+
|
|
135
|
+
Generate 20 realistic eval queries (10 should-trigger, 10 should-not-trigger). Use the HTML review template from:
|
|
136
|
+
`[skill-creator-plugin-path]/assets/eval_review.html`
|
|
137
|
+
|
|
138
|
+
### Optimization loop
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
cd [skill-creator-plugin-path] && python -m scripts.run_loop \
|
|
142
|
+
--eval-set [path-to-trigger-eval.json] \
|
|
143
|
+
--skill-path [path-to-skill] \
|
|
144
|
+
--model [current-model-id] \
|
|
145
|
+
--max-iterations 5 \
|
|
146
|
+
--verbose
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
### Final validation
|
|
150
|
+
|
|
151
|
+
Run the skill-writer self-check rubric (from skill-writer's Step 9) against the finished skill. All items must pass.
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# Gap Analysis: [Skill Name]
|
|
2
|
+
|
|
3
|
+
## Task Description
|
|
4
|
+
|
|
5
|
+
[What the user is trying to accomplish -- the capability this skill should provide]
|
|
6
|
+
|
|
7
|
+
## Gaps Identified
|
|
8
|
+
|
|
9
|
+
### Gap 1: [Descriptive Name]
|
|
10
|
+
|
|
11
|
+
- **What happened:** [Description of the failure or missing context when working without a skill]
|
|
12
|
+
- **What was needed:** [The specific context, instruction, or knowledge that would fix it]
|
|
13
|
+
- **Frequency:** [How often this comes up in real usage]
|
|
14
|
+
- **Example task:** [A concrete task that exposes this gap]
|
|
15
|
+
|
|
16
|
+
### Gap 2: [Descriptive Name]
|
|
17
|
+
|
|
18
|
+
- **What happened:** [Description]
|
|
19
|
+
- **What was needed:** [Context/instruction needed]
|
|
20
|
+
- **Frequency:** [Frequency]
|
|
21
|
+
- **Example task:** [Concrete example]
|
|
22
|
+
|
|
23
|
+
### Gap 3: [Descriptive Name]
|
|
24
|
+
|
|
25
|
+
- **What happened:** [Description]
|
|
26
|
+
- **What was needed:** [Context/instruction needed]
|
|
27
|
+
- **Frequency:** [Frequency]
|
|
28
|
+
- **Example task:** [Concrete example]
|
|
29
|
+
|
|
30
|
+
## Patterns
|
|
31
|
+
|
|
32
|
+
- [Recurring themes across gaps -- e.g., "Claude consistently lacks knowledge about X"]
|
|
33
|
+
- [Common failure modes -- e.g., "Without guidance, Claude chooses library A when library B is required"]
|
|
34
|
+
- [Context that was repeatedly provided manually]
|
|
35
|
+
|
|
36
|
+
## Candidate Eval Scenarios
|
|
37
|
+
|
|
38
|
+
- [Task that would expose Gap 1 -- becomes the seed for an eval]
|
|
39
|
+
- [Task that would expose Gap 2]
|
|
40
|
+
- [Task that would expose multiple gaps simultaneously]
|
|
41
|
+
- [Edge case that tests boundary behavior]
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# Improve Skill Workflow
|
|
2
|
+
|
|
3
|
+
Observation-first flow for iterating on an existing skill.
|
|
4
|
+
|
|
5
|
+
## Prerequisites
|
|
6
|
+
|
|
7
|
+
- An existing skill that needs improvement
|
|
8
|
+
- The skill has been used at least once (or the user has observed specific issues)
|
|
9
|
+
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
## Phase 1: Observe
|
|
13
|
+
|
|
14
|
+
**Goal:** Document the existing skill's current behavior by running it on real tasks.
|
|
15
|
+
|
|
16
|
+
> "Use the Skill in real workflows: Give Claude B (with the Skill loaded) actual tasks, not test scenarios"
|
|
17
|
+
|
|
18
|
+
### Process
|
|
19
|
+
|
|
20
|
+
1. Identify the skill to improve. Read its current SKILL.md and any reference files.
|
|
21
|
+
|
|
22
|
+
2. Ask the user what prompted the improvement:
|
|
23
|
+
- "What specific issue did you observe?"
|
|
24
|
+
- "Can you give me a concrete task where the skill underperformed?"
|
|
25
|
+
- "Is this a triggering issue (skill does not activate), a quality issue (skill activates but produces poor results), or a scope issue (skill does the wrong thing)?"
|
|
26
|
+
|
|
27
|
+
3. Run the existing skill on 2-3 real tasks. For each, spawn a subagent:
|
|
28
|
+
|
|
29
|
+
```
|
|
30
|
+
Execute this task using the skill at [path-to-existing-skill]:
|
|
31
|
+
- Read the skill at [path]/SKILL.md and follow its instructions
|
|
32
|
+
- Task: [realistic task from user]
|
|
33
|
+
- Save outputs to: [skill-name]-workspace/observation/task-[N]/outputs/
|
|
34
|
+
- Save transcript to: [skill-name]-workspace/observation/task-[N]/transcript.md
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
4. Analyze the transcripts. Document observations:
|
|
38
|
+
- Where did the skill work well?
|
|
39
|
+
- Where did it fail or produce subpar results?
|
|
40
|
+
- Did Claude B follow the skill's instructions as written?
|
|
41
|
+
- Did Claude B ignore any sections or files?
|
|
42
|
+
- Did Claude B explore in unexpected directions?
|
|
43
|
+
|
|
44
|
+
5. Generate a gap analysis (same template as new-skill Phase 1) focused on the delta between current behavior and desired behavior.
|
|
45
|
+
|
|
46
|
+
**Output:** `[skill-name]-workspace/gap-analysis.md` with observation-based gaps
|
|
47
|
+
|
|
48
|
+
---
|
|
49
|
+
|
|
50
|
+
## Phase 2-6: Follow the New Skill Workflow
|
|
51
|
+
|
|
52
|
+
From here, follow the same phases as `${CLAUDE_SKILL_DIR}/workflows/new-skill.md`, starting at Phase 2 (Build Evals).
|
|
53
|
+
|
|
54
|
+
Key differences from the new-skill flow:
|
|
55
|
+
|
|
56
|
+
- **Phase 2 (Build Evals):** Evals should test the specific issues observed in Phase 1, not hypothetical gaps.
|
|
57
|
+
|
|
58
|
+
- **Phase 3 (Write Skill):** Instead of writing from scratch, invoke `/skill-writer` with:
|
|
59
|
+
|
|
60
|
+
```
|
|
61
|
+
Refine this existing skill based on observation findings.
|
|
62
|
+
|
|
63
|
+
Current SKILL.md: [reference or paste current skill]
|
|
64
|
+
Gap analysis: [reference observation-based gaps]
|
|
65
|
+
Eval scenarios: [reference evals]
|
|
66
|
+
|
|
67
|
+
Constraint: Preserve what works. Only change what the observations demand.
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
- **Phase 4 (Test):** The baseline is the CURRENT skill (snapshot it before editing). Compare old-skill vs new-skill, not with-skill vs without-skill.
|
|
71
|
+
|
|
72
|
+
Before making any changes, snapshot the existing skill:
|
|
73
|
+
```bash
|
|
74
|
+
cp -r [skill-path] [workspace]/skill-snapshot/
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Then for baseline runs, point subagents at the snapshot:
|
|
78
|
+
```
|
|
79
|
+
Execute this task using the ORIGINAL skill at [workspace]/skill-snapshot/:
|
|
80
|
+
- Read the skill and follow its instructions
|
|
81
|
+
- Task: [eval prompt]
|
|
82
|
+
- Save outputs to: [workspace]/iteration-N/eval-[name]/old_skill/outputs/
|
|
83
|
+
- Save transcript to: [workspace]/iteration-N/eval-[name]/old_skill/transcript.md
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
- **Phase 5 (Iterate):** Same process. The improvement loop compares new version against the snapshot.
|
|
87
|
+
|
|
88
|
+
- **Phase 6 (Polish):** Same process. Run description optimization if triggering was an issue.
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
# New Skill Workflow
|
|
2
|
+
|
|
3
|
+
Full evaluation-driven lifecycle for building a new skill from scratch.
|
|
4
|
+
|
|
5
|
+
## Prerequisites
|
|
6
|
+
|
|
7
|
+
- The user has a task or domain they want to capture as a skill
|
|
8
|
+
- No existing skill for this capability (or intentionally starting fresh)
|
|
9
|
+
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
## Phase 1: Identify Gaps
|
|
13
|
+
|
|
14
|
+
**Goal:** Document what fails or requires repeated context when working without a skill.
|
|
15
|
+
|
|
16
|
+
### Process
|
|
17
|
+
|
|
18
|
+
1. Have a guided conversation to uncover gaps. Explore these areas:
|
|
19
|
+
- "What task were you doing when you realized you needed a skill?"
|
|
20
|
+
- "What context did you repeatedly provide to Claude?"
|
|
21
|
+
- "Where did Claude fail or produce subpar results without guidance?"
|
|
22
|
+
- "What domain knowledge was missing?"
|
|
23
|
+
- "What specific format or structure did you need?"
|
|
24
|
+
- "Were there tools or scripts that needed to be used in a particular way?"
|
|
25
|
+
- "What rules or constraints did Claude violate?"
|
|
26
|
+
|
|
27
|
+
2. As patterns emerge, probe for eval-worthy scenarios:
|
|
28
|
+
- "Can you give me a concrete example of a task where this failed?"
|
|
29
|
+
- "What would success look like for that specific task?"
|
|
30
|
+
- "Are there edge cases where the right approach changes?"
|
|
31
|
+
|
|
32
|
+
3. Generate `gap-analysis.md` from the conversation using the template at `${CLAUDE_SKILL_DIR}/templates/gap-analysis.md`. Fill in all sections from what was discussed.
|
|
33
|
+
|
|
34
|
+
4. Review the gap analysis with the user. Confirm completeness before moving to Phase 2.
|
|
35
|
+
|
|
36
|
+
**Output:** `[skill-name]-workspace/gap-analysis.md`
|
|
37
|
+
|
|
38
|
+
---
|
|
39
|
+
|
|
40
|
+
## Phase 2: Build Evals
|
|
41
|
+
|
|
42
|
+
**Goal:** Create 3+ evaluation scenarios that test the identified gaps. Establish a baseline.
|
|
43
|
+
|
|
44
|
+
### Process
|
|
45
|
+
|
|
46
|
+
1. Transform each gap into at least one eval scenario. Each scenario needs:
|
|
47
|
+
- A realistic user prompt (detailed and specific, like a real request)
|
|
48
|
+
- A description of what success looks like
|
|
49
|
+
- Objectively verifiable expectations (assertions)
|
|
50
|
+
|
|
51
|
+
2. Draft evals using the schema at `${CLAUDE_SKILL_DIR}/templates/eval-scenario.json`. Ensure:
|
|
52
|
+
- Minimum 3 scenarios (official requirement)
|
|
53
|
+
- Every identified gap has at least one scenario testing it
|
|
54
|
+
- Expectations are objectively verifiable, not subjective
|
|
55
|
+
- Prompts sound like things a real user would say
|
|
56
|
+
|
|
57
|
+
3. Review eval scenarios with the user. Adjust until both sides are satisfied.
|
|
58
|
+
|
|
59
|
+
4. Save to `[skill-name]-workspace/evals/evals.json`.
|
|
60
|
+
|
|
61
|
+
5. **Establish baseline.** For each eval, spawn a subagent WITHOUT any skill:
|
|
62
|
+
|
|
63
|
+
```
|
|
64
|
+
Execute this task with NO skill loaded:
|
|
65
|
+
- Task: [eval prompt]
|
|
66
|
+
- Input files: [eval files if any, or "none"]
|
|
67
|
+
- Save all output files to: [workspace]/iteration-0/eval-[name]/without_skill/outputs/
|
|
68
|
+
- Save a complete transcript of your work to: [workspace]/iteration-0/eval-[name]/without_skill/transcript.md
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Spawn all baseline runs in parallel. Capture timing data when each completes.
|
|
72
|
+
|
|
73
|
+
6. Grade baseline results using the skill-creator grading agent. See `${CLAUDE_SKILL_DIR}/references/delegation-map.md` for exact grading invocation.
|
|
74
|
+
|
|
75
|
+
**Output:** `[skill-name]-workspace/evals/evals.json` and baseline results in `iteration-0/`
|
|
76
|
+
|
|
77
|
+
---
|
|
78
|
+
|
|
79
|
+
## Phase 3: Write Minimal Skill
|
|
80
|
+
|
|
81
|
+
**Goal:** Create just enough skill content to address the documented gaps and pass evaluations.
|
|
82
|
+
|
|
83
|
+
### Process
|
|
84
|
+
|
|
85
|
+
1. Invoke `/skill-writer` with this context:
|
|
86
|
+
|
|
87
|
+
```
|
|
88
|
+
Create a skill based on this gap analysis and eval scenarios.
|
|
89
|
+
|
|
90
|
+
Gap analysis: [reference or paste gap-analysis.md]
|
|
91
|
+
Eval scenarios: [reference or paste evals.json expected_output and expectations]
|
|
92
|
+
Baseline failures: [summarize what Claude got wrong in iteration-0]
|
|
93
|
+
|
|
94
|
+
Constraint: Write the minimum instructions needed to address these specific gaps.
|
|
95
|
+
Every line must serve a documented gap. Do not over-document.
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
2. `/skill-writer` will run its workflow: classify type, set degree of freedom, ask clarifying questions, produce the SKILL.md artifact.
|
|
99
|
+
|
|
100
|
+
3. Review the draft with the user:
|
|
101
|
+
- "Does this address all the gaps we identified?"
|
|
102
|
+
- "Is anything here unnecessary or over-engineered?"
|
|
103
|
+
- "Would this pass our eval scenarios?"
|
|
104
|
+
|
|
105
|
+
4. Save the skill to its target directory.
|
|
106
|
+
|
|
107
|
+
**Output:** The skill's SKILL.md (and optional reference files)
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
## Phase 4: Test (Feedback Loop)
|
|
112
|
+
|
|
113
|
+
**Goal:** Run the skill on eval scenarios, compare against baseline, identify remaining gaps.
|
|
114
|
+
|
|
115
|
+
### Process
|
|
116
|
+
|
|
117
|
+
1. **Spawn all runs in parallel.** For each eval scenario, launch a with-skill subagent:
|
|
118
|
+
|
|
119
|
+
```
|
|
120
|
+
Execute this task:
|
|
121
|
+
- Read the skill at [path-to-skill]/SKILL.md and follow its instructions
|
|
122
|
+
- Task: [eval prompt from evals.json]
|
|
123
|
+
- Input files: [eval files if any, or "none"]
|
|
124
|
+
- Save all output files to: [workspace]/iteration-N/eval-[name]/with_skill/outputs/
|
|
125
|
+
- Save a complete transcript of your work to: [workspace]/iteration-N/eval-[name]/with_skill/transcript.md
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
For iteration-1, the without-skill baseline already exists from Phase 2.
|
|
129
|
+
|
|
130
|
+
2. **While runs are in progress**, review and refine assertions if needed based on what was learned from the baseline.
|
|
131
|
+
|
|
132
|
+
3. **When runs complete**, immediately capture timing data (`total_tokens`, `duration_ms`) to `timing.json` in each run directory. This data is only available in the task completion notification.
|
|
133
|
+
|
|
134
|
+
4. **Grade each run** using the skill-creator grading agent. See `${CLAUDE_SKILL_DIR}/references/delegation-map.md` for the grading process.
|
|
135
|
+
|
|
136
|
+
5. **Aggregate into benchmark** using skill-creator's aggregation script. See delegation-map.md for the exact command.
|
|
137
|
+
|
|
138
|
+
6. **Launch the eval viewer** using skill-creator's generate_review.py. See delegation-map.md for the exact command. For iteration 2+, include `--previous-workspace` to show diffs.
|
|
139
|
+
|
|
140
|
+
7. Tell the user to review in the viewer:
|
|
141
|
+
- "Outputs" tab: click through each test case, leave feedback
|
|
142
|
+
- "Benchmark" tab: quantitative comparison (pass rates, timing, tokens)
|
|
143
|
+
|
|
144
|
+
8. Wait for the user to complete their review.
|
|
145
|
+
|
|
146
|
+
**Output:** `grading.json`, `benchmark.json`, `feedback.json` in the iteration directory
|
|
147
|
+
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
## Phase 5: Iterate
|
|
151
|
+
|
|
152
|
+
**Goal:** Refine the skill based on observed Claude B behavior and user feedback.
|
|
153
|
+
|
|
154
|
+
### Process
|
|
155
|
+
|
|
156
|
+
1. Read `feedback.json` from the viewer. Empty feedback means the user was satisfied with that test case.
|
|
157
|
+
|
|
158
|
+
2. Read transcripts from Phase 4 runs. Watch for the signals the official docs highlight:
|
|
159
|
+
- **Unexpected exploration paths** -- Claude B read files in an order you did not anticipate
|
|
160
|
+
- **Missed connections** -- Claude B did not follow references to important files
|
|
161
|
+
- **Overreliance on certain sections** -- content that should be promoted to SKILL.md
|
|
162
|
+
- **Ignored content** -- files Claude B never accessed (may be unnecessary or poorly signaled)
|
|
163
|
+
- **Repeated work across test cases** -- all subagents wrote similar helper scripts (bundle them into the skill)
|
|
164
|
+
|
|
165
|
+
3. Synthesize observations into actionable improvements. For each piece of feedback, identify the specific skill change that would fix it.
|
|
166
|
+
|
|
167
|
+
4. Apply improvements. For significant changes, re-invoke `/skill-writer` with:
|
|
168
|
+
|
|
169
|
+
```
|
|
170
|
+
Refine this existing skill based on testing observations.
|
|
171
|
+
|
|
172
|
+
Current SKILL.md: [reference or paste]
|
|
173
|
+
User feedback: [from feedback.json -- only non-empty entries]
|
|
174
|
+
Behavioral observations: [from transcript analysis]
|
|
175
|
+
|
|
176
|
+
Specific issues to address:
|
|
177
|
+
1. [Issue]
|
|
178
|
+
2. [Issue]
|
|
179
|
+
|
|
180
|
+
Constraint: Only change what the feedback demands. Do not reorganize working content.
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
5. Key principles for this phase (from the official docs):
|
|
184
|
+
- **Generalize from feedback** -- the skill will be used across many different prompts, not just these test cases
|
|
185
|
+
- **Keep the prompt lean** -- remove instructions that are not pulling their weight
|
|
186
|
+
- **Explain the why** -- theory of mind beats rigid MUSTs
|
|
187
|
+
- **Bundle repeated work** -- if subagents all wrote similar scripts, add them to the skill
|
|
188
|
+
|
|
189
|
+
6. Return to Phase 4 with the refined skill. Continue iterating until:
|
|
190
|
+
- User feedback is all empty (satisfied with every test case)
|
|
191
|
+
- Pass rates meet acceptable thresholds
|
|
192
|
+
- No meaningful progress between iterations
|
|
193
|
+
|
|
194
|
+
---
|
|
195
|
+
|
|
196
|
+
## Phase 6: Polish
|
|
197
|
+
|
|
198
|
+
**Goal:** Optimize the skill description for triggering accuracy and run final validation.
|
|
199
|
+
|
|
200
|
+
### Process
|
|
201
|
+
|
|
202
|
+
1. **Description optimization.** Follow the process in `${CLAUDE_SKILL_DIR}/workflows/polish-skill.md`.
|
|
203
|
+
|
|
204
|
+
2. **Final validation.** Run the skill-writer self-check rubric against the finished skill:
|
|
205
|
+
- [ ] Description is third person with trigger phrases
|
|
206
|
+
- [ ] Under 500 lines
|
|
207
|
+
- [ ] States what to do in positive terms (not prohibition-heavy)
|
|
208
|
+
- [ ] Degree of freedom matches task fragility
|
|
209
|
+
- [ ] Progressive disclosure used (heavy content in separate files)
|
|
210
|
+
- [ ] Examples are concrete, not abstract
|
|
211
|
+
- [ ] Frontmatter fields are valid
|
|
212
|
+
- [ ] One skill = one capability
|
|
213
|
+
|
|
214
|
+
3. **Final checklist** from the official Anthropic docs:
|
|
215
|
+
- [ ] At least 3 evaluation scenarios created and passing
|
|
216
|
+
- [ ] Tested with real usage scenarios
|
|
217
|
+
- [ ] Skill solves documented gaps (not imagined requirements)
|
|
218
|
+
- [ ] Iterative refinement based on observed behavior (not assumptions)
|
|
219
|
+
|
|
220
|
+
4. Present the finished skill to the user with:
|
|
221
|
+
- Final benchmark comparison (latest iteration vs baseline)
|
|
222
|
+
- Summary of gaps addressed
|
|
223
|
+
- Any remaining limitations or known edge cases
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# Polish Skill Workflow
|
|
2
|
+
|
|
3
|
+
Final optimization pass for a skill that is functionally complete.
|
|
4
|
+
|
|
5
|
+
## Prerequisites
|
|
6
|
+
|
|
7
|
+
- The skill passes its evaluation scenarios
|
|
8
|
+
- The user is satisfied with output quality
|
|
9
|
+
- This is the final step before the skill is considered done
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## Step 1: Description Optimization
|
|
14
|
+
|
|
15
|
+
Optimize the skill's description for triggering accuracy using the skill-creator's trigger eval system.
|
|
16
|
+
|
|
17
|
+
### Generate trigger eval queries
|
|
18
|
+
|
|
19
|
+
Create 20 eval queries: 10 should-trigger and 10 should-not-trigger.
|
|
20
|
+
|
|
21
|
+
**Should-trigger queries (10):** Different phrasings of the same intent. Include:
|
|
22
|
+
- Formal and casual variations
|
|
23
|
+
- Cases where the user does not explicitly name the skill but clearly needs it
|
|
24
|
+
- Uncommon use cases
|
|
25
|
+
- Cases where this skill competes with another but should win
|
|
26
|
+
|
|
27
|
+
**Should-not-trigger queries (10):** Near-misses that share keywords but need something different. Include:
|
|
28
|
+
- Adjacent domains with overlapping terminology
|
|
29
|
+
- Ambiguous phrasing where naive keyword matching would falsely trigger
|
|
30
|
+
- Tasks that touch the skill's domain but in a context where another tool is better
|
|
31
|
+
|
|
32
|
+
All queries must be realistic -- detailed, specific, with file paths, personal context, casual speech. Not abstract one-liners.
|
|
33
|
+
|
|
34
|
+
### Review with user
|
|
35
|
+
|
|
36
|
+
Present the eval set using the skill-creator's HTML review template. See `${CLAUDE_SKILL_DIR}/references/delegation-map.md` for the exact process.
|
|
37
|
+
|
|
38
|
+
The user can edit queries, toggle should-trigger, and add/remove entries.
|
|
39
|
+
|
|
40
|
+
### Run optimization loop
|
|
41
|
+
|
|
42
|
+
See `${CLAUDE_SKILL_DIR}/references/delegation-map.md` for the exact command. The loop:
|
|
43
|
+
1. Splits eval set into 60% train / 40% held-out test
|
|
44
|
+
2. Evaluates current description (3 runs per query for reliability)
|
|
45
|
+
3. Proposes improvements based on failures
|
|
46
|
+
4. Re-evaluates on both train and test
|
|
47
|
+
5. Iterates up to 5 times
|
|
48
|
+
6. Selects best description by test score (avoids overfitting)
|
|
49
|
+
|
|
50
|
+
### Apply result
|
|
51
|
+
|
|
52
|
+
Update the skill's SKILL.md frontmatter with the optimized description. Show the user before/after with scores.
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## Step 2: Final Validation
|
|
57
|
+
|
|
58
|
+
Run the skill-writer self-check rubric:
|
|
59
|
+
|
|
60
|
+
- [ ] Description is third person with trigger phrases
|
|
61
|
+
- [ ] SKILL.md body under 500 lines
|
|
62
|
+
- [ ] States what to do in positive terms (not prohibition-heavy)
|
|
63
|
+
- [ ] Degree of freedom matches task fragility
|
|
64
|
+
- [ ] Progressive disclosure used (heavy content in separate files)
|
|
65
|
+
- [ ] No time-sensitive claims unless clearly dated
|
|
66
|
+
- [ ] Examples are concrete, not abstract
|
|
67
|
+
- [ ] Frontmatter fields are valid per official docs
|
|
68
|
+
- [ ] One skill = one capability
|
|
69
|
+
- [ ] Consistent terminology throughout
|
|
70
|
+
- [ ] File references are one level deep from SKILL.md
|
|
71
|
+
- [ ] Files over 100 lines have a table of contents
|
|
72
|
+
|
|
73
|
+
---
|
|
74
|
+
|
|
75
|
+
## Step 3: Final Summary
|
|
76
|
+
|
|
77
|
+
Present the finished skill to the user:
|
|
78
|
+
|
|
79
|
+
1. **Benchmark summary:** Final pass rate vs baseline, with delta
|
|
80
|
+
2. **Gaps addressed:** Map each original gap to the skill content that addresses it
|
|
81
|
+
3. **Description optimization:** Before/after trigger accuracy scores
|
|
82
|
+
4. **Known limitations:** Anything the skill does not handle (scope boundaries)
|
|
83
|
+
5. **Maintenance notes:** What to watch for in future usage that might warrant re-iteration
|