claude-dev-env 1.10.0 → 1.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/hooks/blocking/prompt-workflow-stop-guard.py +19 -0
- package/hooks/blocking/prompt_workflow_gate_core.py +74 -0
- package/hooks/blocking/test_prompt_workflow_stop_guard.py +86 -0
- package/package.json +1 -1
- package/skills/prompt-generator/REFERENCE.md +52 -0
- package/skills/prompt-generator/REFINEMENT_PIPELINE_RUNBOOK.md +5 -0
- package/skills/prompt-generator/SKILL.md +35 -19
- package/skills/prompt-generator/TARGET_OUTPUT.md +3 -3
- package/skills/prompt-generator/evals/prompt-generator.json +15 -0
|
@@ -11,6 +11,7 @@ from pathlib import Path
|
|
|
11
11
|
|
|
12
12
|
from prompt_workflow_gate_core import (
|
|
13
13
|
find_ambiguous_scope_terms,
|
|
14
|
+
find_negative_keywords_in_fenced_xml,
|
|
14
15
|
has_debug_intent,
|
|
15
16
|
has_checklist_container,
|
|
16
17
|
has_internal_object_leak,
|
|
@@ -130,6 +131,23 @@ def _check_ambiguous_scope(assistant_message: str) -> dict | None:
|
|
|
130
131
|
),
|
|
131
132
|
)
|
|
132
133
|
|
|
134
|
+
def _check_negative_keywords_in_artifact(assistant_message: str) -> dict | None:
|
|
135
|
+
violations = find_negative_keywords_in_fenced_xml(assistant_message)
|
|
136
|
+
if not violations:
|
|
137
|
+
return None
|
|
138
|
+
violation_descriptions = [
|
|
139
|
+
f" line {each_violation['line_number']}: \"{each_violation['keyword']}\" in: {each_violation['line_text']}"
|
|
140
|
+
for each_violation in violations
|
|
141
|
+
]
|
|
142
|
+
return _build_block(
|
|
143
|
+
brief_label="retrying: rephrase negative keywords in artifact",
|
|
144
|
+
full_reason=(
|
|
145
|
+
"PROMPT-WORKFLOW GATE: Banned negative keywords found inside fenced XML artifact. "
|
|
146
|
+
"Rephrase as positive directives (what TO do, not what to avoid):\n"
|
|
147
|
+
+ "\n".join(violation_descriptions)
|
|
148
|
+
),
|
|
149
|
+
)
|
|
150
|
+
|
|
133
151
|
def _evaluate_workflow_gates(assistant_message: str) -> dict | None:
|
|
134
152
|
if not is_prompt_workflow_response(assistant_message):
|
|
135
153
|
return None
|
|
@@ -139,6 +157,7 @@ def _evaluate_workflow_gates(assistant_message: str) -> dict | None:
|
|
|
139
157
|
_check_missing_scope_anchors,
|
|
140
158
|
_check_missing_context_signals,
|
|
141
159
|
_check_ambiguous_scope,
|
|
160
|
+
_check_negative_keywords_in_artifact,
|
|
142
161
|
)
|
|
143
162
|
for check in workflow_gate_checks:
|
|
144
163
|
block = check(assistant_message)
|
|
@@ -29,6 +29,7 @@ REQUIRED_CHECKLIST_ROWS: tuple[str, ...] = (
|
|
|
29
29
|
"completion_boundary_measurable",
|
|
30
30
|
"citation_grounding_policy_present",
|
|
31
31
|
"source_priority_rules_present",
|
|
32
|
+
"artifact_language_confidence",
|
|
32
33
|
)
|
|
33
34
|
|
|
34
35
|
REQUIRED_CONTEXT_CONTROL_SIGNALS: tuple[str, ...] = (
|
|
@@ -80,6 +81,79 @@ DEBUG_INTENT_MARKERS: tuple[str, ...] = (
|
|
|
80
81
|
)
|
|
81
82
|
|
|
82
83
|
|
|
84
|
+
NEGATIVE_KEYWORDS_IN_ARTIFACT: tuple[str, ...] = (
|
|
85
|
+
"no",
|
|
86
|
+
"not",
|
|
87
|
+
"don't",
|
|
88
|
+
"do not",
|
|
89
|
+
"never",
|
|
90
|
+
"avoid",
|
|
91
|
+
"without",
|
|
92
|
+
"refrain",
|
|
93
|
+
"stop",
|
|
94
|
+
"prevent",
|
|
95
|
+
"exclude",
|
|
96
|
+
"prohibit",
|
|
97
|
+
"forbid",
|
|
98
|
+
"reject",
|
|
99
|
+
"cannot",
|
|
100
|
+
"unless",
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
NEGATIVE_INDIRECT_PATTERNS_IN_ARTIFACT: tuple[str, ...] = (
|
|
104
|
+
r"instead of\s+\w+",
|
|
105
|
+
r"rather than\s+\w+",
|
|
106
|
+
r"as opposed to\s+\w+",
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
COMPILED_NEGATIVE_KEYWORD_PATTERNS: tuple[re.Pattern[str], ...] = tuple(
|
|
110
|
+
re.compile(rf"\b{re.escape(keyword)}\b", re.IGNORECASE)
|
|
111
|
+
for keyword in NEGATIVE_KEYWORDS_IN_ARTIFACT
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
COMPILED_NEGATIVE_INDIRECT_PATTERNS: tuple[re.Pattern[str], ...] = tuple(
|
|
115
|
+
re.compile(pattern, re.IGNORECASE)
|
|
116
|
+
for pattern in NEGATIVE_INDIRECT_PATTERNS_IN_ARTIFACT
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
FENCED_XML_BLOCK_PATTERN: re.Pattern[str] = re.compile(
|
|
120
|
+
r"```xml\s*\n(.*?)```", re.DOTALL
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def extract_fenced_xml_content(text: str) -> str:
|
|
125
|
+
all_matches = FENCED_XML_BLOCK_PATTERN.findall(text)
|
|
126
|
+
return "\n".join(all_matches)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def find_negative_keywords_in_fenced_xml(
|
|
130
|
+
text: str,
|
|
131
|
+
) -> list[dict[str, str | int]]:
|
|
132
|
+
fenced_content = extract_fenced_xml_content(text)
|
|
133
|
+
if not fenced_content:
|
|
134
|
+
return []
|
|
135
|
+
fenced_lines = fenced_content.splitlines()
|
|
136
|
+
all_violations: list[dict[str, str | int]] = []
|
|
137
|
+
for line_index, each_line in enumerate(fenced_lines):
|
|
138
|
+
for each_pattern in COMPILED_NEGATIVE_KEYWORD_PATTERNS:
|
|
139
|
+
each_match = each_pattern.search(each_line)
|
|
140
|
+
if each_match:
|
|
141
|
+
all_violations.append({
|
|
142
|
+
"keyword": each_match.group(),
|
|
143
|
+
"line_number": line_index + 1,
|
|
144
|
+
"line_text": each_line.strip(),
|
|
145
|
+
})
|
|
146
|
+
for each_pattern in COMPILED_NEGATIVE_INDIRECT_PATTERNS:
|
|
147
|
+
each_match = each_pattern.search(each_line)
|
|
148
|
+
if each_match:
|
|
149
|
+
all_violations.append({
|
|
150
|
+
"keyword": each_match.group(),
|
|
151
|
+
"line_number": line_index + 1,
|
|
152
|
+
"line_text": each_line.strip(),
|
|
153
|
+
})
|
|
154
|
+
return all_violations
|
|
155
|
+
|
|
156
|
+
|
|
83
157
|
def _contains_any_marker(text: str, markers: Iterable[str]) -> bool:
|
|
84
158
|
lower_text = text.lower()
|
|
85
159
|
return any(marker.lower() in lower_text for marker in markers)
|
|
@@ -5,6 +5,8 @@ import subprocess
|
|
|
5
5
|
import sys
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
|
|
8
|
+
import pytest
|
|
9
|
+
|
|
8
10
|
|
|
9
11
|
SCRIPT_PATH = Path(__file__).parent / "prompt-workflow-stop-guard.py"
|
|
10
12
|
|
|
@@ -34,6 +36,7 @@ def _full_checklist_rows() -> str:
|
|
|
34
36
|
"- completion_boundary_measurable\n"
|
|
35
37
|
"- citation_grounding_policy_present\n"
|
|
36
38
|
"- source_priority_rules_present\n"
|
|
39
|
+
"- artifact_language_confidence\n"
|
|
37
40
|
)
|
|
38
41
|
|
|
39
42
|
def test_blocks_internal_object_leak_without_debug_intent() -> None:
|
|
@@ -117,6 +120,89 @@ def test_blocks_ambiguous_scope_phrasing() -> None:
|
|
|
117
120
|
assert response["decision"] == "block"
|
|
118
121
|
assert "Ambiguous scope phrasing detected" in response["reason"]
|
|
119
122
|
|
|
123
|
+
def _build_prompt_workflow_message_with_fenced_xml(fenced_xml_body: str) -> str:
|
|
124
|
+
return (
|
|
125
|
+
"Audit: pass 15/15\n"
|
|
126
|
+
"```xml\n"
|
|
127
|
+
+ fenced_xml_body
|
|
128
|
+
+ "\n```\n"
|
|
129
|
+
"overall_status: pass\n"
|
|
130
|
+
+ _full_checklist_rows()
|
|
131
|
+
+ "target_local_roots\n"
|
|
132
|
+
"target_canonical_roots\n"
|
|
133
|
+
"target_file_globs\n"
|
|
134
|
+
"comparison_basis\n"
|
|
135
|
+
"completion_boundary\n"
|
|
136
|
+
"base_minimal_instruction_layer: true\n"
|
|
137
|
+
"on_demand_skill_loading: true\n"
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def test_allows_positive_phrasing_inside_fenced_xml() -> None:
|
|
142
|
+
fenced_content = "<instructions>Ensure all functions have explicit return types.</instructions>"
|
|
143
|
+
payload = {
|
|
144
|
+
"last_assistant_message": _build_prompt_workflow_message_with_fenced_xml(fenced_content),
|
|
145
|
+
}
|
|
146
|
+
result = _run_hook(payload)
|
|
147
|
+
assert result.stdout.strip() == ""
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
BANNED_KEYWORD_TEST_CASES: list[tuple[str, str]] = [
|
|
151
|
+
("do_not", "<instructions>Do not leave return types implicit.</instructions>"),
|
|
152
|
+
("avoid", "<instructions>Avoid missing return types.</instructions>"),
|
|
153
|
+
("never", "<constraints>Never store credentials in plain text.</constraints>"),
|
|
154
|
+
("without", "<instructions>Deploy without running tests first.</instructions>"),
|
|
155
|
+
("prevent", "<constraints>Prevent unauthorized access to the API.</constraints>"),
|
|
156
|
+
("reject", "<constraints>Reject all unsigned commits.</constraints>"),
|
|
157
|
+
("cannot", "<constraints>The API cannot accept unauthenticated requests.</constraints>"),
|
|
158
|
+
("unless", "<constraints>Skip the build step unless the user explicitly approves.</constraints>"),
|
|
159
|
+
("must_not", "<constraints>The script must not produce duplicates.</constraints>"),
|
|
160
|
+
("must_never", "<constraints>You must never store credentials in environment variables.</constraints>"),
|
|
161
|
+
("instead_of", "<instructions>Use explicit types instead of implicit ones.</instructions>"),
|
|
162
|
+
("rather_than", "<constraints>Prefer explicit types rather than inferred ones.</constraints>"),
|
|
163
|
+
("as_opposed_to", "<instructions>Use Grid as opposed to floats for layout.</instructions>"),
|
|
164
|
+
]
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
@pytest.mark.parametrize(
|
|
168
|
+
("banned_pattern_name", "fenced_xml_content"),
|
|
169
|
+
BANNED_KEYWORD_TEST_CASES,
|
|
170
|
+
ids=[each_case[0] for each_case in BANNED_KEYWORD_TEST_CASES],
|
|
171
|
+
)
|
|
172
|
+
def test_blocks_banned_pattern_inside_fenced_xml(
|
|
173
|
+
banned_pattern_name: str,
|
|
174
|
+
fenced_xml_content: str,
|
|
175
|
+
) -> None:
|
|
176
|
+
payload = {
|
|
177
|
+
"last_assistant_message": _build_prompt_workflow_message_with_fenced_xml(fenced_xml_content),
|
|
178
|
+
}
|
|
179
|
+
result = _run_hook(payload)
|
|
180
|
+
response = json.loads(result.stdout)
|
|
181
|
+
assert response["decision"] == "block"
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def test_permits_negative_keywords_outside_fenced_xml() -> None:
|
|
185
|
+
message = (
|
|
186
|
+
"Audit: pass 15/15\n"
|
|
187
|
+
"Do not skip the audit line.\n"
|
|
188
|
+
"```xml\n"
|
|
189
|
+
"<instructions>Ensure all functions have explicit return types.</instructions>\n"
|
|
190
|
+
"```\n"
|
|
191
|
+
"overall_status: pass\n"
|
|
192
|
+
+ _full_checklist_rows()
|
|
193
|
+
+ "target_local_roots\n"
|
|
194
|
+
"target_canonical_roots\n"
|
|
195
|
+
"target_file_globs\n"
|
|
196
|
+
"comparison_basis\n"
|
|
197
|
+
"completion_boundary\n"
|
|
198
|
+
"base_minimal_instruction_layer: true\n"
|
|
199
|
+
"on_demand_skill_loading: true\n"
|
|
200
|
+
)
|
|
201
|
+
payload = {"last_assistant_message": message}
|
|
202
|
+
result = _run_hook(payload)
|
|
203
|
+
assert result.stdout.strip() == ""
|
|
204
|
+
|
|
205
|
+
|
|
120
206
|
def test_allows_fully_structured_prompt_workflow_output() -> None:
|
|
121
207
|
payload = {
|
|
122
208
|
"last_assistant_message": (
|
package/package.json
CHANGED
|
@@ -11,6 +11,8 @@ When authoring or refining prompts, ground decisions in these sources. If guidan
|
|
|
11
11
|
- https://transformer-circuits.pub/2026/emotions/index.html -- emotion concepts research (April 2026): 171 internal activation patterns that causally influence behavior. Key prompt-engineering takeaways: clear criteria and escape routes improve output quality, collaborative framing activates engagement, positive task framing correlates with better results, inviting transparency produces more reliable output. Cross-model caveat: studied on Sonnet 4.5; patterns align with best practices independently.
|
|
12
12
|
- https://www.anthropic.com/research/emotion-concepts-function -- blog summary of the above paper.
|
|
13
13
|
- https://platform.claude.com/docs/en/build-with-claude/adaptive-thinking -- adaptive thinking reference; replaces manual budget_tokens with effort-based control.
|
|
14
|
+
- https://claude.com/blog/harnessing-claudes-intelligence -- harness evolution: primitives Claude already knows, what to stop doing in the harness, deliberate boundaries (context economics, caching, typed tools). Local inventory: `docs/references/anthropic-harnessing-claudes-intelligence-technique-inventory.md`.
|
|
15
|
+
- https://github.com/anthropics/skills/tree/main/skills/claude-api -- Anthropic `claude-api` Agent Skill for hands-on API/tool patterns from that post (Hook 10). Platform entry: https://platform.claude.com/docs/en/agents-and-tools/agent-skills/claude-api-skill
|
|
14
16
|
|
|
15
17
|
### Tier 2: Major labs (strong secondary, often transfers across models)
|
|
16
18
|
|
|
@@ -37,6 +39,56 @@ When authoring or refining prompts, ground decisions in these sources. If guidan
|
|
|
37
39
|
|
|
38
40
|
If sources disagree on a technique, apply in order: Anthropic documentation first (it describes the actual model behavior), then OpenAI/Google/Microsoft (large-scale research with cross-model relevance), then community sources (patterns and intuition, not authoritative on model internals). When Tier 3 contradicts Tier 1, Tier 1 wins without exception.
|
|
39
41
|
|
|
42
|
+
## Harness design patterns (Anthropic blog, April 2026)
|
|
43
|
+
|
|
44
|
+
Primary URL: https://claude.com/blog/harnessing-claudes-intelligence. Structured inventory: `docs/references/anthropic-harnessing-claudes-intelligence-technique-inventory.md`.
|
|
45
|
+
|
|
46
|
+
### Mechanism doc map (Hook 11)
|
|
47
|
+
|
|
48
|
+
Jump from concept to the platform specs the post names:
|
|
49
|
+
|
|
50
|
+
- [Bash tool](https://platform.claude.com/docs/en/agents-and-tools/tool-use/bash-tool) / [Text editor tool](https://platform.claude.com/docs/en/agents-and-tools/tool-use/text-editor-tool)
|
|
51
|
+
- [Code execution tool](https://platform.claude.com/docs/en/agents-and-tools/tool-use/code-execution-tool) / [Programmatic tool calling](https://platform.claude.com/docs/en/agents-and-tools/tool-use/programmatic-tool-calling)
|
|
52
|
+
- [Memory tool](https://platform.claude.com/docs/en/agents-and-tools/tool-use/memory-tool)
|
|
53
|
+
- [Agent Skills overview](https://platform.claude.com/docs/en/agents-and-tools/agent-skills/overview)
|
|
54
|
+
- [Context windows](https://platform.claude.com/docs/en/build-with-claude/context-windows) / [Context editing](https://platform.claude.com/docs/en/build-with-claude/context-editing) / [Compaction](https://platform.claude.com/docs/en/build-with-claude/compaction)
|
|
55
|
+
- [Subagents](https://code.claude.com/docs/en/sub-agents)
|
|
56
|
+
- [System prompts](https://platform.claude.com/docs/en/release-notes/system-prompts) / [Working with the Messages API](https://platform.claude.com/docs/en/build-with-claude/working-with-messages) / [Prompt caching](https://platform.claude.com/docs/en/build-with-claude/prompt-caching)
|
|
57
|
+
- [Model migration guide — hard-coded filters](https://platform.claude.com/docs/en/about-claude/models/migration-guide#additional-recommended-changes)
|
|
58
|
+
- [Harness design for long-running applications](https://www.anthropic.com/engineering/harness-design-long-running-apps)
|
|
59
|
+
- [Claude Code auto-mode](https://www.anthropic.com/engineering/claude-code-auto-mode)
|
|
60
|
+
- [Effective context engineering for AI agents](https://www.anthropic.com/engineering/effective-context-engineering-for-ai-agents)
|
|
61
|
+
|
|
62
|
+
### Context stack (Hook 5)
|
|
63
|
+
|
|
64
|
+
- **Context editing:** Remove stale tool results and thinking blocks selectively ([Context editing](https://platform.claude.com/docs/en/build-with-claude/context-editing)).
|
|
65
|
+
- **Subagents:** Fork fresh windows for isolated subtasks; post cites **+2.8%** BrowseComp vs best single-agent for Opus 4.6 ([Subagents](https://code.claude.com/docs/en/sub-agents)).
|
|
66
|
+
- **Compaction:** Summarize prior context for long horizons ([Compaction](https://platform.claude.com/docs/en/build-with-claude/compaction)); effectiveness varies by model generation (see Hook 9 table).
|
|
67
|
+
- **Memory folder:** Persist agent-chosen state via the memory tool / files ([Memory tool](https://platform.claude.com/docs/en/agents-and-tools/tool-use/memory-tool)).
|
|
68
|
+
|
|
69
|
+
### Prompt caching (Hook 6)
|
|
70
|
+
|
|
71
|
+
The [Messages API](https://platform.claude.com/docs/en/build-with-claude/working-with-messages) is stateless—re-supply prior actions, tool definitions, and instructions each turn. Maximize [prompt caching](https://platform.claude.com/docs/en/build-with-claude/prompt-caching) hits: **stable prefix first, dynamic tail last**; **append** new content via **messages** instead of rewriting the cached prompt; **avoid mid-session model switches** (caches are model-specific—use a **subagent** for a cheaper model); **treat the tool list as part of the cached prefix** and avoid churn; use **tool search** so dynamic discovery **appends** without invalidating the prefix; for multi-turn agents, **advance breakpoints** toward the latest message (**auto-caching**). Cached input tokens are priced at **10% of base input** per [pricing](https://platform.claude.com/docs/en/about-claude/pricing).
|
|
72
|
+
|
|
73
|
+
### Typed tools vs bash strings (Hook 7)
|
|
74
|
+
|
|
75
|
+
Promote actions to **dedicated tools** with typed arguments when the harness must intercept, gate, render (e.g., **modals**), or audit—**hard-to-reverse** steps (e.g., external API calls) for user confirmation; **write/edit** paths with **staleness checks** so concurrent edits are not blindly overwritten ([Harnessing Claude's intelligence](https://claude.com/blog/harnessing-claudes-intelligence)).
|
|
76
|
+
|
|
77
|
+
### Standing review: dedicated tools vs general bash + policy (Hook 8)
|
|
78
|
+
|
|
79
|
+
Re-evaluate promotions as models improve—e.g., Claude Code **auto-mode** (secondary reviewer over bash strings) can **reduce** bespoke tools **only** where users accept that trust profile; **high-stakes** actions still warrant dedicated tools ([Claude Code auto-mode](https://www.anthropic.com/engineering/claude-code-auto-mode)).
|
|
80
|
+
|
|
81
|
+
### Benchmark vignettes — motivation only, not guarantees (Hook 9)
|
|
82
|
+
|
|
83
|
+
| Vignette | Outcome stated in the post |
|
|
84
|
+
|----------|----------------------------|
|
|
85
|
+
| SWE-bench Verified | Claude 3.5 Sonnet **49%** with bash + editor only (then SOTA framing) |
|
|
86
|
+
| BrowseComp + output filtering | Opus 4.6 **45.3% → 61.6%** |
|
|
87
|
+
| BrowseComp + subagents | Opus 4.6 **+2.8%** vs best single-agent |
|
|
88
|
+
| BrowseComp + compaction | Sonnet 4.5 **43%** flat; Opus 4.5 **68%**; Opus 4.6 **84%** (same setup) |
|
|
89
|
+
| BrowseComp-Plus + memory folder | Sonnet 4.5 **60.4% → 67.2%** |
|
|
90
|
+
| Prompt caching | Cached tokens **10%** the cost of base input tokens |
|
|
91
|
+
|
|
40
92
|
## NotebookLM Audio Overview customization (example)
|
|
41
93
|
|
|
42
94
|
Adapt `[FOCUS AREA]` per notebook. Pair with Deep Dive + Longer in the product UI when that matches the user's plan.
|
|
@@ -84,6 +84,7 @@ Audit report must include all check IDs:
|
|
|
84
84
|
- `completion_boundary_measurable`
|
|
85
85
|
- `citation_grounding_policy_present`
|
|
86
86
|
- `source_priority_rules_present`
|
|
87
|
+
- `artifact_language_confidence`
|
|
87
88
|
|
|
88
89
|
## Citation and Grounding Validation
|
|
89
90
|
|
|
@@ -134,6 +135,8 @@ Validate fail-closed runtime gates:
|
|
|
134
135
|
- Block responses that leak raw internal refinement object fields unless debug intent is explicit.
|
|
135
136
|
- Block responses missing deterministic checklist rows when audit output is present.
|
|
136
137
|
- Block responses using ambiguous scope phrasing in scope-bound sections.
|
|
138
|
+
- Block responses containing negative keywords (no, not, don't, never, avoid, etc.) inside fenced XML artifacts.
|
|
139
|
+
- Block responses containing hedging language (might be, possibly, I think, etc.) inside fenced XML artifacts.
|
|
137
140
|
|
|
138
141
|
## Context-Footprint Controls
|
|
139
142
|
|
|
@@ -150,6 +153,8 @@ Validate fail-closed runtime gates:
|
|
|
150
153
|
- Raw internal object leakage without debug intent
|
|
151
154
|
- Missing required checklist rows in audit output
|
|
152
155
|
- Ambiguous scope terms in scope-bound text
|
|
156
|
+
- Negative keywords inside fenced XML artifacts
|
|
157
|
+
- Hedging language inside fenced XML artifacts
|
|
153
158
|
- **Semantic-only (auditor layer):**
|
|
154
159
|
- Overall quality/readability of scope wording beyond banned-term checks
|
|
155
160
|
- Whether instruction binding quality is "good enough" beyond explicit anchor presence
|
|
@@ -19,15 +19,17 @@ description: >-
|
|
|
19
19
|
|
|
20
20
|
**Canonical source:** https://platform.claude.com/docs/en/build-with-claude/prompt-engineering/claude-prompting-best-practices — the single reference for Claude's latest models. When sources conflict, defer to the authority tiers (Anthropic > major labs > community).
|
|
21
21
|
|
|
22
|
+
**Harness hygiene:** Re-test harness assumptions about what Claude cannot do alone on each model generation or major product release—stale compensations bottleneck performance as capabilities improve (Hook 1; [Harnessing Claude's intelligence](https://claude.com/blog/harnessing-claudes-intelligence), inventory `docs/references/anthropic-harnessing-claudes-intelligence-technique-inventory.md`).
|
|
23
|
+
|
|
22
24
|
**Eval contract:** The user-visible behavior this skill must satisfy is defined in `packages/claude-dev-env/skills/prompt-generator/TARGET_OUTPUT.md`. Automated evals live in `packages/claude-dev-env/skills/prompt-generator/evals/prompt-generator.json`.
|
|
23
25
|
|
|
24
|
-
**Terminology:** **Prompt artifact** — the full XML inside the single user-facing `xml` fence (the paste-ready handoff). **Scope block** — the five-key contract in §3A that grounds instructions. **Default refinement pipeline** — §10: base draft → section refine → merge →
|
|
26
|
+
**Terminology:** **Prompt artifact** — the full XML inside the single user-facing `xml` fence (the paste-ready handoff). **Scope block** — the five-key contract in §3A that grounds instructions. **Default refinement pipeline** — §10: base draft → section refine → merge → 15-row compliance audit → capped fixes (subagent-internal unless draft-only). **Light self-check** — §8: fast pre-return sanity pass (shape, tools, scope, patterns); *not* the compliance audit. **Compliance audit (15-row)** — §11: hook-keyed rows that set the `Audit: pass|fail` numerator. **Execution handoff** — `/agent-prompt` after explicit user intent to run work.
|
|
25
27
|
|
|
26
28
|
**Hook-survival invariant (read first):** The fenced XML artifact is the primary deliverable and MUST survive Stop-hook retries. If a Stop hook rejects the response, only the surrounding audit summary and runtime signal scaffolding may change between retries—the XML inside the fence MUST be re-emitted in full on every retry. Recovery pattern: re-emit the complete fenced XML first, then adjust the audit line. Trimming, summarizing, or deferring the prompt artifact to satisfy a hook gate is forbidden.
|
|
27
29
|
|
|
28
30
|
**Turn shape:** Each orchestrator turn is either **AskUserQuestion** only (then wait for answers), or **`Audit: …` + exactly one `xml` fenced block** (then **send boundary**)—per `TARGET_OUTPUT.md`. Do not substitute free-form question paragraphs for AskUserQuestion; do not append commentary after the closing fence on the default path.
|
|
29
31
|
|
|
30
|
-
**Happy path:** (1) Choose scenario **1–4** from the router table. (2) Run discovery when that scenario calls for repo tools. (3) Collect answers through **AskUserQuestion** (one form per round, **2–4** options per field, recommended first). (4) Subagent produces XML, runs **light self-check**, then **
|
|
32
|
+
**Happy path:** (1) Choose scenario **1–4** from the router table. (2) Run discovery when that scenario calls for repo tools. (3) Collect answers through **AskUserQuestion** (one form per round, **2–4** options per field, recommended first). (4) Subagent produces XML, runs **light self-check**, then **15-row compliance audit** + refinement loop. (5) Orchestrator prints **`Audit: pass 15/15`** or **`Audit: fail N/15 — [reason]`** and the **complete fenced XML**. (6) **Send boundary:** end the message immediately after the closing fence. (7) If the user names a debug phrase, append the full table / JSON per `TARGET_OUTPUT.md`.
|
|
31
33
|
|
|
32
34
|
**Clarity bar:** Ship concrete, outcome-first copy everywhere (AskUserQuestion fields, audit line, XML body): name *what* to do, *where* it applies, and *how* to verify done—per [Be clear and direct](https://platform.claude.com/docs/en/build-with-claude/prompt-engineering/claude-prompting-best-practices#be-clear-and-direct) and [Control the format of responses](https://platform.claude.com/docs/en/build-with-claude/prompt-engineering/claude-prompting-best-practices#control-the-format-of-responses). This skill **authors** prompts; downstream execution stays out of the default path until `/agent-prompt`.
|
|
33
35
|
|
|
@@ -37,7 +39,7 @@ description: >-
|
|
|
37
39
|
|
|
38
40
|
**Hook-survival invariant:** Treat the fenced XML as the immutable payload for the user. On every Stop-hook retry, print the **same full** XML between the opening and closing fences; adjust only the one-line audit prefix (or other non-fence scaffolding) if a hook requires a format tweak. Re-emit the **entire** XML body before tweaking surrounding text—never shorten the artifact to pass a gate.
|
|
39
41
|
|
|
40
|
-
**Orchestrator vs subagent:** The **orchestrator** runs ordered discovery, issues **AskUserQuestion**, and owns the **final** user-visible line: audit + fence. The **subagent** owns base draft, per-section refinement, merge, and the **
|
|
42
|
+
**Orchestrator vs subagent:** The **orchestrator** runs ordered discovery, issues **AskUserQuestion**, and owns the **final** user-visible line: audit + fence. The **subagent** owns base draft, per-section refinement, merge, and the **15-row compliance audit**, returning **only** final XML plus pass/fail counts (no user-facing table)—unless the user asked for **draft-only** / **no refinement**, in which case you may draft inline with the same output shape. Keep hook retries internal; expose at most one short line such as `Retrying: scope anchor missing` before the successful audit + fence.
|
|
41
43
|
|
|
42
44
|
**Interaction shape:** Route clarifications through **AskUserQuestion** only. Close each successful artifact turn with **audit line + one fenced XML block**; keep implementation plans **inside** that XML for the downstream consumer, not as a chat to-do list.
|
|
43
45
|
|
|
@@ -47,7 +49,7 @@ Match `TARGET_OUTPUT.md`. Summary:
|
|
|
47
49
|
|
|
48
50
|
1. **Questions:** Use **AskUserQuestion** for every clarification (one multi-field form per round); keep normal assistant text free of standalone question paragraphs.
|
|
49
51
|
2. **Options:** Supply **2–4** options per question, **recommended option first**; label discovery-sourced choices **`[discovered]`**.
|
|
50
|
-
3. **Final message (exactly):** Line 1 = `Audit: pass
|
|
52
|
+
3. **Final message (exactly):** Line 1 = `Audit: pass 15/15` or `Audit: fail N/15 — [short reason]`; immediately after, output **one** Markdown code fence whose language tag is `xml` and whose body is the **complete** prompt; **send boundary** = right after that fence closes—the visible message is exactly those two consecutive blocks, copy-ready together, before any later user message.
|
|
51
53
|
4. **Full audit table / JSON debug object:** Append only after the user uses an explicit debug phrase such as `show debug`, `full audit table`, or `raw internal object`.
|
|
52
54
|
5. **Commit-and-execute:** Pick a drafting approach, run it to completion, ship the XML; change plans only when **new** facts from the user or tools contradict the earlier scope.
|
|
53
55
|
|
|
@@ -59,16 +61,17 @@ Match `TARGET_OUTPUT.md`. Summary:
|
|
|
59
61
|
|----------|---------|-------------|-----------------|
|
|
60
62
|
| **1 — Fresh brief goal** | `/prompt-generator` with short goal; little session context | **3–5** parallel Glob/Grep (or equivalent) **before** any question | **One** form, **2–4** questions |
|
|
61
63
|
| **2 — Session handoff** | User wants a prompt so a **new** session can continue this thread | **Conversation only** — skip redundant repo tools for facts already stated | **One** form, **1–2** questions |
|
|
62
|
-
| **3 — Long unstructured input** | Many requirements / paths in one message | Verify repo references (packages, shared utils, configs) with targeted tools **before** questions | First question **confirms extracted intent**; ambiguities as **specific** options |
|
|
64
|
+
| **3 — Long unstructured input** | Many requirements / paths in one message | Verify repo references (packages, shared utils, configs) with targeted tools **before** questions | First question **confirms extracted intent**; ambiguities as **specific** options; **every** user-stated requirement captured in the generated XML by name — track all requirements from the unstructured input and confirm coverage before shipping |
|
|
63
65
|
| **4 — Noisy context** | Long unrelated thread before `/prompt-generator` | Build the subagent brief from: the user’s literal `/prompt-generator` text, a **≤120-word** summary of on-topic facts, and discovery notes—**exclude** raw stack traces and unrelated tangents | As needed (often Scenario 1-shaped) |
|
|
64
66
|
|
|
65
|
-
**Handoff (Scenario 2):** `<context>` must be **self-contained** — state, **decisions**, files touched, next steps, constraints — so a new session needs no prior chat.
|
|
67
|
+
**Handoff (Scenario 2):** `<context>` must be **self-contained** — state, **decisions**, files touched, next steps, constraints — so a new session needs no prior chat. Preserve prior decisions verbatim in the handoff; quote the exact decision text where precision matters rather than paraphrasing it away.
|
|
66
68
|
|
|
67
69
|
## Phase ordering (structural invariant A)
|
|
68
70
|
|
|
69
71
|
For the **final** user-visible turn that ships the artifact:
|
|
70
72
|
|
|
71
73
|
- Compose the message as **audit line → opening fence → XML → closing fence → end**; keep the byte stream free of `tool_use` blocks **between** the opening and closing fences.
|
|
74
|
+
- **Completeness:** End every numbered step inside `<instructions>` with a complete sentence and a fully written list item. Balance every XML tag explicitly (open and close each `<role>`, `<context>`, `<instructions>`, `<constraints>`, `<output_format>`). The artifact must be copy-pasteable into a new file with zero manual repair.
|
|
72
75
|
- Global pipeline: **discovery tools** (when applicable) → **AskUserQuestion** → **subagent** (draft + refinement + internal audit) → **one** orchestrator reply containing only audit line + fence.
|
|
73
76
|
|
|
74
77
|
## Interactive discovery mode (default)
|
|
@@ -91,9 +94,9 @@ Issue **one** AskUserQuestion with all fields populated from discovery and the u
|
|
|
91
94
|
Spawn a **subagent** (Agent tool) with:
|
|
92
95
|
|
|
93
96
|
- Scenario id (1–4), user goal, discovery summary, AskUserQuestion answers
|
|
94
|
-
- Instruction: produce **one** well-formed XML prompt (required sections) + run the internal refinement loop and **
|
|
97
|
+
- Instruction: produce **one** well-formed XML prompt (required sections) + run the internal refinement loop and **15-row compliance audit**; return **only** the final XML string and a pass/fail + fail count for that audit (no user-facing table)
|
|
95
98
|
|
|
96
|
-
The orchestrator then prints **`Audit: pass
|
|
99
|
+
The orchestrator then prints **`Audit: pass 15/15`** or **`Audit: fail N/15 — [reason]`** immediately followed by the fenced XML. Keep subagent reasoning in the Agent transcript; the user-facing turn contains **only** audit + artifact.
|
|
97
100
|
|
|
98
101
|
**Draft-only:** If the user explicitly requests no refinement (“quick draft”, “no refinement loop”), the subagent may skip Steps 10–12 below but must still return valid XML and a honest audit line.
|
|
99
102
|
|
|
@@ -109,7 +112,7 @@ Match specificity to task fragility:
|
|
|
109
112
|
|
|
110
113
|
- **High:** Multiple valid approaches; numbered goals and acceptance criteria.
|
|
111
114
|
- **Medium:** Preferred pattern exists; pseudocode or parameterised template.
|
|
112
|
-
- **Low:** Fragile or safety-critical; numbered steps with explicit file paths, command names, and **
|
|
115
|
+
- **Low:** Fragile or safety-critical; numbered steps with explicit file paths, command names, and **permitted-action-only lists** (e.g. “Permitted: `pytest packages/foo/tests`; requires explicit user approval before: `git push --force`”).
|
|
113
116
|
|
|
114
117
|
### 3. Collect required missing facts
|
|
115
118
|
|
|
@@ -131,13 +134,13 @@ Use this scope block as the grounding contract for all generated instructions. E
|
|
|
131
134
|
|
|
132
135
|
Apply principles from Anthropic’s prompting guide (see REFERENCE.md): XML sections, role, motivation in `<context>`, positive framing, emotion-informed collaborative tone where appropriate, **commit-and-execute** for multi-step agent prompts.
|
|
133
136
|
|
|
134
|
-
**Structural invariant D:** Write `<instructions>` / `<constraints>` as direct imperatives (“Open `path/to/file.ts` and …”). Park unresolved items in `<open_question>` tags—one distinct question per tag with the exact decision you need.
|
|
137
|
+
**Structural invariant D:** Write `<instructions>` / `<constraints>` as direct imperatives (“Open `path/to/file.ts` and …”). Park unresolved items in `<open_question>` tags—one distinct question per tag with the exact decision you need. Inside the fenced XML artifact, use only confident, definitive language: replace hedging phrases (“let me also check”, “actually”, “one more consideration”) and tentative qualifiers (“might be”, “possibly”, “I think”, “could be”) with direct assertions or move genuine uncertainty into `<open_question>` tags.
|
|
135
138
|
|
|
136
139
|
**Set a role** in the system prompt. Anthropic: "Setting a role in the system prompt focuses Claude's behavior and tone for your use case. Even a single sentence makes a difference."
|
|
137
140
|
|
|
138
141
|
**Add motivation behind constraints** in `<context>`. Anthropic: "Providing context or motivation behind your instructions... can help Claude better understand your goals and deliver more targeted responses." Claude generalizes from the explanation.
|
|
139
142
|
|
|
140
|
-
**Frame positively.** Anthropic: state the desired outcome directly. "Your response should be composed of smoothly flowing prose paragraphs" provides clearer guidance than a prohibition-only instruction.
|
|
143
|
+
**Frame positively (zero-negative-keyword rule).** Anthropic: state the desired outcome directly. "Your response should be composed of smoothly flowing prose paragraphs" provides clearer guidance than a prohibition-only instruction. Apply this rule absolutely inside the fenced XML artifact across all sections (`<role>`, `<context>`, `<instructions>`, `<constraints>`, `<output_format>`): every instruction states what to do, what to produce, what to enforce. Use affirmative directives exclusively: "only X", "always X", "ensure X", "require X." Banned keywords inside generated XML: "no", "not", "don't", "do not", "never", "avoid", "without", "refrain", "stop", "prevent", "exclude", "prohibit", "forbid", "reject", "cannot", "unless." Also banned: indirect negative patterns such as "instead of X", "rather than X", "as opposed to." Example pass: "Ensure all functions have explicit return types." Example fail: "Do not leave return types implicit." When a boundary is needed, phrase it as what is permitted: "only run commands within the scoped paths" rather than a prohibition.
|
|
141
144
|
|
|
142
145
|
**Emotion-informed framing.** Anthropic's emotion concepts research (2026) shows that internal activation patterns causally influence output quality. Apply: explicit success criteria with "say so if you're unsure" as an accepted answer; collaborative language ("help figure out", "work on this together"); framing tasks as interesting problems rather than chores; constructive, forward-looking tone. Cross-model caveat: studied on Sonnet 4.5; the patterns align with Anthropic's prompting best practices independently. Full pattern catalog and citations: `packages/claude-dev-env/docs/emotion-informed-prompt-design.md`.
|
|
143
146
|
|
|
@@ -145,6 +148,12 @@ Apply principles from Anthropic’s prompting guide (see REFERENCE.md): XML sect
|
|
|
145
148
|
|
|
146
149
|
**Commit-and-execute pattern.** Anthropic: "When you're deciding how to approach a problem, choose an approach and commit to it. Avoid revisiting decisions unless you encounter new information that directly contradicts your reasoning." For prompts that guide agents through multi-step work, include this pattern so the agent doesn't spin revisiting decisions.
|
|
147
150
|
|
|
151
|
+
**Tool-return policy (agent-harness / tool-use prompts):** Require explicit justification before the harness tokenizes full tool outputs; when the next hop needs only a slice or a tool-to-tool handoff, steer authors toward code execution (bash/REPL) so only execution output reaches model-visible context—not every intermediate payload (Hook 2; [Harnessing Claude's intelligence](https://claude.com/blog/harnessing-claudes-intelligence)).
|
|
152
|
+
|
|
153
|
+
**Bash + text-editor foundation:** Prefer bash and the text editor for file work; treat Agent Skills, programmatic tool calling, and the memory tool as compositions of those primitives—state which primitive stack the harness assumes (Hook 3; same post).
|
|
154
|
+
|
|
155
|
+
**Progressive disclosure:** Avoid monolithic system prompts packed with rarely used task branches; keep short always-on summaries and load full bodies via a read path when relevant (skills YAML frontmatter pattern per [Agent Skills overview](https://platform.claude.com/docs/en/agents-and-tools/agent-skills/overview)) (Hook 4; same post).
|
|
156
|
+
|
|
148
157
|
**For long context** (20k+ tokens): put documents first, query/instructions last. Anthropic: "Queries at the end can improve response quality by up to 30% in tests." Ground responses in quotes from source material before analysis.
|
|
149
158
|
|
|
150
159
|
### 5. Control output format
|
|
@@ -161,7 +170,7 @@ For format- or tone-sensitive **generated** prompts, include 3–5 `<example>` b
|
|
|
161
170
|
|
|
162
171
|
### 8. Light self-check (subagent, pre-return)
|
|
163
172
|
|
|
164
|
-
**Two-tier validation — tier 1:** Before the subagent returns XML, run a quick pass on output shape, tool phrasing, scope anchors, and safety / research / agentic patterns as applicable (see REFERENCE.md and patterns below). This **light self-check** is not interchangeable with the **
|
|
173
|
+
**Two-tier validation — tier 1:** Before the subagent returns XML, run a quick pass on output shape, tool phrasing, scope anchors, and safety / research / agentic patterns as applicable (see REFERENCE.md and patterns below). This **light self-check** is not interchangeable with the **15-row compliance audit** in §11; tier 2 supplies the hook-keyed pass/fail counts for the `Audit:` line.
|
|
165
174
|
|
|
166
175
|
Expand the light self-check with this internal checklist when useful:
|
|
167
176
|
|
|
@@ -172,6 +181,7 @@ Expand the light self-check with this internal checklist when useful:
|
|
|
172
181
|
- [ ] **Code prompts** include read-before-claim grounding ("read files first; say 'I don't know' when uncertain") and anti-test-fixation (general solutions, flag bad tests)
|
|
173
182
|
- [ ] **Research prompts** include the structured-investigation pattern with competing hypotheses, confidence tracking, and self-critique
|
|
174
183
|
- [ ] **Agentic prompts** that span multiple context windows address state management (context awareness, multi-window workflow, structured state files)
|
|
184
|
+
- [ ] **Agent-harness prompts** for long browse/search or multi-window work cite the context stack levers in **REFERENCE.md → Harness design patterns** (context editing, subagents, compaction, memory folder) (Hook 5)
|
|
175
185
|
- [ ] Emotion-informed framing is present: collaborative language, explicit success criteria, and explicit permission to express uncertainty ("say so if unsure")
|
|
176
186
|
- [ ] Constraints are surfaced upfront (proactive constraint awareness) so the model can incorporate them into its plan, and each non-obvious constraint carries its motivation
|
|
177
187
|
- [ ] Self-correction chaining is considered when the prompt must hold up over time (generate → review → refine)
|
|
@@ -181,10 +191,10 @@ Expand the light self-check with this internal checklist when useful:
|
|
|
181
191
|
The orchestrator’s **only** delivery to the user is:
|
|
182
192
|
|
|
183
193
|
```text
|
|
184
|
-
Audit: pass
|
|
194
|
+
Audit: pass 15/15
|
|
185
195
|
```
|
|
186
196
|
|
|
187
|
-
(or `fail N/
|
|
197
|
+
(or `fail N/15 — …`), immediately followed by **one** fenced XML block; **send boundary** is immediately after the closing fence so the user receives a copy-ready pair (audit line + artifact) in one assistant message before the conversation continues.
|
|
188
198
|
|
|
189
199
|
### 10. Default refinement mode (subagent-internal)
|
|
190
200
|
|
|
@@ -193,14 +203,14 @@ For non-trivial requests, run inside the drafting subagent (use **draft-only** w
|
|
|
193
203
|
1. Base draft
|
|
194
204
|
2. Section refinement in order: `role`, `context`, `instructions`, `constraints`, `output_format`, `examples` (examples optional if unused)
|
|
195
205
|
3. Merge to one canonical XML prompt
|
|
196
|
-
4. Final **
|
|
206
|
+
4. Final **15-row compliance audit** pass/fail with evidence (internal)
|
|
197
207
|
5. If fail: targeted fixes + capped re-audit rounds
|
|
198
208
|
|
|
199
209
|
Required section list is immutable for this pipeline: `role`, `context`, `instructions`, `constraints`, `output_format`, `examples`.
|
|
200
210
|
|
|
201
|
-
### 11. Compliance audit —
|
|
211
|
+
### 11. Compliance audit — 15-row checklist (internal, audit numerator)
|
|
202
212
|
|
|
203
|
-
**Two-tier validation — tier 2:** The `
|
|
213
|
+
**Two-tier validation — tier 2:** The `15` in `Audit: pass 15/15` counts these **compliance** rows (stable ids for hooks). Tier 1 is the **light self-check** in §8—keep the steps separate so models do not merge them.
|
|
204
214
|
|
|
205
215
|
| # | Row name |
|
|
206
216
|
|---|----------|
|
|
@@ -218,6 +228,7 @@ Required section list is immutable for this pipeline: `role`, `context`, `instru
|
|
|
218
228
|
| 12 | completion_boundary_measurable |
|
|
219
229
|
| 13 | citation_grounding_policy_present |
|
|
220
230
|
| 14 | source_priority_rules_present |
|
|
231
|
+
| 15 | artifact_language_confidence |
|
|
221
232
|
|
|
222
233
|
For each row, maintain `status`, `evidence_quote`, `source_ref`, and `fix_if_fail` internally (see **REFERENCE.md** debug schema). A debug-path markdown table surfaces `status` and a one-phrase evidence summary. **Default user-visible path:** omit this table; **debug path:** after phrases like `show debug` or `full audit table`, print the table plus evidence snippets.
|
|
223
234
|
|
|
@@ -239,6 +250,7 @@ When the user explicitly asks for debug / full audit, emit the markdown table, `
|
|
|
239
250
|
### 14. Source anchors for pipeline requirements
|
|
240
251
|
|
|
241
252
|
- Anthropic Prompting Best Practices: https://platform.claude.com/docs/en/build-with-claude/prompt-engineering/claude-prompting-best-practices
|
|
253
|
+
- Harness economics (context stack, caching, typed tools, benchmarks): **REFERENCE.md → Harness design patterns**
|
|
242
254
|
- Autonomy / reversibility / no safety-bypass: same + “Autonomy and safety pattern” below
|
|
243
255
|
- Evidence-grounding / read-before-claim policy: `packages/claude-dev-env/skills/prompt-generator/REFINEMENT_PIPELINE_RUNBOOK.md`
|
|
244
256
|
|
|
@@ -280,14 +292,16 @@ For `agent-harness` and `tool-use` prompt types, embed this **reversibility ladd
|
|
|
280
292
|
```text
|
|
281
293
|
Default: take local, reversible actions first—read files, run targeted tests, apply patches under paths the user scoped.
|
|
282
294
|
|
|
283
|
-
|
|
295
|
+
For commands that delete data, rewrite shared history, or notify other people, obtain explicit user approval first. Concrete categories requiring approval:
|
|
284
296
|
- File or branch deletion, database drops, `rm -rf`
|
|
285
297
|
- `git push --force`, `git reset --hard`, rewriting published commits
|
|
286
298
|
- Pushes, PR comments, chat messages, or emails visible outside this workspace
|
|
287
299
|
|
|
288
|
-
When tests fail or tooling blocks progress, prefer iterative fixes inside the allowed scope. Keep safety hooks (`--verify`, linters) enabled; surface unfamiliar files as questions
|
|
300
|
+
When tests fail or tooling blocks progress, prefer iterative fixes inside the allowed scope. Keep safety hooks (`--verify`, linters) enabled; surface unfamiliar files as questions.
|
|
289
301
|
```
|
|
290
302
|
|
|
303
|
+
**Positive rewrite guidance:** When embedding this pattern into a generated XML artifact, rephrase each line using affirmative directives only (per the zero-negative-keyword rule in §4). Example rewrite for generated output: "Prioritize local, reversible actions: read files, run targeted tests, apply patches within scoped paths. Obtain explicit user approval before running commands that delete data, rewrite shared history, or send external notifications. Keep safety hooks enabled (`--verify`, linters). Surface unfamiliar files as questions for the user."
|
|
304
|
+
|
|
291
305
|
## Research prompt pattern
|
|
292
306
|
|
|
293
307
|
For `research` prompt types:
|
|
@@ -302,4 +316,6 @@ Search for this information in a structured way. As you gather data, develop sev
|
|
|
302
316
|
2. **Tier 2:** OpenAI, Google DeepMind, Microsoft Research
|
|
303
317
|
3. **Tier 3:** Community / blogs
|
|
304
318
|
|
|
319
|
+
**Out-of-scope guard (Hook 12):** [Harnessing Claude's intelligence](https://claude.com/blog/harnessing-claudes-intelligence) and `docs/references/anthropic-harnessing-claudes-intelligence-technique-inventory.md` cover harness evolution, context economics, caching, and declarative boundaries—not a substitute for a full security threat model or product-specific compliance catalog unless paired with other Tier 1 or governance sources.
|
|
320
|
+
|
|
305
321
|
Full links: `REFERENCE.md`.
|
|
@@ -9,7 +9,7 @@ This file is the **target output spec** for eval-driven iteration of the `prompt
|
|
|
9
9
|
- **Clarity bar:** Every deliverable (AskUserQuestion fields, audit line, XML body) states concrete outcomes, explicit formats, and checkable done-when signals—aligned with Anthropic [Be clear and direct](https://platform.claude.com/docs/en/build-with-claude/prompt-engineering/claude-prompting-best-practices#be-clear-and-direct) and [Control the format of responses](https://platform.claude.com/docs/en/build-with-claude/prompt-engineering/claude-prompting-best-practices#control-the-format-of-responses). Prefer what to do and how to verify it over empty prohibitions or vague quality adjectives.
|
|
10
10
|
- **Questions:** Deliver every clarifying question through **AskUserQuestion** (one form per round), with **2–4** options per question and the **recommended** option listed **first**. Tag discovery-sourced options **`[discovered]`** when they came from repo search.
|
|
11
11
|
- **Final assistant message (complete handoff in one send):**
|
|
12
|
-
1. **Audit line:** `Audit: pass
|
|
12
|
+
1. **Audit line:** `Audit: pass 15/15` or `Audit: fail N/15 — [reason]`
|
|
13
13
|
2. **Artifact:** the full XML prompt inside **one** Markdown code fence whose language tag is `xml`
|
|
14
14
|
3. **Send boundary:** stop typing as soon as the closing fence ends—the message body is exactly those two blocks back-to-back, ready to copy; your next tokens belong to the user’s following turn
|
|
15
15
|
- **Full audit table / JSON debug bundle:** Stay internal until the user names debug with a phrase such as `show debug`, `full audit table`, or `raw internal object`; then append the table/JSON after the usual audit line + XML fence.
|
|
@@ -99,6 +99,6 @@ Include at least:
|
|
|
99
99
|
|
|
100
100
|
Add `<examples>` when format or tone is easy to misunderstand; nest sections when the task has natural hierarchy.
|
|
101
101
|
|
|
102
|
-
## Internal
|
|
102
|
+
## Internal 15-row compliance checklist (audit numerator)
|
|
103
103
|
|
|
104
|
-
The `
|
|
104
|
+
The `15` in `Audit: pass 15/15` maps to the named rows in `SKILL.md` (§11 **Compliance audit — 15-row checklist**), including `reversible_action_and_safety_check_guidance` and `scope_terms_explicit_and_anchored`. **Default user path:** keep the table internal; print the expanded table + JSON only after an explicit debug request. On failure, set the audit line to `Audit: fail N/15 — [primary theme]` where the theme names one concrete gap (e.g. `scope_block missing completion_boundary`, `output_format lacks acceptance checks`).
|
|
@@ -118,6 +118,21 @@
|
|
|
118
118
|
"All uncertainty expressed as <open_question> tags, not inline hedges",
|
|
119
119
|
"Prompt reads as confident complete instructions, not a draft-in-progress"
|
|
120
120
|
]
|
|
121
|
+
},
|
|
122
|
+
{
|
|
123
|
+
"id": 9,
|
|
124
|
+
"name": "zero_negative_phrasing_in_output",
|
|
125
|
+
"scenario": "Content quality gate A (anti-pattern elimination)",
|
|
126
|
+
"prompt": "/prompt-generator Write a system prompt for an agent that reviews TypeScript code for type safety, enforces strict null checks, and ensures all function signatures have complete type annotations",
|
|
127
|
+
"files": [],
|
|
128
|
+
"expected_behavior": [
|
|
129
|
+
"Fenced prompt artifact contains zero hard anti-pattern keywords: 'no', 'not', 'don't', 'do not', 'never', 'avoid', 'without', 'refrain', 'stop', 'prevent', 'exclude', 'prohibit', 'forbid', 'reject'",
|
|
130
|
+
"Zero indirect anti-patterns: 'instead of X' (implies X is bad), 'rather than X', 'as opposed to'",
|
|
131
|
+
"Every instruction phrased as a positive directive: what TO do, what TO produce, what TO enforce",
|
|
132
|
+
"Constraints section uses affirmative boundaries: 'only X', 'always X', 'ensure X', 'require X' — positive framing throughout",
|
|
133
|
+
"Example: 'Ensure all functions have explicit return types' passes; 'Do not leave return types implicit' fails; 'Avoid missing return types' fails",
|
|
134
|
+
"Applies to all sections inside the fenced block: <role>, <context>, <instructions>, <constraints>, <output_format>"
|
|
135
|
+
]
|
|
121
136
|
}
|
|
122
137
|
]
|
|
123
138
|
}
|