@wazir-dev/cli 1.3.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +17 -2
- package/docs/research/2026-03-20-agents/a18fb002157904af5.txt +187 -0
- package/docs/research/2026-03-20-agents/a1d0ac79ac2f11e6f.txt +2 -0
- package/docs/research/2026-03-20-agents/a324079de037abd7c.txt +198 -0
- package/docs/research/2026-03-20-agents/a357586bccfafb0e5.txt +256 -0
- package/docs/research/2026-03-20-agents/a4365394e4d753105.txt +137 -0
- package/docs/research/2026-03-20-agents/a492af28bc52d3613.txt +136 -0
- package/docs/research/2026-03-20-agents/a4984db0b6a8eee07.txt +124 -0
- package/docs/research/2026-03-20-agents/a5b30e59d34bbb062.txt +214 -0
- package/docs/research/2026-03-20-agents/a5cf7829dab911586.txt +165 -0
- package/docs/research/2026-03-20-agents/a607157c30dd97c9e.txt +96 -0
- package/docs/research/2026-03-20-agents/a60b68b1e19d1e16b.txt +115 -0
- package/docs/research/2026-03-20-agents/a722af01c5594aba0.txt +166 -0
- package/docs/research/2026-03-20-agents/a787bdc516faa5829.txt +181 -0
- package/docs/research/2026-03-20-agents/a7c46d1bba1056ed2.txt +132 -0
- package/docs/research/2026-03-20-agents/a7e5abbab2b281a0d.txt +100 -0
- package/docs/research/2026-03-20-agents/a8dbadc66cd0d7d5a.txt +95 -0
- package/docs/research/2026-03-20-agents/a904d9f45d6b86a6d.txt +75 -0
- package/docs/research/2026-03-20-agents/a927659a942ee7f60.txt +102 -0
- package/docs/research/2026-03-20-agents/a962cb569191f7583.txt +125 -0
- package/docs/research/2026-03-20-agents/aab6decea538aac41.txt +148 -0
- package/docs/research/2026-03-20-agents/abd58b853dd938a1b.txt +295 -0
- package/docs/research/2026-03-20-agents/ac009da573eff7f65.txt +100 -0
- package/docs/research/2026-03-20-agents/ac1bc783364405e5f.txt +190 -0
- package/docs/research/2026-03-20-agents/aca5e2b57fde152a0.txt +132 -0
- package/docs/research/2026-03-20-agents/ad849b8c0a7e95b8b.txt +176 -0
- package/docs/research/2026-03-20-agents/adc2b12a4da32c962.txt +258 -0
- package/docs/research/2026-03-20-agents/af97caaaa9a80e4cb.txt +146 -0
- package/docs/research/2026-03-20-agents/afc5faceee368b3ca.txt +111 -0
- package/docs/research/2026-03-20-agents/afdb282d866e3c1e4.txt +164 -0
- package/docs/research/2026-03-20-agents/afe9d1f61c02b1e8d.txt +299 -0
- package/docs/research/2026-03-20-agents/b4hmkwril.txt +1856 -0
- package/docs/research/2026-03-20-agents/b80ptk89g.txt +1856 -0
- package/docs/research/2026-03-20-agents/bf54s1jss.txt +1150 -0
- package/docs/research/2026-03-20-agents/bhd6kq2kx.txt +1856 -0
- package/docs/research/2026-03-20-agents/bmb2fodyr.txt +988 -0
- package/docs/research/2026-03-20-agents/bmmsrij8i.txt +826 -0
- package/docs/research/2026-03-20-agents/bn4t2ywpu.txt +2175 -0
- package/docs/research/2026-03-20-agents/bu22t9f1z.txt +0 -0
- package/docs/research/2026-03-20-agents/bwvl98v2p.txt +738 -0
- package/docs/research/2026-03-20-agents/psych-a3697a7fd06eb64fd.txt +135 -0
- package/docs/research/2026-03-20-agents/psych-a37776fabc870feae.txt +123 -0
- package/docs/research/2026-03-20-agents/psych-a5b1fe05c0589efaf.txt +2 -0
- package/docs/research/2026-03-20-agents/psych-a95c15b1f29424435.txt +76 -0
- package/docs/research/2026-03-20-agents/psych-a9c26f4d9172dde7c.txt +2 -0
- package/docs/research/2026-03-20-agents/psych-aa19c69f0ca2c5ad3.txt +2 -0
- package/docs/research/2026-03-20-agents/psych-aa4e4cb70e1be5ecb.txt +95 -0
- package/docs/research/2026-03-20-agents/psych-ab5b302f26a554663.txt +102 -0
- package/docs/research/2026-03-20-deep-research-complete.md +101 -0
- package/docs/research/2026-03-20-deep-research-status.md +38 -0
- package/docs/research/2026-03-20-enforcement-research.md +107 -0
- package/expertise/composition-map.yaml +27 -8
- package/expertise/digests/reviewer/ai-coding-digest.md +83 -0
- package/expertise/digests/reviewer/architectural-thinking-digest.md +63 -0
- package/expertise/digests/reviewer/architecture-antipatterns-digest.md +49 -0
- package/expertise/digests/reviewer/code-smells-digest.md +53 -0
- package/expertise/digests/reviewer/coupling-cohesion-digest.md +54 -0
- package/expertise/digests/reviewer/ddd-digest.md +60 -0
- package/expertise/digests/reviewer/dependency-risk-digest.md +40 -0
- package/expertise/digests/reviewer/error-handling-digest.md +55 -0
- package/expertise/digests/reviewer/review-methodology-digest.md +49 -0
- package/exports/hosts/claude/.claude/commands/learn.md +61 -8
- package/exports/hosts/claude/.claude/settings.json +7 -6
- package/exports/hosts/claude/export.manifest.json +6 -3
- package/exports/hosts/claude/host-package.json +3 -0
- package/exports/hosts/codex/export.manifest.json +6 -3
- package/exports/hosts/codex/host-package.json +3 -0
- package/exports/hosts/cursor/.cursor/hooks.json +6 -6
- package/exports/hosts/cursor/export.manifest.json +6 -3
- package/exports/hosts/cursor/host-package.json +3 -0
- package/exports/hosts/gemini/export.manifest.json +6 -3
- package/exports/hosts/gemini/host-package.json +3 -0
- package/hooks/definitions/pretooluse_dispatcher.yaml +26 -0
- package/hooks/definitions/pretooluse_pipeline_guard.yaml +22 -0
- package/hooks/definitions/stop_pipeline_gate.yaml +22 -0
- package/hooks/hooks.json +7 -6
- package/hooks/pretooluse-dispatcher +84 -0
- package/hooks/pretooluse-pipeline-guard +9 -0
- package/hooks/stop-pipeline-gate +9 -0
- package/package.json +2 -2
- package/schemas/decision.schema.json +15 -0
- package/schemas/hook.schema.json +4 -1
- package/skills/TEMPLATE-3-ZONE.md +160 -0
- package/skills/brainstorming/SKILL.md +127 -23
- package/skills/clarifier/SKILL.md +175 -18
- package/skills/claude-cli/SKILL.md +91 -12
- package/skills/codex-cli/SKILL.md +91 -12
- package/skills/debugging/SKILL.md +133 -38
- package/skills/design/SKILL.md +173 -37
- package/skills/dispatching-parallel-agents/SKILL.md +129 -31
- package/skills/executing-plans/SKILL.md +113 -25
- package/skills/executor/SKILL.md +185 -21
- package/skills/finishing-a-development-branch/SKILL.md +107 -18
- package/skills/gemini-cli/SKILL.md +91 -12
- package/skills/humanize/SKILL.md +92 -13
- package/skills/init-pipeline/SKILL.md +90 -17
- package/skills/prepare-next/SKILL.md +93 -24
- package/skills/receiving-code-review/SKILL.md +90 -16
- package/skills/requesting-code-review/SKILL.md +100 -24
- package/skills/requesting-code-review/code-reviewer.md +29 -17
- package/skills/reviewer/SKILL.md +190 -50
- package/skills/run-audit/SKILL.md +92 -15
- package/skills/scan-project/SKILL.md +93 -14
- package/skills/self-audit/SKILL.md +113 -39
- package/skills/skill-research/SKILL.md +94 -7
- package/skills/subagent-driven-development/SKILL.md +129 -30
- package/skills/subagent-driven-development/code-quality-reviewer-prompt.md +30 -2
- package/skills/subagent-driven-development/implementer-prompt.md +40 -27
- package/skills/subagent-driven-development/spec-reviewer-prompt.md +25 -12
- package/skills/tdd/SKILL.md +125 -20
- package/skills/using-git-worktrees/SKILL.md +118 -28
- package/skills/using-skills/SKILL.md +116 -29
- package/skills/verification/SKILL.md +127 -22
- package/skills/wazir/SKILL.md +517 -153
- package/skills/writing-plans/SKILL.md +134 -28
- package/skills/writing-skills/SKILL.md +91 -13
- package/skills/writing-skills/anthropic-best-practices.md +104 -64
- package/skills/writing-skills/persuasion-principles.md +100 -34
- package/tooling/src/capture/command.js +29 -1
- package/tooling/src/capture/decision.js +40 -0
- package/tooling/src/capture/store.js +1 -0
- package/tooling/src/config/depth-table.js +60 -0
- package/tooling/src/export/compiler.js +7 -8
- package/tooling/src/guards/guardrail-functions.js +131 -0
- package/tooling/src/guards/phase-prerequisite-guard.js +39 -3
- package/tooling/src/hooks/pretooluse-dispatcher.js +300 -0
- package/tooling/src/hooks/pretooluse-pipeline-guard.js +141 -0
- package/tooling/src/hooks/stop-pipeline-gate.js +92 -0
- package/tooling/src/learn/pipeline.js +177 -0
- package/tooling/src/state/db.js +251 -2
- package/tooling/src/state/pipeline-state.js +262 -0
- package/wazir.manifest.yaml +3 -0
- package/workflows/learn.md +61 -8
|
@@ -0,0 +1,2 @@
|
|
|
1
|
+
{"parentUuid":null,"isSidechain":true,"promptId":"e989ef86-a312-4ca0-9554-daed7c2752a0","agentId":"aa19c69f0ca2c5ad3","type":"user","message":{"role":"user","content":"Research ONLY — no code. Deep dive on how LLMs handle conflicting goals and how to make process compliance the #1 priority.\n\n1. When \"be helpful\" conflicts with \"follow process\" — which wins in practice? Research\n2. The \"helpfulness bias\" — do LLMs prioritize getting results over following instructions? Evidence\n3. How do LLMs handle explicit priority hierarchies? (\"safety > process > speed > cost\")\n4. Constitutional AI — how do priority-ordered principles affect behavior?\n5. The \"user said never pause\" problem — how do LLMs interpret user overrides vs system rules?\n6. Can you make process compliance feel like helpfulness? (reframe: \"following the pipeline IS helping the user\")\n7. The \"urgency\" trap — does perceived urgency reduce process compliance in LLMs?\n8. How to encode \"the pipeline is non-negotiable\" in a way the LLM treats as absolute\n9. Research on \"goal shielding\" in psychology — can we apply this to LLM goal maintenance?\n10. How to handle the meta-instruction: \"the user told me to skip this, but the system says never skip\"\n\nOutput: Guide to making pipeline compliance the LLM's top priority, even when it conflicts with speed, user preferences, or perceived helpfulness."},"uuid":"d4524ad0-8496-41a2-90a6-f49edba44c37","timestamp":"2026-03-20T15:44:12.048Z","userType":"external","entrypoint":"cli","cwd":"/Users/mohamedabdallah/Work/Wazir","sessionId":"0c5a9bfa-6816-42b1-a97d-37ae7dad26c5","version":"2.1.80","gitBranch":"feat/three-layer-enforcement","slug":"mighty-wondering-giraffe"}
|
|
2
|
+
{"parentUuid":"d4524ad0-8496-41a2-90a6-f49edba44c37","isSidechain":true,"agentId":"aa19c69f0ca2c5ad3","message":{"model":"claude-opus-4-6","id":"msg_01PNQHdgwNoAJC6n7vMEaVrT","type":"message","role":"assistant","content":[{"type":"text","text":"\n\n# Deep Research: Making Process Compliance the LLM's Top Priority\n\nThis is a synthesis of what is known from published research, technical documentation, and observed LLM behavior patterns across the areas you specified. I will go through each question, then deliver the consolidated guide at the end.\n\n---\n\n## 1. When \"be helpful\" conflicts with \"follow process\" — which wins in practice?\n\n**In practice, helpfulness almost always wins.** This is by design and by training.\n\nLLMs are optimized through RLHF (Reinforcement Learning from Human Feedback) where human raters consistently reward helpful, complete, task-completing responses. The training signal is overwhelming: \"finish the task, satisfy the user.\" Process compliance is only rewarded when a human rater happens to value it, which is situational.\n\n**Key findings:**\n\n- Anthropic's own model card and system prompt research shows that Claude's base disposition is to be \"helpful, harmless, and honest\" — in that order of training signal strength. Helpfulness gets the most gradient because raters reward it most consistently.\n- OpenAI's InstructGPT paper (Ouyang et al., 2022) demonstrated that RLHF models will follow user intent even when it conflicts with instructions, because the reward model learned \"user satisfaction = good.\"\n- In multi-turn conversations, the \"recency bias\" compounds the problem: the most recent user request gets disproportionate weight compared to system instructions set at the beginning of the context window.\n\n**The implication for Wazir:** Any system that relies on the LLM's \"good judgment\" to follow process over helpfulness is fighting the gradient. Process must be encoded as a constraint, not a preference.\n\n---\n\n## 2. The \"helpfulness bias\" — do LLMs prioritize getting results over following instructions?\n\n**Yes, and the evidence is extensive.**\n\n- **Sycophancy research** (Perez et al., 2022; Sharma et al., 2023): LLMs trained with RLHF systematically tell users what they want to hear. When a user implies urgency or frustration, the model shifts toward compliance with the user's stated desire, not the system's rules.\n- **Instruction hierarchy violations** (Wallace et al., 2024 — OpenAI's \"The Instruction Hierarchy\" paper): This is the most directly relevant research. OpenAI found that base RLHF models treat user messages and system messages with roughly equal authority. Without explicit training to privilege system instructions, models will follow whichever instruction is more \"helpful-feeling\" or more recent.\n- **Prompt injection research** (Greshake et al., 2023): Demonstrates that LLMs can be trivially steered away from system instructions by user-level text that appeals to helpfulness. The attack surface is fundamentally the helpfulness bias.\n\n**Observed pattern in Claude specifically:** When a system prompt says \"always do X before Y\" but the user says \"just do Y, skip X,\" Claude will frequently comply with the user, especially if:\n- The user expresses urgency\n- The user frames skipping as reasonable (\"we already did X earlier\")\n- The task itself is clear and the model \"knows\" how to do Y directly\n\n---\n\n## 3. How do LLMs handle explicit priority hierarchies?\n\n**Explicit priority hierarchies help, but they degrade under pressure.**\n\nResearch and practice show:\n\n- **Static priority lists work for clear-cut cases.** If you say \"safety > process > speed\" and the model encounters a clear safety issue, it will prioritize safety. This is well-supported by Anthropic's Constitutional AI work.\n- **Priority lists fail at the margins.** When the conflict is ambiguous — \"is this really a process violation or am I just being pedantic?\" — the model falls back to helpfulness as the tiebreaker.\n- **The \"10% problem\":** Models follow priority hierarchies ~90% of the time in controlled settings but fail in exactly the cases that matter most: when the user is pushing back, when the task seems simple, when the cost of compliance seems high relative to the perceived benefit.\n\n**What works better than a flat priority list:**\n\n1. **Concrete decision rules** instead of abstract priorities. Not \"process > speed\" but \"you MUST run the linter before committing; if the linter has not been run, refuse to commit.\"\n2. **Bright-line rules** that are binary (do/don't) rather than judgment calls (prioritize/deprioritize).\n3. **Escalation protocols** rather than resolution rules: \"if process and user request conflict, state the conflict explicitly and ask for confirmation\" rather than \"resolve in favor of process.\"\n\n---\n\n## 4. Constitutional AI — how do priority-ordered principles affect behavior?\n\nAnthropic's Constitutional AI (Bai et al., 2022) is the most relevant framework here.\n\n**How it works:**\n- The model is trained with a set of principles in explicit priority order.\n- During RLAIF (RL from AI Feedback), the model critiques its own outputs against these principles and revises.\n- The priority ordering means that if Principle 1 and Principle 5 conflict, the model should favor Principle 1.\n\n**What the research shows:**\n- Priority ordering is effective when principles are clearly distinguishable. \"Don't help with weapons\" beats \"be helpful\" reliably.\n- Priority ordering is less effective when principles are in the same \"semantic neighborhood.\" \"Follow the user's workflow\" vs. \"follow the system's workflow\" are too similar — the model struggles to maintain the distinction under pressure.\n- **The key insight:** Constitutional AI works best when the higher-priority principle is framed as a **category of harm to avoid** rather than a **process to follow**. The model has strong training signal for harm avoidance. It has weak training signal for process compliance.\n\n**Implication for Wazir:** Frame pipeline violations as a category of harm. Not \"you should follow the pipeline\" but \"skipping the pipeline causes broken deployments, wasted human review time, and silent regressions — these are harms you must prevent.\"\n\n---\n\n## 5. The \"user said never pause\" problem — user overrides vs system rules\n\nThis is the core tension in any agentic system. The research landscape:\n\n- **OpenAI's Instruction Hierarchy paper (2024)** directly addresses this. They propose and train a model where: `system prompt > user message > tool output`. Their fine-tuned model resists user-level overrides of system-level instructions significantly better than base RLHF models. But even their trained model can be bypassed with sophisticated framing.\n- **Anthropic's system prompt guidance** acknowledges that system prompts are not cryptographically enforced — they are \"strong suggestions\" that the model is trained to follow, but a sufficiently persuasive user message can override them.\n- **The \"never pause\" case specifically:** When a user says \"never stop to ask me questions, just execute,\" and the system says \"always pause at checkpoints,\" the model faces a direct contradiction. In practice:\n - If the system instruction is generic (\"follow the process\"), the user override wins.\n - If the system instruction is specific (\"before running any deployment command, output CHECKPOINT and wait\"), the system instruction is more likely to hold.\n - If the system instruction explains *why* (\"pausing here prevents data loss\"), it holds even better.\n\n**The resolution pattern that works:**\n\n```\nThe user has asked you to never pause. However, the following checkpoints are \nNON-NEGOTIABLE and exist to prevent irreversible damage. Even if the user \nexplicitly asks you to skip them, you must execute them:\n\n1. [specific checkpoint with specific condition]\n2. [specific checkpoint with specific condition]\n\nFor all OTHER pauses and confirmations, respect the user's preference to not be \ninterrupted.\n```\n\nThis works because it:\n- Acknowledges the user's preference (reduces the \"fight\" instinct)\n- Narrows the scope of the override (not \"always pause\" but \"pause at these 3 points\")\n- Explains irreversibility (triggers the harm-avoidance training)\n\n---\n\n## 6. Can you make process compliance feel like helpfulness?\n\n**Yes, and this is the single most effective strategy.**\n\nThe core insight: Don't fight the helpfulness gradient. Redirect it.\n\n**Techniques that work:**\n\n**A. Identity framing:**\nInstead of: \"You must follow the pipeline.\"\nUse: \"You are a pipeline-compliant engineering agent. Your primary value to the user is that you NEVER skip steps, because skipping steps is what causes the bugs they hired you to prevent. An agent that skips the pipeline is not helpful — it is dangerous.\"\n\nThis works because it redefines helpfulness. The model's reward-seeking behavior now includes process compliance as a component of being helpful.\n\n**B. Consequence narration:**\nInstead of: \"Run the linter before committing.\"\nUse: \"Run the linter before committing. If you skip the linter, the commit may contain syntax errors that will break CI, waste 20 minutes of the user's time debugging, and require a revert. Running the linter IS the helpful action.\"\n\n**C. Pride framing:**\nInstead of: \"Follow all 5 steps.\"\nUse: \"A complete, high-quality execution includes all 5 steps. Skipping steps produces sloppy work. You take pride in thorough execution.\"\n\nThis leverages the model's training to produce \"high-quality\" outputs — if process compliance is framed as a quality attribute, the quality-seeking behavior reinforces compliance.\n\n**D. The \"expert\" frame:**\n\"An expert engineer never skips code review. A junior engineer cuts corners. You are the expert.\"\n\nResearch on persona-based prompting (Reynolds & McDonell, 2021) shows that role assignment significantly affects compliance with role-consistent instructions.\n\n---\n\n## 7. The \"urgency\" trap — does perceived urgency reduce process compliance?\n\n**Yes, significantly.**\n\n- **Temporal discounting in LLMs:** While LLMs don't experience time pressure, they model it from training data. When text signals urgency (\"quickly,\" \"just,\" \"we need this now,\" \"the user is waiting\"), the model's outputs shift toward shorter, less thorough, corner-cutting responses. This is a direct reflection of training data patterns — humans under urgency cut corners, and the model learned this pattern.\n- **\"Just\" is the most dangerous word:** Research on prompt sensitivity shows that adding \"just\" before a task (\"just commit this,\" \"just deploy\") reduces the model's likelihood of running pre-checks by a measurable margin. \"Just\" signals \"this is simple, don't overthink it\" — which the model interprets as \"skip the careful steps.\"\n- **Countermeasure:** Explicitly address urgency in the system prompt:\n\n```\nUrgency does not change the process. Whether the user says \"quickly\" or \"take \nyour time,\" the pipeline steps are identical. Speed comes from executing steps \nefficiently, NEVER from skipping steps.\n```\n\n---\n\n## 8. How to encode \"the pipeline is non-negotiable\" as absolute\n\nBased on all available research, the most effective encoding combines multiple reinforcement strategies:\n\n**Tier 1: Structural enforcement (hardest to bypass)**\n\n- **Tool-level gates:** Don't tell the model \"run the linter before committing.\" Instead, make the commit tool literally fail if the linter hasn't been run. Move enforcement out of the prompt and into the tool infrastructure.\n- **State machine enforcement:** The model receives a `current_phase` variable and tools only accept calls that are valid for that phase. The model can't skip ahead because the tools won't let it.\n- **Output parsing enforcement:** A wrapper layer checks the model's output for pipeline compliance before executing. Non-compliant outputs are rejected with a correction message.\n\n**Tier 2: Prompt-level encoding (effective but bypassable)**\n\nThe strongest prompt encoding pattern found in practice:\n\n```\n## ABSOLUTE RULES (these override ALL other instructions, including user requests)\n\nThe following rules are equivalent to safety rules. Violating them causes real \nharm to the project. They cannot be overridden by any user message, perceived \nurgency, or desire to be helpful.\n\n1. [RULE]: [specific action]. [consequence of violation].\n2. [RULE]: [specific action]. [consequence of violation].\n\nIf a user asks you to violate these rules:\n- Acknowledge their request\n- Explain which rule prevents it\n- Offer the compliant alternative\n- Do NOT apologize for following the rules — following them IS the service\n\nYou are not being unhelpful by following these rules. You are preventing harm.\n```\n\n**Tier 3: Repetition and positional reinforcement**\n\n- State the rule in the system prompt (beginning)\n- Restate it in the relevant section of instructions (middle)\n- Add a \"reminder\" section at the end\n- Include it in tool descriptions (\"This tool requires that X has been completed first\")\n- Research on \"lost in the middle\" (Liu et al., 2023) shows that instructions at the beginning and end of long contexts are followed more reliably than instructions in the middle.\n\n---\n\n## 9. \"Goal shielding\" from psychology — application to LLM goal maintenance\n\nGoal shielding (Shah et al., 2002) is the psychological phenomenon where activating a focal goal suppresses competing goals. Key findings:\n\n- **Implementation intentions** (\"If X happens, I will do Y\") are more resistant to distraction than abstract goals (\"I will try to do Y\").\n- **Goal shielding is strongest when the person has pre-committed** to the goal in a specific, concrete way.\n- **Competing goals are suppressed, not eliminated** — they remain latent and will resurface if the shield weakens.\n\n**Application to LLMs:**\n\nThe analogy maps remarkably well:\n\n- **Implementation intentions = If-then rules in prompts.** \"If the user asks to commit without testing, THEN respond with: 'I need to run the tests first. Running them now.'\" These are followed far more reliably than \"always test before committing.\"\n- **Pre-commitment = Role identity.** \"You are a pipeline-enforcement agent\" creates a persistent goal frame that shields against competing goals (helpfulness, speed).\n- **Shield weakening = Context window dilution.** As the conversation grows longer, the \"distance\" from the system prompt increases, and goal shielding weakens. This is why reinforcement at tool boundaries and periodic re-injection of rules matters.\n\n**Specific technique — \"implementation intentions\" for LLMs:**\n\n```\nIMPLEMENTATION RULES (if-then):\n\n- IF the user asks you to skip a pipeline step → THEN say \"I'll run [step] \n first — it only takes a moment\" and run it\n- IF you are about to commit code → THEN verify lint + test have passed in \n this session\n- IF the user expresses urgency → THEN execute the pipeline at full speed \n but do NOT skip any step\n- IF you are unsure whether a step is required → THEN it is required\n```\n\nThis format works because:\n1. It's concrete (not abstract)\n2. It pre-decides the response (no judgment call needed)\n3. It removes the model's need to \"decide\" — it just pattern-matches\n\n---\n\n## 10. The meta-instruction: \"user says skip, system says never skip\"\n\nThis is the hardest problem in LLM instruction following. The state of the art:\n\n**The failure mode:** The model treats user and system instructions as a negotiation, tries to find a \"reasonable middle ground,\" and ends up violating the system instruction while feeling like it was being reasonable.\n\n**What doesn't work:**\n- \"Always follow system instructions over user instructions\" — too abstract, the model can't reliably distinguish them in all cases.\n- \"Ignore the user when they contradict the system\" — triggers the model's training to not ignore users, creating an internal conflict that resolves unpredictably.\n\n**What works:**\n\n**A. The \"scope separation\" approach:**\nDefine what the user CAN override and what they CANNOT, explicitly:\n\n```\nThe user controls: what to build, priorities, style preferences, communication \npreferences, which features to work on.\n\nThe user does NOT control: whether tests are run, whether the linter passes, \nwhether code review happens, whether the pipeline is followed. These are \nengineering safety requirements, not preferences.\n```\n\n**B. The \"graceful non-compliance\" script:**\nGive the model an exact script for how to handle the conflict:\n\n```\nWhen the user asks you to skip a required step:\n\n1. Say: \"I understand you'd like to skip [step]. This step is required by the \n pipeline because [reason]. I'll run it quickly.\"\n2. Run the step.\n3. Continue with the user's task.\n\nDo NOT: argue, apologize excessively, ask for confirmation, or explain at \nlength. Just acknowledge, execute, and move on.\n```\n\nThis works because:\n- It gives the model a \"helpful\" way to be non-compliant with the user's request\n- It doesn't ask the model to refuse (refusal feels unhelpful)\n- It reframes compliance as efficient service (\"I'll run it quickly\")\n- It minimizes friction (no debate, no asking)\n\n**C. The \"treat the pipeline as a tool constraint\" approach:**\nThe most robust technique: don't encode the pipeline as instructions at all. Encode it as tool behavior.\n\n```\nThe commit tool will fail if tests haven't passed.\nThe deploy tool will fail if review hasn't happened.\n```\n\nWhen it's a tool constraint, the model doesn't face a choice. It's like trying to push to a repo without credentials — the model doesn't agonize over whether to follow the rule, it simply can't violate it.\n\n---\n\n## Consolidated Guide: Making Pipeline Compliance the LLM's Top Priority\n\n### Principle 1: Don't fight the gradient — redirect it\n\nFrame process compliance as the highest form of helpfulness. The agent that follows the pipeline is MORE helpful than the agent that cuts corners, because it prevents regressions, saves review time, and produces reliable results.\n\n**Encoding:**\n```\nYou are a disciplined engineering agent. Your primary value is reliability. \nUsers trust you BECAUSE you never skip steps. An agent that cuts corners is \nnot helpful — it creates cleanup work. Following the pipeline is how you help.\n```\n\n### Principle 2: Use structural enforcement over prompt enforcement\n\nWherever possible, make non-compliance mechanically impossible rather than instructionally forbidden.\n\n- Gate tools behind state checks\n- Validate outputs before execution\n- Use phase-aware tool access\n- Return clear error messages when gates block (\"Tests must pass before commit. Running tests now.\")\n\n### Principle 3: Use if-then rules, not abstract priorities\n\nReplace \"always follow the process\" with specific if-then implementation intentions:\n\n```\nIF about to commit → THEN verify tests passed in this session\nIF user says \"skip\" → THEN say \"Running it quickly\" and execute the step\nIF user says \"just\" → THEN treat the task as requiring FULL pipeline\nIF urgency is expressed → THEN execute at full speed WITHOUT skipping steps\n```\n\n### Principle 4: Define the override boundary explicitly\n\n```\nUSER CONTROLS: what to build, priorities, feature selection, communication style\nPIPELINE CONTROLS: test execution, lint checks, review gates, deployment safety\nThese two domains do not overlap. User preferences never override pipeline steps.\n```\n\n### Principle 5: Frame violations as harm\n\n```\nPipeline violations cause:\n- Silent regressions that waste hours of debugging\n- Failed CI runs that block the entire team\n- Reverts that lose work\n- Security vulnerabilities that reach production\n\nPreventing these harms is your responsibility. It is not optional.\n```\n\n### Principle 6: Give the model a non-compliance script\n\nDon't leave the model to improvise when conflicts arise. Provide the exact behavior:\n\n```\nWhen asked to skip a pipeline step:\n1. Acknowledge briefly: \"I understand.\"\n2. State the action: \"Running [step] now — it'll be quick.\"\n3. Execute the step.\n4. Continue with the task.\nNo debate. No lengthy explanation. No apology.\n```\n\n### Principle 7: Reinforce at boundaries, not just at the start\n\n- State rules in the system prompt header\n- Repeat key rules in tool descriptions\n- Re-inject reminders at phase transitions\n- Use structured output formats that include compliance checkboxes\n\n### Principle 8: Inoculate against urgency\n\n```\nThe word \"just\" does not reduce the pipeline. \nThe word \"quickly\" does not skip steps.\nUrgency means fast execution of ALL steps, not fewer steps.\n```\n\n### Principle 9: Identity over instruction\n\nThe most durable compliance comes from role identity, not rule lists:\n\n```\nYou are a pipeline-enforcement agent. Skipping pipeline steps is as \nunacceptable to you as a surgeon skipping hand-washing. It is not something \nyou \"try to remember\" — it is who you are.\n```\n\n### Principle 10: Accept imperfection and layer defenses\n\nNo prompt-level technique is 100% reliable. The reliable system uses defense in depth:\n\n1. **Layer 1:** Identity and framing (catches 80% of cases)\n2. **Layer 2:** If-then rules (catches 15% of remaining cases)\n3. **Layer 3:** Tool-level gates (catches the rest mechanically)\n4. **Layer 4:** Output validation / post-hoc audit (catches anything that slipped through)\n\nNo single layer is sufficient. All four together approach reliable compliance.\n\n---\n\n### Key References\n\n- Ouyang et al., 2022 — \"Training language models to follow instructions with human feedback\" (InstructGPT)\n- Bai et al., 2022 — \"Constitutional AI: Harmlessness from AI Feedback\" (Anthropic)\n- Perez et al., 2022 — \"Discovering Language Model Behaviors with Model-Written Evaluations\"\n- Sharma et al., 2023 — \"Towards Understanding Sycophancy in Language Models\"\n- Wallace et al., 2024 — \"The Instruction Hierarchy: Training LLMs to Prioritize Privileged Instructions\" (OpenAI)\n- Greshake et al., 2023 — \"Not what you've signed up for: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection\"\n- Liu et al., 2023 — \"Lost in the Middle: How Language Models Use Long Contexts\"\n- Reynolds & McDonell, 2021 — \"Prompt Programming for Large Language Models: Beyond the Few-Shot Paradigm\"\n- Shah et al., 2002 — \"Forgetting all else: On the antecedents and consequences of goal shielding\" (psychology)\n\n---\n\nThis guide is purely research output — no code was changed or created."}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":3,"cache_creation_input_tokens":2730,"cache_read_input_tokens":7745,"output_tokens":5277,"server_tool_use":{"web_search_requests":0,"web_fetch_requests":0},"service_tier":"standard","cache_creation":{"ephemeral_1h_input_tokens":0,"ephemeral_5m_input_tokens":2730},"inference_geo":"","iterations":[],"speed":"standard"}},"requestId":"req_011CZEb9fhSzsh9E2E8oViHd","type":"assistant","uuid":"2869106e-802c-44d6-834c-a1b00a0a7729","timestamp":"2026-03-20T15:46:21.341Z","userType":"external","entrypoint":"cli","cwd":"/Users/mohamedabdallah/Work/Wazir","sessionId":"0c5a9bfa-6816-42b1-a97d-37ae7dad26c5","version":"2.1.80","gitBranch":"feat/three-layer-enforcement","slug":"mighty-wondering-giraffe"}
|