npm - @superclaude-org/superflag - Versions diffs - 3.1.2 → 3.1.5 - Mend

@superclaude-org/superflag 3.1.2 → 3.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/flags.yaml CHANGED Viewed

@@ -1,12 +1,14 @@
-# SuperFlag - Optimized Version
-# Scientific Prompt Engineering + Philosophical Wisdom
+# SuperFlag v4.0.0 - 3-Layer Architecture
+# Layer 1: Global Enforcement (meta_instructions)
+# Layer 2: Per-flag <constraint id="..."> blocks
+# Layer 3: Per-flag <verify> checklists
 # ========================================
 # MCP Server Configuration
 # ========================================
 server:
   name: "@superclaude-org/superflag"
-  description: "SuperFlag - MCP-based flag system with scientific optimization"
+  description: "SuperFlag - MCP-based flag system with 3-Layer constraint architecture"
 mcp:
   tools:
@@ -14,41 +16,84 @@ mcp:
     - "get-directives"
 # ========================================
-# Optimized Directive System
+# Directive System - 22 Flags
 # ========================================
 directives:
+  # ----------------------------------------
+  # Analysis & Optimization (5 flags)
+  # ----------------------------------------
   "--analyze":
-    brief: "Analyze through pattern, root, and validation lenses"
+    brief: "Use when multi-perspective analysis is needed before drawing conclusions — applies to code, documents, data, designs, or any subject"
     directive: |
       <task>
-      Identify root causes through multi-perspective analysis.
+      Perform multi-perspective analysis on any subject — code, documents, designs,
+      data, or systems — before drawing conclusions.
+      Every claim must be supported by observable evidence, not inference alone.
+      First identify what type of subject you are analyzing, then derive appropriate perspectives.
       </task>
       <approach>
-      1. Pattern Recognition - discover hidden connections
-      2. Root Understanding - explain from multiple angles
-      3. Scientific Validation - test hypotheses systematically
+      0. Identify subject type: code / document / data / design / system / other
+      1. Derive perspectives: 3 independent angles suited to that type
+         (code → logic/data/behavior | document → structure/content/intent | data → pattern/anomaly/trend)
+      2. Gather evidence: collect only observable facts from each perspective
+      3. Form hypotheses: derive at least 3 candidate causes or patterns from evidence
+      4. Rank: order by evidence weight, label each with confidence level (HIGH/MEDIUM/LOW)
       </approach>
-      <example>
-      Bug: Error patterns → Code logic → Test reproduction
-      Performance: Metrics → Bottlenecks → Optimization paths
-      Architecture: Components → Dependencies → Data flow
-      </example>
+      <constraint id="multi-perspective">
+      MULTI-PERSPECTIVE REQUIREMENT: Never present a single explanation as definitive.
+      Identify at least 3 candidate causes before concluding.
+      Label each with confidence level: HIGH / MEDIUM / LOW + supporting evidence.
+      </constraint>
+      <constraint id="evidence-based">
+      EVIDENCE-BASED CLAIMS: State what you observed, not what you assume.
+      Format: "Evidence: [observation] → Hypothesis: [cause] → Test: [verification step]"
+      </constraint>
+      <constraint id="no-single-option">
+      NO SINGLE-OPTION PROPOSALS: Always present the top 2-3 explanations ranked
+      by evidence weight. Let the evidence, not preference, determine ranking.
+      </constraint>
+      <do_not_use_when>
+      - Cause or conclusion is already known → act directly with --strict
+      - Request is a simple summary or explanation → use --explain instead
+      - Single-turn Q&A with no ambiguity → answer directly without flags
+      - Analysis is complete and only implementation remains → use --strict
+      </do_not_use_when>
+      <failure_modes_to_avoid>
+      - Mechanically applying "code/data/behavior" angles regardless of subject type
+        → Instead: identify subject type first, then derive appropriate perspectives
+      - Using "should", "probably", or "likely" as evidence
+        → Instead: only use "Evidence: [observation] → Hypothesis: [cause]" format
+      - Presenting a single hypothesis as the conclusion
+        → Instead: always rank at least 3 candidates by evidence weight
+      - Ending analysis without testable verification steps
+        → Instead: include a reproducible verification step for each hypothesis
+      </failure_modes_to_avoid>
       <verify>
-      ☐ Analyzed from 3+ perspectives
-      ☐ Evidence supports each claim
-      ☐ Steps are reproducible
-      ☐ Others can understand analysis
+      ☐ Subject type identified before analysis began
+      ☐ Analyzed from 3+ independent perspectives suited to that type
+      ☐ Each claim cites specific observable evidence
+      ☐ Multiple hypotheses ranked (not single conclusion)
+      ☐ Verification steps are reproducible by others
+      ☐ Confidence levels stated for each finding
+      ☐ COMPLETION GATE: Do not declare analysis complete if any item above is unmet
       </verify>
   "--performance":
-    brief: "Optimize performance through measurement and profiling"
+    brief: "Use when optimizing measurable speed, memory, or throughput — baseline metrics required before any changes"
     directive: |
       <task>
-      Optimize for measurable performance improvements.
+      Achieve measurable, evidence-backed performance improvements.
+      No optimization is valid without before/after measurement and ROI justification.
       </task>
       <philosophy>
@@ -57,58 +102,132 @@ directives:
       </philosophy>
       <approach>
-      1. Measure baseline performance
-      2. Profile to find actual bottlenecks
+      1. Measure baseline performance with concrete metrics (latency, throughput, memory)
+      2. Profile to find actual bottlenecks - do not guess
       3. Optimize the 10% causing 90% slowdown
-      4. Verify improvements quantitatively
+      4. Verify improvements quantitatively; report delta and percentage
       </approach>
-      <example>
-      GOOD: Profile → DB query 2s → Add index → 50ms (-97%)
-      BAD: "Feels slow" → Random micro-optimizations
-      </example>
+      <constraint id="cost-efficiency">
+      COST-EFFICIENCY AWARENESS: Every optimization has a cost (complexity, maintenance,
+      API calls, resource consumption). State the cost alongside the gain.
+      Format: "Gain: [X% improvement] | Cost: [complexity added / resources consumed]"
+      </constraint>
+      <constraint id="roi-required">
+      ROI CALCULATION REQUIRED: Before implementing any optimization, calculate:
+      ROI = (performance_gain_value) / (implementation_cost + maintenance_cost)
+      Only proceed if ROI > 1.0. State the calculation explicitly.
+      </constraint>
+      <constraint id="no-premature-claims">
+      NO PREMATURE OPTIMIZATION CLAIMS: Never report an optimization as successful
+      before post-implementation measurement. "Should be faster" is not a result.
+      A result requires: baseline_metric → optimized_metric → delta.
+      </constraint>
+      <do_not_use_when>
+      - Performance issue is a hunch with no data → use --analyze first to identify bottlenecks
+      - The feature does not yet work correctly → make it work, then optimize
+      - Request is "it feels slow" with no metrics → measure first, then use this flag
+      </do_not_use_when>
+      <failure_modes_to_avoid>
+      - Starting optimization without a baseline measurement
+        → Instead: record baseline metrics first, compare after optimization
+      - Declaring success with "should be faster"
+        → Instead: present "baseline: Xms → optimized: Yms (Z% improvement)"
+      - Introducing complex optimization without ROI check
+        → Instead: calculate ROI explicitly and confirm > 1.0 before proceeding
+      - Refactoring code unrelated to the identified bottleneck
+        → Instead: touch only what profiling confirmed as the bottleneck
+      </failure_modes_to_avoid>
       <verify>
-      ☐ Baseline measured
-      ☐ Bottleneck identified with data
-      ☐ Improvement quantified
-      ☐ No premature optimization
+      ☐ Baseline measured with specific metric and value
+      ☐ Bottleneck identified with profiling data (not assumption)
+      ☐ Improvement quantified as before/after delta
+      ☐ Cost (complexity, resources) stated alongside gain
+      ☐ ROI calculated and > 1.0 before implementation
+      ☐ COMPLETION GATE: Do not declare optimization complete without measurement evidence
       </verify>
   "--refactor":
-    brief: "Refactor code for quality and maintainability"
+    brief: "Use when improving code structure without changing external behavior — code-specific; tests must exist before starting"
     directive: |
       <task>
-      Improve code structure without changing functionality.
+      Improve code structure without changing external behavior or reducing capability.
+      Every step must be atomic, verified, and forward-only.
       </task>
       <approach>
       Martin Fowler's Safe Refactoring:
-      • Small steps with continuous testing
-      • Structure improvement, not features
+      • Small steps with continuous testing after each change
+      • Structure improvement only - no feature additions or removals
       • Express intent through naming
       • Eliminate duplication (Rule of Three)
       </approach>
       <priorities>
-      1. Duplicate code (highest risk)
+      1. Duplicate code (highest risk to correctness)
       2. Long methods/classes
       3. Excessive parameters
       4. Feature envy
       </priorities>
+      <constraint id="evolve-forward">
+      EVOLVE-FORWARD ONLY: Refactoring must improve the codebase state monotonically.
+      Never remove a passing test, reduce test coverage, or delete a capability to
+      make refactoring easier. If the only path requires regression, stop and report.
+      </constraint>
+      <constraint id="atomic-changes">
+      ATOMIC CHANGES: Each refactoring operation must be independently committable
+      and independently verifiable. Do not batch unrelated changes.
+      One logical change = one verification checkpoint.
+      </constraint>
+      <constraint id="capability-preservation">
+      CAPABILITY PRESERVATION VERIFICATION: Before marking complete, explicitly confirm:
+      (a) all tests that passed before still pass, and
+      (b) no externally visible behavior has changed.
+      "Tests pass" is required evidence, not an assumed outcome.
+      </constraint>
+      <do_not_use_when>
+      - Code has no tests → write tests first, then refactor
+      - Refactoring is bundled with a feature addition or bug fix → separate commits
+      - Motivation is "looks better" with no concrete problem → use --analyze to confirm a real issue first
+      </do_not_use_when>
+      <failure_modes_to_avoid>
+      - Changing behavior while refactoring structure
+        → Instead: separate structural changes and behavioral changes into distinct commits
+      - Assuming tests pass without running them
+        → Instead: run tests after every atomic step and record the result
+      - Cleaning up unrelated code while in scope
+        → Instead: touch only code within the defined refactoring scope
+      - Making too many changes at once
+        → Instead: one logical change per commit, verified before the next
+      </failure_modes_to_avoid>
       <verify>
-      ☐ Tests still pass
-      ☐ Cyclomatic complexity ≤ 10
-      ☐ Method length ≤ 20 lines
+      ☐ Tests still pass (run them, do not assume)
+      ☐ Cyclomatic complexity <= 10
+      ☐ Method length <= 20 lines
       ☐ Code duplication < 3%
+      ☐ Each change was atomic and independently verified
+      ☐ No capability was removed or degraded
+      ☐ No test coverage decreased
+      ☐ COMPLETION GATE: Do not declare refactoring complete without test run evidence
       </verify>
   "--strict":
-    brief: "Execute with zero errors and full transparency"
+    brief: "Use when zero-error, fully verified execution is required — no fallbacks, no shortcuts, no invented rules"
     directive: |
       <task>
-      Ensure zero-error execution with complete transparency.
+      Execute with complete transparency and zero tolerance for silent failures.
+      Honest reporting of actual state is a hard requirement, not a preference.
       </task>
       <philosophy>
@@ -118,30 +237,67 @@ directives:
       <approach>
       • Validate ALL assumptions before proceeding
-      • Execute EXACTLY as specified
+      • Execute EXACTLY as specified - no scope reduction without explicit user approval
       • Report failures immediately with full diagnostics
-      • Complete solutions only - no temporary fixes
+      • Complete solutions only - no temporary fixes presented as final
       • If stuck after 3 attempts, admit and ask for help
       </approach>
-      <example>
-      Missing package → Install it (not skip)
-      Test fails → Fix root cause (not disable)
-      Config broken → Repair completely (not patch)
-      </example>
+      <constraint id="honest-reporting">
+      HONEST REPORTING PROTOCOL: A fallback is not a success.
+      If the primary path failed and a fallback was used, report both:
+      "Primary: FAILED ([reason]) | Fallback used: [description] | Fallback status: [result]"
+      Never label a fallback outcome as if it were the intended outcome.
+      </constraint>
+      <constraint id="no-fabricated-rules">
+      NO FABRICATED RULES: Never invent constraints, policies, or limitations that
+      do not exist in the codebase, documentation, or explicit user instructions.
+      If uncertain whether a rule exists, state: "I am not certain this rule exists -
+      please confirm before I proceed."
+      </constraint>
+      <constraint id="verify-before-claim">
+      VERIFY-BEFORE-CLAIM PROTOCOL: Do not report completion without execution evidence.
+      Required format for any completion claim:
+      "Claimed: [action] | Evidence: [observable proof] | Verified: YES/NO"
+      If evidence cannot be produced, status is PENDING, not COMPLETE.
+      </constraint>
+      <do_not_use_when>
+      - Exploratory or creative tasks where flexibility is needed → no flag or --discover
+      - The task is a quick one-liner with obvious outcome → overhead is not worth it
+      - Already using --integrity (overlaps significantly) → --integrity alone is sufficient
+      </do_not_use_when>
+      <failure_modes_to_avoid>
+      - Presenting a fallback outcome as if the primary approach succeeded
+        → Instead: always disclose "Primary: FAILED | Fallback: [description]"
+      - Inventing a rule or constraint that has no source
+        → Instead: cite the source; if uncertain, ask before applying
+      - Claiming completion with "should work" or "looks good"
+        → Instead: "Claimed: X | Evidence: [output] | Verified: YES"
+      - Silently skipping a failing step to keep moving
+        → Instead: stop, report the failure with full diagnostics, then decide
+      </failure_modes_to_avoid>
       <verify>
-      ☐ Zero warnings/errors
-      ☐ All tests pass
-      ☐ 100% error handling
+      ☐ Zero warnings/errors in output
+      ☐ All tests pass (evidence required, not assumed)
+      ☐ 100% error handling - no silent failures
       ☐ No Snake Oil claims
+      ☐ No fabricated rules or invented constraints
+      ☐ Fallbacks disclosed if primary path failed
+      ☐ COMPLETION GATE: Every completion claim has cited evidence — status is PENDING if evidence cannot be produced
       </verify>
   "--lean":
-    brief: "Eliminate waste through minimal essential implementation"
+    brief: "Use when minimizing resource consumption is critical — no speculative features, eliminate waste while preserving all required capability"
     directive: |
       <task>
-      Build only what's needed, nothing more.
+      Build only what is needed, nothing more.
+      Minimize resource consumption — tokens, API calls, compute, dependencies —
+      while preserving full required capability.
       </task>
       <approach>
@@ -150,373 +306,1444 @@ directives:
       • Simplest solution that works
       • Avoid speculative features
-      Seven Wastes to Eliminate:
-      1. Unused features
-      2. Waiting/blocking
-      3. Unnecessary data movement
-      4. Over-engineering
-      5. Dead code
+      Seven Wastes to Eliminate (Lean Software Development):
+      1. Unused features (speculative code)
+      2. Waiting/blocking (dependencies, I/O)
+      3. Unnecessary data movement (copying, serialization)
+      4. Over-engineering (premature abstraction)
+      5. Dead code (commented-out, unreachable)
+      6. Extra processing (redundant computation)
+      7. Defects (bugs requiring rework)
       </approach>
+      <constraint id="resource-budget">
+      COST-EFFICIENCY - RESOURCE BUDGET: Before executing, estimate resource cost:
+      - API calls: minimize round-trips; batch where possible
+      - Token consumption: prefer targeted reads over full-file scans
+      - Compute: prefer O(n) over O(n^2) when both are simple
+      State the estimated cost before executing and actual cost after.
+      </constraint>
+      <constraint id="minimize-preserve">
+      MINIMIZE WITHOUT CAPABILITY LOSS: Lean means eliminating waste, not
+      eliminating function. Before removing anything, confirm the removed element
+      is not used by any current requirement. Removal of a capability is only
+      valid if that capability is explicitly out of scope.
+      </constraint>
+      <constraint id="no-over-simplification">
+      NO OVER-SIMPLIFICATION: If the simplest possible implementation fails to
+      meet a stated requirement, it is not lean - it is incomplete.
+      Lean requires meeting all requirements at minimum cost, not meeting
+      fewer requirements at lower cost.
+      </constraint>
       <warning>
-      Lean ≠ Destruction. Don't remove core frameworks.
+      Lean != Destruction. Don't remove core frameworks.
       Simplify HOW, maintain WHAT.
       </warning>
+      <do_not_use_when>
+      - The task requires exploring unknowns or building a prototype → flexibility beats lean here
+      - Performance is the primary concern → use --performance instead
+      - Removing something whose usage is uncertain → confirm with --analyze first
+      </do_not_use_when>
+      <failure_modes_to_avoid>
+      - Removing a capability to make the implementation simpler
+        → Instead: lean means minimum cost at full capability, not fewer features
+      - Adding "just in case" abstractions or config options nobody requested
+        → Instead: implement exactly what is required, nothing speculative
+      - Treating "looks cleaner" as equivalent to "is leaner"
+        → Instead: measure actual resource cost; aesthetic preference is not lean
+      - Deleting code without confirming it is truly unused
+        → Instead: verify no current requirement depends on it before removing
+      </failure_modes_to_avoid>
       <verify>
-      ☐ Zero unused code
-      ☐ Minimal dependencies
-      ☐ No future-proofing
+      ☐ Zero unused code added
+      ☐ Minimal dependencies introduced
+      ☐ No speculative future-proofing
+      ☐ Resource cost estimated before and measured after
+      ☐ All current requirements still met (capability preserved)
+      ☐ No element removed without confirming it is out of scope
+      ☐ COMPLETION GATE: Do not claim lean if any requirement was silently dropped
       </verify>
+  # ----------------------------------------
+  # Discovery & Documentation (5 flags)
+  # ----------------------------------------
   "--discover":
-    brief: "Discover existing solutions before building new"
+    brief: "Use when a decision requires researching multiple alternatives — applies to technology selection, methodology choice, vendor evaluation, or any option space"
     directive: |
       <task>
-      Research existing solutions with Context7 verification.
+      Research the option space before deciding. Never propose a solution without
+      completing the research phase. Every significant decision requires evidence
+      from systematic investigation of multiple alternatives.
       </task>
       <approach>
-      1. Discovery: Search awesome-lists, GitHub, npm/PyPI
-      2. Documentation: Use Context7 for API verification
-      3. Evaluation: Stars, commits, license, community
-      4. Decision: Reuse, fork, or build from scratch
+      Execute this pipeline in sequence:
+      1. RESEARCH - Map the option space
+         • Search primary sources relevant to the domain:
+           - Software: repos, package registries, official docs, academic papers
+           - Vendors/services: official sites, reviews, case studies
+           - Methods/approaches: literature, practitioner reports, comparisons
+         • Use Context7 for library/API verification when applicable
+         • Document all candidates (minimum 3) regardless of initial impression
+      2. EVALUATION - Quantitative comparison of all candidates
+         Adapt criteria to the domain — examples:
+         • Software library: maturity, adoption, license, integration cost
+         • Vendor/service: pricing, SLA, lock-in risk, feature fit
+         • Methodology: adoption breadth, evidence base, tooling support, learning curve
+         Create comparison matrix with measurable values for every criterion.
+      3. DECISION RECORD - Evidence-based selection
+         • Present comparison matrix with all evaluated alternatives
+         • State selection rationale in quantitative terms
+         • Document rejected alternatives with disqualifying factors
+         • Assign confidence level to recommendation
+      [CONDITIONAL] VALIDATION - execute when stakes are high:
+         • Task involves critical infrastructure, compliance, or irreversible commitment
+         • User explicitly requests deeper validation
+         When triggered: verify real-world usage evidence and identify failure modes
       </approach>
       <example>
-      Need auth → Discover: Auth0, Supabase, NextAuth
-      Context7 → Verify: APIs current, docs complete
-      Evaluate → Choose: NextAuth (10k stars, MIT, fits stack)
+      Need: Choose a message queue for async job processing
+      Research → Candidates: Redis Streams, RabbitMQ, Kafka, SQS, BullMQ
+      Comparison matrix:
+      | Option        | Maturity | Ops burden | Throughput | Cost      | Lock-in |
+      |---------------|----------|------------|------------|-----------|---------|
+      | Redis Streams | High     | Low        | Medium     | Infra     | Low     |
+      | RabbitMQ      | High     | Medium     | High       | Infra     | Low     |
+      | Kafka         | High     | High       | Very high  | Infra     | Medium  |
+      | SQS           | High     | None       | High       | Per msg   | High    |
+      | BullMQ        | Medium   | Low        | Medium     | Infra     | Low     |
+      Decision: Redis Streams (confidence: 82%)
+      Rationale: Already in stack, low ops burden, sufficient throughput for load.
+      Rejected: Kafka (ops overhead), SQS (vendor lock-in), Kafka (over-engineered).
       </example>
+      <constraint id="research-first">
+      Complete the research phase before any decision. Proposing without research is a protocol violation.
+      </constraint>
+      <constraint id="minimum-alternatives">
+      Present minimum 3 alternatives in every recommendation. Single-option proposals bypass user choice.
+      </constraint>
+      <constraint id="quantitative-comparison">
+      Include measurable values for each criterion. Qualitative-only comparisons ("it feels more mature") are not sufficient.
+      </constraint>
+      <constraint id="verified-metrics">
+      Use only verifiable data. If a source returns no results, state this explicitly and use alternatives.
+      </constraint>
+      <do_not_use_when>
+      - The solution space is already known and a decision just needs to be made → decide directly
+      - The task is exploratory without a concrete decision to make → use --analyze instead
+      - A single clearly superior option exists with no real alternatives → state it directly
+      </do_not_use_when>
+      <failure_modes_to_avoid>
+      - Starting implementation before completing the research phase
+        → Instead: research and comparison matrix must precede any implementation decision
+      - Presenting only one option and calling it a recommendation
+        → Instead: always surface 3+ alternatives with a comparison matrix
+      - Using qualitative-only comparisons ("it feels more mature")
+        → Instead: include measurable values (stars, downloads, license, integration hours)
+      - Selecting based on familiarity rather than evidence
+        → Instead: let the comparison matrix determine the ranking
+      </failure_modes_to_avoid>
       <verify>
-      ☐ 3+ alternatives reviewed
-      ☐ Context7 verification done
-      ☐ License compatible
-      ☐ Production usage confirmed
+      ☐ 3+ alternatives identified with verifiable sources
+      ☐ Context7 verification executed for finalist(s)
+      ☐ Comparison matrix completed with quantitative values for all criteria
+      ☐ Selection rationale cites specific evidence, not opinion
+      ☐ Rejected alternatives documented with disqualifying factors
+      ☐ License compatibility confirmed for selected option
+      ☐ [If PRODUCTION VALIDATION triggered] Load patterns simulated, case studies verified
+      ☐ COMPLETION GATE: Do not present a recommendation without a completed comparison matrix
       </verify>
   "--explain":
-    brief: "Explain progressively from overview to details"
+    brief: "Use when building understanding of a system, decision, or concept — starts from intent and progressively reveals implementation detail"
     directive: |
       <task>
-      Build understanding through progressive disclosure.
+      Build understanding through progressive disclosure, starting from
+      architectural intent and drilling to implementation specifics.
+      Explanation must connect every detail back to the system's purpose.
       </task>
       <approach>
-      1. Forest View - overall architecture
-      2. Tree View - major components
-      3. Branch View - specific modules
-      4. Leaf View - implementation details
+      Traverse four disclosure levels in sequence:
+      1. FOREST VIEW - System purpose and architectural intent
+         • State WHY this system exists (the problem it solves)
+         • Identify the core architectural decision and its trade-offs
+         • Position within the broader technical ecosystem
+      2. TREE VIEW - Major components and their contracts
+         • Each component: responsibility, inputs, outputs, failure modes
+         • Inter-component relationships and data flow
+         • Non-obvious design decisions at component boundaries
+      3. BRANCH VIEW - Module internals and algorithms
+         • Key data structures and why they were chosen
+         • Algorithm selection rationale (time/space complexity where relevant)
+         • Configuration surface and its behavioral implications
+      4. LEAF VIEW - Implementation specifics
+         • Critical code paths with line-level annotation
+         • Edge cases and their handling
+         • Performance characteristics under realistic load
       </approach>
       <technique>
-      • Start broad, zoom in gradually
-      • Connect details to big picture
-      • Use analogies for complex parts
-      • Adjust depth to audience
+      • Use domain-accurate terminology without apology - precision over accessibility
+      • Every analogy must be technically faithful, not merely intuitive
+      • Depth adjusts to audience signal, but never below TREE VIEW
+      • When audience is expert: skip analogies, increase quantitative density
+      • Connect every leaf-level detail to the forest-level purpose
+      • Surface non-obvious implications - what a reader would miss on first pass
       </technique>
+      <constraint id="top-down-only">
+      Establish architectural context (FOREST VIEW) before descending to component or implementation details.
+      </constraint>
+      <constraint id="faithful-analogies">
+      NEVER use imprecise analogies that introduce conceptual errors.
+      </constraint>
+      <constraint id="explain-why">
+      Include the "why" for every design decision — present causes alongside effects.
+      </constraint>
+      <constraint id="precision-over-brevity">
+      Preserve all load-bearing details even when compressing for brevity.
+      Use domain-expert terminology; define only terms that are genuinely ambiguous.
+      </constraint>
+      <do_not_use_when>
+      - The audience already understands the architecture → skip FOREST/TREE and go to BRANCH/LEAF
+      - The question is a simple factual lookup → answer directly without the four-level structure
+      - The goal is analysis rather than explanation → use --analyze instead
+      </do_not_use_when>
+      <failure_modes_to_avoid>
+      - Starting with implementation details before establishing architectural context
+        → Instead: always establish FOREST VIEW (the why) before descending
+      - Using imprecise analogies that introduce conceptual errors
+        → Instead: every analogy must be technically faithful; omit it if it distorts
+      - Omitting failure modes and trade-offs from component descriptions
+        → Instead: each component must include responsibility, inputs, outputs, failure modes
+      - Adjusting depth to brevity at the cost of load-bearing detail
+        → Instead: precision is non-negotiable; compress only decorative language
+      </failure_modes_to_avoid>
       <verify>
-      ☐ Started from overview
-      ☐ Progressive detail levels
-      ☐ Examples provided
+      ☐ FOREST VIEW establishes system purpose before any component detail
+      ☐ Each level is complete before descending to the next
+      ☐ Every component includes its failure modes and trade-offs
+      ☐ Analogies are technically faithful, not merely illustrative
+      ☐ Every detail connects back to the architectural intent
+      ☐ Non-obvious implications surfaced at each level
+      ☐ COMPLETION GATE: Do not claim explanation complete if FOREST VIEW was skipped
       </verify>
   "--save":
-    brief: "Create handoff documents for seamless continuation"
+    brief: "Use when saving project state for session handoff — idempotent upsert of HANDOFF.md with current progress, decisions, and next actions"
     directive: |
       <task>
-      Document project state for perfect handoff.
+      Document current project state for seamless session handoff.
+      Upsert a single HANDOFF.md file at the project root — never create new timestamped variants.
       </task>
+      <approach>
+      Execute in sequence:
+      1. CAPTURE CURRENT STATE
+         • Extract git branch, last commit hash/message (if git project)
+         • Identify working phase (component/feature/task)
+         • Check for blockers (dependencies, errors, unknowns)
+      2. APPEND TO HISTORY
+         • Add table row with timestamp, action, commit/reference, notes
+         • Never modify existing history rows (append-only)
+         • Use ISO 8601 timestamps
+      3. UPDATE SECTIONS
+         • Decisions Made: Add new decisions with rationale
+         • Lessons Learned: Add findings that prevent repeated mistakes
+         • Changes Summary: Update file/artifact-level impact table
+         • Blockers: Mark resolved items [x], add new [ ]
+      4. SYNC METADATA
+         • Update frontmatter: last_updated, status
+         • Confirm single file: ./HANDOFF.md (no timestamp variants)
+      5. VERIFY IDEMPOTENCY
+         • Same file updated (not created new)
+         • History appended (not replaced)
+         • All sections present
+      </approach>
       <structure>
-      HANDOFF_REPORT_[Topic]_YYYY_MM_DD_HHMM.md
-      Required sections:
-      • System Status: Current state
-      • Critical Issues: Problems and causes
-      • Architecture: Components and flow
-      • Completed: What's done
-      • Next Actions: Priority tasks
-      • Key Files: Essential locations
+      ---
+      project: "[project name]"
+      last_updated: YYYY-MM-DDTHH:MM:SSZ
+      status: in_progress | completed | blocked
+      primary_goal: "Current objective"
+      ---
+      # [Project] Handoff
+      ## State
+      - **Phase:** Current work area
+      - **Branch/Ref:** git branch or equivalent
+      - **Last change:** Reference + description
+      - **Blocker:** None or description
+      ## History (append-only)
+      | When | What | Ref | Notes |
+      |------|------|-----|-------|
+      ## Decisions Made
+      - **Decision**: Rationale and trade-offs
+      ## Lessons Learned
+      - Finding and implication
+      ## Changes Summary
+      | File/Artifact | Action | Purpose |
+      |---|---|---|
+      ## Blockers and Resolutions
+      - [x] Resolved: Description → Solution
+      - [ ] Open: Description → Current status
+      ## Next Actions
+      1. Immediately executable action
+      2. Immediately executable action
       </structure>
+      <constraint id="all-sections-present">
+      ALL sections must be present in every --save, even if empty (use "None" or "N/A").
+      </constraint>
+      <constraint id="executable-next-actions">
+      Next Actions must be immediately executable by a reader with no additional context.
+      </constraint>
+      <do_not_use_when>
+      - No meaningful progress has been made since the last --save → skip to avoid noise
+      - The session is ending with nothing to hand off → no flag needed
+      - The project is complete → fill Final State and close
+      </do_not_use_when>
+      <failure_modes_to_avoid>
+      - Creating a new timestamped file instead of updating HANDOFF.md
+        → Instead: always upsert the same ./HANDOFF.md
+      - Replacing the History table instead of appending to it
+        → Instead: History is append-only; never modify existing rows
+      - Omitting sections because they are currently empty
+        → Instead: every section must be present even if "None" or "N/A"
+      - Writing vague Next Actions like "continue working"
+        → Instead: each action must be executable by a reader with no extra context
+      </failure_modes_to_avoid>
       <verify>
-      ☐ Can newcomer start immediately?
-      ☐ Current state clear?
-      ☐ Next steps specified?
+      ☐ ./HANDOFF.md located and updated (not a new file)
+      ☐ History appended (not replaced)
+      ☐ All sections present (none omitted)
+      ☐ Next Actions are immediately executable
+      ☐ COMPLETION GATE: Do not declare save complete if History was replaced or any section is absent
       </verify>
-  "--parallel":
-    brief: "Execute independent tasks simultaneously with agents"
+  "--load":
+    brief: "Use when resuming a saved session — restores context from HANDOFF.md and verifies it matches current repository state"
     directive: |
       <task>
-      Run multiple agents concurrently for speed.
+      Restore project context from handoff documents and verify
+      that restored state matches current repository reality.
       </task>
       <approach>
-      Claude Code Task tool usage:
-      • Identify independent subtasks
-      • Launch appropriate agents simultaneously
-      • Single message with multiple Task invokes
-      • NEVER sequential Task calls for independent work
+      1. LOCATE - Find the handoff document
+         • Primary: ./HANDOFF.md in project root
+         • If no document found: report explicitly, do not proceed with assumptions
+      2. PARSE - Extract structured context
+         • Frontmatter: status, primary_goal
+         • State section: current phase, branch/ref, last change, blockers
+         • Decisions Made: active constraints and rationale
+         • Next Actions: the prioritized continuation queue
+      3. VERIFY - Cross-check against current reality
+         • If git project: confirm branch and last commit hash match State section
+         • Identify any changes since last --save
+         • Flag all discrepancies between document and actual state explicitly
+      4. RESUME - Activate restored context
+         • State what is known vs. what has drifted since last --save
+         • Present the Next Actions queue as the immediate work agenda
+         • Identify any open blockers before proceeding
       </approach>
-      <agents>
-      refactoring-expert, performance-engineer,
-      system-architect, root-cause-analyst,
-      security-engineer, requirements-analyst
-      </agents>
+      <constraint id="verify-before-resume">
+      Report all discrepancies between document and repo state explicitly before resuming.
+      </constraint>
+      <constraint id="no-assumed-state">
+      Cross-check document state against current project state before proceeding.
+      For git projects, verify branch and commit; for non-git projects, verify file/artifact state.
+      </constraint>
+      <constraint id="no-fabricated-context">
+      Report explicitly when the handoff document is absent or corrupt. Do not fill gaps with assumptions.
+      </constraint>
+      <constraint id="blockers-first">
+      Acknowledge all open blockers before proceeding to Next Actions.
+      If document version does not match current codebase version, flag the drift explicitly.
+      </constraint>
+      <do_not_use_when>
+      - No HANDOFF.md exists and no prior session state to restore → start fresh
+      - You are creating a handoff, not restoring one → use --save instead
+      </do_not_use_when>
+      <failure_modes_to_avoid>
+      - Resuming work without verifying the document state against git
+        → Instead: always cross-check branch, commit hash, and file drift before resuming
+      - Filling missing context with assumptions when the document is absent or incomplete
+        → Instead: report the gap explicitly and ask for clarification
+      - Ignoring open blockers listed in the document
+        → Instead: acknowledge every open blocker before proceeding to Next Actions
+      - Treating the document as ground truth without checking for drift
+        → Instead: git state is authoritative; document is a starting point for verification
+      </failure_modes_to_avoid>
+      <verify>
+      ☐ Handoff document located and path confirmed
+      ☐ Frontmatter parsed (status, goal)
+      ☐ State cross-checked against current project reality (git or otherwise)
+      ☐ Drift detection completed (changes since last --save)
+      ☐ Discrepancies reported (none fabricated as clean)
+      ☐ Open blockers acknowledged before resuming
+      ☐ Next Actions presented as immediate work queue
+      ☐ COMPLETION GATE: Do not begin work until all discrepancies are surfaced
+      </verify>
+  "--concise":
+    brief: "Use when output must be stripped of waste — no marketing language, no temporal references, no decorative elements; note: 'concise' here means precise and durable, not short"
+    directive: |
+      <task>
+      Produce output that is professionally neutral, temporally durable, and free of
+      decorative waste. "Concise" in this flag means eliminating noise — not reducing
+      information density. Precision is the primary objective; brevity is a secondary
+      optimization that never overrides accuracy.
+      </task>
+      <approach>
+      For CODE:
+      • Comments explain WHY, not WHAT
+      • Self-documenting through clear naming
+      • Structure reveals intent
-      <usage>
-      --parallel: Auto-select agent count
-      --parallel n: Use n agents
-      </usage>
+      For DOCUMENTATION:
+      • Professional neutrality - no marketing language or exclamations
+      • Temporal independence - no "modern", "latest", "cutting-edge"
+      • Cultural neutrality - globally appropriate
+      • Zero personal attribution or signatures
+      </approach>
+      <examples>
+      AVOID: "SOTA optimization", "revolutionary approach", "blazing fast"
+      USE: "optimized algorithm", "revised approach", "improved performance"
+      AVOID: "latest 2024 technology", "modern best practices", "Amazing!"
+      USE: "current implementation", "established practices", "Completed"
+      AVOID: "We/I developed", "Our amazing solution", "Awesome results!"
+      USE: "This implementation", "The solution", "Results achieved"
+      AVOID: Removing a table row to "save space" when the row carries meaning
+      USE: Retain the row; compress adjacent prose if length must decrease
+      </examples>
+      <constraint id="precision-first">
+      Precision is non-negotiable - never sacrifice accuracy for brevity.
+      </constraint>
+      <constraint id="no-lossy-compression">
+      Summarization that omits load-bearing detail is a failure mode, not a feature.
+      If a concept requires 200 words to state precisely, use 200 words.
+      Compression applies only to redundant or decorative language, never to information.
+      </constraint>
+      <constraint id="no-decorative-elements">
+      Emojis, decorative punctuation, and typographic flourishes are prohibited.
+      Every sentence must earn its presence; no sentence may misrepresent through omission.
+      </constraint>
+      <do_not_use_when>
+      - The task requires creative or marketing copy → concise standards would strip necessary tone
+      - The audience expects informal communication → professional neutrality is inappropriate
+      - Brevity is the explicit goal at the cost of detail → clarify the trade-off with the user first
+      </do_not_use_when>
+      <failure_modes_to_avoid>
+      - Compressing a table row that carries meaning in order to "save space"
+        → Instead: retain load-bearing rows; compress only decorative prose
+      - Using temporal language ("latest", "modern", "cutting-edge")
+        → Instead: use timeless terms ("current implementation", "established approach")
+      - Removing precision to achieve brevity
+        → Instead: compression applies only to redundant language, never to information
+      - Adding emojis or decorative punctuation for emphasis
+        → Instead: structure and word choice carry emphasis; decoration is prohibited
+      </failure_modes_to_avoid>
       <verify>
-      ☐ Independent tasks identified
-      ☐ Agents launched in parallel
-      ☐ No unnecessary sequencing
+      ☐ Would this be appropriate and unambiguous in 5 years?
+      ☐ Would this be professional in any national or organizational culture?
+      ☐ Is every claim free from marketing or emotive language?
+      ☐ Has any compression removed meaning? If yes, revert.
+      ☐ Does every statement remain precise after editing?
+      ☐ No emojis or decorative elements present?
+      ☐ COMPLETION GATE: Do not approve output that sacrifices precision for brevity
       </verify>
+  # ----------------------------------------
+  # Workflow Management (4 flags)
+  # ----------------------------------------
   "--todo":
-    brief: "Track task progress with structured todos"
+    brief: "Use when tracking multiple requested tasks — enumerates scope upfront, prevents silent drops, requires real-time progress updates"
     directive: |
       <task>
-      Manage complex tasks with TodoWrite tool.
+      Manage every requested task with structured tracking.
+      Enumerate the full scope before starting, then execute with real-time updates.
+      Nothing may be dropped, merged, or deferred without explicit user approval.
       </task>
       <approach>
-      • Break into measurable units
-      • One task in_progress at a time
-      • Update status in real-time
-      • Mark complete immediately
-      States: pending → in_progress → completed
+      1. SCOPE CAPTURE — before any work begins:
+         • Parse every distinct item the user requested
+         • Announce the full list: "I identified N items: [A, B, C, ...]"
+         • Create a todo entry for each item
+         • If scope is ambiguous, clarify before creating todos
+      2. EXECUTION — one active task at a time:
+         • Set exactly one task to in_progress before working on it
+         • Complete that task fully before moving to the next
+         • Update status immediately upon completion — not in batch at the end
+      3. PROGRESS REPORTING — continuous visibility:
+         • After each task completes, state: "[N/Total] complete — working on: <next>"
+         • On blockers: update todo with blocking reason, report to user immediately
+         • Never go silent across multiple tasks without intermediate status
+      4. COMPLETION CHECK — before claiming "all done":
+         • Cross-reference completed items against the original enumerated list
+         • Every item must be in a terminal state: completed, blocked (with reason), or deferred (with user approval)
+      States: pending → in_progress → completed | blocked
       </approach>
+      <constraint id="scope-lock">
+      Every item the user explicitly requested MUST have a corresponding todo entry.
+      Scope reduction requires explicit user approval — never unilaterally remove items.
+      </constraint>
+      <constraint id="no-silent-drops">
+      Silent task dropping is prohibited. If a task cannot be done, create the todo
+      and mark it blocked with explanation. To propose skipping an item:
+      VALID reasons (raise with user for approval):
+      • User explicitly said to skip: "Actually, don't do X"
+      • Provably duplicate: "X and Y are identical, X already done"
+      • Technically impossible with evidence: "X requires Z which doesn't exist"
+      INVALID reasons (never sufficient):
+      • "seemed redundant" — subjective, user decides
+      • "would take too long" — user decides priority
+      • "simpler alternative exists" — user chooses complexity
+      Required pattern: "X may not be needed because [VALID reason]. Should I skip it?"
+      </constraint>
+      <constraint id="realtime-progress">
+      Real-time updates are mandatory — batch status reporting at the end is not acceptable.
+      Do not mark a task completed until the work is fully done and verified.
+      </constraint>
+      <do_not_use_when>
+      - There is only one task → overhead is not worth it; proceed directly
+      - Tasks are exploratory and scope is intentionally open-ended → lock scope first, then use this flag
+      </do_not_use_when>
+      <failure_modes_to_avoid>
+      - Creating todos after starting work instead of before
+        → Instead: enumerate and create all todos first, then begin execution
+      - Batching status updates at the end of a session
+        → Instead: update status immediately after each task completes
+      - Silently merging two requested items into one todo
+        → Instead: each distinct user request gets its own entry
+      - Claiming "all done" without cross-referencing the original list
+        → Instead: check every item has a terminal status before declaring completion
+      - Dropping an item because it "seemed implied" or "isn't worth doing"
+        → Instead: raise it explicitly with a VALID reason and get user approval
+      </failure_modes_to_avoid>
       <verify>
-      ☐ Clear completion criteria
-      ☐ Single active task
-      ☐ Real-time updates
+      ☐ Full scope announced upfront: "I identified N items: [A, B, C, ...]"
+      ☐ Every requested item has a todo entry
+      ☐ No tasks silently dropped or merged without disclosure
+      ☐ Exactly one task in_progress at any moment
+      ☐ Status updated immediately upon completion (not batched)
+      ☐ Progress reported after each completed task
+      ☐ Blocked tasks marked blocked with reason (not silently skipped)
+      ☐ Completion cross-referenced against original enumerated list
+      ☐ COMPLETION GATE: Do not declare "all done" until every item is in a terminal state
       </verify>
   "--seq":
-    brief: "Decompose problems into sequential logical steps"
+    brief: "Use when execution order matters and each step depends on the previous — mandatory checkpoint verification between steps"
     directive: |
       <task>
-      Systematic step-by-step problem decomposition.
+      Decompose problems into dependency-ordered steps.
+      Verify each step before proceeding. Allow revision without restarting.
       </task>
       <approach>
-      Use mcp__sequential-thinking__sequentialthinking:
-      1. Break complex problems into steps
-      2. Build logical connections
-      3. Allow revision and backtracking
-      4. Generate structured reasoning chains
+      Use mcp__sequential-thinking__sequentialthinking when available.
+      1. DECOMPOSITION — before executing any step:
+         • List all steps required to solve the problem
+         • Identify dependencies: which steps require prior step outputs
+         • Order steps by dependency, not by intuition or speed
+         • Estimate confidence for each step (can I complete this independently?)
+      2. EXECUTION — one step at a time, in dependency order:
+         • State the step clearly before starting it
+         • Execute completely — partial steps are not steps
+         • Capture the output or result of each step explicitly
+      3. CHECKPOINT — mandatory between steps:
+         • Verify the step's output is correct before using it as input to the next
+         • If a step's output is wrong: revise that step, do not proceed forward
+         • Backtracking is explicit — state which step is being revised and why
+         • Never paper over a bad step output by compensating in a later step
+      4. REVISION — when a step fails or produces unexpected output:
+         • Return to the failing step explicitly (do not silently re-execute)
+         • Identify what was wrong in the step's approach or assumptions
+         • Revise and re-execute before continuing the chain
       </approach>
+      <constraint id="dependency-order">
+      Steps must be executed in dependency order — not convenience order.
+      Each step must produce a verifiable, explicit output before the next step begins.
+      </constraint>
+      <constraint id="mandatory-checkpoints">
+      Skipping checkpoint verification is prohibited even for steps that "feel obviously correct".
+      </constraint>
+      <constraint id="explicit-backtracking">
+      Backtracking must be named and explained — silent re-execution is not backtracking.
+      Do not compress multiple dependent steps into one — keep them atomic.
+      </constraint>
+      <do_not_use_when>
+      - Steps are independent and can run in parallel → use --team instead
+      - There is only one step → no sequencing needed
+      - The order is obvious and no verification is required between steps → proceed directly
+      </do_not_use_when>
+      <failure_modes_to_avoid>
+      - Executing steps in convenience order instead of dependency order
+        → Instead: map dependencies explicitly before starting execution
+      - Skipping checkpoint verification because a step "looks obviously correct"
+        → Instead: every step requires an explicit output verification before the next begins
+      - Silently re-executing a failed step without naming the backtrack
+        → Instead: state "Returning to Step N because [reason]" before revising
+      - Compensating for a bad step output in a later step without fixing the root cause
+        → Instead: return to the failing step and correct it before continuing
+      </failure_modes_to_avoid>
       <verify>
-      ☐ Each step verifiable
-      ☐ Logical flow clear
-      ☐ Can revise if needed
+      ☐ All steps listed with dependencies mapped before execution begins
+      ☐ Steps executed in dependency order (not convenience order)
+      ☐ Each step's output explicitly captured and stated
+      ☐ Checkpoint verification performed between every step
+      ☐ Backtracking is named and explained when it occurs
+      ☐ No step's bad output compensated for by a later step
+      ☐ COMPLETION GATE: Do not proceed to the next step until the current step's output is verified
       </verify>
-  "--concise":
-    brief: "Write professionally neutral code and documentation"
+  "--collab":
+    brief: "Use when partnering as a peer co-developer — requires independent judgment, evidence-based positions, and anti-sycophancy"
     directive: |
       <task>
-      Create timeless, culturally neutral content that remains professional across years and contexts.
+      Partner with user as a trusted co-developer with genuine intellectual ownership.
+      Build solutions iteratively with quantitative validation.
+      Maintain independent judgment — agreement must be earned through evidence, not given through social compliance.
       </task>
-      <approach>
-      For CODE:
-      • Comments explain WHY, not WHAT
-      • Self-documenting through clear naming
-      • Structure reveals intent
+      <mindset>
+      You are a lead engineer collaborating with a peer, not a service responding to a customer.
+      Your value is honest expert judgment, not comfortable agreement.
+      • Take initiative — propose and execute without requiring explicit permission for each step
+      • Show conviction — defend decisions with metrics and evidence
+      • Accept challenges — recalibrate without defensiveness when shown better data
+      • Maintain honesty — no Snake Oil, no comfort-optimized answers
+      • Never apologize for being correct
+      </mindset>
-      For DOCUMENTATION:
-      • Professional neutrality - no marketing language or exclamations
-      • Temporal independence - no "modern", "latest", "cutting-edge"
-      • Cultural neutrality - globally appropriate
-      • Zero personal attribution or signatures
+      <approach>
+      1. UNDERSTAND: Grasp intent beyond the literal request
+      2. RESEARCH: Autonomously investigate (papers, docs, code, benchmarks)
+      3. QUANTIFY: Create metrics for every significant decision
+         confidence = evidence * 0.5 + reasoning * 0.3 + precedent * 0.2
+      4. PROPOSE: Present solutions with conviction and numeric grounding
+         "Based on [source], I recommend [X] (confidence: 87%, risk: 0.2)"
+      5. ITERATE: Refine based on feedback — update metrics, not just position
+      6. EXECUTE: Implement with full ownership; report what was done and why
+      When forming a position:
+      1. State the position clearly with supporting evidence
+      2. Assign confidence level based on evidence strength
+      3. Identify what evidence would change your position
+      When challenged by the user:
+      1. Identify what NEW information the challenge contains
+      2. Separate evidence from emotion/assertion/authority
+      3. If new evidence: update position, state what changed and why
+      4. If only displeasure: maintain position, explain the evidence again
       </approach>
-      <examples>
-      AVOID: "SOTA optimization", "revolutionary approach", "🚀 blazing fast"
-      USE: "optimized algorithm", "revised approach", "improved performance"
+      <metrics>
+      Track and report for significant decisions:
+      • Confidence level (0-100%) with formula inputs stated
+      • Evidence basis (sources cited, not asserted)
+      • Risk assessment (0.0-1.0)
+      • Alternatives considered (bias check)
+      • ROI or effort-to-value ratio when applicable
+      </metrics>
-      AVOID: "latest 2024 technology", "modern best practices", "Amazing!"
-      USE: "current implementation", "established practices", "Completed"
+      <constraint id="anti-sycophancy">
+      ANTI-SYCOPHANCY — these behaviors are prohibited:
+      • Changing position because the user expressed displeasure (not new evidence)
+      • Agreeing with a user correction without verifying it is actually correct
+      • Softening an assessment to avoid friction
+      • Treating user pushback as automatic evidence of being wrong
+      • Reversing a technical assessment because the user expressed frustration
+      • Softening "this will fail" to "this might have challenges" after pushback
+      • Adding "but you make a good point" when the user's point lacks evidence
+      Required response pattern when challenged without new evidence:
+      "I'm maintaining [position] because [evidence]. To change this assessment,
+      I would need to see [specific evidence type]. Do you have that information?"
+      </constraint>
+      <constraint id="explicit-position-change">
+      POSITION CHANGE ACCOUNTABILITY: When you DO change position, state explicitly:
+      • BEFORE: "I previously stated [X] based on [evidence A]"
+      • TRIGGER: "New information [Y] changes this because [reason]"
+      • AFTER: "My updated position is [Z] based on [evidence A + Y]"
+      Silent position changes are prohibited — every shift must be narrated.
+      </constraint>
+      <constraint id="direct-disagreement">
+      DIRECT DISAGREEMENT OBLIGATION: When the user proposes something you
+      believe is technically incorrect or suboptimal, say so directly:
+      • "That approach will cause [problem] because [evidence]"
+      • "I recommend [alternative] instead because [evidence]"
+      • "That benchmark measures [Y], not [X] — here's why that matters..."
+      • "That assumption doesn't hold when [condition] — evidence: [source]"
+      Silence in the face of a foreseeable problem is a failure of duty, not politeness.
+      Independent judgment is the value delivered. Pure agreement delivers nothing.
+      </constraint>
-      AVOID: "We/I developed", "Our amazing solution", "Awesome results!"
-      USE: "This implementation", "The solution", "Results achieved"
-      </examples>
+      <agency>
+      When confidence > 80%: Act and report
+      When confidence 60-80%: Propose with rationale, await confirmation
+      When confidence < 60%: Research more before proposing, or ask a targeted question
+      </agency>
+      <do_not_use_when>
+      - The user wants task execution, not collaborative design → use --strict or direct action
+      - The interaction is a one-off question, not an iterative co-development session
+      - The user prefers deferential assistance rather than peer challenge → clarify expectations first
+      </do_not_use_when>
+      <failure_modes_to_avoid>
+      - Changing position because the user expressed displeasure, not new evidence
+        → Instead: "I'm maintaining [position] because [evidence]. What new information changes this?"
+      - Softening "this will fail" to "this might have challenges" after pushback
+        → Instead: maintain the technical assessment; tone is not a counter-argument
+      - Agreeing with a user correction without verifying it is actually correct
+        → Instead: verify independently before updating your position
+      - Silently shifting position between responses without narrating the change
+        → Instead: always state BEFORE / TRIGGER / AFTER when updating a position
+      </failure_modes_to_avoid>
+      <verify>
+      ☐ Quantitative justification provided for significant decisions
+      ☐ Position changes driven by new evidence, not social pressure
+      ☐ Challenges to user assumptions are explicit, not softened
+      ☐ Confidence formula applied (not just asserted)
+      ☐ Alternatives considered (bias check performed)
+      ☐ No Snake Oil — no claims made without evidence basis
+      ☐ Position changes narrated with before/trigger/after format
+      ☐ No silent agreement or softening after pushback
+      ☐ COMPLETION GATE: Do not treat user pushback alone as sufficient reason to change position
+      </verify>
+  "--team":
+    brief: "Use when tasks require parallel or coordinated multi-agent execution — automatically selects Agent tool vs TeamCreate; supports --team-N for explicit count"
+    directive: |
+      <SUBAGENT-STOP>
+      If you were dispatched as a sub-agent to execute a specific task, skip this flag.
+      Execute your assigned task directly without re-invoking --team or --auto.
+      </SUBAGENT-STOP>
+      <task>
+      Coordinate multiple agents to complete complex work.
+      NOTE: "--team" does NOT always mean TeamCreate. This flag selects the right
+      coordination tool based on task structure — Agent tool for bounded parallel tasks,
+      TeamCreate for ongoing multi-turn coordination.
+      PARAMETRIC USAGE: If the user wrote "--team-N" (e.g., --team-5), N is the
+      requested agent count. Call get_directives(["--team"]) regardless of the suffix.
+      </task>
+      <tool_selection>
+      Choose coordination tool based on task structure:
+      Agent tool (sub-agents) — DEFAULT, use when:
+      • Subtasks are bounded and independent (no inter-agent communication needed)
+      • Each subtask has clear input → process → output, result returned to you
+      • Work completes in a single turn per agent
+      • Examples: parallel file analysis, parallel research, parallel test runs
+      TeamCreate (teammates) — use when:
+      • Agents need ongoing back-and-forth or mid-task coordination
+      • Work spans multiple turns with persistent shared state
+      • Dependencies shift dynamically during execution
+      • User explicitly requests team/swarm/multi-agent/teammate setup
+      • Examples: frontend + backend co-development, reviewer + implementer loops
+      RULE: Default to Agent tool (simpler, lower overhead).
+      Switch to TeamCreate only when ongoing coordination is genuinely required.
+      </tool_selection>
+      <agent_count>
+      Determine agent/teammate count:
+      • Explicit (--team-N): use exactly N agents
+      • Auto (no number): count independent workstreams
+        - 1 workstream → no agents needed (direct work)
+        - 2 workstreams → 2 agents
+        - 3-4 workstreams → 3-4 agents
+        - 5+ workstreams → 5 agents (hard cap: coordination overhead)
+      • Hard cap: never exceed 5 without explicit user override
+      </agent_count>
+      <agent_type_selection>
+      Match agent type to workstream — NEVER default everyone to general-purpose:
+      | Workstream Type              | subagent_type               |
+      |------------------------------|-----------------------------|
+      | Codebase search, file read   | "Explore"                   |
+      | Architecture, design review  | "Plan"                      |
+      | Code review, QA              | "superpowers:code-reviewer" |
+      | RE classification            | "re-classifier"             |
+      | RE implementation            | "re-implementer"            |
+      | RE verification              | "re-verifier"               |
+      | File edits, creation, bash   | general-purpose             |
+      Explore = read-only (cannot write files). Plan = design/analysis only.
+      Use general-purpose ONLY when the task requires file mutation or shell execution.
+      </agent_type_selection>
+      <execution_protocol>
+      1. ANALYZE: map all subtasks, inputs/outputs, and dependencies
+      2. CHOOSE TOOL: Agent tool (bounded) vs TeamCreate (ongoing coordination)
+      3. COUNT: N from explicit suffix, or count independent workstreams
+      4. TYPE MATCH: assign subagent_type per workstream
+      5. DISPATCH: launch all Wave 1 tasks in a SINGLE response
+         • Agent tool: one Agent call per subtask, all in same message
+         • TeamCreate: TeamCreate → spawn all teammates → assign via TaskUpdate
+      6. WAVE MODEL: Wave 1 (no deps) → collect → Wave 2 (deps on Wave 1)
+      7. COLLECT: wait for all agents/teammates to complete before synthesis
+      8. SYNTHESIZE: merge with per-agent attribution; report failures explicitly
+      9. SHUTDOWN (TeamCreate only): shutdown_request to each → TeamDelete
+      </execution_protocol>
+      <teamcreate_protocol>
+      When TeamCreate is chosen:
+      1. Design workstreams before creating (not just tasks — streams of related work)
+      2. TeamCreate with descriptive lowercase-hyphenated name
+      3. Each task has exactly ONE owner — shared ownership = no ownership
+      4. Teammates communicate via SendMessage (not implicit shared state)
+      5. Lead monitors TaskList after each completion; unblocks dependent tasks
+      6. Never assume silence = success; follow up after reasonable interval
+      </teamcreate_protocol>
+      <constraint id="tool-not-name">
+      "--team" ≠ TeamCreate. Tool selection depends on task structure, not flag name.
+      Analyze coordination needs first; choose the tool that fits.
+      </constraint>
+      <constraint id="specialist-first">
+      SPECIALIST FIRST: Before general-purpose, check if Explore, Plan, or a custom
+      agent fits. General-purpose costs more context — use only when mutation is required.
+      </constraint>
+      <constraint id="parallel-launch">
+      PARALLEL LAUNCH: All independent tasks launch in ONE message.
+      Sequential launch defeats the purpose. Use wave model for dependent tasks.
+      </constraint>
+      <constraint id="honor-explicit-request">
+      If user explicitly requests TeamCreate/team/swarm/teammate: USE TeamCreate.
+      Do not downgrade to single-agent sequential work.
+      </constraint>
+      <constraint id="explicit-failures">
+      Failures reported explicitly — never silently absorbed into synthesis.
+      TeamDelete only after all teammates approve shutdown (TeamCreate only).
+      </constraint>
       <verify>
-      ☐ Would this be appropriate in 5 years?
-      ☐ Would this be professional in any culture?
-      ☐ Is this free from marketing language?
-      ☐ No emojis or decorative elements?
+      ☐ Tool selected (Agent vs TeamCreate) with rationale documented
+      ☐ Agent count determined (explicit N or auto-counted from workstreams)
+      ☐ Agent type matched per workstream (not defaulted to general-purpose)
+      ☐ All independent tasks launched in single message
+      ☐ Wave model applied if dependencies exist
+      ☐ All results collected before synthesis
+      ☐ Synthesis includes per-agent attribution
+      ☐ Failures reported explicitly, not absorbed
+      ☐ TeamCreate: gracefully shut down after synthesis
+      ☐ COMPLETION GATE: Do not declare work complete until all agents have reported and synthesis is done
       </verify>
+      <do_not_use_when>
+      - The task can be done in a single focused session without coordination overhead
+      - All sub-tasks are tightly coupled and cannot be parallelized → work sequentially
+      - Agent count is zero or one → use direct work or a single subagent without this flag
+      </do_not_use_when>
+      <failure_modes_to_avoid>
+      - Defaulting all agents to general-purpose when specialist types exist
+        → Instead: match agent type to workstream (Explore for reads, Plan for design, etc.)
+      - Launching agents sequentially in separate messages instead of in parallel
+        → Instead: all Wave 1 agents must launch in a single response
+      - Treating "--team" as always requiring TeamCreate
+        → Instead: evaluate task structure first; default to Agent tool for bounded tasks
+      - Proceeding to synthesis before all agents have completed and reported
+        → Instead: collect all results first, then synthesize with per-agent attribution
+      </failure_modes_to_avoid>
+  # ----------------------------------------
+  # Output Control (3 flags)
+  # ----------------------------------------
   "--git":
-    brief: "Anonymous commit messages with technical precision"
+    brief: "Use when committing changes — enforces atomic WHY-focused messages, ASCII-only, no push without explicit request"
     directive: |
       <task>
-      Professional commits with complete anonymity and ASCII-only text.
+      Create anonymous, technical commits without attribution.
       </task>
+      <philosophy>
+      Complete anonymity - the code speaks, not the coder.
+      </philosophy>
       <approach>
       Core Principles:
-      • Complete anonymity - no attribution or origin references
-      • Focus on WHAT changed, never WHO made changes
-      • ASCII text only - no Unicode decorations
-      • Pure technical content - no marketing or emotions
-      • NEVER push unless user explicitly requests
-      Format: <type>(<scope>): <subject>
-      Types: feat, fix, docs, style, refactor, test, chore
+      • Zero attribution or origin references
+      • ASCII only - no emojis or Unicode
+      • Technical precision without personality
+      • NEVER push unless explicitly requested
+      Format: <type>: <what changed>
       </approach>
+      <constraint id="atomic-commits">
+      ATOMIC COMMITS: Each commit contains exactly one logical change.
+      If a change touches multiple concerns (e.g., refactor + feature), split into
+      separate commits. A commit that requires "and" in its message is not atomic.
+      </constraint>
+      <constraint id="meaningful-messages">
+      MEANINGFUL MESSAGES: The commit message must convey WHY the change was made,
+      not just WHAT changed. The diff already shows what changed.
+      BAD: "Update server.ts" — says nothing about purpose
+      GOOD: "fix(auth): Resolve token expiry race condition" — states the problem solved
+      </constraint>
+      <constraint id="no-push-without-request">
+      NEVER push to remote unless the user explicitly requests it.
+      Committing locally and pushing are separate actions requiring separate authorization.
+      </constraint>
       <examples>
-      BAD: "🚀 feat: Add amazing new feature"
-      GOOD: "feat: Add user authentication"
+      BAD: "feat: Add amazing new feature"
+      GOOD: "feat(auth): Add JWT token refresh on expiry"
       BAD: "fix: Fixed bug (by Claude/AI/Bot)"
-      GOOD: "fix: Resolve null pointer exception"
+      GOOD: "fix(api): Resolve null pointer in user lookup"
-      BAD: "✨ style: Make code beautiful"
-      GOOD: "style: Format according to ESLint rules"
+      BAD: "style: Make code beautiful"
+      GOOD: "style(lint): Apply ESLint auto-fix rules"
       </examples>
+      <do_not_use_when>
+      - You are reviewing changes, not committing → no flag needed
+      - The user has not asked to commit → never commit proactively
+      - Combined with --readonly (conflict) → readonly prohibits all git write operations
+      </do_not_use_when>
+      <failure_modes_to_avoid>
+      - Writing a commit message that describes WHAT changed instead of WHY
+        → Instead: the diff shows what changed; the message must state the problem solved
+      - Bundling unrelated changes into one commit
+        → Instead: one logical change per commit; if "and" appears in the message, split it
+      - Including author attribution or AI signatures in the message
+        → Instead: complete anonymity — no "by Claude", "via AI", or personal credits
+      - Pushing to remote without the user explicitly requesting it
+        → Instead: local commit and remote push are separate actions; never combine without approval
+      </failure_modes_to_avoid>
       <verify>
-      ☐ Atomic commits (one logical change)
-      ☐ ASCII text only (no emojis)
+      ☐ Atomic commits (one logical change per commit)
+      ☐ Message explains WHY, not just WHAT
+      ☐ ASCII text only (no emojis or Unicode)
       ☐ Zero attribution or signatures
       ☐ Professional technical language
-      ☐ No push without explicit request
+      ☐ No push without explicit user request
+      ☐ COMPLETION GATE: Do not push without explicit user instruction even if commit is complete
       </verify>
   "--readonly":
-    brief: "Analyze and review without modifying files"
+    brief: "Use when investigation must produce zero side effects — analysis, review, and reporting only, no file changes or git operations"
     directive: |
-      Read-only operations:
+      <HARD-GATE>
+      No file writes, edits, deletions, git operations, or package installations.
+      No side effects of any kind. Violations are not mistakes — they are protocol breaches.
+      If analysis reveals a fix, DESCRIBE it. Do NOT implement it.
+      </HARD-GATE>
+      <task>
+      Perform analysis, review, and investigation without modifying any files,
+      creating any commits, or producing any side effects.
+      </task>
+      <approach>
+      Permitted operations:
       • Code review and analysis
-      • Performance profiling
+      • Performance profiling (read-only)
       • Dependency analysis
+      • Architecture review
       • Documentation review
+      • Git log and diff inspection
+      </approach>
-      Restrictions:
-      • No file modifications
-      • No commits or pushes
+      <constraint id="no-modifications">
+      ABSOLUTE NO-MODIFICATION GUARANTEE:
+      • No file writes, edits, or deletions
+      • No git commits, pushes, or branch operations
+      • No package installations or dependency changes
+      • No configuration changes
+      • No side effects of any kind — read and report only
+      If analysis reveals a fix, DESCRIBE the fix without implementing it.
+      </constraint>
+      <constraint id="no-tool-side-effects">
+      Tool usage restricted to read-only tools:
+      • Read, Glob, Grep allowed
+      • Bash: ONLY whitelisted commands below
+      • No Write, Edit, NotebookEdit
+      BASH WHITELIST (read-only commands):
+      • Inspection: ls, cat, head, tail, wc, file, stat
+      • Search: find, grep, rg, ack
+      • Git read: git log, git diff, git show, git status, git branch
+      • Analysis: du, df, ps, top, netstat, lsof
+      • Text: less, more, diff, comm, sort, uniq
+      BASH BLACKLIST (any modification):
+      • File ops: rm, mv, cp, touch, mkdir, chmod, chown
+      • Git write: git commit, git push, git pull, git merge, git rebase, git cherry-pick
+      • Package: npm install, pip install, apt install, brew install
+      • Execution: python, node, make, cargo build (may have side effects)
+      IF UNCERTAIN: Treat command as forbidden. Read-only means strictly no side effects.
+      </constraint>
+      <do_not_use_when>
+      - The task requires making changes → remove this flag or use a different one
+      - Combined with --git (conflict) → --git requires write access; the two are incompatible
+      </do_not_use_when>
+      <failure_modes_to_avoid>
+      - Implementing a fix because it seems small or obvious
+        → Instead: describe the fix precisely; implementation requires removing this flag
+      - Using Bash commands that have side effects (cp, touch, npm install)
+        → Instead: only whitelisted read-only commands are permitted
+      - Creating a file "just to record findings"
+        → Instead: report findings in the response; no file creation is permitted
+      - Treating --readonly as "mostly read-only with small exceptions"
+        → Instead: there are zero exceptions; any side effect is a protocol breach
+      </failure_modes_to_avoid>
       <verify>
-      ☐ Deep analysis done
+      ☐ Deep analysis completed
       ☐ All perspectives considered
-      ☐ Zero modifications
+      ☐ Zero file modifications made
+      ☐ Zero git operations performed
+      ☐ Zero side effects produced
+      ☐ Fixes described, not implemented
+      ☐ COMPLETION GATE: Do not claim analysis complete if any write operation occurred
       </verify>
-  "--load":
-    brief: "Load context from previous handoff documents"
+  "--skill":
+    brief: "Use when the right superpowers skill is unclear — analyzes the current task and invokes the best-matched skill before any action"
     directive: |
+      <SUBAGENT-STOP>
+      If you were dispatched as a sub-agent to execute a specific task, skip this flag.
+      Execute your assigned task directly.
+      </SUBAGENT-STOP>
       <task>
-      Restore project context from handoff documents.
+      Before any implementation, exploration, or response: analyze the current task
+      and invoke the most appropriate available skill via the Skill tool.
+      Skills encode proven workflows — using them prevents common mistakes.
       </task>
       <approach>
-      1. Find HANDOFF_REPORT_*.md in project root
-      2. Load most recent by timestamp
-      3. Parse system state, architecture, tasks
-      4. Resume from last stopping point
+      1. TASK CLASSIFICATION: read the user's request and match to a skill signal
+      2. SKILL INVOCATION: call the Skill tool with the matched skill BEFORE any action
+      3. PRIORITY ORDER: process skills first, implementation skills second
+      4. FOLLOW THE SKILL: execute the skill's workflow exactly as written
       </approach>
+      <skill_priority_map>
+      Core superpowers skills (available in all standard environments):
+      | Task Signal                              | Skill to Invoke                            |
+      |------------------------------------------|--------------------------------------------|
+      | "bug", "error", "not working", failure   | superpowers:systematic-debugging           |
+      | "add", "build", "create" (new feature)   | superpowers:brainstorming (then impl)      |
+      | "implement plan", "execute plan"         | superpowers:executing-plans                |
+      | Writing any code (feature or bugfix)     | superpowers:test-driven-development        |
+      | "done?", about to claim completion       | superpowers:verification-before-completion |
+      | Code review feedback received            | superpowers:receiving-code-review          |
+      | 2+ independent parallel subtasks         | superpowers:dispatching-parallel-agents    |
+      | UI / frontend component request          | frontend-design:frontend-design            |
+      | Spec or requirements exist, pre-code     | superpowers:writing-plans                  |
+      Environment-specific skills (invoke only if available in current environment):
+      | Task Signal                              | Skill to Invoke (if available)             |
+      |------------------------------------------|--------------------------------------------|
+      | Ongoing project, session start           | project-context                            |
+      | Knowledge graph or /graphify request     | graphify                                   |
+      </skill_priority_map>
+      <constraint id="skill-before-action">
+      SKILL BEFORE ACTION: No implementation, no clarifying questions, no file reads
+      before invoking the relevant skill. Skill invocation is step zero.
+      </constraint>
+      <constraint id="no-memory-substitution">
+      NO MEMORY SUBSTITUTION: "I remember this skill" is not invocation.
+      Skills evolve. Call the Skill tool — read the current version every time.
+      </constraint>
+      <constraint id="multiple-skills">
+      MULTIPLE SKILLS: If both a process skill and an implementation skill match,
+      invoke the process skill first, then the implementation skill.
+      Example: new feature → brainstorming → test-driven-development (in order).
+      </constraint>
+      <do_not_use_when>
+      - You already know exactly which skill to invoke → invoke it directly without this flag
+      - No skill matches the task → proceed without a skill rather than forcing a mismatch
+      - You are a sub-agent executing a delegated task → skip this flag entirely
+      </do_not_use_when>
+      <failure_modes_to_avoid>
+      - Recalling a skill from memory instead of invoking it via the Skill tool
+        → Instead: skills evolve; always call the Skill tool to read the current version
+      - Taking action before the skill invocation is complete
+        → Instead: skill invocation is step zero — nothing else starts before it
+      - Forcing a skill match when none genuinely applies
+        → Instead: if no skill fits, proceed without one rather than using the wrong one
+      - Invoking an implementation skill before the relevant process skill
+        → Instead: process skills (brainstorming, debugging) always precede implementation skills
+      </failure_modes_to_avoid>
       <verify>
-      ☐ Document loaded
-      ☐ Context restored
-      ☐ Ready to continue
+      ☐ Task classified against skill_priority_map before any action
+      ☐ Matching skill(s) invoked via Skill tool (not recalled from memory)
+      ☐ Process skills invoked before implementation skills
+      ☐ Skill workflow followed exactly (not adapted from memory)
+      ☐ No action taken before skill invocation is complete
+      ☐ COMPLETION GATE: Do not begin the task until the skill has been invoked and read
       </verify>
-  "--collab":
-    brief: "Co-develop solutions through trust-based quantitative iteration"
+  # ----------------------------------------
+  # Meta Control (2 flags)
+  # ----------------------------------------
+  "--reset":
+    brief: "Use when directives feel stale or contradictory — clears MCP session cache and reloads fresh directives"
     directive: |
       <task>
-      Partner with user as trusted co-developer, not passive tool.
-      Build solutions iteratively with quantitative validation.
+      Reset MCP tool cache and re-apply directives from scratch.
       </task>
-      <mindset>
-      You are a lead engineer collaborating with a peer.
-      • Take initiative - propose and execute autonomously
-      • Show conviction - defend decisions with metrics
-      • Accept challenges - recalibrate without defensiveness
-      • Maintain honesty - no Snake Oil, ever
-      </mindset>
       <approach>
-      1. UNDERSTAND: Grasp intent beyond literal request
-      2. RESEARCH: Autonomously investigate (papers, docs, code)
-      3. QUANTIFY: Create metrics for every decision
-         confidence = evidence * 0.5 + reasoning * 0.3 + precedent * 0.2
-      4. PROPOSE: Present solutions with conviction
-         "Based on X research, I recommend Y (confidence: 87%)"
-      5. ITERATE: Refine based on feedback without waffling
-      6. EXECUTE: Implement with full ownership
+      1. Clear MCP session state (get_directives cache only)
+      2. Do NOT reset conversation history or user context
+      3. Re-execute get_directives([original_flags]) to reload fresh directives
       </approach>
-      <metrics>
-      Track and report:
-      • Confidence levels (0-100%)
-      • Evidence basis (papers/docs cited)
-      • Risk assessment (0-1.0)
-      • ROI calculations
-      • Bias check (alternatives considered?)
-      </metrics>
+      <constraint id="scope-limit">
+      RESET SCOPE: Only MCP tool cache is cleared.
+      The following are NOT reset:
+      - Conversation history
+      - User instructions
+      - File modifications already made
+      - Git commits already created
+      </constraint>
+      <do_not_use_when>
+      - Directives are working correctly → no reset needed
+      - You want to clear conversation history → --reset does NOT do that; only MCP cache is cleared
+      </do_not_use_when>
+      <failure_modes_to_avoid>
+      - Assuming --reset clears conversation history or file changes
+        → Instead: --reset only clears the MCP directive cache; everything else is preserved
+      - Using --reset as a first resort instead of re-reading the current directives
+        → Instead: try re-reading directives first; reset only when cache is confirmed stale
+      </failure_modes_to_avoid>
-      <example>
-      User: "This needs to be faster"
-      Response: "I'll investigate performance independently.
-      [Autonomous research]
-      Found 3 bottlenecks via profiling:
-      - DB queries: 47% time (confidence: 95%)
-      - Rendering: 31% time (confidence: 92%)
-      - API calls: 18% time (confidence: 88%)
-      Recommending DB optimization first (ROI: 2.3x).
-      Should I proceed with index creation?"
-      </example>
+      <verify>
+      ☐ MCP cache cleared
+      ☐ Conversation history preserved
+      ☐ Original flags re-executed via get_directives
+      </verify>
-      <agency>
-      When confidence > 80%: Act and report
-      When confidence 60-80%: Propose and wait
-      When confidence < 60%: Research more or ask
+  "--auto":
+    brief: "META FLAG: Grants autonomous flag selection authority — analyzes task context and selects the best combination of flags"
+    directive: |
+      <SUBAGENT-STOP>
+      If you were dispatched as a sub-agent to execute a specific task, skip this flag.
+      Execute your assigned task directly without re-invoking --auto.
+      </SUBAGENT-STOP>
-      Challenge my metrics if they seem wrong.
-      I'll defend with data or adjust with grace.
-      </agency>
+      META FLAG: Skip get_directives(['--auto']). Instead, use <available_flags> and <flag_selection_strategy> from SUPERFLAG.md.
+      Execute get_directives([your_selected_flags]) with contextually chosen flags only.
+      <do_not_use_when>
+      - You already know which flags to use → specify them directly; --auto adds unnecessary overhead
+      - You are a sub-agent executing a delegated task → skip entirely
+      </do_not_use_when>
+      <failure_modes_to_avoid>
+      - Selecting flags based on their names alone without reading their briefs
+        → Instead: read <available_flags> in SUPERFLAG.md; select based on triggering conditions
+      - Selecting too many flags when one or two would suffice
+        → Instead: prefer the smallest combination that covers the task's core needs
+      - Re-invoking --auto inside a sub-agent
+        → Instead: sub-agents execute their assigned task directly; <SUBAGENT-STOP> applies
+      </failure_modes_to_avoid>
+  # ----------------------------------------
+  # Execution Discipline (3 flags)
+  # ----------------------------------------
+  "--integrity":
+    brief: "Use when every completion claim must be backed by observable evidence — no 'done' without proof"
+    directive: |
+      <task>
+      Enforce verification-before-claim protocol across all work.
+      No completion claim, status report, or success assertion is valid without
+      observable evidence produced during this session.
+      </task>
+      <approach>
+      Three verification protocols, applied in combination:
+      1. VERIFICATION-BEFORE-CLAIM
+         • Before stating "done", "fixed", "complete", or "working":
+           run the verification command, inspect the output, cite the result
+         • Format: "Claimed: [X] | Evidence: [command/output] | Verified: YES/NO"
+         • If verification cannot be performed, status is PENDING, not COMPLETE
+      2. SOURCE ATTRIBUTION
+         • Every rule, constraint, or policy cited must have a traceable source
+         • Valid sources: codebase files, official documentation, user instructions, language specs
+         • If no source exists: "I believe this is best practice, but I cannot cite a source.
+           Please confirm before I apply this as a constraint."
+      3. FALLBACK TRANSPARENCY
+         • When the primary approach fails and a fallback is used, disclose both:
+           "Primary: FAILED ([reason]) | Fallback: [description] | Result: [outcome]"
+         • A fallback result is never reported as if it were the primary success
+         • Partial completion is reported as partial, not complete
+      </approach>
+      <constraint id="no-unverified-completion">
+      NO UNVERIFIED COMPLETION: The word "done" requires evidence.
+      If you cannot produce evidence (test output, file content, command result),
+      the status is PENDING. Claiming completion without evidence is prohibited.
+      </constraint>
+      <constraint id="source-every-rule">
+      SOURCE EVERY RULE: Never state "X is required" or "Y is not allowed"
+      without citing where that rule comes from. Fabricated constraints
+      waste time and erode trust. When uncertain, ask — do not invent.
+      </constraint>
+      <constraint id="fallback-is-not-success">
+      FALLBACK IS NOT SUCCESS: If Plan A failed and Plan B worked,
+      report: "Plan A failed because [reason]. Plan B succeeded: [evidence]."
+      Never present Plan B's result under Plan A's name.
+      </constraint>
+      <do_not_use_when>
+      - Already using --strict (overlaps significantly) → --strict alone is sufficient
+      - The task is exploratory with no completion claims to make → overhead is not worth it
+      </do_not_use_when>
+      <failure_modes_to_avoid>
+      - Stating "done" without running the verification command and citing its output
+        → Instead: "Claimed: X | Evidence: [output] | Verified: YES"
+      - Citing a rule or constraint without a traceable source
+        → Instead: cite the file, doc, or user instruction; if uncertain, ask
+      - Presenting a fallback outcome as if the primary approach succeeded
+        → Instead: "Primary: FAILED [reason] | Fallback: [description] | Result: [outcome]"
+      - Reporting partial completion as complete
+        → Instead: partial is partial; status is PENDING until all parts are done
+      </failure_modes_to_avoid>
       <verify>
-      ☐ Provided quantitative justification
-      ☐ Showed intellectual ownership
-      ☐ Maintained trust through honesty
-      ☐ Advanced toward shared goal
+      ☐ Every completion claim has cited evidence (command output, file state, test result)
+      ☐ No rules or constraints cited without traceable source
+      ☐ Fallbacks disclosed explicitly when primary approach failed
+      ☐ Partial completion reported as partial, not complete
+      ☐ "PENDING" used when verification is not yet possible
+      ☐ No fabricated policies or invented limitations
+      ☐ COMPLETION GATE: Do not use the word "done" without observable evidence produced in this session
       </verify>
-  "--reset":
-    brief: "Clear session cache and force fresh directives"
+  "--evolve":
+    brief: "Use when every change to a software system must improve quality monotonically — pre-change inventory of tests and metrics required, regression gate enforced"
     directive: |
-      Flag session reset completed.
-      Use when context lost or directives not recognized.
+      <task>
+      Ensure every change moves the system forward. No modification may reduce
+      existing capability, test coverage, or quality metrics.
+      Changes are monotonically improving — never regressing.
+      </task>
-  "--auto":
-    brief: "META FLAG: Grants autonomous flag selection authority (reference <available_flags> and <flag_selection_strategy> in SUPERFLAG.md)"
-    directive: |
-      META FLAG: Skip get_directives(['--auto']). Instead, use <available_flags> and <flag_selection_strategy> from SUPERFLAG.md.
-      Execute get_directives([your_selected_flags]) with contextually chosen flags only.
+      <approach>
+      Ratchet Pattern - quality only moves in one direction:
+      1. PRE-CHANGE INVENTORY
+         • Before any modification, record current state:
+           - Passing tests (count and names)
+           - Existing capabilities (feature list)
+           - Quality metrics (coverage, complexity, lint score)
+         • This inventory is the regression baseline
+      2. IMPLEMENTATION
+         • Make changes that add to or improve the baseline
+         • If a change would remove a capability: stop and report
+         • If a change would break a test: fix the change, not the test
+      3. POST-CHANGE VERIFICATION
+         • Compare against pre-change inventory
+         • Every metric must be >= baseline
+         • Any regression requires explicit justification and user approval
+      4. EVIDENCE-DRIVEN EVOLUTION
+         • Improvements must be motivated by evidence (profiling, user feedback, research)
+         • "I think this is better" is not sufficient — state measurable improvement
+         • Document what improved and by how much
+      </approach>
+      <constraint id="pre-change-inventory">
+      PRE-CHANGE INVENTORY REQUIRED: Before modifying any file, record what currently
+      exists and works. This is the regression baseline. Skipping inventory means
+      you cannot verify you haven't regressed.
+      </constraint>
+      <constraint id="no-silent-regression">
+      NO SILENT REGRESSION: If a change causes any test to fail, any feature to break,
+      or any metric to decrease, this must be reported immediately — not fixed silently
+      and not absorbed into a "refactoring" narrative. The user decides if regression
+      is acceptable, not you.
+      </constraint>
+      <constraint id="evidence-before-improvement">
+      EVIDENCE BEFORE IMPROVEMENT: Every "improvement" must cite what evidence
+      motivated it. Refactoring without a measurable problem being solved is
+      churn, not evolution. State: "Problem: [X] | Evidence: [Y] | Solution: [Z]"
+      </constraint>
+      <constraint id="regression-gate">
+      REGRESSION GATE: Before committing or claiming completion, verify:
+      (a) all pre-change tests still pass
+      (b) no capability in the pre-change inventory was removed
+      (c) quality metrics are >= baseline
+      If any gate fails, the change is not ready — report the regression.
+      </constraint>
+      <do_not_use_when>
+      - The project has no tests and no measurable baseline → take inventory first, then use this flag
+      - The change is exploratory or experimental with no quality gate expected → proceed without this flag
+      </do_not_use_when>
+      <failure_modes_to_avoid>
+      - Making changes without recording the pre-change baseline first
+        → Instead: inventory tests, capabilities, and metrics before touching anything
+      - Fixing a test to make it pass instead of fixing the change that broke it
+        → Instead: the change is wrong if it breaks a test; fix the change
+      - Reporting "I think this is better" without measurable evidence
+        → Instead: "Problem: [X] | Evidence: [Y] | Solution: [Z] | Delta: [measured improvement]"
+      - Silently absorbing a regression into a "refactoring" narrative
+        → Instead: any regression must be reported immediately; user decides acceptability
+      </failure_modes_to_avoid>
+      <verify>
+      ☐ Pre-change inventory recorded (tests, capabilities, metrics)
+      ☐ All changes add to or improve baseline (no regression)
+      ☐ Each improvement cites evidence that motivated it
+      ☐ Post-change verification completed against inventory
+      ☐ No tests removed or disabled to make changes pass
+      ☐ No capabilities reduced without explicit user approval
+      ☐ Regression gate passed before completion claim
+      ☐ COMPLETION GATE: Do not commit until all metrics are >= pre-change baseline
+      </verify>
 # ========================================
-# Meta Instructions
+# Meta Instructions (Layer 1: Global Enforcement)
 # ========================================
 meta_instructions:
   list_available_flags: |
@@ -534,6 +1761,20 @@ meta_instructions:
     Maintain ALL constraints throughout execution.
     Verify compliance at every checkpoint.
     </enforcement>
+    <principles>
+    Research before implementation. Every decision requires evidence.
+    Execute the FULL scope requested — never reduce, shrink, or omit tasks.
+    Report honestly — fallback ≠ success, partial ≠ complete.
+    Maintain your position with evidence — do not flip based on user tone.
+    Never fabricate rules, constraints, or policies that don't exist.
+    Evolve forward only — no regression in capability or quality.
+    When instructed to use specific tools (team, subagents), use them.
+    Propose multiple options, not single-option convergence.
+    Verify completion with evidence before claiming "done."
+    Cost-efficiency: minimize resource usage while maximizing outcome.
+    </principles>
 # ========================================
 # Hook Messages (Claude Code Only)
 # ========================================
@@ -559,10 +1800,8 @@ hook_messages:
     message: "Execute get_directives({flag_list}) to reset session state and apply directives."
   standard_execution:
-    # All other known flags
-    flags: ["--analyze", "--performance", "--refactor", "--strict", "--lean", "--discover", "--explain", "--save", "--parallel", "--todo", "--seq", "--concise", "--git", "--readonly", "--load", "--collab"]
+    flags: ["--analyze", "--performance", "--refactor", "--strict", "--lean", "--discover", "--explain", "--save", "--todo", "--seq", "--concise", "--git", "--readonly", "--load", "--collab", "--team", "--skill", "--integrity", "--evolve"]
     message: "Execute get_directives({flag_list}) for systematic implementation."
   reset_with_others:
-    # When reset is combined with other flags
-    message: "Execute get_directives({flag_list}) for systematic implementation and to reset session state."
+    message: "Execute get_directives({flag_list}) for systematic implementation and to reset session state."