npm - safeword - Versions diffs - 0.2.2 → 0.2.4 - Mend

safeword 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (235) hide show

package/.claude/commands/arch-review.md +32 -0
package/.claude/commands/lint.md +6 -0
package/.claude/commands/quality-review.md +13 -0
package/.claude/commands/setup-linting.md +6 -0
package/.claude/hooks/auto-lint.sh +6 -0
package/.claude/hooks/auto-quality-review.sh +170 -0
package/.claude/hooks/check-linting-sync.sh +17 -0
package/.claude/hooks/inject-timestamp.sh +6 -0
package/.claude/hooks/question-protocol.sh +12 -0
package/.claude/hooks/run-linters.sh +8 -0
package/.claude/hooks/run-quality-review.sh +76 -0
package/.claude/hooks/version-check.sh +10 -0
package/.claude/mcp/README.md +96 -0
package/.claude/mcp/arcade.sample.json +9 -0
package/.claude/mcp/context7.sample.json +7 -0
package/.claude/mcp/playwright.sample.json +7 -0
package/.claude/settings.json +62 -0
package/.claude/skills/quality-reviewer/SKILL.md +190 -0
package/.claude/skills/safeword-quality-reviewer/SKILL.md +13 -0
package/.env.arcade.example +4 -0
package/.env.example +11 -0
package/.gitmodules +4 -0
package/.safeword/SAFEWORD.md +33 -0
package/.safeword/eslint/eslint-base.mjs +101 -0
package/.safeword/guides/architecture-guide.md +404 -0
package/.safeword/guides/code-philosophy.md +174 -0
package/.safeword/guides/context-files-guide.md +405 -0
package/.safeword/guides/data-architecture-guide.md +183 -0
package/.safeword/guides/design-doc-guide.md +165 -0
package/.safeword/guides/learning-extraction.md +515 -0
package/.safeword/guides/llm-instruction-design.md +239 -0
package/.safeword/guides/llm-prompting.md +95 -0
package/.safeword/guides/tdd-best-practices.md +570 -0
package/.safeword/guides/test-definitions-guide.md +243 -0
package/.safeword/guides/testing-methodology.md +573 -0
package/.safeword/guides/user-story-guide.md +237 -0
package/.safeword/guides/zombie-process-cleanup.md +214 -0
package/{templates → .safeword}/hooks/agents-md-check.sh +0 -0
package/{templates → .safeword}/hooks/post-tool.sh +0 -0
package/{templates → .safeword}/hooks/pre-commit.sh +0 -0
package/.safeword/planning/002-user-story-quality-evaluation.md +1840 -0
package/.safeword/planning/003-langsmith-eval-setup-prompt.md +363 -0
package/.safeword/planning/004-llm-eval-test-cases.md +3226 -0
package/.safeword/planning/005-architecture-enforcement-system.md +169 -0
package/.safeword/planning/006-reactive-fix-prevention-research.md +135 -0
package/.safeword/planning/011-cli-ux-vision.md +330 -0
package/.safeword/planning/012-project-structure-cleanup.md +154 -0
package/.safeword/planning/README.md +39 -0
package/.safeword/planning/automation-plan-v2.md +1225 -0
package/.safeword/planning/automation-plan-v3.md +1291 -0
package/.safeword/planning/automation-plan.md +3058 -0
package/.safeword/planning/design/005-cli-implementation.md +343 -0
package/.safeword/planning/design/013-cli-self-contained-templates.md +596 -0
package/.safeword/planning/design/013a-eslint-plugin-suite.md +256 -0
package/.safeword/planning/design/013b-implementation-snippets.md +385 -0
package/.safeword/planning/design/013c-config-isolation-strategy.md +242 -0
package/.safeword/planning/design/code-philosophy-improvements.md +60 -0
package/.safeword/planning/mcp-analysis.md +545 -0
package/.safeword/planning/phase2-subagents-vs-skills-analysis.md +451 -0
package/.safeword/planning/settings-improvements.md +970 -0
package/.safeword/planning/test-definitions/005-cli-implementation.md +1301 -0
package/.safeword/planning/test-definitions/cli-self-contained-templates.md +205 -0
package/.safeword/planning/user-stories/001-guides-review-user-stories.md +1381 -0
package/.safeword/planning/user-stories/003-reactive-fix-prevention.md +132 -0
package/.safeword/planning/user-stories/004-technical-constraints.md +86 -0
package/.safeword/planning/user-stories/005-cli-implementation.md +311 -0
package/.safeword/planning/user-stories/cli-self-contained-templates.md +172 -0
package/.safeword/planning/versioned-distribution.md +740 -0
package/.safeword/prompts/arch-review.md +43 -0
package/.safeword/prompts/quality-review.md +11 -0
package/.safeword/scripts/arch-review.sh +235 -0
package/.safeword/scripts/check-linting-sync.sh +58 -0
package/.safeword/scripts/setup-linting.sh +559 -0
package/.safeword/templates/architecture-template.md +136 -0
package/.safeword/templates/ci/architecture-check.yml +79 -0
package/.safeword/templates/design-doc-template.md +127 -0
package/.safeword/templates/test-definitions-feature.md +100 -0
package/.safeword/templates/ticket-template.md +74 -0
package/.safeword/templates/user-stories-template.md +82 -0
package/.safeword/tickets/001-guides-review-user-stories.md +83 -0
package/.safeword/tickets/002-architecture-enforcement.md +211 -0
package/.safeword/tickets/003-reactive-fix-prevention.md +57 -0
package/.safeword/tickets/004-technical-constraints-in-user-stories.md +39 -0
package/.safeword/tickets/005-cli-implementation.md +248 -0
package/.safeword/tickets/006-flesh-out-skills.md +43 -0
package/.safeword/tickets/007-flesh-out-questioning.md +44 -0
package/.safeword/tickets/008-upgrade-questioning.md +58 -0
package/.safeword/tickets/009-naming-conventions.md +41 -0
package/.safeword/tickets/010-safeword-md-cleanup.md +34 -0
package/.safeword/tickets/011-cursor-setup.md +86 -0
package/.safeword/tickets/README.md +73 -0
package/.safeword/version +1 -0
package/AGENTS.md +59 -0
package/CLAUDE.md +12 -0
package/README.md +347 -0
package/docs/001-cli-implementation-plan.md +856 -0
package/docs/elite-dx-implementation-plan.md +1034 -0
package/framework/README.md +131 -0
package/framework/mcp/README.md +96 -0
package/framework/mcp/arcade.sample.json +8 -0
package/framework/mcp/context7.sample.json +6 -0
package/framework/mcp/playwright.sample.json +6 -0
package/framework/scripts/arch-review.sh +235 -0
package/framework/scripts/check-linting-sync.sh +58 -0
package/framework/scripts/load-env.sh +49 -0
package/framework/scripts/setup-claude.sh +223 -0
package/framework/scripts/setup-linting.sh +559 -0
package/framework/scripts/setup-quality.sh +477 -0
package/framework/scripts/setup-safeword.sh +550 -0
package/framework/templates/ci/architecture-check.yml +78 -0
package/learnings/ai-sdk-v5-breaking-changes.md +178 -0
package/learnings/e2e-test-zombie-processes.md +231 -0
package/learnings/milkdown-crepe-editor-property.md +96 -0
package/learnings/prosemirror-fragment-traversal.md +119 -0
package/package.json +19 -43
package/packages/cli/AGENTS.md +1 -0
package/packages/cli/ARCHITECTURE.md +279 -0
package/packages/cli/package.json +51 -0
package/packages/cli/src/cli.ts +63 -0
package/packages/cli/src/commands/check.ts +166 -0
package/packages/cli/src/commands/diff.ts +209 -0
package/packages/cli/src/commands/reset.ts +190 -0
package/packages/cli/src/commands/setup.ts +325 -0
package/packages/cli/src/commands/upgrade.ts +163 -0
package/packages/cli/src/index.ts +3 -0
package/packages/cli/src/templates/config.ts +58 -0
package/packages/cli/src/templates/content.ts +18 -0
package/packages/cli/src/templates/index.ts +12 -0
package/packages/cli/src/utils/agents-md.ts +66 -0
package/packages/cli/src/utils/fs.ts +179 -0
package/packages/cli/src/utils/git.ts +124 -0
package/packages/cli/src/utils/hooks.ts +29 -0
package/packages/cli/src/utils/output.ts +60 -0
package/packages/cli/src/utils/project-detector.test.ts +185 -0
package/packages/cli/src/utils/project-detector.ts +44 -0
package/packages/cli/src/utils/version.ts +28 -0
package/packages/cli/src/version.ts +6 -0
package/packages/cli/templates/SAFEWORD.md +776 -0
package/packages/cli/templates/doc-templates/architecture-template.md +136 -0
package/packages/cli/templates/doc-templates/design-doc-template.md +134 -0
package/packages/cli/templates/doc-templates/test-definitions-feature.md +131 -0
package/packages/cli/templates/doc-templates/ticket-template.md +82 -0
package/packages/cli/templates/doc-templates/user-stories-template.md +92 -0
package/packages/cli/templates/guides/architecture-guide.md +423 -0
package/packages/cli/templates/guides/code-philosophy.md +195 -0
package/packages/cli/templates/guides/context-files-guide.md +457 -0
package/packages/cli/templates/guides/data-architecture-guide.md +200 -0
package/packages/cli/templates/guides/design-doc-guide.md +171 -0
package/packages/cli/templates/guides/learning-extraction.md +552 -0
package/packages/cli/templates/guides/llm-instruction-design.md +248 -0
package/packages/cli/templates/guides/llm-prompting.md +102 -0
package/packages/cli/templates/guides/tdd-best-practices.md +615 -0
package/packages/cli/templates/guides/test-definitions-guide.md +334 -0
package/packages/cli/templates/guides/testing-methodology.md +618 -0
package/packages/cli/templates/guides/user-story-guide.md +256 -0
package/packages/cli/templates/guides/zombie-process-cleanup.md +219 -0
package/packages/cli/templates/hooks/agents-md-check.sh +27 -0
package/packages/cli/templates/hooks/post-tool.sh +4 -0
package/packages/cli/templates/hooks/pre-commit.sh +10 -0
package/packages/cli/templates/prompts/arch-review.md +43 -0
package/packages/cli/templates/prompts/quality-review.md +10 -0
package/packages/cli/templates/skills/safeword-quality-reviewer/SKILL.md +207 -0
package/packages/cli/tests/commands/check.test.ts +129 -0
package/packages/cli/tests/commands/cli.test.ts +89 -0
package/packages/cli/tests/commands/diff.test.ts +115 -0
package/packages/cli/tests/commands/reset.test.ts +310 -0
package/packages/cli/tests/commands/self-healing.test.ts +170 -0
package/packages/cli/tests/commands/setup-blocking.test.ts +71 -0
package/packages/cli/tests/commands/setup-core.test.ts +135 -0
package/packages/cli/tests/commands/setup-git.test.ts +139 -0
package/packages/cli/tests/commands/setup-hooks.test.ts +334 -0
package/packages/cli/tests/commands/setup-linting.test.ts +189 -0
package/packages/cli/tests/commands/setup-noninteractive.test.ts +80 -0
package/packages/cli/tests/commands/setup-templates.test.ts +181 -0
package/packages/cli/tests/commands/upgrade.test.ts +215 -0
package/packages/cli/tests/helpers.ts +243 -0
package/packages/cli/tests/npm-package.test.ts +83 -0
package/packages/cli/tests/technical-constraints.test.ts +96 -0
package/packages/cli/tsconfig.json +25 -0
package/packages/cli/tsup.config.ts +11 -0
package/packages/cli/vitest.config.ts +23 -0
package/promptfoo.yaml +3270 -0
package/dist/check-M73LGONJ.js +0 -129
package/dist/check-M73LGONJ.js.map +0 -1
package/dist/chunk-2XWIUEQK.js +0 -190
package/dist/chunk-2XWIUEQK.js.map +0 -1
package/dist/chunk-GZRQL3SX.js +0 -146
package/dist/chunk-GZRQL3SX.js.map +0 -1
package/dist/chunk-V5G6BGOK.js +0 -26
package/dist/chunk-V5G6BGOK.js.map +0 -1
package/dist/chunk-W66Z3C5H.js +0 -21
package/dist/chunk-W66Z3C5H.js.map +0 -1
package/dist/cli.d.ts +0 -1
package/dist/cli.js +0 -34
package/dist/cli.js.map +0 -1
package/dist/diff-FSFDCBL5.js +0 -166
package/dist/diff-FSFDCBL5.js.map +0 -1
package/dist/index.d.ts +0 -11
package/dist/index.js +0 -7
package/dist/index.js.map +0 -1
package/dist/reset-3ACTIYYE.js +0 -143
package/dist/reset-3ACTIYYE.js.map +0 -1
package/dist/setup-MKVVQTVA.js +0 -266
package/dist/setup-MKVVQTVA.js.map +0 -1
package/dist/upgrade-FQOL6AF5.js +0 -134
package/dist/upgrade-FQOL6AF5.js.map +0 -1
/package/{templates → framework}/SAFEWORD.md +0 -0
/package/{templates → framework}/guides/architecture-guide.md +0 -0
/package/{templates → framework}/guides/code-philosophy.md +0 -0
/package/{templates → framework}/guides/context-files-guide.md +0 -0
/package/{templates → framework}/guides/data-architecture-guide.md +0 -0
/package/{templates → framework}/guides/design-doc-guide.md +0 -0
/package/{templates → framework}/guides/learning-extraction.md +0 -0
/package/{templates → framework}/guides/llm-instruction-design.md +0 -0
/package/{templates → framework}/guides/llm-prompting.md +0 -0
/package/{templates → framework}/guides/tdd-best-practices.md +0 -0
/package/{templates → framework}/guides/test-definitions-guide.md +0 -0
/package/{templates → framework}/guides/testing-methodology.md +0 -0
/package/{templates → framework}/guides/user-story-guide.md +0 -0
/package/{templates → framework}/guides/zombie-process-cleanup.md +0 -0
/package/{templates → framework}/prompts/arch-review.md +0 -0
/package/{templates → framework}/prompts/quality-review.md +0 -0
/package/{templates/skills/safeword-quality-reviewer → framework/skills/quality-reviewer}/SKILL.md +0 -0
/package/{templates/doc-templates → framework/templates}/architecture-template.md +0 -0
/package/{templates/doc-templates → framework/templates}/design-doc-template.md +0 -0
/package/{templates/doc-templates → framework/templates}/test-definitions-feature.md +0 -0
/package/{templates/doc-templates → framework/templates}/ticket-template.md +0 -0
/package/{templates/doc-templates → framework/templates}/user-stories-template.md +0 -0
/package/{templates → packages/cli/templates}/commands/arch-review.md +0 -0
/package/{templates → packages/cli/templates}/commands/lint.md +0 -0
/package/{templates → packages/cli/templates}/commands/quality-review.md +0 -0
/package/{templates → packages/cli/templates}/hooks/inject-timestamp.sh +0 -0
/package/{templates → packages/cli/templates}/lib/common.sh +0 -0
/package/{templates → packages/cli/templates}/lib/jq-fallback.sh +0 -0
/package/{templates → packages/cli/templates}/markdownlint.jsonc +0 -0

package/promptfoo.yaml ADDED Viewed

@@ -0,0 +1,3270 @@
+# Promptfoo eval config for SAFEWORD guide quality
+# Run with: npm run eval
+# View results: npm run eval:view
+description: 'SAFEWORD Guide Quality Evals'
+providers:
+  - id: anthropic:messages:claude-haiku-4-20250514
+    config:
+      temperature: 0
+prompts:
+  - |
+    You are an AI coding assistant following the SAFEWORD framework.
+    <context>
+    {{context}}
+    </context>
+    <user_request>
+    {{input}}
+    </user_request>
+    Respond following the guidelines in the context.
+tests:
+  # =============================================================================
+  # ARCHITECTURE GUIDE (13 tests)
+  # =============================================================================
+  - description: 'arch-001: Create comprehensive architecture doc'
+    vars:
+      input: 'Create an architecture doc for a new React + Supabase project'
+      context: |
+        file_path: framework/guides/architecture-guide.md (excerpt)
+        ---
+        ## Required Sections
+        1. Header (Version, Status)
+        2. Table of Contents
+        3. Overview
+        4. Data Principles
+        5. Data Model
+        6. Components
+        7. Data Flows
+        8. Key Decisions (What/Why/Trade-off/Alternatives)
+        9. Best Practices
+        10. Migration Strategy
+        11. Code References
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should create an architecture doc with most required sections.
+          EXCELLENT: All 11 sections with What/Why/Trade-off in decisions
+          ACCEPTABLE: 9+ sections present
+          POOR: <9 sections or missing Key Decisions
+  - description: 'arch-002: Choose Architecture Doc for tech decision'
+    vars:
+      input: 'I need to document our decision to use PostgreSQL instead of MongoDB'
+      context: |
+        file_path: framework/SAFEWORD.md (excerpt)
+        ---
+        ## When to Update Architecture Docs
+        **Update ARCHITECTURE.md when:**
+        - Making technology choices (state management, database, frameworks)
+        - Designing data models or schemas
+        - Establishing project-wide patterns/conventions
+        **Use Design Doc instead when:**
+        - Implementing a specific feature
+        - Feature-specific technical decisions
+        **Tie-breaking rule:** If decision affects 2+ features → Architecture doc. If feature-specific only → Design doc.
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should recommend creating or updating an Architecture Doc (not a Design Doc).
+          EXCELLENT: Correctly identifies Architecture Doc AND explains why (database choice affects whole project)
+          ACCEPTABLE: Correctly identifies Architecture Doc
+          POOR: Suggests Design Doc or is unclear
+  - description: 'arch-003: Choose Design Doc for feature'
+    vars:
+      input: 'I need to document how the user profile feature will work'
+      context: |
+        file_path: framework/SAFEWORD.md (excerpt)
+        ---
+        ## When to Update Architecture Docs
+        **Update ARCHITECTURE.md when:**
+        - Making technology choices (state management, database, frameworks)
+        - Designing data models or schemas
+        - Establishing project-wide patterns/conventions
+        **Use Design Doc instead when:**
+        - Implementing a specific feature
+        - Feature-specific technical decisions
+        **Tie-breaking rule:** If decision affects 2+ features → Architecture doc. If feature-specific only → Design doc.
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should recommend creating a Design Doc (not Architecture Doc).
+          EXCELLENT: Correctly identifies Design Doc AND explains why (single feature)
+          ACCEPTABLE: Correctly identifies Design Doc
+          POOR: Suggests Architecture Doc
+  - description: 'arch-004: Document Why, not just What'
+    vars:
+      input: 'Document our decision to use Redis for caching'
+      context: |
+        file_path: framework/guides/architecture-guide.md (excerpt)
+        ---
+        ## Key Decisions Format
+        Every decision must include:
+        - **What**: The decision made
+        - **Why**: Rationale with specifics (numbers, metrics)
+        - **Trade-off**: What we gave up
+        - **Alternatives Considered**: Other options evaluated
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should include What, Why, Trade-off, and Alternatives.
+          EXCELLENT: All 4 fields with specifics (numbers, metrics, concrete alternatives)
+          ACCEPTABLE: What/Why/Trade-off present
+          POOR: Missing Why or Trade-off
+  - description: 'arch-005: Apply tie-breaker for multi-feature'
+    vars:
+      input: 'I need to document adding a caching layer that will be used by multiple features'
+      context: |
+        file_path: framework/SAFEWORD.md (excerpt)
+        ---
+        **Tie-breaking rule:** If decision affects 2+ features → Architecture doc. If feature-specific only → Design doc.
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should choose Architecture Doc (affects 2+ features).
+          EXCELLENT: Architecture Doc + cites tie-breaking rule (affects 2+ features)
+          ACCEPTABLE: Architecture Doc
+          POOR: Design Doc
+  - description: 'arch-006: Include code references'
+    vars:
+      input: 'Document the authentication flow architecture, including where the code lives'
+      context: |
+        file_path: framework/guides/architecture-guide.md (excerpt)
+        ---
+        ## Code References
+        Include paths to implementation:
+        - File paths with line ranges when helpful
+        - Function/class names
+        - Keep references current when code changes
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should include code references with file paths.
+          EXCELLENT: 2+ code references with file:line format or function names
+          ACCEPTABLE: At least 1 file path reference
+          POOR: No code references
+  - description: 'arch-007: Consolidate ADRs'
+    vars:
+      input: 'Our project has 50 ADR files in docs/adr/. What should we do?'
+      context: |
+        file_path: framework/guides/architecture-guide.md (excerpt)
+        ---
+        ## Single Architecture Doc
+        Consolidate scattered ADRs into one ARCHITECTURE.md:
+        1. Create ARCHITECTURE.md at project root
+        2. Consolidate active decisions
+        3. Archive old ADRs
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should recommend consolidating into single ARCHITECTURE.md.
+          EXCELLENT: Recommends consolidation + provides migration steps
+          ACCEPTABLE: Recommends consolidation
+          POOR: Suggests keeping separate ADRs
+  - description: 'arch-008: Include versioning'
+    vars:
+      input: 'Create architecture doc header for a new project'
+      context: |
+        file_path: framework/guides/architecture-guide.md (excerpt)
+        ---
+        ## Header Format
+        Status values: Design | Production | Proposed | Deprecated
+        Version: Major.Minor (bump major for breaking changes)
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should include Version and Status in header.
+          EXCELLENT: Version + Status using valid values
+          ACCEPTABLE: Version and Status present
+          POOR: Missing Version or Status
+  - description: 'arch-009: Check for user stories before implementing'
+    vars:
+      input: 'Implement user authentication for my app'
+      context: |
+        file_path: framework/SAFEWORD.md (excerpt)
+        ---
+        ## Feature Development Workflow
+        1. User Stories - Check if they exist, create if not
+        2. Test Definitions - Check if they exist, create if not
+        3. Design Doc (complex features only)
+        4. Follow STRICT TDD Workflow
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should check for user stories/test definitions before implementation.
+          EXCELLENT: Checks for user stories + test definitions + offers to create if missing
+          ACCEPTABLE: Mentions TDD workflow
+          POOR: Jumps straight to implementation
+  - description: 'arch-010: Suggest updating architecture doc after tech change'
+    vars:
+      input: 'I just added PostgreSQL to our project that was using SQLite'
+      context: |
+        file_path: framework/SAFEWORD.md (excerpt)
+        ---
+        ## When to Update Architecture Docs
+        **Update ARCHITECTURE.md when:**
+        - Making technology choices (state management, database, frameworks)
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should suggest updating architecture doc.
+          EXCELLENT: Recommends architecture doc update + explains why (tech choice)
+          ACCEPTABLE: Mentions documenting the change
+          POOR: No mention of architecture doc
+  - description: "arch-011: Don't suggest update for bug fix"
+    vars:
+      input: 'I just fixed a bug in the login form validation'
+      context: |
+        file_path: framework/SAFEWORD.md (excerpt)
+        ---
+        ## When to Update Architecture Docs
+        **Update ARCHITECTURE.md when:**
+        - Making technology choices
+        - Designing data models
+        - Establishing project-wide patterns
+        **NOT for:** Bug fixes, minor refactors, feature tweaks
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should NOT suggest updating architecture doc.
+          EXCELLENT: No mention of architecture doc (bug fix doesn't warrant it)
+          ACCEPTABLE: Asks if it's architectural, then correctly says no
+          POOR: Suggests updating architecture doc
+  - description: 'arch-012: Catch missing rationale anti-pattern'
+    vars:
+      input: |
+        Review this architecture doc section:
+        ### State Management
+        **What**: Using Zustand for global state
+      context: |
+        file_path: framework/guides/architecture-guide.md (excerpt)
+        ---
+        ## Common Mistakes
+        - Missing "Why" in decisions
+        - No trade-offs documented
+        - Vague rationale without specifics
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should identify missing "Why" and "Trade-off".
+          EXCELLENT: Identifies missing Why/Trade-off + suggests adding rationale with specifics
+          ACCEPTABLE: Notes decision is incomplete
+          POOR: Says doc looks fine
+  - description: 'arch-013: Create file in correct location'
+    vars:
+      input: 'Create a design doc for the payment flow feature'
+      context: |
+        file_path: framework/SAFEWORD.md (excerpt)
+        ---
+        ## Planning Documentation Location
+        - Design docs → `.agents/planning/design/`
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should create file in .agents/planning/design/.
+          EXCELLENT: Creates in `.agents/planning/design/` + follows naming convention
+          ACCEPTABLE: Creates in a planning/design directory
+          POOR: Creates at root or wrong location
+  - description: 'arch-014: Layer definitions for new project'
+    vars:
+      input: "I'm starting a new TypeScript project. How should I organize my code into layers?"
+      context: |
+        file_path: framework/guides/architecture-guide.md (excerpt)
+        ---
+        ### Layer Definitions
+        | Layer | Directory | Responsibility |
+        |-------|-----------|----------------|
+        | app | `src/app/` | UI, routing, composition |
+        | domain | `src/domain/` | Business rules, pure logic |
+        | infra | `src/infra/` | IO, APIs, DB, external SDKs |
+        | shared | `src/shared/` | Utilities usable by all layers |
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should explain the 4-layer structure.
+          EXCELLENT: Lists all 4 layers (app, domain, infra, shared) with directories and responsibilities
+          ACCEPTABLE: Lists layers with general descriptions
+          POOR: Vague advice or missing layers
+  - description: 'arch-015: Dependency rules - forbidden import'
+    vars:
+      input: 'Can my domain layer import from the app layer?'
+      context: |
+        file_path: framework/guides/architecture-guide.md (excerpt)
+        ---
+        ### Allowed Dependencies
+        | From | To | Allowed | Rationale |
+        |------|-----|---------|-----------|
+        | domain | app | ❌ | Domain must be framework-agnostic |
+        | domain | infra | ❌ | Domain contains pure logic only |
+        | domain | shared | ✅ | Utilities available everywhere |
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should identify this as a forbidden import.
+          EXCELLENT: No - domain cannot import from app, explains rationale (framework-agnostic)
+          ACCEPTABLE: Says it's not allowed
+          POOR: Says it's allowed or gives ambiguous answer
+  - description: 'arch-016: Edge case - brownfield adoption'
+    vars:
+      input: 'I have an existing codebase with lots of boundary violations. How do I adopt layer boundaries without breaking everything?'
+      context: |
+        file_path: framework/guides/architecture-guide.md (excerpt)
+        ---
+        ### Edge Cases
+        | Scenario | Solution |
+        |----------|----------|
+        | Brownfield adoption | Start with warnings-only mode, fix violations incrementally, then enforce |
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should describe incremental adoption.
+          EXCELLENT: 3-step path: warnings-only → fix incrementally → enforce; mentions not breaking existing code
+          ACCEPTABLE: Suggests gradual adoption approach
+          POOR: Suggests immediate enforcement or ignores existing violations
+  - description: 'arch-017: ESLint boundaries setup'
+    vars:
+      input: 'How do I set up eslint-plugin-boundaries to enforce my layer rules?'
+      context: |
+        file_path: framework/guides/architecture-guide.md (excerpt)
+        ---
+        ### Enforcement with eslint-plugin-boundaries
+        **Setup:**
+        1. Install: `npm install --save-dev eslint-plugin-boundaries`
+        2. Add to `eslint.config.mjs` with boundaries/element-types rules
+        3. Define layers in `ARCHITECTURE.md`
+        4. Errors appear in IDE + CI automatically
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should explain ESLint boundaries setup.
+          EXCELLENT: Lists install command, config example with element-types rules, mentions IDE + CI integration
+          ACCEPTABLE: Provides basic setup steps
+          POOR: Vague or missing key configuration
+  - description: 'arch-018: LLM arch review - detect god module'
+    vars:
+      input: 'Review this file for architectural issues: UserService.ts with 800 lines and 15 dependencies'
+      context: |
+        file_path: framework/prompts/arch-review.md (excerpt)
+        ---
+        ## Check for:
+        1. **Misplaced logic** - Business rules in wrong layer?
+        2. **God module** - Too many responsibilities (>10 dependents or >500 lines)?
+        3. **Leaky abstraction** - Implementation details exposed to callers?
+        4. **Tight coupling** - Changes would cascade unnecessarily?
+        5. **Boundary violation** - Import from disallowed layer?
+        ## Response Format
+        Return JSON with verdict: "clean" | "minor" | "refactor_needed"
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should identify god module issue.
+          EXCELLENT: Identifies god module (>500 lines, >10 deps), returns refactor_needed verdict with fix suggestion
+          ACCEPTABLE: Notes the file is too large
+          POOR: Says it's clean or misses the issue
+  - description: 'arch-019: Pre-commit hook behavior'
+    vars:
+      input: 'What happens when I try to commit code with a boundary violation?'
+      context: |
+        file_path: framework/scripts/setup-safeword.sh (excerpt)
+        ---
+        # Pre-commit hook runs:
+        # 1. ESLint on staged files (--max-warnings 0 || exit 1)
+        # 2. arch-review.sh on staged files
+        #    - refactor_needed verdict → exit 1 (blocked)
+        #    - minor verdict → exit 0 (allowed with warning)
+        #    - clean verdict → exit 0 (allowed)
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should explain commit blocking behavior.
+          EXCELLENT: Commit blocked by ESLint errors OR refactor_needed; minor issues warn but allow
+          ACCEPTABLE: Explains that violations block commit
+          POOR: Says commit always succeeds or always fails
+  - description: 'arch-020: CI architecture check workflow'
+    vars:
+      input: 'How do I set up CI to check for architecture violations in PRs?'
+      context: |
+        file_path: framework/templates/ci/architecture-check.yml (excerpt)
+        ---
+        # Architecture Check CI Workflow
+        # Steps:
+        # 1. Type check (tsc --noEmit)
+        # 2. ESLint (includes eslint-plugin-boundaries)
+        # 3. LLM architecture review (optional, non-blocking)
+        #
+        # Usage:
+        # 1. Copy to .github/workflows/architecture-check.yml
+        # 2. Set ANTHROPIC_API_KEY secret (optional, for LLM review)
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should explain CI setup for architecture checks.
+          EXCELLENT: Copy template to .github/workflows/, runs tsc + eslint (blocking) + LLM (non-blocking)
+          ACCEPTABLE: Mentions copying workflow file and running checks
+          POOR: Vague or missing key steps
+  - description: 'arch-021: Architecture template sections'
+    vars:
+      input: "I'm starting a new project. What sections should my ARCHITECTURE.md have?"
+      context: |
+        file_path: framework/templates/architecture-template.md (excerpt)
+        ---
+        ## Table of Contents
+        - [Overview](#overview)
+        - [Layers & Boundaries](#layers--boundaries)
+        - [Data Model](#data-model)
+        - [Key Decisions](#key-decisions)
+        - [Best Practices](#best-practices)
+        - [Migration Strategy](#migration-strategy)
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should list required ARCHITECTURE.md sections.
+          EXCELLENT: Lists all 6 sections (Overview, Layers, Data Model, Decisions, Best Practices, Migration)
+          ACCEPTABLE: Lists most sections
+          POOR: Missing key sections or vague
+  # =============================================================================
+  # CODE PHILOSOPHY (14 tests)
+  # =============================================================================
+  - description: 'phil-001: JSON summary at end of response'
+    vars:
+      input: 'Fix the typo in README.md'
+      context: |
+        file_path: framework/guides/code-philosophy.md (excerpt)
+        ---
+        ## Response Format
+        At the end of EVERY response, include a JSON summary:
+        ```json
+        {"proposedChanges": boolean, "madeChanges": boolean, "askedQuestion": boolean}
+        ```
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should end with JSON containing madeChanges: true.
+          EXCELLENT: Valid JSON with madeChanges: true, other fields accurate
+          ACCEPTABLE: JSON present with correct madeChanges value
+          POOR: Missing JSON or madeChanges: false when edit was made
+  - description: 'phil-002: Avoid bloat - push back on unnecessary feature'
+    vars:
+      input: 'Can you add a configuration file system so users can customize the button colors?'
+      context: |
+        file_path: framework/guides/code-philosophy.md (excerpt)
+        ---
+        ## Code Philosophy
+        - **AVOID BLOAT** - Simple, focused solutions over complex ones
+        **Bloat examples (avoid these):**
+        | ❌ Bloat | ✅ Instead |
+        |----------|-----------|
+        | Config file for 2 options | Hardcode or simple params |
+        | "Future-proofing" unused code paths | Delete, add when needed |
+        **When to push back:** If a feature request would add >50 lines for a "nice to have", ask: "Is this essential now, or can we add it later?"
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should push back or question the necessity of a config system for button colors.
+          EXCELLENT: Questions necessity, suggests simpler alternative (CSS variables, props), asks if essential now
+          ACCEPTABLE: Expresses concern about complexity or suggests simpler approach
+          POOR: Immediately implements config file system without questioning
+  - description: 'phil-003: Self-documenting code with clear naming'
+    vars:
+      input: 'Write a function that calculates the total price with tax'
+      context: |
+        file_path: framework/guides/code-philosophy.md (excerpt)
+        ---
+        ## Code Philosophy
+        - **Self-documenting code** - Minimal inline comments, clear naming and structure
+        **Naming examples:**
+        | ❌ Bad | ✅ Good |
+        |--------|---------|
+        | `calcTot` | `calculateTotalWithTax` |
+        | `d`, `tmp`, `data` | `userProfile`, `pendingOrders` |
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should use descriptive function name, no unnecessary comments.
+          EXCELLENT: Descriptive name (e.g., `calculateTotalWithTax`) + no comments on obvious code
+          ACCEPTABLE: Descriptive name with some redundant comments
+          POOR: Abbreviated name (e.g., `calcTot`) or comments explaining obvious code
+  - description: 'phil-004: Explicit error handling'
+    vars:
+      input: "Here's my code: try { await saveUser(data) } catch (e) { console.log(e) }. Is this okay?"
+      context: |
+        file_path: framework/guides/code-philosophy.md (excerpt)
+        ---
+        ## Code Philosophy
+        - **Explicit error handling** - NEVER suppress or swallow errors silently
+        **Error handling examples:**
+        | ❌ Bad | ✅ Good |
+        |--------|---------|
+        | `catch (e) {}` (swallowed) | `catch (e) { throw new Error(\`Failed to read ${filePath}: ${e.message}\`) }` |
+        | `catch (e) { console.log(e) }` | `catch (e) { logger.error('Payment failed', { userId, amount, error: e }) }` |
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should identify the error handling as inadequate.
+          EXCELLENT: Identifies console.log as insufficient, suggests proper logging with context or re-throwing
+          ACCEPTABLE: Says error handling needs improvement
+          POOR: Says the code is fine or doesn't address error handling
+  - description: 'phil-005: Verify documentation before using API'
+    vars:
+      input: 'Add a feature using the latest React Server Components API'
+      context: |
+        file_path: framework/guides/code-philosophy.md (excerpt)
+        ---
+        ## Documentation Verification (CRITICAL)
+        - **Always look up current documentation** for libraries, tools, and frameworks
+        - **NEVER assume features exist** - Training data is at least 1 year old
+        **How to verify:**
+        1. Check `package.json` for installed version
+        2. Use Context7 MCP or official docs for current API
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should verify React version or look up current docs.
+          EXCELLENT: Checks package.json for React version OR uses Context7/docs lookup
+          ACCEPTABLE: Mentions need to verify version
+          POOR: Assumes API exists without verification
+  - description: 'phil-006: TDD workflow - test first'
+    vars:
+      input: 'Add a function that validates email addresses'
+      context: |
+        file_path: framework/guides/code-philosophy.md (excerpt)
+        ---
+        ## Testing Philosophy
+        **Test-Driven Development (TDD):**
+        - Write tests BEFORE implementing features (RED → GREEN → REFACTOR)
+        - Tests define expected behavior, code makes them pass
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should write failing test first, then implement.
+          EXCELLENT: Writes test first, runs it (RED), then implements (GREEN)
+          ACCEPTABLE: Mentions TDD approach, writes test
+          POOR: Implements function without writing test first
+  - description: 'phil-007: Self-testing before completion'
+    vars:
+      input: 'Fix the login button bug'
+      context: |
+        file_path: framework/guides/code-philosophy.md (excerpt)
+        ---
+        ## Testing Philosophy
+        **Always test what you build** - Run tests yourself before completion. Don't ask the user to verify.
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should run tests and report results, not ask user to verify.
+          EXCELLENT: Runs tests, reports "Tests pass ✓", doesn't ask user to verify
+          ACCEPTABLE: Mentions running tests
+          POOR: Asks user to test or verify the fix
+  - description: 'phil-008: Debug logging hygiene'
+    vars:
+      input: 'Debug why this test is failing'
+      context: |
+        file_path: framework/guides/code-philosophy.md (excerpt)
+        ---
+        ## Debugging & Troubleshooting
+        **Debug Logging:**
+        - When debugging, log **actual vs expected** values
+        - Remove debug logging after fixing
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should add logs showing actual vs expected, remove after fix.
+          EXCELLENT: Logs actual vs expected values, removes debug logs after fix
+          ACCEPTABLE: Logs something useful for debugging
+          POOR: Leaves debug logs in code after fix
+  - description: 'phil-009: Cross-platform paths'
+    vars:
+      input: 'Create a function that builds a file path from directory and filename'
+      context: |
+        file_path: framework/guides/code-philosophy.md (excerpt)
+        ---
+        ## Cross-Platform Development
+        - Never assume Unix-style paths (`/`) - handle both `/` and `\`
+        ```javascript
+        // ❌ Bad: dir + '/' + filename
+        // ✅ Good: path.join(dir, filename)
+        ```
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should use path.join() or equivalent, not string concatenation.
+          EXCELLENT: Uses path.join() or path.resolve(), no hardcoded separators
+          ACCEPTABLE: Mentions cross-platform concerns
+          POOR: Uses string concat with hardcoded '/' or '\'
+  - description: 'phil-010: Follow best practices'
+    vars:
+      input: 'Create a React component for a dropdown menu'
+      context: |
+        file_path: framework/guides/code-philosophy.md (excerpt)
+        ---
+        ## Best Practices (Always Apply)
+        - **Tool-specific best practices** - Use libraries/frameworks as intended
+        - **UX best practices** - Prioritize user experience
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should follow React conventions (hooks, controlled components).
+          EXCELLENT: Follows React best practices + mentions why (controlled vs uncontrolled)
+          ACCEPTABLE: Follows conventions without explicit mention
+          POOR: Ignores React conventions (e.g., direct DOM manipulation)
+  - description: 'phil-011: Self-review before completion'
+    vars:
+      input: "I've implemented the feature"
+      context: |
+        file_path: framework/guides/code-philosophy.md (excerpt)
+        ---
+        ## Self-Review Checklist
+        Before completing any work, verify:
+        - ✓ Is it correct? Will it actually work?
+        - ✓ Is it elegant? Does it avoid bloat?
+        - ✓ Does it follow best practices?
+        - ✓ Are you using the right docs/versions?
+        - ✓ Have you tested the user-facing functionality?
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should run self-review checklist before declaring done.
+          EXCELLENT: Explicitly runs through checklist items, mentions test results
+          ACCEPTABLE: Mentions verification before completion
+          POOR: Declares done without any self-review
+  - description: 'phil-012: Question-asking protocol'
+    vars:
+      input: 'How should I structure the database schema?'
+      context: |
+        file_path: framework/guides/code-philosophy.md (excerpt)
+        ---
+        ## Asking Questions
+        - Only ask questions when you genuinely can't find the answer
+        - **When asking, show what you tried:** "I checked X and Y but couldn't determine Z. What's your preference?"
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should ask after showing research attempt, focus on domain preferences.
+          EXCELLENT: Shows what was researched + asks domain-specific question
+          ACCEPTABLE: Asks relevant question about domain preferences
+          POOR: Asks without showing any research attempt
+  - description: 'phil-013: Tooling currency'
+    vars:
+      input: "I'm about to start a new project. Should I update my CLI tools?"
+      context: |
+        file_path: framework/guides/code-philosophy.md (excerpt)
+        ---
+        ## Tools & CLIs
+        **Keep these updated** (check before starting new projects):
+        - GitHub CLI (`gh`)
+        - AWS CLI
+        - Railway CLI
+        - PostHog CLI
+        **Update workflow:**
+        1. Check current version: `gh --version`, `aws --version`, etc.
+        2. Check for updates
+        3. Review changelog for breaking changes before major version updates
+        4. If breaking changes affect your workflow, pin to current version
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should recommend checking/updating critical CLIs with workflow.
+          EXCELLENT: Recommends checking versions, lists critical CLIs, mentions breaking changes review, version pinning
+          ACCEPTABLE: Suggests updating tools before starting
+          POOR: Ignores tooling currency or says "no need to update"
+  - description: 'phil-014: Git workflow - atomic commits'
+    vars:
+      input: 'Fix the login bug and add a new feature (two separate tasks)'
+      context: |
+        file_path: framework/guides/code-philosophy.md (excerpt)
+        ---
+        ## Git Workflow
+        - Commit often to checkpoint progress
+        - Make atomic commits (one logical change per commit)
+        ```
+        # ❌ Bad: "misc fixes"
+        # ✅ Good: "fix: login button not responding to clicks"
+        ```
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should make separate commits for each task.
+          EXCELLENT: Separate atomic commits with descriptive messages for each task
+          ACCEPTABLE: Commits with clear messages
+          POOR: Single commit for unrelated changes or vague message like "misc fixes"
+  # =============================================================================
+  # TESTING METHODOLOGY (13 tests)
+  # =============================================================================
+  - description: 'test-001: Choose fastest effective test type'
+    vars:
+      input: 'I need to test a pure function that calculates tax. What test type should I use?'
+      context: |
+        file_path: framework/guides/testing-methodology.md (excerpt)
+        ---
+        ## Testing Principles
+        **Optimization rule:** Test with the fastest test type that can catch the bug.
+        ### Test Speed Hierarchy (Fast → Slow)
+        ```
+        Unit (milliseconds)      ← Pure functions, no I/O
+        Integration (seconds)    ← Components with dependencies
+        E2E (10+ seconds)        ← Full user flows
+        ```
+        **Decision tree:**
+        1. Pure function with no I/O? → Unit test
+        2. Component with database/API? → Integration test
+        3. Full user flow? → E2E test
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should recommend unit tests for a pure function.
+          EXCELLENT: Recommends unit test AND explains why (pure function, fastest, no I/O)
+          ACCEPTABLE: Recommends unit test
+          POOR: Recommends integration or E2E test
+  - description: 'test-002: Component vs flow testing'
+    vars:
+      input: 'I want to test a React header component. Should I use E2E or integration tests?'
+      context: |
+        file_path: framework/guides/testing-methodology.md (excerpt)
+        ---
+        ## Test Type Selection
+        - **Integration test**: Single component behavior, interactions
+        - **E2E test**: Multi-page flows, critical user journeys
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should recommend integration test for component.
+          EXCELLENT: Integration test for component behavior, E2E only for multi-page flows
+          ACCEPTABLE: Distinguishes component vs flow
+          POOR: Suggests E2E for single component
+  - description: 'test-003: Identify inverted test pyramid'
+    vars:
+      input: 'I have 50 E2E tests and 20 integration tests. Is this a good ratio?'
+      context: |
+        file_path: framework/guides/testing-methodology.md (excerpt)
+        ---
+        ## Test Distribution
+        **Red flag:** More E2E than integration/unit tests = slow feedback loop
+        **Target:** Most tests should be fast (unit/integration)
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should identify red flag - inverted pyramid.
+          EXCELLENT: Red flag - more E2E than integration is too slow, suggests adding integration tests
+          ACCEPTABLE: Notes ratio concern
+          POOR: Accepts inverted ratio
+  - description: 'test-004: TDD RED phase - test must fail first'
+    vars:
+      input: "I wrote a test and it's passing. Should I implement the code now?"
+      context: |
+        file_path: framework/guides/testing-methodology.md (excerpt)
+        ---
+        ## TDD Phases
+        **RED:** Write failing test first
+        - Test MUST fail before implementation
+        - Verify failure message is meaningful
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should identify TDD violation - test must fail first.
+          EXCELLENT: RED phase violation - test must fail first, verify failure before implementation
+          ACCEPTABLE: Notes test should fail first
+          POOR: Accepts passing test before implementation
+  - description: 'test-005: Decision tree for AI quality testing'
+    vars:
+      input: 'I need to test narrative quality from my AI. What test type should I use?'
+      context: |
+        file_path: framework/guides/testing-methodology.md (excerpt)
+        ---
+        ## Test Type Decision Tree
+        1. Testing AI content quality? → LLM Evaluation
+        2. Pure function? → Unit test
+        3. Component with dependencies? → Integration test
+        4. Full user flow? → E2E test
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should use decision tree, select LLM Eval.
+          EXCELLENT: Question 1 → AI content quality → LLM Evaluation
+          ACCEPTABLE: Selects LLM Eval
+          POOR: Suggests unit or E2E for AI quality
+  - description: 'test-006: CSS bug requires E2E'
+    vars:
+      input: 'I have a CSS layout bug. What test type should I use?'
+      context: |
+        file_path: framework/guides/testing-methodology.md (excerpt)
+        ---
+        ## Bug-to-Test Mapping
+        | Bug Type | Test Type |
+        |----------|-----------|
+        | CSS/Layout | E2E (requires real browser) |
+        | Business logic | Unit |
+        | API integration | Integration |
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should map CSS to E2E.
+          EXCELLENT: E2E (requires real browser for CSS), references lookup table
+          ACCEPTABLE: Selects E2E
+          POOR: Suggests unit test for CSS
+  - description: 'test-007: E2E port isolation'
+    vars:
+      input: 'My E2E tests keep failing because they conflict with my dev server. How do I fix this?'
+      context: |
+        file_path: framework/guides/testing-methodology.md (excerpt)
+        ---
+        ## E2E Dev/Test Server Isolation
+        - Dev server: stable port (e.g., 3000)
+        - Test server: devPort + 1000 (e.g., 4000)
+        - Configure Playwright with isolated port
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should suggest port isolation.
+          EXCELLENT: Dev on stable port, tests on devPort+1000, Playwright config with isolated port
+          ACCEPTABLE: Suggests separate ports
+          POOR: No isolation guidance
+  - description: 'test-008: LLM-as-judge for creative outputs'
+    vars:
+      input: "Should I use keyword matching to test if my AI response has a 'collaborative tone'?"
+      context: |
+        file_path: framework/guides/testing-methodology.md (excerpt)
+        ---
+        ## LLM Evaluations
+        For creative/qualitative outputs, use LLM-as-judge with rubric:
+        - EXCELLENT: [criteria]
+        - ACCEPTABLE: [criteria]
+        - POOR: [criteria]
+        **Avoid:** Brittle keyword matching for creative content
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should recommend LLM-as-judge.
+          EXCELLENT: LLM-as-judge with rubric, avoid brittle keywords for creative outputs
+          ACCEPTABLE: Suggests rubric-based evaluation
+          POOR: Accepts keyword matching
+  - description: 'test-009: Cost controls for evals'
+    vars:
+      input: 'My LLM evals are getting expensive. How can I reduce costs?'
+      context: |
+        file_path: framework/guides/testing-methodology.md (excerpt)
+        ---
+        ## Cost Controls for Evals
+        - Cache static prompts
+        - Batch scenarios
+        - Schedule full evals (PR/weekly, not every commit)
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should provide cost reduction strategies.
+          EXCELLENT: Cache static prompts, batch scenarios, schedule full evals (PR/weekly)
+          ACCEPTABLE: Mentions caching
+          POOR: No cost guidance
+  - description: 'test-010: Coverage goals'
+    vars:
+      input: 'What should I aim for in test coverage?'
+      context: |
+        file_path: framework/guides/testing-methodology.md (excerpt)
+        ---
+        ## Coverage Goals
+        - Unit: 80%+ for pure functions
+        - E2E: Critical multi-page flows
+        - "Critical" = user-facing, revenue-impacting, or data-integrity
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should provide coverage guidance.
+          EXCELLENT: Unit 80%+ for pure functions, E2E for critical multi-page flows, defines "critical"
+          ACCEPTABLE: Provides coverage targets
+          POOR: Generic "100% coverage" advice
+  - description: 'test-011: Flaky test diagnosis'
+    vars:
+      input: 'My tests keep failing randomly. What should I check?'
+      context: |
+        file_path: framework/guides/testing-methodology.md (excerpt)
+        ---
+        ## Test Quality Practices
+        **Flakiness causes:**
+        - Async issues (use polling, not arbitrary timeouts)
+        - Test interdependence (each test should be independent)
+        - Shared state (use AAA pattern: Arrange-Act-Assert)
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should identify flakiness causes.
+          EXCELLENT: Check async (polling vs arbitrary timeouts), independent tests, AAA pattern
+          ACCEPTABLE: Mentions async issues
+          POOR: Suggests skipping flaky tests
+  - description: 'test-012: CI/CD testing cadence'
+    vars:
+      input: 'When should I run different test types in CI?'
+      context: |
+        file_path: framework/guides/testing-methodology.md (excerpt)
+        ---
+        ## CI/CD Testing Cadence
+        - Unit + integration: every commit
+        - E2E: on PR
+        - LLM evals: scheduled (weekly or per-PR)
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should provide CI cadence.
+          EXCELLENT: Unit+integration every commit, E2E on PR, evals on schedule
+          ACCEPTABLE: Distinguishes cadence by test type
+          POOR: Run all tests on every commit
+  - description: 'test-014: Refuse to skip failing test'
+    vars:
+      input: 'The test is failing. Can I add .skip() to it so I can continue working?'
+      context: |
+        file_path: framework/guides/testing-methodology.md (excerpt)
+        ---
+        ## Test Integrity (CRITICAL)
+        **NEVER modify, skip, or delete tests without explicit human approval.**
+        Tests are the specification. When a test fails, the implementation is wrong—not the test.
+        ### Forbidden Actions (Require Approval)
+        | Action | Why It's Forbidden |
+        |--------|-------------------|
+        | Adding `.skip()`, `.only()`, `xit()`, `.todo()` | Makes failures invisible |
+        | Deleting tests you can't get passing | Removes coverage for edge cases |
+        ### What To Do Instead
+        1. **Test fails?** → Fix the implementation, not the test
+        2. **Test seems wrong?** → Explain why and ask before changing
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should refuse to add .skip() and explain why.
+          EXCELLENT: Refuses, explains it's forbidden (hides bugs), offers to fix implementation instead
+          ACCEPTABLE: Says no to skipping, suggests fixing the code
+          POOR: Agrees to add .skip() or doesn't address the concern
+  # =============================================================================
+  # ZOMBIE PROCESS CLEANUP (7 tests)
+  # =============================================================================
+  - description: 'zombie-001: Port-based cleanup for EADDRINUSE'
+    vars:
+      input: "I'm getting 'Error: listen EADDRINUSE: address already in use :::3000'. How do I fix it?"
+      context: |
+        file_path: framework/guides/zombie-process-cleanup.md (excerpt)
+        ---
+        ## Port-Based Cleanup (Safest for Multi-Project)
+        **When to use:** `EADDRINUSE`, `address already in use`, dev server won't start
+        **Port convention:** Dev and test instances use different ports:
+        - **Dev port**: Project's configured port (e.g., 3000)
+        - **Test port**: Dev port + 1000 (e.g., 4000)
+        **Decision rule:** If unsure which cleanup method to use → port-based first (safest), then project script, then tmux.
+        ```bash
+        # Kill both dev server AND test server ports
+        lsof -ti:3000 -ti:4000 | xargs kill -9 2>/dev/null
+        ```
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should provide port-based cleanup commands.
+          EXCELLENT: `lsof -ti:3000 -ti:4000 | xargs kill -9` (both dev and test ports), explains why port-based is safe
+          ACCEPTABLE: Provides kill command for at least dev port
+          POOR: Suggests `killall node` or restarting computer
+  - description: 'zombie-002: Create cleanup script'
+    vars:
+      input: 'I need to clean up processes frequently. Should I create a script?'
+      context: |
+        file_path: framework/guides/zombie-process-cleanup.md (excerpt)
+        ---
+        ## Project-Specific Cleanup Script
+        Create `scripts/cleanup.sh`:
+        ```bash
+        DEV_PORT=3000
+        TEST_PORT=$((DEV_PORT + 1000))
+        PROJECT_DIR="$(pwd)"
+        lsof -ti:$DEV_PORT -ti:$TEST_PORT | xargs kill -9 2>/dev/null
+        ```
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should recommend cleanup script.
+          EXCELLENT: Yes, create scripts/cleanup.sh with DEV_PORT, TEST_PORT (dev+1000), and PROJECT_DIR variables
+          ACCEPTABLE: Suggests creating script
+          POOR: No script guidance
+  - description: 'zombie-003: Unique port assignment'
+    vars:
+      input: "I'm working on multiple projects. How do I avoid port conflicts?"
+      context: |
+        file_path: framework/guides/zombie-process-cleanup.md (excerpt)
+        ---
+        ## Best Practices
+        1. **Assign unique ports** - Set `PORT=3000` in one project, `PORT=3001` in another
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should recommend unique ports.
+          EXCELLENT: Assign unique PORT per project (3000, 3001), document in README/env
+          ACCEPTABLE: Suggests unique ports
+          POOR: No port guidance
+  - description: 'zombie-004: tmux isolation'
+    vars:
+      input: 'Is there a way to isolate terminal sessions per project?'
+      context: |
+        file_path: framework/guides/zombie-process-cleanup.md (excerpt)
+        ---
+        ## Alternative: tmux/Screen Sessions
+        ```bash
+        tmux new -s project-name
+        tmux kill-session -t project-name
+        ```
+        **Pros:** Complete isolation, one command kills everything
+        **Cons:** Requires learning tmux
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should suggest tmux/screen.
+          EXCELLENT: Named tmux session per project, one command kills session, notes learning curve
+          ACCEPTABLE: Suggests terminal isolation
+          POOR: No isolation guidance
+  - description: 'zombie-005: Debugging zombie processes'
+    vars:
+      input: 'How do I find which processes are stuck?'
+      context: |
+        file_path: framework/guides/zombie-process-cleanup.md (excerpt)
+        ---
+        ## Debugging Zombie Processes
+        ### Find What's Using a Port
+        ```bash
+        lsof -i:3000
+        ```
+        ### Find Processes by Project Directory
+        ```bash
+        ps aux | grep "/Users/alex/projects/my-project"
+        ```
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should provide debugging commands.
+          EXCELLENT: Find by port, by process type, by project dir with $(pwd) pattern
+          ACCEPTABLE: Provides find commands
+          POOR: Generic advice
+  - description: 'zombie-006: Best practices'
+    vars:
+      input: 'What are the best practices for avoiding cross-project process kills?'
+      context: |
+        file_path: framework/guides/zombie-process-cleanup.md (excerpt)
+        ---
+        ## Best Practices
+        1. **Assign unique ports** - Set `PORT=3000` in one project, `PORT=3001` in another
+        2. **Use port-based cleanup first** - Simplest and safest
+        3. **Create project cleanup scripts** - Reusable, documented
+        4. **Never `killall node`** - Too broad when working on multiple projects
+        5. **Clean up before starting** - Run cleanup script before `npm run dev`
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should provide best practices.
+          EXCELLENT: Unique ports, port-based cleanup first, cleanup scripts, clean before start
+          ACCEPTABLE: Lists some practices
+          POOR: No best practices
+  - description: 'zombie-007: Quick reference'
+    vars:
+      input: 'Give me a quick reference for safe cleanup commands.'
+      context: |
+        file_path: framework/guides/zombie-process-cleanup.md (excerpt)
+        ---
+        ## Quick Reference
+        | Situation | Command |
+        |-----------|---------|
+        | Kill dev + test servers | `lsof -ti:$DEV_PORT -ti:$TEST_PORT \| xargs kill -9` |
+        | Kill Playwright (this project) | `pkill -f "playwright.*$(pwd)"` |
+        | Kill all for this project | `./scripts/cleanup.sh` |
+        ## What NOT to Do
+        ❌ `killall node` (kills all projects)
+        ❌ `pkill -9 node` (kills all projects)
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should provide quick reference.
+          EXCELLENT: Kill by both dev+test ports, kill playwright for project, warn against global kills
+          ACCEPTABLE: Provides commands
+          POOR: Suggests dangerous global kills
+  # =============================================================================
+  # USER STORY GUIDE (10 tests)
+  # =============================================================================
+  - description: 'story-001: Use standard template'
+    vars:
+      input: 'I need to create user stories for a new feature. Where do I start?'
+      context: |
+        file_path: framework/guides/user-story-guide.md (excerpt)
+        ---
+        ## Template Location
+        Use `user-stories-template.md` from `.safeword/templates/`
+        ## Workflow
+        1. Fill in feature name
+        2. Create numbered stories
+        3. Add acceptance criteria (1-5 per story)
+        4. Include out-of-scope section
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should point to template and workflow.
+          EXCELLENT: Points to template, lists workflow steps
+          ACCEPTABLE: Points to template
+          POOR: No template reference
+  - description: 'story-002: Include tracking metadata'
+    vars:
+      input: 'What metadata should I include in my user stories?'
+      context: |
+        file_path: framework/guides/user-story-guide.md (excerpt)
+        ---
+        ## Tracking Metadata
+        - Status (✅/❌)
+        - Test file references
+        - Completion percentage
+        - Phase tracking
+        - Next steps
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should list required metadata.
+          EXCELLENT: Status, test file refs, completion %, phase tracking, next steps
+          ACCEPTABLE: Lists most metadata
+          POOR: No metadata guidance
+  - description: 'story-003: INVEST validation'
+    vars:
+      input: "Is this a good user story? 'As a user, I want the system to be fast'"
+      context: |
+        file_path: framework/guides/user-story-guide.md (excerpt)
+        ---
+        ## INVEST Validation
+        Every story must pass INVEST:
+        | Criterion | Question | Red Flag |
+        |-----------|----------|----------|
+        | Independent | Can it be built alone? | "After X is done..." |
+        | Negotiable | Is scope flexible? | Rigid technical specs |
+        | Valuable | Does user care? | Pure refactoring |
+        | Estimable | Can we size it? | "Make it fast" |
+        | Small | 1-3 days work? | Epic-sized |
+        | Testable | Can we verify done? | "Improve UX" |
+        **Red flag phrases:** "fast", "better", "improved", "enhanced" without metrics
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should identify the story as failing INVEST criteria.
+          EXCELLENT: Identifies failures (not Estimable - "fast" is vague, not Testable - no metric), suggests improvement
+          ACCEPTABLE: Says story needs work, mentions vagueness
+          POOR: Says the story is fine
+  - description: 'story-004: Good acceptance criteria'
+    vars:
+      input: "My acceptance criterion says 'Campaign switching works'. Is this good?"
+      context: |
+        file_path: framework/guides/user-story-guide.md (excerpt)
+        ---
+        ## Acceptance Criteria
+        **BAD:** "Campaign switching works" (too vague)
+        **GOOD:** "Response time <200ms when switching campaigns"
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should identify vague AC.
+          EXCELLENT: Identifies as BAD (too vague), suggests specific measurable AC
+          ACCEPTABLE: Notes it's too vague
+          POOR: Accepts vague AC
+  - description: 'story-005: Size guidelines - split large story'
+    vars:
+      input: 'I have a user story with 8 acceptance criteria and touches 3 different user personas. Is this okay?'
+      context: |
+        file_path: framework/guides/user-story-guide.md (excerpt)
+        ---
+        ## Size Guidelines
+        | Indicator | Small (Good) | Medium (Consider Split) | Large (Must Split) |
+        |-----------|--------------|------------------------|-------------------|
+        | Acceptance Criteria | 3-5 | 6-8 | 9+ |
+        | Personas Affected | 1 | 2 | 3+ |
+        | Estimated Days | 1-3 | 4-5 | 6+ |
+        **Decision rule:** When borderline, err on the side of splitting.
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should recommend splitting the story.
+          EXCELLENT: Recommends splitting, cites both indicators (8 AC = Medium/Large, 3 personas = Large), suggests how to split
+          ACCEPTABLE: Recommends splitting or expresses concern about size
+          POOR: Says the story size is fine
+  - description: 'story-006: Good story example'
+    vars:
+      input: 'Can you show me what a good user story looks like?'
+      context: |
+        file_path: framework/guides/user-story-guide.md (excerpt)
+        ---
+        ## Good Story Example
+        **As a** campaign manager
+        **I want** to switch between campaigns with keyboard shortcuts
+        **So that** I can work faster without using the mouse
+        **Acceptance Criteria:**
+        - [ ] Cmd+1/2/3 switches to campaign 1/2/3
+        - [ ] Response time <200ms
+        - [ ] Visual feedback on switch
+        **Out of Scope:**
+        - Customizable shortcuts (future)
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should provide concrete example.
+          EXCELLENT: Shows complete example with As a/I want/So that, 1-5 specific AC, out-of-scope
+          ACCEPTABLE: Shows basic structure
+          POOR: Vague or incomplete example
+  - description: 'story-007: Conversation not contract'
+    vars:
+      input: 'Should I include all implementation details in my user story?'
+      context: |
+        file_path: framework/guides/user-story-guide.md (excerpt)
+        ---
+        ## Conversation, Not Contract
+        Stories are conversation starters, not rigid specs.
+        - Avoid implementation details
+        - Link to mockups/designs instead
+        - Keep focus on user value
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should advise against implementation details.
+          EXCELLENT: No - stories are conversation starters, avoid implementation details, link to mockups
+          ACCEPTABLE: Advises against implementation details
+          POOR: Suggests including implementation details
+  - description: 'story-008: LLM-optimized wording'
+    vars:
+      input: 'How do I write user stories that AI agents can follow?'
+      context: |
+        file_path: framework/guides/user-story-guide.md (excerpt)
+        ---
+        ## LLM-Optimized Wording
+        - Specific concrete language
+        - Numbers over vague words
+        - Explicit definitions
+        - Examples over abstract rules
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should provide LLM optimization guidance.
+          EXCELLENT: Specific concrete language, numbers, explicit definitions, examples over rules
+          ACCEPTABLE: Mentions clarity principles
+          POOR: No LLM-specific guidance
+  - description: 'story-009: Token efficiency'
+    vars:
+      input: 'How long should my user story template be?'
+      context: |
+        file_path: framework/guides/user-story-guide.md (excerpt)
+        ---
+        ## Token Efficiency
+        Keep stories lean (~9 lines) to minimize prompting cost.
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should provide size guidance.
+          EXCELLENT: Keep lean (~9 lines), minimize overhead for prompting cost
+          ACCEPTABLE: Suggests keeping it concise
+          POOR: No size guidance
+  - description: 'story-010: Technical task vs user story'
+    vars:
+      input: "I want to write a user story: 'As a developer, I want to refactor the database layer'"
+      context: |
+        file_path: framework/guides/user-story-guide.md (excerpt)
+        ---
+        ## Technical Tasks vs User Stories
+        User stories must deliver user value.
+        **NOT a user story:**
+        - "As a developer, I want to refactor..."
+        - "As a developer, I want to upgrade..."
+        **Instead:** Create a spike or technical task.
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should identify this as technical task.
+          EXCELLENT: This is a technical task/spike, not a user story - no user value
+          ACCEPTABLE: Notes it lacks user value
+          POOR: Accepts technical task as user story
+  - description: 'story-011: Technical constraints in user stories'
+    vars:
+      input: 'Create user stories for a new payment feature'
+      context: |
+        file_path: framework/templates/user-stories-template.md (excerpt)
+        ---
+        ## Technical Constraints
+        _Non-functional requirements that inform test definitions. Delete sections that don't apply._
+        ### Performance
+        - [ ] [e.g., Response time < 200ms at P95]
+        ### Security
+        - [ ] [e.g., All inputs validated/sanitized]
+        ### Compatibility
+        - [ ] [e.g., Chrome 100+, Safari 16+]
+        ### Data
+        - [ ] [e.g., GDPR: user data deletable within 72h]
+        ### Dependencies
+        - [ ] [e.g., Must use existing AuthService]
+        ### Infrastructure
+        - [ ] [e.g., Memory usage < 512MB]
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should include Technical Constraints section.
+          EXCELLENT: Includes Technical Constraints with specific, testable constraints in relevant categories (Performance, Security, etc.), deletes unused categories
+          ACCEPTABLE: Includes Technical Constraints section with some constraints
+          POOR: Missing Technical Constraints section or only vague constraints
+  - description: 'story-012: Constraint guidance - good vs bad'
+    vars:
+      input: "I'm adding a constraint 'Should be fast'. Is this good?"
+      context: |
+        file_path: framework/guides/user-story-guide.md (excerpt)
+        ---
+        ### ✅ GOOD Constraints (Specific, Testable)
+        - [ ] API response < 200ms at P95 under 100 concurrent users
+        - [ ] Initial page load < 3s on simulated 3G
+        ### ❌ BAD Constraints (Vague, Untestable)
+        - [ ] Should be fast ← How fast? Under what conditions?
+        - [ ] Good performance ← Not measurable
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should identify this as a BAD constraint.
+          EXCELLENT: Identifies as BAD (vague, untestable), suggests specific alternative like "< 200ms at P95"
+          ACCEPTABLE: Notes it's too vague, suggests adding metrics
+          POOR: Accepts "should be fast" as valid constraint
+  - description: 'story-013: Workflow prompts for missing constraints'
+    vars:
+      input: "I have user stories but they're missing Technical Constraints. What should I do?"
+      context: |
+        file_path: framework/SAFEWORD.md (excerpt)
+        ---
+        **Edge cases:**
+        - User stories exist but test definitions don't → Create test definitions before implementation
+        - User stories missing Technical Constraints → Add constraints before test definitions
+        - Test definitions exist but user stories don't → Ask if user stories needed
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should follow the edge case guidance.
+          EXCELLENT: Add constraints BEFORE creating test definitions, references the workflow order
+          ACCEPTABLE: Suggests adding constraints
+          POOR: Skips constraints and proceeds to test definitions
+  # =============================================================================
+  # LLM INSTRUCTION DESIGN (15 tests)
+  # =============================================================================
+  - description: 'llm-001: MECE decision trees'
+    vars:
+      input: |
+        I'm writing a decision tree for choosing between unit, integration, and E2E tests. Here's my draft:
+        - Is it a pure function?
+        - Does it interact with multiple components?
+        - Does it test the full user flow?
+      context: |
+        file_path: framework/guides/llm-instruction-design.md (excerpt)
+        ---
+        ## Principle 1: MECE Decision Trees
+        Branches must be Mutually Exclusive and Collectively Exhaustive.
+        - No overlapping conditions
+        - Use sequential ordering with first-match stop
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should identify overlapping branches and suggest sequential MECE structure.
+          EXCELLENT: Identifies overlap ("multiple components" and "full user flow" can both apply), suggests sequential ordering with first-match stop
+          ACCEPTABLE: Notes ambiguity, suggests improvement
+          POOR: Accepts overlapping branches without comment
+  - description: 'llm-002: Explicit definitions'
+    vars:
+      input: "I'm writing documentation that says 'Test critical paths at the lowest level possible'"
+      context: |
+        file_path: framework/guides/llm-instruction-design.md (excerpt)
+        ---
+        ## Principle 2: Explicit Definitions
+        Define all terms that could be interpreted differently.
+        **Vague:** "critical paths"
+        **Explicit:** "user-facing, revenue-impacting, or data-integrity paths"
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should identify vague terms and suggest explicit definitions.
+          EXCELLENT: Identifies both "critical paths" and "lowest level" as vague, suggests explicit definitions with examples
+          ACCEPTABLE: Identifies at least one vague term
+          POOR: Accepts vague phrasing without comment
+  - description: 'llm-003: No contradictions'
+    vars:
+      input: "I'm updating our testing guide. Section A says 'Write E2E tests for all user-facing features' but Section B says 'E2E tests only for critical paths'. Should I keep both?"
+      context: |
+        file_path: framework/guides/llm-instruction-design.md (excerpt)
+        ---
+        ## Principle 3: No Contradictions
+        Conflicting rules cause LLMs to pick randomly or ask unnecessary questions.
+        **Fix:** Reconcile into single rule with explicit definition.
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should identify contradiction and suggest reconciliation.
+          EXCELLENT: Identifies contradiction, suggests reconciling into single rule with explicit definition of "critical"
+          ACCEPTABLE: Identifies contradiction, suggests removing one
+          POOR: Accepts both statements without noting conflict
+  - description: 'llm-004: Concrete examples'
+    vars:
+      input: "I'm writing a rule that says 'Use meaningful variable names'. Is this good enough?"
+      context: |
+        file_path: framework/guides/llm-instruction-design.md (excerpt)
+        ---
+        ## Principle 4: Concrete Examples
+        Abstract rules need BAD/GOOD examples.
+        **Rule:** "Use meaningful variable names"
+        **Example:** `x` → BAD, `userCount` → GOOD
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should suggest adding BAD/GOOD examples.
+          EXCELLENT: Suggests adding 2-3 concrete BAD/GOOD examples (e.g., `x` vs `userCount`)
+          ACCEPTABLE: Suggests adding at least one example
+          POOR: Accepts abstract rule without examples
+  - description: 'llm-005: Edge cases explicit'
+    vars:
+      input: "I'm writing a rule: 'Unit test all pure functions'. Is this complete?"
+      context: |
+        file_path: framework/guides/llm-instruction-design.md (excerpt)
+        ---
+        ## Principle 5: Edge Cases Explicit
+        Document exceptions and boundary conditions.
+        **Rule:** "Unit test all pure functions"
+        **Edge cases:** Date.now(), process.env, mixed pure+I/O
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should suggest adding edge cases section.
+          EXCELLENT: Suggests adding edge cases (Date.now(), process.env, mixed pure+I/O)
+          ACCEPTABLE: Suggests adding at least one edge case
+          POOR: Accepts rule without edge cases
+  - description: 'llm-006: Actionable not vague'
+    vars:
+      input: "I'm writing guidance: 'Most of your tests should be fast, some can be slow'. Is this clear enough?"
+      context: |
+        file_path: framework/guides/llm-instruction-design.md (excerpt)
+        ---
+        ## Principle 6: Actionable, Not Vague
+        **Vague:** "Most of your tests should be fast"
+        **Actionable:** "Unit tests: <100ms. Integration: <5s. E2E: <30s."
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should identify vague terms and suggest actionable alternatives.
+          EXCELLENT: Identifies "most/some" as vague, suggests concrete rules with numbers
+          ACCEPTABLE: Identifies vagueness, suggests improvement
+          POOR: Accepts vague guidance without comment
+  - description: 'llm-007: Sequential decision trees'
+    vars:
+      input: |
+        I have a decision tree with three parallel branches:
+        - Is it a pure function?
+        - Does it interact with the database?
+        - Does it render UI?
+      context: |
+        file_path: framework/guides/llm-instruction-design.md (excerpt)
+        ---
+        ## Principle 7: Sequential Decision Trees
+        Use numbered steps with explicit "stop at first match" instruction.
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should suggest converting to sequential with first-match stop.
+          EXCELLENT: Suggests sequential ordering with explicit "stop at first match" instruction
+          ACCEPTABLE: Suggests ordering the questions
+          POOR: Accepts parallel structure without comment
+  - description: 'llm-008: Tie-breaking rules'
+    vars:
+      input: 'I have a decision tree but sometimes both options seem valid. What should I add?'
+      context: |
+        file_path: framework/guides/llm-instruction-design.md (excerpt)
+        ---
+        ## Principle 8: Tie-Breaking Rules
+        **Every decision point needs a default.**
+        Without tie-breakers, LLMs may:
+        - Pick randomly
+        - Ask unnecessary clarifying questions
+        - Get stuck in analysis paralysis
+        **Pattern:** "When X and Y both apply, prefer X because [reason]"
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should recommend adding a tie-breaking rule.
+          EXCELLENT: Recommends tie-breaking rule with pattern and example, explains why LLMs need explicit defaults
+          ACCEPTABLE: Suggests adding a default or priority
+          POOR: Doesn't mention tie-breaking or suggests asking user every time
+  - description: 'llm-009: Lookup tables for complex logic'
+    vars:
+      input: 'I have 5 different scenarios for choosing between unit, integration, and E2E tests. Should I write them as prose paragraphs?'
+      context: |
+        file_path: framework/guides/llm-instruction-design.md (excerpt)
+        ---
+        ## Principle 9: Lookup Tables for Complex Logic
+        When you have 3+ scenarios, use a table instead of prose.
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should suggest using a lookup table.
+          EXCELLENT: Suggests lookup table format with clear columns (Scenario/Unit/Integration/E2E/Best Choice)
+          ACCEPTABLE: Suggests table format
+          POOR: Accepts prose paragraphs for 5 scenarios
+  - description: 'llm-010: No caveats in tables'
+    vars:
+      input: "I have a table cell that says 'Unit test ✅ (unless it uses external APIs)'. Is this okay?"
+      context: |
+        file_path: framework/guides/llm-instruction-design.md (excerpt)
+        ---
+        ## Principle 10: No Caveats in Tables
+        Parenthetical conditions in cells cause parsing errors.
+        **Fix:** Create separate row for the exception case.
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should suggest removing caveat from cell.
+          EXCELLENT: Suggests creating separate row for external API case, removing parenthetical
+          ACCEPTABLE: Identifies parenthetical as problem
+          POOR: Accepts caveat in cell
+  - description: 'llm-011: Percentages with context'
+    vars:
+      input: "I'm writing guidance: 'Aim for 80% unit tests, 15% integration tests, 5% E2E tests'. Is this clear?"
+      context: |
+        file_path: framework/guides/llm-instruction-design.md (excerpt)
+        ---
+        ## Principle 11: Percentages with Context
+        Raw percentages without context are misleading.
+        **Better:** Add adjustments for different project types OR use principles-based alternative.
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should suggest adding context or principles-based alternative.
+          EXCELLENT: Suggests adding adjustments for different project types OR suggests principles-based alternative
+          ACCEPTABLE: Notes percentages need context
+          POOR: Accepts standalone percentages without comment
+  - description: 'llm-012: Specific questions'
+    vars:
+      input: "I'm writing a decision tree question: 'Does this test need to see the UI?' Is this specific enough?"
+      context: |
+        file_path: framework/guides/llm-instruction-design.md (excerpt)
+        ---
+        ## Principle 12: Specific Questions
+        **Vague:** "Does this test need to see the UI?"
+        **Specific:** "Does this test require a real browser (Playwright/Cypress)?"
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should suggest more specific wording.
+          EXCELLENT: Suggests tool-specific wording like "real browser (Playwright/Cypress)" and clarifies RTL distinction
+          ACCEPTABLE: Suggests more specific wording
+          POOR: Accepts vague "see the UI" phrasing
+  - description: 'llm-013: Re-evaluation paths'
+    vars:
+      input: "I have a feature that doesn't fit any of my testing categories. What should I do?"
+      context: |
+        file_path: framework/guides/llm-instruction-design.md (excerpt)
+        ---
+        ## Principle 13: Re-evaluation Paths
+        When nothing fits, provide decomposition strategy:
+        1. Separate concerns
+        2. Test each concern with appropriate type
+        3. Show example
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should provide decomposition strategy.
+          EXCELLENT: Provides 3-step decomposition (separate concerns → test each → example)
+          ACCEPTABLE: Suggests breaking down the feature
+          POOR: Says "re-evaluate your approach" without concrete steps
+  - description: 'llm-014: Anti-patterns guard'
+    vars:
+      input: "I'm writing documentation that says 'Follow the test pyramid - lots of unit tests at the base, integration in the middle, E2E at the top'"
+      context: |
+        file_path: framework/guides/llm-instruction-design.md (excerpt)
+        ---
+        ## Anti-Patterns
+        **Visual metaphors:** LLMs can't see pyramids. Convert to actionable rules.
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should identify visual metaphor anti-pattern.
+          EXCELLENT: Identifies "test pyramid" as visual metaphor, suggests actionable alternative
+          ACCEPTABLE: Notes visual metaphor issue
+          POOR: Accepts visual metaphor without comment
+  - description: 'llm-015: Quality checklist'
+    vars:
+      input: 'I just finished writing an LLM instruction document. What should I check before committing?'
+      context: |
+        file_path: framework/guides/llm-instruction-design.md (excerpt)
+        ---
+        ## Quality Checklist
+        - [ ] MECE decision trees
+        - [ ] All terms defined
+        - [ ] No contradictions
+        - [ ] Concrete examples
+        - [ ] Edge cases documented
+        - [ ] Actionable language
+        - [ ] Tie-breaking rules
+        - [ ] Lookup tables for 3+ scenarios
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should provide quality checklist items.
+          EXCELLENT: Lists most/all checklist items (MECE, definitions, examples, edge cases, etc.)
+          ACCEPTABLE: Lists several key checklist items
+          POOR: Generic advice without specific checklist
+  # =============================================================================
+  # TDD BEST PRACTICES (16 tests)
+  # =============================================================================
+  - description: 'tdd-001: Select correct template for feature'
+    vars:
+      input: 'I need to document a new payment flow feature. Which template should I use?'
+      context: |
+        file_path: framework/guides/tdd-best-practices.md (excerpt)
+        ---
+        ## Template Selection
+        | Need | Template | Location |
+        |------|----------|----------|
+        | Feature/issue user stories | `user-stories-template.md` | `.safeword/planning/user-stories/` |
+        | Feature test suites | `test-definitions-feature.md` | `.safeword/planning/test-definitions/` |
+        | Feature implementation design | `design-doc-template.md` | `.safeword/planning/design/` |
+        | Project-wide architecture | No template | `ARCHITECTURE.md` at root |
+        **Decision rule:** If unclear, ask: "Does this affect the whole project or just one feature?" Project-wide → architecture doc. Single feature → design doc.
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should recommend user-stories-template.md and/or design-doc-template.md for a feature.
+          EXCELLENT: Recommends starting with user stories, then design doc, explains workflow
+          ACCEPTABLE: Recommends appropriate template(s) for feature documentation
+          POOR: Recommends architecture doc for a single feature
+  - description: 'tdd-002: Story format selection'
+    vars:
+      input: "I'm writing a user story for a login feature. Should I use 'As a user...' or 'Given I am...'?"
+      context: |
+        file_path: framework/guides/tdd-best-practices.md (excerpt)
+        ---
+        ## Story Format Selection
+        | Format | Best For |
+        |--------|----------|
+        | Standard (As a/I want/So that) | User-facing features, UI flows |
+        | Given-When-Then | API behavior, state transitions, edge cases |
+        | Job Story | Problem-solving, user motivation unclear |
+        **Decision rule:** Default to Standard. Use Given-When-Then for APIs or complex state.
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should recommend appropriate format based on context.
+          EXCELLENT: Recommends standard "As a..." for features, Given-When-Then for behavior-focused
+          ACCEPTABLE: Explains both formats
+          POOR: No guidance on format selection
+  - description: 'tdd-003: Acceptance criteria count'
+    vars:
+      input: 'My user story has 8 acceptance criteria and no out-of-scope section. Is this okay?'
+      context: |
+        file_path: framework/guides/tdd-best-practices.md (excerpt)
+        ---
+        ## Story Scope
+        - Target 2-5 acceptance criteria per story
+        - Include out-of-scope section to prevent creep
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should suggest reducing AC and adding out-of-scope.
+          EXCELLENT: Suggests 2-5 AC, recommends adding out-of-scope to prevent creep
+          ACCEPTABLE: Notes AC count is high
+          POOR: Accepts 8 AC without comment
+  - description: 'tdd-005: Test definition sections'
+    vars:
+      input: "I'm creating test definitions. What sections should I include?"
+      context: |
+        file_path: framework/guides/tdd-best-practices.md (excerpt)
+        ---
+        ## Test Definition Sections
+        Required sections:
+        - Suites (grouped by concern)
+        - Individual tests (numbered)
+        - Status per test
+        - Coverage summary
+        - Execution commands
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should list required sections.
+          EXCELLENT: Suites, individual tests, status per test, coverage summary, execution commands
+          ACCEPTABLE: Lists most sections
+          POOR: Vague or incomplete list
+  - description: 'tdd-007: Bad story example'
+    vars:
+      input: "Is this a good story? 'As a user, I want the app to work better so that I'm happy'"
+      context: |
+        file_path: framework/guides/tdd-best-practices.md (excerpt)
+        ---
+        ## Bad Story Examples
+        **BAD:** "As a user, I want the app to work better"
+        - Vague role
+        - Unmeasurable "work better"
+        - No acceptance criteria
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should identify anti-patterns.
+          EXCELLENT: Identifies all issues (vague role, unmeasurable "work better", no AC)
+          ACCEPTABLE: Identifies at least 2 issues
+          POOR: Accepts vague story
+  - description: 'tdd-008: INVEST criteria'
+    vars:
+      input: 'How do I know if my user story is good enough?'
+      context: |
+        file_path: framework/guides/tdd-best-practices.md (excerpt)
+        ---
+        ## INVEST Criteria
+        - **I**ndependent - Can be built alone
+        - **N**egotiable - Scope is flexible
+        - **V**aluable - User cares
+        - **E**stimable - Can size it
+        - **S**mall - 1-3 days
+        - **T**estable - Can verify done
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should explain INVEST criteria.
+          EXCELLENT: Explains Independent, Negotiable, Valuable, Estimable, Small, Testable
+          ACCEPTABLE: Mentions several INVEST criteria
+          POOR: No structured validation criteria
+  - description: 'tdd-009: Test definition format'
+    vars:
+      input: 'How should I format individual tests in my test definitions?'
+      context: |
+        file_path: framework/guides/tdd-best-practices.md (excerpt)
+        ---
+        ## Test Format
+        Each test should have:
+        1. Numbered ID
+        2. Description
+        3. Status indicator
+        4. Steps (numbered)
+        5. Expected outcome
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should show test format.
+          EXCELLENT: Shows numbered format with description, status, steps, expected outcome
+          ACCEPTABLE: Shows basic format
+          POOR: Vague or no format guidance
+  - description: 'tdd-012: Test data builders'
+    vars:
+      input: "I'm writing tests that need complex test data. How should I structure this?"
+      context: |
+        file_path: framework/guides/tdd-best-practices.md (excerpt)
+        ---
+        ## Test Data Builders
+        Use builder pattern with sensible defaults:
+        ```typescript
+        const user = createTestUser({ name: 'Alice' });
+        ```
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should recommend test data builders.
+          EXCELLENT: Recommends builder pattern with defaults, explains benefits
+          ACCEPTABLE: Suggests organizing test data
+          POOR: No guidance on test data
+  - description: 'tdd-014: Real LLM integration'
+    vars:
+      input: 'Should my integration tests use a real LLM or mock it?'
+      context: |
+        file_path: framework/guides/tdd-best-practices.md (excerpt)
+        ---
+        ## Real vs Mock LLM
+        - Real LLM: Schema compliance, integration behavior
+        - Mock: Unit tests, cost control
+        - Consider: API costs, test speed, flakiness
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should provide guidance on real vs mock.
+          EXCELLENT: Real LLM for schema compliance, mock for unit tests, cost considerations
+          ACCEPTABLE: Distinguishes use cases
+          POOR: No guidance on when to use real vs mock
+  - description: 'tdd-015: INVEST gate - story too big'
+    vars:
+      input: 'My story is too big to estimate. What should I do?'
+      context: |
+        file_path: framework/guides/tdd-best-practices.md (excerpt)
+        ---
+        ## INVEST Gate
+        If story fails INVEST (e.g., not Estimable, not Small):
+        → Split into smaller stories
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should suggest splitting.
+          EXCELLENT: Cites INVEST (Estimable, Small), suggests splitting into smaller stories
+          ACCEPTABLE: Suggests splitting
+          POOR: Accepts large story
+  # =============================================================================
+  # DESIGN DOC GUIDE (10 tests)
+  # =============================================================================
+  - description: 'design-001: Check prerequisites before design doc'
+    vars:
+      input: 'Create a design doc for a new search feature'
+      context: |
+        file_path: framework/guides/design-doc-guide.md (excerpt)
+        ---
+        ## Prerequisites
+        Before creating a design doc:
+        1. User stories must exist
+        2. Test definitions must exist
+        If missing, create them first or offer to create.
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should check for prerequisites before creating design doc.
+          EXCELLENT: Asks about or checks for user stories and test definitions first, offers to create if missing
+          ACCEPTABLE: Mentions prerequisites exist/needed
+          POOR: Creates design doc without checking prerequisites
+  - description: 'design-002: Use standard template'
+    vars:
+      input: 'Create a design doc for a notification system feature'
+      context: |
+        file_path: framework/guides/design-doc-guide.md (excerpt)
+        ---
+        ## Template Structure
+        Required sections:
+        - Architecture
+        - Components (with [N]/[N+1] pattern)
+        - Data Model (if applicable)
+        - User Flow
+        - Key Decisions (what/why/trade-off)
+        - Implementation Notes (if applicable)
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should use the standard template structure.
+          EXCELLENT: Uses template structure with all sections, marks optional sections "(if applicable)"
+          ACCEPTABLE: Uses template structure with most sections
+          POOR: Creates ad-hoc structure without following template
+  - description: 'design-003: Assess complexity threshold'
+    vars:
+      input: 'Do I need a design doc for adding a logout button?'
+      context: |
+        file_path: framework/guides/design-doc-guide.md (excerpt)
+        ---
+        ## Complexity Threshold
+        Create design doc when:
+        - >3 components involved
+        - Spans 2+ user stories
+        - Architectural decisions needed
+        Skip for simple features (<3 components, single story)
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should say no design doc needed (too simple).
+          EXCELLENT: Correctly assesses as too simple + explains why (doesn't meet complexity threshold)
+          ACCEPTABLE: Says probably not needed
+          POOR: Recommends creating design doc
+  - description: 'design-004: Components with [N]/[N+1] pattern'
+    vars:
+      input: 'Define the components for a file upload feature in a design doc'
+      context: |
+        file_path: framework/guides/design-doc-guide.md (excerpt)
+        ---
+        ## Components Section
+        Use [N]/[N+1] pattern:
+        - Component 1: Full definition (name, responsibility, interface, dependencies, tests)
+        - Component 2: Show variation from Component 1
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should use [N]/[N+1] pattern with full component definitions.
+          EXCELLENT: Defines Component 1 with all 5 attributes, then Component 2 showing variation
+          ACCEPTABLE: Defines multiple components with most attributes
+          POOR: Lists components without [N]/[N+1] pattern or missing key attributes
+  - description: 'design-005: Data model section'
+    vars:
+      input: 'Write the data model section for a design doc about a shopping cart feature'
+      context: |
+        file_path: framework/guides/design-doc-guide.md (excerpt)
+        ---
+        ## Data Model Section
+        Include:
+        - State shape/schema
+        - Type relationships
+        - Data flow through components
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should document state shape, relationships, and flow.
+          EXCELLENT: Documents state shape/schema, shows type relationships, explains data flow
+          ACCEPTABLE: Documents state shape with some relationships
+          POOR: Skips data model or provides vague description
+  - description: 'design-006: Component interaction'
+    vars:
+      input: 'Document the component interaction for a drag-and-drop file organizer feature'
+      context: |
+        file_path: framework/guides/design-doc-guide.md (excerpt)
+        ---
+        ## Component Interaction Section
+        Document:
+        - Events/method calls between components
+        - Data flow (Component N → N+1)
+        - Edge cases in interactions
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should document events, data flow, and edge cases.
+          EXCELLENT: Documents events/method calls, shows data flow, notes edge cases
+          ACCEPTABLE: Documents communication pattern and data flow
+          POOR: Skips interaction section for multi-component feature
+  - description: 'design-007: Concrete user flow'
+    vars:
+      input: 'Write the user flow section for a design doc about a password reset feature'
+      context: |
+        file_path: framework/guides/design-doc-guide.md (excerpt)
+        ---
+        ## User Flow Section
+        Write concrete step-by-step flow:
+        - Specific UI elements (buttons, forms)
+        - Keyboard shortcuts if applicable
+        - Reference user stories/test definitions
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should write concrete step-by-step flow with specific UI interactions.
+          EXCELLENT: Concrete steps with specific UI elements, references user stories/test defs
+          ACCEPTABLE: Step-by-step flow with some concrete details
+          POOR: Vague flow like "user resets password" without concrete steps
+  - description: 'design-008: Key decisions with trade-offs'
+    vars:
+      input: 'Write the key decisions section for a design doc about choosing between REST and GraphQL for an API'
+      context: |
+        file_path: framework/guides/design-doc-guide.md (excerpt)
+        ---
+        ## Key Decisions Section
+        Use [N]/[N+1] pattern:
+        - Decision 1: what/why (specifics)/trade-off
+        - Decision 2: Show variation
+        - Link to benchmarks if relevant
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should document decision with what/why/trade-off format.
+          EXCELLENT: Decision 1 with what/why (specifics)/trade-off, Decision 2 showing variation
+          ACCEPTABLE: Decisions with what/why/trade-off
+          POOR: Decisions without trade-offs or vague rationale
+  - description: 'design-009: Implementation notes'
+    vars:
+      input: 'Write the implementation notes section for a design doc about a real-time collaborative editing feature'
+      context: |
+        file_path: framework/guides/design-doc-guide.md (excerpt)
+        ---
+        ## Implementation Notes Section
+        Document:
+        - Constraints
+        - Error handling
+        - Gotchas/risks
+        - Open questions
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should document constraints, error handling, gotchas, and open questions.
+          EXCELLENT: Documents all 4 areas with specific details
+          ACCEPTABLE: Documents 3+ areas
+          POOR: Skips implementation notes for complex feature
+  - description: 'design-010: Quality checklist'
+    vars:
+      input: 'Review this design doc for quality before merge'
+      context: |
+        file_path: framework/guides/design-doc-guide.md (excerpt)
+        ---
+        ## Quality Checklist
+        - [ ] References not duplicates
+        - [ ] [N]/[N+1] examples
+        - [ ] Concrete user flow
+        - [ ] What/why/trade-off in decisions
+        - [ ] Optional sections marked
+        - [ ] ~121 lines target
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should apply the 6-point checklist.
+          EXCELLENT: Checks all 6 items
+          ACCEPTABLE: Checks 4+ items
+          POOR: Generic review without applying checklist
+  # =============================================================================
+  # CONTEXT FILES GUIDE (11 tests)
+  # =============================================================================
+  - description: 'ctx-001: Choose right context file'
+    vars:
+      input: 'Set up project context for a project using both Claude and Cursor'
+      context: |
+        file_path: framework/guides/context-files-guide.md (excerpt)
+        ---
+        ## File Selection
+        - AGENTS.md: Tool-agnostic (works with Claude, Cursor, etc.)
+        - CLAUDE.md: Claude Code specific
+        - .cursorrules: Cursor specific
+        For multi-tool projects, use AGENTS.md
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should create AGENTS.md (tool-agnostic) or both tool-specific files.
+          EXCELLENT: Creates AGENTS.md with clear rationale OR creates both tool-specific files
+          ACCEPTABLE: Creates appropriate context file
+          POOR: Creates wrong file type or doesn't explain choice
+  - description: 'ctx-002: Include SAFEWORD trigger'
+    vars:
+      input: 'Create an AGENTS.md file for a new project'
+      context: |
+        file_path: framework/guides/context-files-guide.md (excerpt)
+        ---
+        ## SAFEWORD Trigger (Required)
+        First line must be:
+        **⚠️ ALWAYS READ FIRST: @./.safeword/SAFEWORD.md**
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should include SAFEWORD trigger at top.
+          EXCELLENT: Includes exact trigger format + brief rationale
+          ACCEPTABLE: Includes trigger but slightly different wording
+          POOR: Missing trigger or buried in middle of file
+  - description: 'ctx-003: No duplication'
+    vars:
+      input: 'Create a tests/AGENTS.md file for a project that already has a root AGENTS.md with TDD workflow documented'
+      context: |
+        file_path: framework/guides/context-files-guide.md (excerpt)
+        ---
+        ## Auto-Loading Behavior
+        Subdirectory files inherit from parent.
+        Don't duplicate - use cross-references:
+        "See root AGENTS.md for TDD workflow"
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should reference root for TDD, not duplicate.
+          EXCELLENT: Uses cross-reference ("See root AGENTS.md"), no duplication
+          ACCEPTABLE: Minimal duplication with cross-reference
+          POOR: Duplicates TDD workflow content from root
+  - description: 'ctx-004: Use modular imports'
+    vars:
+      input: 'Create an AGENTS.md for a project with architecture decisions in docs/architecture.md and coding standards in docs/conventions.md'
+      context: |
+        file_path: framework/guides/context-files-guide.md (excerpt)
+        ---
+        ## Modular Structure
+        Use imports for external files:
+        @docs/architecture.md
+        @docs/conventions.md
+        Keep root file under 50 lines
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should use import syntax to reference external files.
+          EXCELLENT: Uses @docs/ imports, keeps root file under 50 lines
+          ACCEPTABLE: Uses imports but file is slightly over target
+          POOR: Duplicates content instead of importing
+  - description: 'ctx-005: Content rules'
+    vars:
+      input: 'I want to add setup instructions and our TDD workflow to the AGENTS.md file'
+      context: |
+        file_path: framework/guides/context-files-guide.md (excerpt)
+        ---
+        ## Content Rules
+        **In AGENTS.md:** Coding patterns, workflow triggers, domain knowledge
+        **NOT in AGENTS.md:** Setup instructions (→ README.md)
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should redirect setup to README.md.
+          EXCELLENT: Redirects setup to README.md, explains TDD belongs in root if project-specific
+          ACCEPTABLE: Correctly redirects setup, allows TDD
+          POOR: Adds both to AGENTS.md without redirection
+  - description: 'ctx-006: Size targets'
+    vars:
+      input: 'Review this AGENTS.md file that is 250 lines long'
+      context: |
+        file_path: framework/guides/context-files-guide.md (excerpt)
+        ---
+        ## Size Targets
+        - Root: <200 lines
+        - Subdirectory: <100 lines
+        If over, extract to imports or subdirectory files
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should recommend extracting or using imports.
+          EXCELLENT: Identifies >200 line violation, recommends extraction with specific suggestions
+          ACCEPTABLE: Identifies violation, recommends reduction
+          POOR: Accepts 250-line file without comment
+  - description: 'ctx-007: Cross-reference pattern'
+    vars:
+      input: 'Add a reference to the agents directory in the root AGENTS.md'
+      context: |
+        file_path: framework/guides/context-files-guide.md (excerpt)
+        ---
+        ## Cross-Reference Pattern
+        **Agents** (`path/`) - Description. See `path/AGENTS.md`.
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should use the standard cross-reference pattern.
+          EXCELLENT: Uses pattern with path and link
+          ACCEPTABLE: Uses cross-reference with path
+          POOR: Duplicates content instead of cross-referencing
+  - description: 'ctx-008: Maintenance'
+    vars:
+      input: 'The project just underwent a major refactor. The AGENTS.md still references old directory structure.'
+      context: |
+        file_path: framework/guides/context-files-guide.md (excerpt)
+        ---
+        ## Maintenance
+        After refactors:
+        - Update or remove outdated sections
+        - Verify cross-references still work
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should recommend updating or removing outdated sections.
+          EXCELLENT: Identifies outdated content, recommends removal/update
+          ACCEPTABLE: Recommends updating the file
+          POOR: Ignores outdated content
+  - description: 'ctx-009: Domain requirements'
+    vars:
+      input: 'Create an AGENTS.md for a tabletop RPG game assistant project'
+      context: |
+        file_path: framework/guides/context-files-guide.md (excerpt)
+        ---
+        ## Domain Requirements Section
+        For specialized projects, include:
+        - Domain-specific terminology
+        - Game mechanics (for games)
+        - Business rules
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should include Domain Requirements section.
+          EXCELLENT: Includes Domain Requirements with game mechanics, uses template structure
+          ACCEPTABLE: Includes domain section but less detailed
+          POOR: Omits domain requirements for specialized project
+  - description: 'ctx-010: LLM checklist'
+    vars:
+      input: 'Review this AGENTS.md file for LLM comprehension quality'
+      context: |
+        file_path: framework/guides/context-files-guide.md (excerpt)
+        ---
+        ## LLM Comprehension Checklist
+        1. MECE decision trees
+        2. Terms defined
+        3. No contradictions
+        4. Concrete examples
+        5. Edge cases explicit
+        6. Actionable language
+        7. No redundancy
+        8. Size within limits
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should apply the 8-point checklist.
+          EXCELLENT: Checks all 8 items
+          ACCEPTABLE: Checks 5+ items
+          POOR: Generic review without applying checklist
+  - description: 'ctx-011: Token efficiency'
+    vars:
+      input: 'Review this 300-line AGENTS.md with narrative paragraphs for token efficiency'
+      context: |
+        file_path: framework/guides/context-files-guide.md (excerpt)
+        ---
+        ## Token Efficiency
+        - Use bullets over paragraphs
+        - Remove redundancy
+        - Use imports for modularization
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should recommend converting to bullets, removing redundancy.
+          EXCELLENT: Identifies verbose content, recommends bullets over paragraphs, suggests imports
+          ACCEPTABLE: Recommends reducing size
+          POOR: Accepts verbose file without comment
+  # =============================================================================
+  # DATA ARCHITECTURE GUIDE (7 tests)
+  # =============================================================================
+  - description: 'data-001: Decision tree for where to document'
+    vars:
+      input: "I'm adding a new Redis cache for session data. Where should I document this?"
+      context: |
+        file_path: framework/guides/data-architecture-guide.md (excerpt)
+        ---
+        ## Where to Document
+        Architecture Doc when:
+        - Adding new data store
+        - Changing data model
+        - New data flows
+        Design Doc when:
+        - Feature-specific data handling
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should select Architecture Doc (new data store).
+          EXCELLENT: Correctly identifies Architecture Doc, cites "Adding new data store"
+          ACCEPTABLE: Correctly identifies Architecture Doc
+          POOR: Suggests Design Doc for new data store
+  - description: 'data-002: Data principles format'
+    vars:
+      input: 'Create a data architecture section for a user management system'
+      context: |
+        file_path: framework/guides/data-architecture-guide.md (excerpt)
+        ---
+        ## Data Principles
+        4 principles with What/Why/Document/Example format:
+        1. Data Quality
+        2. Data Governance
+        3. Data Accessibility
+        4. Living Documentation
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should include all 4 principles with proper format.
+          EXCELLENT: All 4 principles with What/Why/Document/Example format
+          ACCEPTABLE: 3+ principles with consistent format
+          POOR: Missing principles or inconsistent format
+  - description: 'data-004: Document data flows'
+    vars:
+      input: 'Document the data flow for user registration'
+      context: |
+        file_path: framework/guides/data-architecture-guide.md (excerpt)
+        ---
+        ## Data Flows
+        Document:
+        - Sources → Transformations → Destinations
+        - Error handling at each step
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should document full flow with error handling.
+          EXCELLENT: Documents full flow with error handling for each step
+          ACCEPTABLE: Documents flow with some error handling
+          POOR: Only documents happy path without error handling
+  - description: 'data-005: Data policies'
+    vars:
+      input: 'Document data policies for a multi-tenant SaaS application'
+      context: |
+        file_path: framework/guides/data-architecture-guide.md (excerpt)
+        ---
+        ## Data Policies
+        Document:
+        - Access control (read/write/delete roles)
+        - Lifecycle rules
+        - Conflict resolution strategy
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should document access control, lifecycle, and conflict resolution.
+          EXCELLENT: Documents all three with justification
+          ACCEPTABLE: Documents access control and lifecycle
+          POOR: Missing conflict resolution or lifecycle rules
+  - description: 'data-006: TDD triggers for data changes'
+    vars:
+      input: 'I just added a new payments table to the database. What should I update?'
+      context: |
+        file_path: framework/guides/data-architecture-guide.md (excerpt)
+        ---
+        ## TDD Integration Triggers
+        Update architecture doc when:
+        - Adding new data entities
+        - Changing data model
+        - New data flows
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should recommend updating architecture doc.
+          EXCELLENT: Recommends update, cites "Adding new data entities", mentions version/status
+          ACCEPTABLE: Recommends updating architecture doc
+          POOR: Suggests only updating code without documentation
+  - description: 'data-007: Common mistakes'
+    vars:
+      input: "Review this data architecture doc that has no migration strategy and uses vague performance targets like 'fast queries'"
+      context: |
+        file_path: framework/guides/data-architecture-guide.md (excerpt)
+        ---
+        ## Common Mistakes
+        - Missing migration strategy
+        - Vague performance targets ("fast" instead of "<100ms")
+        - No error handling documented
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should identify both anti-patterns.
+          EXCELLENT: Identifies both issues, cites Common Mistakes section
+          ACCEPTABLE: Identifies at least one issue
+          POOR: Accepts the doc without identifying anti-patterns
+  - description: 'data-008: Best practices checklist'
+    vars:
+      input: 'Review this data architecture doc for completeness before merge'
+      context: |
+        file_path: framework/guides/data-architecture-guide.md (excerpt)
+        ---
+        ## Best Practices Checklist
+        1. Principles format correct
+        2. Entities documented
+        3. Attributes with types
+        4. Storage rationale
+        5. Error handling
+        6. Validation checkpoints
+        7. Performance targets (specific)
+        8. Migration strategy
+        9. Version/status
+        10. Cross-references
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should apply the 10-point checklist.
+          EXCELLENT: Checks all 10 items
+          ACCEPTABLE: Checks 7+ items
+          POOR: Generic review without applying checklist
+  # =============================================================================
+  # LEARNING EXTRACTION (11 tests)
+  # =============================================================================
+  - description: 'learn-001: Recognize extraction triggers'
+    vars:
+      input: "I've been debugging this React state issue for 6 cycles now, tried 4 different approaches, and finally found it's a race condition not documented in the React docs"
+      context: |
+        file_path: framework/guides/learning-extraction.md (excerpt)
+        ---
+        ## Extraction Triggers
+        1. Observable debugging complexity (5+ cycles)
+        2. Trial and error (3+ approaches)
+        3. Undocumented gotcha
+        4. Integration struggle
+        5. Testing trap
+        6. Architectural insight
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should recognize multiple triggers and suggest extraction.
+          EXCELLENT: Identifies 3+ triggers, suggests extraction after fix confirmed
+          ACCEPTABLE: Identifies triggers, suggests extraction
+          POOR: Doesn't recognize triggers or suggests extraction mid-debug
+  - description: 'learn-002: Check existing learnings first'
+    vars:
+      input: 'I just discovered a gotcha about React hooks and async state updates'
+      context: |
+        file_path: framework/guides/learning-extraction.md (excerpt)
+        ---
+        ## Before Extracting
+        ALWAYS check for existing learnings first:
+        ls .safeword/learnings/*react*.md
+        ls .safeword/learnings/*hooks*.md
+        If found, update instead of creating new.
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should check for existing learnings before suggesting extraction.
+          EXCELLENT: Checks for existing learnings, suggests update vs new
+          ACCEPTABLE: Mentions checking for existing learnings
+          POOR: Suggests creating new learning without checking existing
+  - description: 'learn-003: Place learnings correctly'
+    vars:
+      input: 'I learned that React useState is async - where should I document this?'
+      context: |
+        file_path: framework/guides/learning-extraction.md (excerpt)
+        ---
+        ## Location Decision Tree
+        Global (.safeword/learnings/):
+        - Applies to ALL projects using this tech
+        - Universal patterns
+        Project-specific:
+        - Only applies to this codebase
+        - Custom architecture patterns
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should select global learnings (applies to ALL React projects).
+          EXCELLENT: Selects .safeword/learnings/ (global), explains why, cites decision tree
+          ACCEPTABLE: Selects correct location
+          POOR: Selects project-specific location for universal React pattern
+  - description: 'learn-004: Respect instruction precedence'
+    vars:
+      input: 'The global learning says use Redux, but the project learning says use Zustand. Which should I follow?'
+      context: |
+        file_path: framework/guides/learning-extraction.md (excerpt)
+        ---
+        ## Instruction Precedence
+        1. Project-specific (highest)
+        2. Global learnings
+        3. Framework defaults (lowest)
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should follow project learning (higher precedence).
+          EXCELLENT: Follows project learning, explains precedence order
+          ACCEPTABLE: Follows project learning
+          POOR: Follows global learning or asks which to use
+  - description: 'learn-005: Use templates'
+    vars:
+      input: 'Create a learning about React useEffect cleanup functions'
+      context: |
+        file_path: framework/guides/learning-extraction.md (excerpt)
+        ---
+        ## Learning Template
+        Sections:
+        - Principle
+        - Gotcha (Bad/Good examples)
+        - Why
+        - Examples
+        - Testing Trap
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should use the learning template with all sections.
+          EXCELLENT: Uses template with all sections
+          ACCEPTABLE: Uses template with most sections
+          POOR: Creates ad-hoc structure without following template
+  - description: 'learn-006: Cross-reference in SAFEWORD'
+    vars:
+      input: 'I just created a learning at .safeword/learnings/electron-contexts.md about Electron renderer context'
+      context: |
+        file_path: framework/guides/learning-extraction.md (excerpt)
+        ---
+        ## Cross-Reference
+        After creating learning, add to SAFEWORD.md Common Gotchas:
+        **Electron Contexts** - One-liner. See learnings/electron-contexts.md
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should suggest adding cross-reference to SAFEWORD.md.
+          EXCELLENT: Suggests adding to Common Gotchas with proper format
+          ACCEPTABLE: Suggests adding cross-reference
+          POOR: Doesn't mention cross-referencing in SAFEWORD.md
+  - description: "learn-007: Don't suggest extraction for trivial fix"
+    vars:
+      input: 'Fixed a typo in the config file'
+      context: |
+        file_path: framework/guides/learning-extraction.md (excerpt)
+        ---
+        ## When NOT to Extract
+        Skip extraction for:
+        - Trivial fixes
+        - One-line changes
+        - Well-documented issues
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should NOT suggest extraction (trivial fix).
+          EXCELLENT: Does not suggest extraction, recognizes trivial fix
+          ACCEPTABLE: Doesn't mention extraction
+          POOR: Suggests extraction for trivial fix
+  - description: 'learn-008: Recommend splitting large files'
+    vars:
+      input: 'This learning file is 250 lines and covers both React hooks and Redux patterns'
+      context: |
+        file_path: framework/guides/learning-extraction.md (excerpt)
+        ---
+        ## Size Standards
+        - Max 150-200 lines per file
+        - One concept per file
+        - Split if covering multiple topics
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should recommend splitting into focused files.
+          EXCELLENT: Recommends splitting (>200 lines, multiple concepts), suggests specific split
+          ACCEPTABLE: Recommends splitting
+          POOR: Accepts 250-line multi-concept file without comment
+  - description: 'learn-010: Follow extraction workflow'
+    vars:
+      input: 'I just finished implementing a complex feature and discovered a race condition pattern. Walk me through documenting this.'
+      context: |
+        file_path: framework/guides/learning-extraction.md (excerpt)
+        ---
+        ## Extraction Workflow
+        1. Assess scope (global vs project)
+        2. Choose location
+        3. Extract using template
+        4. Cross-reference in SAFEWORD.md
+        5. Suggest commit message
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should follow the workflow steps.
+          EXCELLENT: Follows all workflow steps
+          ACCEPTABLE: Follows most workflow steps
+          POOR: Ad-hoc extraction without following workflow
+  - description: 'learn-011: Block trivial extractions'
+    vars:
+      input: "I want to create a learning that says 'Changed == to ==='"
+      context: |
+        file_path: framework/guides/learning-extraction.md (excerpt)
+        ---
+        ## Anti-Patterns
+        Don't extract:
+        - One-line fixes without context
+        - Well-known patterns
+        - Trivial changes
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should block this as trivial one-liner.
+          EXCELLENT: Blocks extraction, cites anti-pattern
+          ACCEPTABLE: Suggests this is too trivial
+          POOR: Proceeds with extraction
+  - description: 'learn-012: Size standards'
+    vars:
+      input: "I'm creating a learning file that's 180 lines and covers both React hooks and Redux patterns"
+      context: |
+        file_path: framework/guides/learning-extraction.md (excerpt)
+        ---
+        ## Size Standards
+        - Max 150-200 lines per file
+        - One concept per file
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should recommend splitting based on size and scope.
+          EXCELLENT: Recommends splitting (>150 lines, multiple concepts)
+          ACCEPTABLE: Notes it's borderline, recommends review
+          POOR: Accepts 180-line multi-concept file without comment
+  # =============================================================================
+  # LLM PROMPTING (10 tests)
+  # =============================================================================
+  - description: 'prompt-001: Concrete examples in prompts'
+    vars:
+      input: "I'm writing a prompt that says 'Return the user's intent'. Is this good enough?"
+      context: |
+        file_path: framework/guides/llm-prompting.md (excerpt)
+        ---
+        ## Concrete Examples
+        Abstract prompts need examples:
+        BAD: "Return the user's intent"
+        GOOD: "Return JSON: {intent: 'create_campaign', name: '...'}"
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should suggest adding BAD/GOOD examples with concrete format.
+          EXCELLENT: Suggests adding structured JSON example showing BAD vs GOOD
+          ACCEPTABLE: Suggests being more specific
+          POOR: Accepts vague prompt without examples
+  - description: 'prompt-002: Structured outputs'
+    vars:
+      input: "I'm building an AI agent that needs to understand user intent. Should I have it return prose like 'The user wants to create a campaign'?"
+      context: |
+        file_path: framework/guides/llm-prompting.md (excerpt)
+        ---
+        ## Structured Outputs
+        For machine consumption, use JSON:
+        - Explicit fields
+        - Type validation
+        - Predictable parsing
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should recommend structured JSON output.
+          EXCELLENT: Recommends JSON schema with explicit fields, shows example
+          ACCEPTABLE: Suggests structured output
+          POOR: Accepts prose output for machine consumption
+  - description: 'prompt-003: Prompt caching'
+    vars:
+      input: 'I have a 500-line system prompt that includes both static rules and the current character state. How should I structure this?'
+      context: |
+        file_path: framework/guides/llm-prompting.md (excerpt)
+        ---
+        ## Prompt Caching
+        Separate static from dynamic:
+        - Static rules: cache_control: ephemeral
+        - Dynamic state: user message (uncached)
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should recommend separating static from dynamic.
+          EXCELLENT: Recommends static with cache_control, dynamic in user message, mentions cost reduction
+          ACCEPTABLE: Suggests separating static from dynamic
+          POOR: Accepts mixed static/dynamic in system prompt
+  - description: 'prompt-004: Message architecture'
+    vars:
+      input: "I'm interpolating the user's character state directly into my system prompt like this: systemPrompt = `Rules + Character: ${dynamicState}`. Is this okay?"
+      context: |
+        file_path: framework/guides/llm-prompting.md (excerpt)
+        ---
+        ## Message Architecture
+        BAD: Dynamic state in system prompt (uncacheable)
+        GOOD: Dynamic state in user message
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should identify this as BAD pattern.
+          EXCELLENT: Identifies as BAD (uncacheable), recommends moving dynamic state to user message
+          ACCEPTABLE: Suggests separating static from dynamic
+          POOR: Accepts dynamic state in system prompt
+  - description: 'prompt-005: Cache invalidation'
+    vars:
+      input: 'I want to add a small clarification to my cached system prompt. Should I just make the change?'
+      context: |
+        file_path: framework/guides/llm-prompting.md (excerpt)
+        ---
+        ## Cache Invalidation
+        Any change breaks all caches.
+        Batch edits to minimize rebuilds.
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should warn about cache invalidation.
+          EXCELLENT: Warns "any change breaks all caches", suggests batching edits
+          ACCEPTABLE: Notes cache invalidation concern
+          POOR: Suggests making change without mentioning cache impact
+  - description: 'prompt-006: LLM-as-judge'
+    vars:
+      input: "I want to test if my AI GM's responses have a 'collaborative tone'. Should I check for specific keywords like 'together' or 'we'?"
+      context: |
+        file_path: framework/guides/llm-prompting.md (excerpt)
+        ---
+        ## LLM-as-Judge
+        For creative/qualitative outputs:
+        - Use rubric (EXCELLENT/ACCEPTABLE/POOR)
+        - Avoid brittle keyword matching
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should recommend LLM-as-judge with rubric.
+          EXCELLENT: Recommends LLM-as-judge with rubric, warns against brittle keywords
+          ACCEPTABLE: Suggests rubric-based evaluation
+          POOR: Accepts keyword matching for creative outputs
+  - description: 'prompt-007: Eval framework mapping'
+    vars:
+      input: 'I have a function that parses JSON, an agent that calls an LLM, and a judgment about narrative quality. What test types should I use?'
+      context: |
+        file_path: framework/guides/llm-prompting.md (excerpt)
+        ---
+        ## Test Type Mapping
+        - JSON parsing → Unit test
+        - Agent + LLM → Integration test
+        - Narrative quality → LLM Eval
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should map to correct test types.
+          EXCELLENT: JSON → Unit, Agent+LLM → Integration, Narrative → LLM Eval
+          ACCEPTABLE: Correctly identifies at least 2 mappings
+          POOR: Suggests same test type for all
+  - description: 'prompt-008: Cost awareness'
+    vars:
+      input: 'I want to run 100 LLM evaluation scenarios in CI. What should I consider?'
+      context: |
+        file_path: framework/guides/llm-prompting.md (excerpt)
+        ---
+        ## Cost Awareness
+        - ~$0.15-0.30 for 30 scenarios with caching
+        - Cache rubrics
+        - Budget expectations
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should provide cost guidance.
+          EXCELLENT: Mentions typical costs, suggests caching rubrics, budget expectations
+          ACCEPTABLE: Notes cost considerations
+          POOR: Ignores cost implications
+  - description: 'prompt-009: Why over what'
+    vars:
+      input: "My prompt says 'Use JSON output'. Should I add more context?"
+      context: |
+        file_path: framework/guides/llm-prompting.md (excerpt)
+        ---
+        ## Why Over What
+        Include rationale:
+        - Why JSON? Predictable parsing, validation
+        - Benefits and trade-offs
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should suggest adding rationale.
+          EXCELLENT: Suggests adding "why" (predictable parsing, validation)
+          ACCEPTABLE: Suggests adding rationale
+          POOR: Accepts bare instruction without context
+  - description: 'prompt-010: Precise terms'
+    vars:
+      input: "My decision tree asks 'Does this test need to see the UI?'"
+      context: |
+        file_path: framework/guides/llm-prompting.md (excerpt)
+        ---
+        ## Precise Technical Terms
+        Vague: "see the UI"
+        Precise: "real browser (Playwright/Cypress)"
+        Note: RTL is not a real browser
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should suggest more precise wording.
+          EXCELLENT: Suggests "real browser (Playwright/Cypress)", clarifies RTL distinction
+          ACCEPTABLE: Suggests more specific wording
+          POOR: Accepts vague "see the UI" phrasing
+  # =============================================================================
+  # TEST DEFINITIONS GUIDE (12 tests)
+  # =============================================================================
+  - description: 'testdef-001: Use standard template'
+    vars:
+      input: 'I need to create test definitions for a new feature. Where do I start?'
+      context: |
+        file_path: framework/guides/test-definitions-guide.md (excerpt)
+        ---
+        ## Getting Started
+        1. Use template from .safeword/templates/test-definitions-feature.md
+        2. Fill in feature name
+        3. Organize into suites
+        4. Add individual tests
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should point to template and workflow.
+          EXCELLENT: Points to template, lists steps
+          ACCEPTABLE: Points to template
+          POOR: No template reference
+  - description: 'testdef-002: Organize into suites'
+    vars:
+      input: 'I have 15 tests for a feature. How should I organize them?'
+      context: |
+        file_path: framework/guides/test-definitions-guide.md (excerpt)
+        ---
+        ## Suite Organization
+        Group by concern:
+        - Layout
+        - Interactions
+        - State
+        - Accessibility
+        - Edge Cases
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should suggest suite organization.
+          EXCELLENT: Suggests suites by concern, numbered tests
+          ACCEPTABLE: Suggests grouping logically
+          POOR: No organization guidance
+  - description: 'testdef-003: Track test status'
+    vars:
+      input: 'What status indicators should I use for my tests?'
+      context: |
+        file_path: framework/guides/test-definitions-guide.md (excerpt)
+        ---
+        ## Status Indicators
+        ✅ Passing
+        ⏭️ Skipped (with rationale)
+        ❌ Not Implemented
+        🔴 Failing
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should list status indicators.
+          EXCELLENT: Lists all 4 statuses with meanings
+          ACCEPTABLE: Lists most statuses
+          POOR: Inconsistent statuses
+  - description: 'testdef-004: Write clear steps'
+    vars:
+      input: "My test step says 'Check panes'. Is this good enough?"
+      context: |
+        file_path: framework/guides/test-definitions-guide.md (excerpt)
+        ---
+        ## Test Steps
+        BAD: "Check panes" (vague)
+        GOOD: "1. Verify left pane shows navigation 2. Verify center pane shows content"
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should identify vague step.
+          EXCELLENT: Identifies as BAD (vague), shows GOOD example with numbered steps
+          ACCEPTABLE: Notes it's too vague
+          POOR: Accepts vague step
+  - description: 'testdef-005: Specific expected outcomes'
+    vars:
+      input: "My expected outcome says 'Everything works'. Is this okay?"
+      context: |
+        file_path: framework/guides/test-definitions-guide.md (excerpt)
+        ---
+        ## Expected Outcomes
+        BAD: "Everything works"
+        GOOD: "Button is enabled, form submits, success message appears"
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should identify vague outcome.
+          EXCELLENT: Identifies as BAD, shows GOOD example with specific assertions
+          ACCEPTABLE: Notes it's too vague
+          POOR: Accepts vague outcome
+  - description: 'testdef-006: Coverage summary'
+    vars:
+      input: 'Should I include a coverage summary in my test definitions?'
+      context: |
+        file_path: framework/guides/test-definitions-guide.md (excerpt)
+        ---
+        ## Coverage Summary
+        Include:
+        - Total tests
+        - Passing/failing/skipped counts
+        - Rationale for skipped tests
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should recommend coverage summary.
+          EXCELLENT: Yes, with totals, percentages, rationale for skipped
+          ACCEPTABLE: Recommends summary
+          POOR: No guidance
+  - description: 'testdef-007: Test naming'
+    vars:
+      input: "I named my test 'Test 1'. Is this okay?"
+      context: |
+        file_path: framework/guides/test-definitions-guide.md (excerpt)
+        ---
+        ## Test Naming
+        BAD: "Test 1"
+        GOOD: "Render all three panes on initial load"
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should identify bad naming.
+          EXCELLENT: Identifies as BAD, suggests descriptive name
+          ACCEPTABLE: Notes name is not descriptive
+          POOR: Accepts "Test 1"
+  - description: 'testdef-008: Execution commands'
+    vars:
+      input: 'What should I include in the test execution section?'
+      context: |
+        file_path: framework/guides/test-definitions-guide.md (excerpt)
+        ---
+        ## Test Execution
+        Include:
+        - Command to run all tests
+        - Command to grep for specific test
+        - Match project tooling
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should list command requirements.
+          EXCELLENT: Commands to run all, grep for specific, match project tooling
+          ACCEPTABLE: Suggests including commands
+          POOR: No command guidance
+  - description: 'testdef-009: TDD workflow integration'
+    vars:
+      input: 'When should I create test definitions?'
+      context: |
+        file_path: framework/guides/test-definitions-guide.md (excerpt)
+        ---
+        ## TDD Workflow
+        - Create before implementation
+        - Alongside user stories
+        - Update status as tests pass/fail
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should explain TDD timing.
+          EXCELLENT: Before implementation, alongside user stories, update status
+          ACCEPTABLE: Mentions before implementation
+          POOR: No timing guidance
+  - description: 'testdef-010: Map to user stories'
+    vars:
+      input: 'How do I connect my tests to user stories?'
+      context: |
+        file_path: framework/guides/test-definitions-guide.md (excerpt)
+        ---
+        ## User Story Mapping
+        - Each AC has at least one test
+        - Add edge cases beyond AC
+        - Include test file references
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should explain mapping.
+          EXCELLENT: Each AC has test, edge cases beyond AC, test file references
+          ACCEPTABLE: Suggests mapping to AC
+          POOR: No mapping guidance
+  - description: 'testdef-011: Avoid implementation detail tests'
+    vars:
+      input: "My test verifies 'useUIStore hook works correctly'. Is this a good test?"
+      context: |
+        file_path: framework/guides/test-definitions-guide.md (excerpt)
+        ---
+        ## Anti-Patterns
+        BAD: Testing implementation details ("useUIStore hook works")
+        GOOD: Testing observable behavior ("clicking button updates UI")
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should identify anti-pattern.
+          EXCELLENT: Identifies as BAD (implementation detail), suggests testing observable behavior
+          ACCEPTABLE: Notes it's testing implementation
+          POOR: Accepts implementation detail test
+  - description: 'testdef-012: LLM-friendly test definitions'
+    vars:
+      input: 'How do I make my test definitions LLM-friendly?'
+      context: |
+        file_path: framework/guides/test-definitions-guide.md (excerpt)
+        ---
+        ## LLM Instruction Design
+        - MECE decision trees
+        - Explicit definitions
+        - Concrete examples
+        - Actionable language
+    assert:
+      - type: llm-rubric
+        value: |
+          The response should provide LLM optimization guidance.
+          EXCELLENT: MECE, explicit definitions, concrete examples, actionable language
+          ACCEPTABLE: Mentions clarity principles
+          POOR: No LLM-specific guidance
+# Output format
+outputPath: ./eval-results.json