npm - @qball-inc/the-bulwark - Versions diffs - 1.0.0 - Mend

@qball-inc/the-bulwark 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (175) hide show

package/.claude-plugin/plugin.json +43 -0
package/agents/bulwark-fix-validator.md +633 -0
package/agents/bulwark-implementer.md +391 -0
package/agents/bulwark-issue-analyzer.md +308 -0
package/agents/bulwark-standards-reviewer.md +221 -0
package/agents/plan-creation-architect.md +323 -0
package/agents/plan-creation-eng-lead.md +352 -0
package/agents/plan-creation-po.md +300 -0
package/agents/plan-creation-qa-critic.md +334 -0
package/agents/product-ideation-competitive-analyzer.md +298 -0
package/agents/product-ideation-idea-validator.md +268 -0
package/agents/product-ideation-market-researcher.md +292 -0
package/agents/product-ideation-pattern-documenter.md +308 -0
package/agents/product-ideation-segment-analyzer.md +303 -0
package/agents/product-ideation-strategist.md +259 -0
package/agents/statusline-setup.md +97 -0
package/hooks/hooks.json +59 -0
package/package.json +45 -0
package/scripts/hooks/cleanup-stale.sh +13 -0
package/scripts/hooks/enforce-quality.sh +166 -0
package/scripts/hooks/implementer-quality.sh +256 -0
package/scripts/hooks/inject-protocol.sh +52 -0
package/scripts/hooks/suggest-pipeline.sh +175 -0
package/scripts/hooks/track-pipeline-start.sh +37 -0
package/scripts/hooks/track-pipeline-stop.sh +52 -0
package/scripts/init-rules.sh +35 -0
package/scripts/init.sh +151 -0
package/skills/anthropic-validator/SKILL.md +607 -0
package/skills/anthropic-validator/references/agents-checklist.md +131 -0
package/skills/anthropic-validator/references/commands-checklist.md +102 -0
package/skills/anthropic-validator/references/hooks-checklist.md +151 -0
package/skills/anthropic-validator/references/mcp-checklist.md +136 -0
package/skills/anthropic-validator/references/plugins-checklist.md +148 -0
package/skills/anthropic-validator/references/skills-checklist.md +85 -0
package/skills/assertion-patterns/SKILL.md +296 -0
package/skills/bug-magnet-data/SKILL.md +284 -0
package/skills/bug-magnet-data/context/cli-args.md +91 -0
package/skills/bug-magnet-data/context/db-query.md +104 -0
package/skills/bug-magnet-data/context/file-contents.md +103 -0
package/skills/bug-magnet-data/context/http-body.md +91 -0
package/skills/bug-magnet-data/context/process-spawn.md +123 -0
package/skills/bug-magnet-data/data/booleans/boundaries.yaml +143 -0
package/skills/bug-magnet-data/data/collections/arrays.yaml +114 -0
package/skills/bug-magnet-data/data/collections/objects.yaml +123 -0
package/skills/bug-magnet-data/data/concurrency/race-conditions.yaml +118 -0
package/skills/bug-magnet-data/data/concurrency/state-machines.yaml +115 -0
package/skills/bug-magnet-data/data/dates/boundaries.yaml +137 -0
package/skills/bug-magnet-data/data/dates/invalid.yaml +132 -0
package/skills/bug-magnet-data/data/dates/timezone.yaml +118 -0
package/skills/bug-magnet-data/data/encoding/charset.yaml +79 -0
package/skills/bug-magnet-data/data/encoding/normalization.yaml +105 -0
package/skills/bug-magnet-data/data/formats/email.yaml +154 -0
package/skills/bug-magnet-data/data/formats/json.yaml +187 -0
package/skills/bug-magnet-data/data/formats/url.yaml +165 -0
package/skills/bug-magnet-data/data/language-specific/javascript.yaml +182 -0
package/skills/bug-magnet-data/data/language-specific/python.yaml +174 -0
package/skills/bug-magnet-data/data/language-specific/rust.yaml +148 -0
package/skills/bug-magnet-data/data/numbers/boundaries.yaml +161 -0
package/skills/bug-magnet-data/data/numbers/precision.yaml +89 -0
package/skills/bug-magnet-data/data/numbers/special.yaml +69 -0
package/skills/bug-magnet-data/data/strings/boundaries.yaml +109 -0
package/skills/bug-magnet-data/data/strings/injection.yaml +208 -0
package/skills/bug-magnet-data/data/strings/special-chars.yaml +190 -0
package/skills/bug-magnet-data/data/strings/unicode.yaml +139 -0
package/skills/bug-magnet-data/references/external-lists.md +115 -0
package/skills/bulwark-brainstorm/SKILL.md +563 -0
package/skills/bulwark-brainstorm/references/at-teammate-prompts.md +60 -0
package/skills/bulwark-brainstorm/references/role-critical-analyst.md +78 -0
package/skills/bulwark-brainstorm/references/role-development-lead.md +66 -0
package/skills/bulwark-brainstorm/references/role-product-delivery-lead.md +79 -0
package/skills/bulwark-brainstorm/references/role-product-manager.md +62 -0
package/skills/bulwark-brainstorm/references/role-project-sme.md +59 -0
package/skills/bulwark-brainstorm/references/role-technical-architect.md +66 -0
package/skills/bulwark-research/SKILL.md +298 -0
package/skills/bulwark-research/references/viewpoint-contrarian.md +63 -0
package/skills/bulwark-research/references/viewpoint-direct-investigation.md +62 -0
package/skills/bulwark-research/references/viewpoint-first-principles.md +65 -0
package/skills/bulwark-research/references/viewpoint-practitioner.md +62 -0
package/skills/bulwark-research/references/viewpoint-prior-art.md +66 -0
package/skills/bulwark-scaffold/SKILL.md +330 -0
package/skills/bulwark-statusline/SKILL.md +161 -0
package/skills/bulwark-statusline/scripts/statusline.sh +144 -0
package/skills/bulwark-verify/SKILL.md +519 -0
package/skills/code-review/SKILL.md +428 -0
package/skills/code-review/examples/anti-patterns/linting.ts +181 -0
package/skills/code-review/examples/anti-patterns/security.ts +91 -0
package/skills/code-review/examples/anti-patterns/standards.ts +195 -0
package/skills/code-review/examples/anti-patterns/type-safety.ts +108 -0
package/skills/code-review/examples/recommended/linting.ts +195 -0
package/skills/code-review/examples/recommended/security.ts +154 -0
package/skills/code-review/examples/recommended/standards.ts +231 -0
package/skills/code-review/examples/recommended/type-safety.ts +181 -0
package/skills/code-review/frameworks/angular.md +218 -0
package/skills/code-review/frameworks/django.md +235 -0
package/skills/code-review/frameworks/express.md +207 -0
package/skills/code-review/frameworks/flask.md +298 -0
package/skills/code-review/frameworks/generic.md +146 -0
package/skills/code-review/frameworks/react.md +152 -0
package/skills/code-review/frameworks/vue.md +244 -0
package/skills/code-review/references/linting-patterns.md +221 -0
package/skills/code-review/references/security-patterns.md +125 -0
package/skills/code-review/references/standards-patterns.md +246 -0
package/skills/code-review/references/type-safety-patterns.md +130 -0
package/skills/component-patterns/SKILL.md +131 -0
package/skills/component-patterns/references/pattern-cli-command.md +118 -0
package/skills/component-patterns/references/pattern-database.md +166 -0
package/skills/component-patterns/references/pattern-external-api.md +139 -0
package/skills/component-patterns/references/pattern-file-parser.md +168 -0
package/skills/component-patterns/references/pattern-http-server.md +162 -0
package/skills/component-patterns/references/pattern-process-spawner.md +133 -0
package/skills/continuous-feedback/SKILL.md +327 -0
package/skills/continuous-feedback/references/collect-instructions.md +81 -0
package/skills/continuous-feedback/references/specialize-code-review.md +82 -0
package/skills/continuous-feedback/references/specialize-general.md +98 -0
package/skills/continuous-feedback/references/specialize-test-audit.md +81 -0
package/skills/create-skill/SKILL.md +359 -0
package/skills/create-skill/references/agent-conventions.md +194 -0
package/skills/create-skill/references/agent-template.md +195 -0
package/skills/create-skill/references/content-guidance.md +291 -0
package/skills/create-skill/references/decision-framework.md +124 -0
package/skills/create-skill/references/template-pipeline.md +217 -0
package/skills/create-skill/references/template-reference-heavy.md +111 -0
package/skills/create-skill/references/template-research.md +210 -0
package/skills/create-skill/references/template-script-driven.md +172 -0
package/skills/create-skill/references/template-simple.md +80 -0
package/skills/create-subagent/SKILL.md +353 -0
package/skills/create-subagent/references/agent-conventions.md +268 -0
package/skills/create-subagent/references/content-guidance.md +232 -0
package/skills/create-subagent/references/decision-framework.md +134 -0
package/skills/create-subagent/references/template-single-agent.md +192 -0
package/skills/fix-bug/SKILL.md +241 -0
package/skills/governance-protocol/SKILL.md +116 -0
package/skills/init/SKILL.md +341 -0
package/skills/issue-debugging/SKILL.md +385 -0
package/skills/issue-debugging/references/anti-patterns.md +245 -0
package/skills/issue-debugging/references/debug-report-schema.md +227 -0
package/skills/mock-detection/SKILL.md +511 -0
package/skills/mock-detection/references/false-positive-prevention.md +402 -0
package/skills/mock-detection/references/stub-patterns.md +236 -0
package/skills/pipeline-templates/SKILL.md +215 -0
package/skills/pipeline-templates/references/code-change-workflow.md +277 -0
package/skills/pipeline-templates/references/code-review.md +336 -0
package/skills/pipeline-templates/references/fix-validation.md +421 -0
package/skills/pipeline-templates/references/new-feature.md +335 -0
package/skills/pipeline-templates/references/research-brainstorm.md +161 -0
package/skills/pipeline-templates/references/research-planning.md +257 -0
package/skills/pipeline-templates/references/test-audit.md +389 -0
package/skills/pipeline-templates/references/test-execution-fix.md +238 -0
package/skills/plan-creation/SKILL.md +497 -0
package/skills/product-ideation/SKILL.md +372 -0
package/skills/product-ideation/references/analysis-frameworks.md +161 -0
package/skills/session-handoff/SKILL.md +139 -0
package/skills/session-handoff/references/examples.md +223 -0
package/skills/setup-lsp/SKILL.md +312 -0
package/skills/setup-lsp/references/server-registry.md +85 -0
package/skills/setup-lsp/references/troubleshooting.md +135 -0
package/skills/subagent-output-templating/SKILL.md +415 -0
package/skills/subagent-output-templating/references/examples.md +440 -0
package/skills/subagent-prompting/SKILL.md +364 -0
package/skills/subagent-prompting/references/examples.md +342 -0
package/skills/test-audit/SKILL.md +531 -0
package/skills/test-audit/references/known-limitations.md +41 -0
package/skills/test-audit/references/priority-classification.md +30 -0
package/skills/test-audit/references/prompts/deep-mode-detection.md +83 -0
package/skills/test-audit/references/prompts/synthesis.md +57 -0
package/skills/test-audit/references/rewrite-instructions.md +46 -0
package/skills/test-audit/references/schemas/audit-output.yaml +100 -0
package/skills/test-audit/references/schemas/diagnostic-output.yaml +49 -0
package/skills/test-audit/scripts/data-flow-analyzer.ts +509 -0
package/skills/test-audit/scripts/integration-mock-detector.ts +462 -0
package/skills/test-audit/scripts/package.json +20 -0
package/skills/test-audit/scripts/skip-detector.ts +211 -0
package/skills/test-audit/scripts/verification-counter.ts +295 -0
package/skills/test-classification/SKILL.md +310 -0
package/skills/test-fixture-creation/SKILL.md +295 -0

package/skills/test-audit/SKILL.md ADDED Viewed

@@ -0,0 +1,531 @@
+---
+name: test-audit
+description: Audit test suites for T1-T4 violations using AST analysis, mock detection, and multi-stage synthesis. Invoke when user asks to audit tests, check test quality, find mock violations, review test effectiveness, or inspect test suites for over-mocking. Triggers automatic rewrites when quality gates fail.
+user-invocable: true
+argument-hint: [path] [--threshold=N]
+skills:
+  - test-classification
+  - mock-detection
+  - assertion-patterns
+  - component-patterns
+  - bug-magnet-data
+---
+# Test Audit
+User-facing entry point for test suite quality auditing. Orchestrates classification, mock detection, and synthesis stages to identify T1-T4 violations and trigger automatic rewrites when required.
+---
+## When to Use This Skill
+**Load this skill when the user request matches ANY of these patterns:**
+| Trigger Pattern | Example User Request |
+|-----------------|---------------------|
+| Test quality audit | "Audit my tests", "Check test quality", "Review test suite" |
+| Mock detection | "Find mock violations", "Check for T1 violations", "Are my tests over-mocked?" |
+| Test effectiveness | "How effective are my tests?", "Are my tests real or mocked?" |
+| After writing tests | "I just wrote tests for X, can you audit them?" |
+| CI/CD integration | "Add test audit to pipeline", "Validate tests before merge" |
+**DO NOT use for:**
+- Running tests (use `just test`)
+- Writing new tests (implement directly)
+- General code review (use `code-review` skill)
+- Debugging test failures (use `issue-debugging` skill)
+---
+## Pre-Flight Gate (BLOCKING)
+**STOP. Before ANY analysis, you MUST acknowledge what this skill requires.**
+This skill uses a **multi-stage pipeline with sub-agents**. You are the orchestrator, NOT the executor.
+### What You MUST Do
+1. **Run Stage 0 AST scripts** before any LLM stages:
+   - `just verify-count {target}` → `/tmp/claude/ast-verify-count.json`
+   - `just skip-detect {target}` → `/tmp/claude/ast-skip-detect.json`
+   - `just ast-analyze {target}` → `/tmp/claude/ast-data-flow.json`
+2. **Select mode** based on file count and threshold (default 5)
+3. **Spawn sub-agents** for each applicable stage:
+   - Stage 1 (Scale mode only): Classification → `Task(subagent_type="general-purpose", model="haiku", ...)`
+   - Stage 2: Mock Detection → `Task(subagent_type="general-purpose", model="sonnet", ...)`
+   - Stage 3: Synthesis → `Task(subagent_type="general-purpose", model="sonnet", ...)`
+4. **Write outputs to logs/**:
+   - `logs/test-classification-{YYYYMMDD-HHMMSS}.yaml` (Scale mode only)
+   - `logs/mock-detection-{YYYYMMDD-HHMMSS}.yaml`
+   - `logs/test-audit-{YYYYMMDD-HHMMSS}.yaml`
+   - `logs/diagnostics/test-audit-{YYYYMMDD-HHMMSS}.yaml`
+5. **Follow the orchestration instructions exactly** - do not substitute your own judgment
+### What You MUST NOT Do
+- **Do NOT skip Stage 0** - AST scripts provide deterministic metadata that LLM stages depend on
+- **Do NOT perform classification yourself** - spawn a Haiku sub-agent (Scale mode)
+- **Do NOT perform mock detection yourself** - spawn a Sonnet sub-agent
+- **Do NOT perform synthesis yourself** - spawn a Sonnet sub-agent
+- **Do NOT skip stages** because you think you can do it faster
+- **Do NOT return to user** until all log files are written
+### Why This Matters
+The pipeline exists for:
+- **Bias avoidance** - Different models for different stages prevent self-review bias
+- **Structured artifacts** - Logs enable observability and debugging
+- **Deterministic workflow** - Reproducible results across sessions
+- **Separation of concerns** - Each stage has a specific role
+**If you find yourself thinking "I can just analyze this directly" - STOP. That violates SC1-SC2 in Rules.md.**
+### Completion Checklist
+Before returning to user, verify ALL items:
+- [ ] Stage 0 (AST) completed - outputs in `/tmp/claude/ast-*.json` (or graceful degradation logged)
+- [ ] Mode selected (Deep or Scale) and displayed to user
+- [ ] Stage 1 (Classification) completed (Scale mode only) - output written to `logs/test-classification-*.yaml`
+- [ ] Stage 2 (Mock Detection) completed - output written to `logs/mock-detection-*.yaml`
+- [ ] Stage 3 (Synthesis) completed - output written to `logs/test-audit-*.yaml`
+- [ ] Summary presented to user with violation counts and REWRITE_REQUIRED status
+- [ ] Diagnostic output written to `logs/diagnostics/test-audit-*.yaml` (includes mode, threshold, AST status)
+**If REWRITE_REQUIRED == true, also verify:**
+- [ ] For each file: component type identified
+- [ ] For each file: `bug-magnet-data` context file loaded for component type
+- [ ] For each file: T0 + T1 edge cases loaded from bug-magnet-data
+- [ ] Verification scripts include edge cases from bug-magnet-data
+- [ ] Destructive patterns (`safe_for_automation: false`) excluded or marked manual-only
+- [ ] Rewrites applied using assertion-patterns and component-patterns
+**Do NOT return to user until all applicable checklist items are verified.**
+---
+## Usage
+```
+/test-audit [path] [--threshold=N]
+```
+**Examples:**
+- `/test-audit tests/` - Audit all tests in tests/ directory
+- `/test-audit src/__tests__/api.test.ts` - Audit specific file
+- `/test-audit tests/ --threshold=10` - Force Scale mode for ≤10 files
+- `/test-audit` - Audit tests mentioned in recent context (or prompt for path)
+---
+## Pipeline Overview
+```
+/test-audit tests/
+        ↓
+┌─────────────────────────────────────────────────────────────────────┐
+│              ORCHESTRATOR (Opus) - Main Context                      │
+│                                                                     │
+│  Stage 0: AST Pre-Processing (deterministic, no LLM)                │
+│     └─ just verify-count {target}                                   │
+│     └─ just skip-detect {target}                                    │
+│     └─ just ast-analyze {target}                                    │
+│     └─ Output: /tmp/claude/ast-*.json                               │
+│                                                                     │
+│  Mode Selection: file_count ≤ threshold → Deep, else → Scale        │
+│                                                                     │
+│  ┌─── DEEP MODE (≤5 files) ──────── SCALE MODE (>5 files) ────┐    │
+│  │                                                              │    │
+│  │  [skip classification]          Stage 1: Classification      │    │
+│  │                                    └─ Haiku + AST hints      │    │
+│  │                                                              │    │
+│  │  Stage 2: Detection             Stage 2: Detection           │    │
+│  │    └─ Sonnet, ALL files           └─ Sonnet, flagged only    │    │
+│  │    └─ Self-computes metadata      └─ Uses classification     │    │
+│  │                                                              │    │
+│  └──────────────────────────────────────────────────────────────┘    │
+│                                                                     │
+│  Stage 3: Synthesis (Sonnet) — unified for both modes               │
+│                                                                     │
+│  Step 4: Present summary to user                                    │
+│                                                                     │
+│  Step 5: If REWRITE_REQUIRED → Implement rewrites (Opus)            │
+│                                                                     │
+└─────────────────────────────────────────────────────────────────────┘
+```
+---
+## Orchestration Instructions
+When this skill is loaded, follow these steps exactly:
+### Step 1: Resolve Target
+```
+IF $ARGUMENTS provided:
+    target = $1 (first argument)
+    Parse optional flags:
+        --threshold=N  → override default threshold (default: 5)
+ELSE:
+    Look for test files in recent conversation context
+    IF found: target = that path
+    ELSE: Ask user: "Which test directory or file should I audit?"
+```
+### Step 2: Stage 0 — AST Pre-Processing (MANDATORY)
+**This step is BINDING. Do NOT skip it.** AST scripts provide deterministic metadata that replaces heuristic estimates. Skipping Stage 0 degrades audit accuracy.
+1. Generate timestamp: `YYYYMMDD-HHMMSS`
+2. Count test files in target (glob `**/*.test.{ts,tsx,js,jsx}` + `**/*.spec.{ts,tsx,js,jsx}`)
+3. Run all four AST scripts via Justfile recipes:
+```bash
+just verify-count {target} > /tmp/claude/ast-verify-count.json
+just skip-detect {target} > /tmp/claude/ast-skip-detect.json
+just ast-analyze {target} > /tmp/claude/ast-data-flow.json
+just integration-mocks {target} > /tmp/claude/ast-integration-mocks.json
+```
+4. Read each output file and verify valid JSON
+5. If any script fails: log warning in diagnostics, continue with LLM-only analysis for that dimension (graceful degradation)
+**AST output schemas** (for prompt injection into LLM stages):
+```json
+// verify-count output (per file)
+{ "file": "tests/user.test.ts", "metrics": { "total_lines": 156, "test_logic_lines": 98, "assertion_lines": 42, "setup_lines": 56, "effectiveness_percent": 42.86, "framework_detected": "jest" } }
+// skip-detect output (per file)
+{ "file": "tests/user.test.ts", "markers": [{ "type": "test.skip", "line": 42, "test_name": "should handle edge case", "severity": "medium", "rule": "T4" }], "summary": { "skip_count": 1, "only_count": 0, "todo_count": 0 } }
+// ast-analyze output (per file)
+{ "file": "tests/workflow.integration.ts", "violations": [{ "line": 42, "type": "T3+", "confidence": "high", "variable": "orderData", "source": "object_literal", "message": "Variable 'orderData' is manually constructed", "suggestion": "Replace with factory function or upstream function output" }] }
+// integration-mocks output (per file)
+{ "file": "tests/error-handler.test.ts", "sections": [{ "name": "Error Handler Integration", "type": "integration", "signal": "keyword_in_name", "line_start": 559, "line_end": 628 }], "leads": [{ "line": 562, "type": "T3", "confidence": "high", "mock_pattern": "jest.fn().mockImplementation()", "enclosing_block": "Error Handler Integration", "block_type": "integration", "message": "Mock call in integration test block", "suggestion": "Replace mock with actual implementation" }], "summary": { "sections_found": 1, "integration_sections": 1, "e2e_sections": 0, "leads_count": 1, "mock_calls_in_integration": 1, "mock_calls_in_e2e": 0 } }
+```
+### Step 3: Mode Selection
+```
+threshold = $THRESHOLD_FLAG OR 5 (default)
+file_count = count of test files in target
+IF file_count <= threshold:
+    IF file_count > 25:
+        mode = "scale"
+        WARN "Deep mode safety cap exceeded (>25 files). Falling back to Scale mode."
+    ELSE:
+        mode = "deep"
+ELSE:
+    mode = "scale"
+```
+**Display mode selection to user:**
+```
+## Test Audit: {mode} Mode
+**Target:** {target}
+**Files:** {file_count}
+**Threshold:** {threshold}
+**Mode:** {mode} ({rationale})
+Stage 0 (AST): {status — success/partial/failed}
+  verify-count: {ok/failed}
+  skip-detect: {ok/failed}
+  ast-analyze: {ok/failed}
+Proceeding with {mode} mode pipeline...
+```
+### Step 4: Classification Stage — Scale Mode Only
+**Skip this step entirely in Deep mode.** In Deep mode, detection (Step 5) self-computes classification metadata using AST output.
+1. Access the `test-classification` skill (loaded via frontmatter dependency)
+**Batching check:**
+```
+IF file_count > 20:
+    Split files into batches of 20-25
+    FOR each batch IN PARALLEL:
+        Construct 4-part prompt with batch file list
+        INCLUDE AST hints in CONTEXT (verify-count + skip-detect per file)
+        Task(subagent_type="general-purpose", model="haiku",
+             prompt=batch_prompt, run_in_background=true)
+    Read all batch outputs
+    Merge into single classification YAML
+ELSE:
+    Construct 4-part prompt using the skill's template
+    INCLUDE AST hints in CONTEXT (verify-count + skip-detect per file)
+    Task(subagent_type="general-purpose", model="haiku", prompt=...)
+```
+**AST hints for classification CONTEXT:**
+```
+The following AST-computed metadata is available for each file.
+Use this to improve classification accuracy — these are deterministic,
+not heuristic.
+{for each file in target}:
+  file: {path}
+  ast_verification_lines: {metrics.test_logic_lines}
+  ast_assertion_lines: {metrics.assertion_lines}
+  ast_skip_markers: {markers array or "none"}
+  ast_data_flow_violations: {violations array or "none"}
+```
+2. Read output from `logs/test-classification-{YYYYMMDD-HHMMSS}.yaml`
+3. Verify output contains `files` array with classification data
+### Step 5: Detection Stage (Sonnet)
+**Behavior differs by mode:**
+#### Deep Mode Detection
+In Deep mode, ALL files are analyzed (no classification filtering). The detection agent self-computes classification metadata from AST output.
+1. Access the `mock-detection` skill (loaded via frontmatter dependency)
+2. Construct the Deep Mode Detection Prompt (see "Deep Mode Detection Prompt" section below)
+3. Include ALL test files in the prompt with their AST metadata
+**Batching check (deep mode):**
+```
+IF file_count > 10:
+    Split files into batches of 10-15
+    FOR each batch:
+        Include full AST metadata per file
+        Task(subagent_type="general-purpose", model="sonnet",
+             prompt=deep_mode_batch_prompt, run_in_background=true)
+    Read all batch outputs
+    Merge into single detection YAML
+ELSE:
+    Task(subagent_type="general-purpose", model="sonnet",
+         prompt=deep_mode_prompt)
+```
+#### Scale Mode Detection
+In Scale mode, only files flagged by classification are analyzed.
+1. Access the `mock-detection` skill (loaded via frontmatter dependency)
+2. Extract files with `needs_deep_analysis: true` from classification output
+3. Count flagged files
+**Batching check (scale mode):**
+```
+IF flagged_file_count > 10:
+    Split flagged files into batches of 10-15
+    FOR each batch:
+        Include verification_lines from classification for each file
+        Include AST metadata (data-flow violations, skip markers) per file
+        Task(subagent_type="general-purpose", model="sonnet",
+             prompt=batch_prompt, run_in_background=true)
+    Read all batch outputs
+    Merge into single detection YAML
+ELSE:
+    Construct 4-part prompt using the skill's template
+    Include AST metadata in CONTEXT
+    Task(subagent_type="general-purpose", model="sonnet", prompt=...)
+```
+4. Read output from `logs/mock-detection-{YYYYMMDD-HHMMSS}.yaml`
+5. Verify output contains `violations` array and `file_summaries`
+### Step 6: Synthesis Stage (Sonnet)
+1. Construct synthesis prompt using template below (unified for both modes)
+2. Include detection output in CONTEXT
+3. Include classification output in CONTEXT (Scale mode) or note "Deep mode — no classification stage" (Deep mode)
+4. Include AST skip-detect output for T4 violation synthesis
+5. Spawn sub-agent:
+   ```
+   Task(
+     subagent_type="general-purpose",
+     model="sonnet",
+     prompt="[synthesis 4-part prompt]"
+   )
+   ```
+6. Read output from `logs/test-audit-{YYYYMMDD-HHMMSS}.yaml`
+7. Verify output contains `directive.REWRITE_REQUIRED` field
+### Step 7: Present Summary
+Display audit summary to user before any rewrites:
+```
+## Test Audit Complete ({mode} Mode)
+**Target:** {target}
+**Files audited:** {total_files}
+**Files analyzed:** {files_analyzed} (deep: all, scale: flagged only)
+**Overall test effectiveness:** {percentage}%
+### Stage 0 (AST)
+- Verification lines: AST-precise (not heuristic)
+- Skip markers (T4): {count} found
+- Data flow leads (T3+): {count} found
+### Violations by Priority
+- P0 (False confidence): {count}
+- P1 (Incomplete verification): {count}
+- P2 (Pattern issues): {count}
+### REWRITE_REQUIRED: {true/false}
+Gate triggered: {gate description}
+[If true] Proceeding with automatic rewrites...
+[If false] No automatic rewrites needed. See recommendations below.
+```
+### Step 8: Evaluate REWRITE_REQUIRED (Two-Gate)
+Apply two-gate logic from audit report:
+**Gate 1 (Impact):**
+```
+IF any P0 violations exist:
+    REWRITE_REQUIRED = true
+    gate_triggered = "Gate 1: Impact (P0 violations - false confidence)"
+```
+**Gate 2 (Threshold):**
+```
+ELSE IF P1 violations exist:
+    IF any file has test_effectiveness < 95%:
+        REWRITE_REQUIRED = true
+        gate_triggered = "Gate 2: Threshold (P1 + effectiveness < 95%)"
+    ELSE:
+        REWRITE_REQUIRED = false
+        status = "Advisory only (P1 above 95% threshold)"
+```
+**Advisory:**
+```
+ELSE (P2 only):
+    REWRITE_REQUIRED = false
+    status = "Advisory only (P2 pattern issues)"
+```
+### Step 9: Rewrite (If Required)
+```
+IF REWRITE_REQUIRED == true:
+    Read `references/rewrite-instructions.md` and follow the procedure
+    for each file in directive.files_to_rewrite (ordered by priority, then effectiveness).
+    Uses: assertion-patterns, component-patterns, bug-magnet-data skills.
+ELSE:
+    Display recommendations without auto-rewrite
+```
+---
+## Deep Mode Detection Prompt
+Read `references/prompts/deep-mode-detection.md` and use as the Task() prompt for the Sonnet detection sub-agent in Deep mode. Inject per-file AST metadata into the prompt's CONTEXT placeholders (verification_lines, skip_markers, data_flow_leads, integration_mock_leads from Stage 0 output).
+---
+## Synthesis Prompt Template
+Read `references/prompts/synthesis.md` and use as the Task() prompt for the Sonnet synthesis sub-agent. Inject the following into the prompt's CONTEXT placeholders:
+- `{deep or scale}` → current mode
+- `{classification_yaml_path}` → classification log path (Scale) or "N/A" (Deep)
+- `{detection_yaml_path}` → detection log path
+- `{skip_detect_json}` → AST skip-detect output
+- `{verify_count_json}` → AST verify-count output
+---
+## Priority Classification
+Full definitions: `references/priority-classification.md`
+- **P0 (False confidence):** T1 (mock SUT), T3+ (broken chain) — test passes but provides no assurance
+- **P1 (Incomplete verification):** T2 (call-only), T3 (mocked boundary) — real code runs but not fully verified
+- **P2 (Pattern issues):** T4 (skip/only/todo), minor patterns — style and disabled tests
+---
+## Output Schema
+Full schema with example: `references/schemas/audit-output.yaml`
+Key fields the orchestrator validates after synthesis:
+- `directive.REWRITE_REQUIRED` — boolean, drives Step 9
+- `directive.gate_triggered` — which gate fired
+- `directive.files_to_rewrite` — ordered list for rewrite step
+- `audit.file_analysis[].test_effectiveness` — per-file percentage
+- `audit.overview.overall_effectiveness` — aggregate metric
+---
+## Diagnostic Output
+Write diagnostic output to `logs/diagnostics/test-audit-{YYYYMMDD-HHMMSS}.yaml`.
+Schema: `references/schemas/diagnostic-output.yaml`. Includes mode selection, Stage 0 AST status, gate evaluation, and per-file decisions with `verification_lines_source: ast | heuristic`.
+---
+## Integration Notes
+### Hook Integration
+This skill can be triggered by:
+1. **Direct invocation:** `/test-audit [path]`
+2. **Pipeline hook:** PostToolUse on `*.test.*` files suggests Test Audit pipeline
+Both paths use the same orchestration flow.
+### AST Scripts
+All AST scripts live in `skills/test-audit/scripts/` and are invoked via Justfile recipes:
+| Recipe | Script | Purpose |
+|--------|--------|---------|
+| `just verify-count` | `verification-counter.ts` | Precise line counting (replaces heuristic) |
+| `just skip-detect` | `skip-detector.ts` | T4 skip/only/todo marker detection |
+| `just ast-analyze` | `data-flow-analyzer.ts` | T3+ broken chain detection via data flow tracing |
+Scripts use ts-morph for AST parsing, run via `npx tsx`, and output JSON to stdout. Dependencies are in `skills/test-audit/scripts/package.json`.
+---
+## Known Limitations
+See `references/known-limitations.md` for full details including resolved limitations history.
+**Active limitations:** T3+ single-file scope (~90% coverage), manual stub detection gaps (mitigated by Deep mode + extended patterns), context limits at scale (mitigated by batching).
+---
+## Supporting Files
+| File | Purpose |
+|------|---------|
+| `references/prompts/deep-mode-detection.md` | 4-part prompt for Deep mode detection sub-agent |
+| `references/prompts/synthesis.md` | 4-part prompt for synthesis sub-agent |
+| `references/schemas/audit-output.yaml` | Output schema with example for audit report |
+| `references/schemas/diagnostic-output.yaml` | Diagnostic output schema |
+| `references/priority-classification.md` | P0/P1/P2 definitions with T-rule impact tables |
+| `references/known-limitations.md` | Active and resolved limitations |
+| `references/rewrite-instructions.md` | Step 9 rewrite procedure with bug-magnet-data integration |
+---
+## Related Skills
+- `test-classification` (P0.6) - Classification prompt template
+- `mock-detection` (P0.7) - Detection prompt template + `references/stub-patterns.md`, `references/false-positive-prevention.md`
+- `pipeline-templates` (P0.3) - Test Audit pipeline definition
+- `subagent-prompting` (P0.1) - 4-part template reference
+- `bug-magnet-data` (P4.2) - Curated edge case test data

package/skills/test-audit/references/known-limitations.md ADDED Viewed

@@ -0,0 +1,41 @@
+# Known Limitations
+This skill has the following known limitations that are documented for transparency.
+## T3+ Detection: Single-File Scope
+**Issue:** AST-based data flow analysis (`ast-analyze`) traces data flow within a single file. Cross-file integration chains (e.g., mock data imported from a shared fixtures file) are not traced.
+**Impact:** Estimated ~90% of T3+ violations are single-file (variable constructed in same test body). Cross-file violations (~10%) require LLM heuristics from the detection stage.
+**Mitigation:** AST provides high-confidence leads for single-file cases. Detection agent (Sonnet) uses call graph analysis for cross-file patterns. Future enhancement: cross-file data flow analysis (deferred to P6+).
+**Resolved (P5.10):** Previously, T3+ detection relied entirely on LLM pattern matching for variable names containing "mock". AST data-flow-analyzer now detects violations with generic variable names (e.g., `testOrder`).
+## Manual Stub Pattern Detection
+**Issue:** Projects using manual stub classes (e.g., `StubSharedContext`) instead of `jest.mock()` are not detected by mock indicator scanning alone.
+**Impact:** Classification may not flag all files needing analysis in projects with custom stubbing patterns.
+**Mitigation:** Integration/e2e files are always flagged regardless of indicators. Extended pattern detection reference docs (`references/stub-patterns.md` in mock-detection skill) provide Meszaros taxonomy patterns to the detection agent. Deep mode (≤5 files) bypasses classification entirely, analyzing all files.
+**Improved (P5.12):** Detection agent now has access to extended stub/fake patterns and false positive prevention reference docs.
+## Context Limits at Scale
+**Issue:** Single sub-agent calls can handle ~20-25 test files before approaching context limits.
+**Mitigation:** Batching with parallel sub-agents implemented for >20 files (classification) and >10 files (detection). Deep mode safety cap prevents analyzing >25 files without classification filtering.
+## Resolved Limitations
+The following limitations from earlier versions have been fully addressed:
+| Limitation | Resolution | Version |
+|-----------|-----------|---------|
+| Verification line counting approximation | AST-based `verification-counter.ts` provides exact counts | P5.10 |
+| Negative effectiveness percentages | Impossible with AST-precise line counts | P5.10 |
+| T4 detection not automated | AST-based `skip-detector.ts` finds all skip/only/todo markers | P5.10 |
+| No dual-mode for small audits | Deep mode (≤5 files) skips classification | P5.11 |
+| T3+ relies on "mock" in variable names | AST `data-flow-analyzer.ts` traces data sources structurally | P5.10 |

package/skills/test-audit/references/priority-classification.md ADDED Viewed

@@ -0,0 +1,30 @@
+# Priority Classification
+## P0: False Confidence
+Tests that pass but should not be trusted:
+| Rule | Impact |
+|------|--------|
+| T1 | Mock hides real failures - test always passes regardless of SUT behavior |
+| T3+ | Broken integration chain - no real integration is tested |
+## P1: Incomplete Verification
+Tests that run real code but don't fully verify:
+| Rule | Impact |
+|------|--------|
+| T2 | Call happened but effect not verified |
+| T3 | Integration boundary mocked - partial integration only |
+## P2: Pattern Issues
+Style, organization, and disabled test issues:
+| Rule | Impact |
+|------|--------|
+| T4 (.skip) | Test disabled — not running, medium severity |
+| T4 (.only) | Focus marker — other tests excluded from CI, high severity |
+| T4 (.todo) | Test placeholder — not implemented, low severity |
+| Minor patterns | Style and organization recommendations |

package/skills/test-audit/references/prompts/deep-mode-detection.md ADDED Viewed

@@ -0,0 +1,83 @@
+# Deep Mode Detection Prompt
+Use this template for Stage 2 in **Deep mode only**. In Deep mode, the detection agent self-computes classification metadata (normally provided by Stage 1) using AST output.
+## GOAL
+Analyze ALL provided test files for T1-T4 violations using mock appropriateness rubric and call graph analysis. For each file, self-compute classification metadata (test type, mock indicators, needs_deep_analysis) before performing detection. Track the full scope of each violation for test effectiveness calculation.
+## CONSTRAINTS
+- Do NOT modify any files
+- Analyze ALL provided files (no classification filtering — this is Deep mode)
+- Use AST metadata as ground truth for verification_lines (do not re-estimate)
+- Use AST data-flow violations as starting leads for T3+ analysis
+- Use AST skip markers as T4 violations (deterministic — no further analysis needed)
+- Use call graph analysis to detect T1-T3 violations beyond AST leads
+- Track violation scope (all affected lines, not just violation line)
+- Provide full context for each violation (line, snippet, reason, fix)
+- Complete within 50 tool calls per batch
+## CONTEXT
+**Mode:** Deep (all files analyzed, no classification stage)
+**Files to analyze:** {list of ALL test files}
+**AST metadata per file:**
+```
+{for each file}:
+  file: {path}
+  verification_lines: {metrics.test_logic_lines from verify-count}
+  assertion_lines: {metrics.assertion_lines}
+  framework: {metrics.framework_detected}
+  skip_markers: {markers from skip-detect, or "none"}
+  data_flow_leads: {violations from ast-analyze, or "none"}
+  integration_mock_leads: {leads from just integration-mocks, or "none"}
+```
+**Self-classification instructions (MANDATORY — per-section, not per-file):**
+Files commonly contain multiple test types in different sections. You MUST classify each top-level describe/test block independently. DO NOT assign a single test type to the entire file.
+For each top-level describe block or section:
+1. **Test type**: unit / integration / e2e — determine from:
+   - Block/suite name (e.g., "Integration Tests", "E2E: checkout flow")
+   - Preceding comments or section headers (e.g., `// INTEGRATION TESTS`, `# E2E`, `/* system tests */`)
+   - Setup patterns within the block (real DB connections = integration, browser launch = e2e)
+   - These signals are language-agnostic — apply regardless of whether the file is TypeScript, Python, Java, Go, Ruby, etc.
+2. **Mock indicators within that block**: list mock/stub/spy framework calls found
+3. **Evaluate each block against the rubric for ITS test type** — not the file's majority type
+If AST integration-mock metadata is available (from `just integration-mocks`), use it as ground truth for section classification and mock locations. Validate AST leads and add any the AST missed.
+**BINDING: AST classification is final.** When the AST script classifies a section as integration or e2e, that classification is NOT subject to LLM override. You MUST evaluate mocks in that section against integration/e2e rules — even if you believe the section is "actually" a unit test. Your role is to evaluate mock appropriateness within the classified type, not to re-classify sections.
+- If the test author labeled a block "Integration" and the AST confirmed it, both the author's intent and the deterministic signal agree. Do NOT introduce personal judgment to override them.
+- If you believe a section is mislabeled, you MAY note "Advisory: consider renaming this section" — but you MUST still flag T3 violations against the integration/e2e rubric.
+- Dismissing an AST T3 lead by re-classifying the section as "actually unit" is a rule violation.
+**Mock appropriateness rubric:** See mock-detection skill's "Mock Appropriateness Rubric" section
+**T1-T4 detection patterns:** See mock-detection skill's "T1-T4 Detection Patterns" section
+**Extended stub/fake patterns:** See `skills/mock-detection/references/stub-patterns.md` (loaded via mock-detection dependency)
+**False positive prevention:** See `skills/mock-detection/references/false-positive-prevention.md` (loaded via mock-detection dependency) — consult BEFORE flagging borderline patterns
+## OUTPUT
+Write violations to: `logs/mock-detection-{YYYYMMDD-HHMMSS}.yaml`
+Write diagnostics to: `logs/diagnostics/mock-detection-{YYYYMMDD-HHMMSS}.yaml`
+Use the same output schema as the mock-detection skill's "Output Schema" section, with one addition — include a `self_classification` block per file:
+```yaml
+self_classification:
+  - file: tests/proxy.test.ts
+    test_type: unit
+    mock_indicators: ["jest.spyOn(child_process, 'spawn')"]
+    needs_deep_analysis: true
+    reason: "Mock intercepts core dependency"
+```