npm - @kiwidata/grimoire - Versions diffs - 0.1.3 → 0.1.4 - Mend

@kiwidata/grimoire 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (127) hide show

package/AGENTS.md +56 -4
package/README.md +28 -1
package/dist/cli/index.js +2 -0
package/dist/cli/index.js.map +1 -1
package/dist/commands/check.js +1 -1
package/dist/commands/check.js.map +1 -1
package/dist/commands/configure.d.ts +3 -0
package/dist/commands/configure.d.ts.map +1 -0
package/dist/commands/configure.js +19 -0
package/dist/commands/configure.js.map +1 -0
package/dist/commands/init.d.ts.map +1 -1
package/dist/commands/init.js +2 -0
package/dist/commands/init.js.map +1 -1
package/dist/commands/map.d.ts.map +1 -1
package/dist/commands/map.js +10 -11
package/dist/commands/map.js.map +1 -1
package/dist/core/archive.d.ts.map +1 -1
package/dist/core/archive.js +32 -43
package/dist/core/archive.js.map +1 -1
package/dist/core/check.d.ts.map +1 -1
package/dist/core/check.js +115 -104
package/dist/core/check.js.map +1 -1
package/dist/core/ci.d.ts.map +1 -1
package/dist/core/ci.js +50 -69
package/dist/core/ci.js.map +1 -1
package/dist/core/configure.d.ts +14 -0
package/dist/core/configure.d.ts.map +1 -0
package/dist/core/configure.js +434 -0
package/dist/core/configure.js.map +1 -0
package/dist/core/detect.d.ts.map +1 -1
package/dist/core/detect.js +153 -26
package/dist/core/detect.js.map +1 -1
package/dist/core/diff.d.ts.map +1 -1
package/dist/core/diff.js +62 -93
package/dist/core/diff.js.map +1 -1
package/dist/core/doc-style.d.ts +0 -4
package/dist/core/doc-style.d.ts.map +1 -1
package/dist/core/doc-style.js +28 -23
package/dist/core/doc-style.js.map +1 -1
package/dist/core/docs.js +106 -100
package/dist/core/docs.js.map +1 -1
package/dist/core/health.js +55 -77
package/dist/core/health.js.map +1 -1
package/dist/core/hooks.d.ts +0 -3
package/dist/core/hooks.d.ts.map +1 -1
package/dist/core/hooks.js +0 -11
package/dist/core/hooks.js.map +1 -1
package/dist/core/init.d.ts +2 -0
package/dist/core/init.d.ts.map +1 -1
package/dist/core/init.js +230 -406
package/dist/core/init.js.map +1 -1
package/dist/core/list.d.ts.map +1 -1
package/dist/core/list.js +55 -65
package/dist/core/list.js.map +1 -1
package/dist/core/log.d.ts.map +1 -1
package/dist/core/log.js +23 -33
package/dist/core/log.js.map +1 -1
package/dist/core/map.d.ts +15 -2
package/dist/core/map.d.ts.map +1 -1
package/dist/core/map.js +257 -194
package/dist/core/map.js.map +1 -1
package/dist/core/shared-setup.d.ts +0 -40
package/dist/core/shared-setup.d.ts.map +1 -1
package/dist/core/shared-setup.js +87 -52
package/dist/core/shared-setup.js.map +1 -1
package/dist/core/status.d.ts.map +1 -1
package/dist/core/status.js +42 -52
package/dist/core/status.js.map +1 -1
package/dist/core/test-quality.d.ts +0 -8
package/dist/core/test-quality.d.ts.map +1 -1
package/dist/core/test-quality.js +24 -30
package/dist/core/test-quality.js.map +1 -1
package/dist/core/trace.d.ts.map +1 -1
package/dist/core/trace.js +31 -41
package/dist/core/trace.js.map +1 -1
package/dist/core/update.d.ts.map +1 -1
package/dist/core/update.js +61 -11
package/dist/core/update.js.map +1 -1
package/dist/core/validate.d.ts +1 -4
package/dist/core/validate.d.ts.map +1 -1
package/dist/core/validate.js +126 -148
package/dist/core/validate.js.map +1 -1
package/dist/utils/config.d.ts +15 -5
package/dist/utils/config.d.ts.map +1 -1
package/dist/utils/config.js +63 -42
package/dist/utils/config.js.map +1 -1
package/dist/utils/fs.d.ts +0 -12
package/dist/utils/fs.d.ts.map +1 -1
package/dist/utils/fs.js +0 -12
package/dist/utils/fs.js.map +1 -1
package/dist/utils/paths.d.ts +0 -6
package/dist/utils/paths.d.ts.map +1 -1
package/dist/utils/paths.js +0 -6
package/dist/utils/paths.js.map +1 -1
package/dist/utils/spawn.d.ts +0 -3
package/dist/utils/spawn.d.ts.map +1 -1
package/dist/utils/spawn.js +0 -3
package/dist/utils/spawn.js.map +1 -1
package/package.json +1 -1
package/skills/grimoire-apply/SKILL.md +84 -16
package/skills/grimoire-audit/SKILL.md +21 -1
package/skills/grimoire-bug/SKILL.md +48 -9
package/skills/grimoire-commit/SKILL.md +2 -1
package/skills/grimoire-design/SKILL.md +259 -0
package/skills/grimoire-design-consult/SKILL.md +200 -0
package/skills/grimoire-discover/SKILL.md +65 -2
package/skills/grimoire-draft/SKILL.md +85 -2
package/skills/grimoire-plan/SKILL.md +61 -18
package/skills/grimoire-pr/SKILL.md +4 -6
package/skills/grimoire-pr-review/SKILL.md +45 -114
package/skills/grimoire-precommit-review/SKILL.md +205 -0
package/skills/grimoire-refactor/SKILL.md +5 -5
package/skills/grimoire-review/SKILL.md +74 -147
package/skills/grimoire-verify/SKILL.md +33 -0
package/skills/references/adversarial-personas.md +225 -0
package/skills/references/brand-tokens-format.md +186 -0
package/skills/references/code-quality.md +140 -0
package/skills/references/design-heuristics.md +138 -0
package/skills/references/design-input-formats.md +190 -0
package/skills/references/pattern-guard.md +180 -0
package/skills/references/refactor-scan-categories.md +152 -0
package/skills/references/review-personas.md +405 -0
package/skills/references/security-compliance.md +22 -1
package/skills/references/visual-fidelity.md +206 -0
package/templates/brand-tokens-example.json +13 -0
package/templates/brand-voice-example.md +22 -0
package/templates/design-tool-setup-stub.md +59 -0

package/skills/references/refactor-scan-categories.md CHANGED Viewed

@@ -28,6 +28,21 @@ Files that change frequently AND are hard to change. Highest-ROI refactoring tar
 **Severity:** high = 2x+ threshold, medium = 1-2x, low = marginally over
+**Graph-powered LLM bloat checks** (requires `codebase-memory-mcp`; skip if not indexed):
+These target patterns that static size checks miss — structurally valid code that adds indirection without value. Primary signal of LLM-generated over-engineering.
+| Pattern | Query | Flag when |
+|---|---|---|
+| Single-subclass base class | `query_graph("MATCH (sub)-[:INHERITS]->(base:Class) WITH base, collect(sub) AS subs WHERE size(subs) = 1 RETURN base.qualified_name, base.file, subs[0].qualified_name AS only_subclass")` | Any result — a base with one child is premature abstraction |
+| Single-caller wrapper | Step 1: `query_graph("MATCH (caller)-[:CALLS]->(fn) WITH fn, collect(caller) AS callers WHERE size(callers) = 1 RETURN fn.qualified_name, fn.file, callers[0].qualified_name AS only_caller")`. Step 2: for each result, `get_code_snippet(qualified_name)` and count body lines. | Wrapper with 1 caller and ≤7 body lines — inline candidate |
+| Zero-caller export | `query_graph("MATCH (f:Function) WHERE f.exported = true AND NOT ()-[:CALLS]->(f) RETURN f.qualified_name, f.file")` — then filter out entry points manually: skip files named `index.ts`, `__init__.py`, `main.py`, `cli.py`, `app.py`, or in a `public/` directory | Exported, unreachable within repo, not an entry point — dead export |
+| Single-implementation interface | `query_graph("MATCH (impl)-[:IMPLEMENTS]->(iface:Interface) WITH iface, collect(impl) AS impls WHERE size(impls) = 1 RETURN iface.qualified_name, iface.file, impls[0].qualified_name AS only_impl")` | Any result — interface with one implementor adds no polymorphism |
+Note: the exact Cypher depends on the graph schema. If a query returns an error, adjust field names using `get_graph_schema()` to inspect available properties.
+**Severity for graph findings:** high = single-implementation interface or zero-caller export, medium = single-subclass base or single-caller wrapper
 ## 2c. Data Structure Complexity
 | Signal | Meaning |
@@ -82,6 +97,22 @@ TODO/FIXME/HACK/XXX comments that have aged.
 **Severity:** high = >30 lines or >3 copies, medium = 10-30 lines or 2 copies, low = <10 lines
+**Concept-based duplicate detection** (requires `codebase-memory-mcp`; supplements jscpd which only finds textual clones):
+LLM-generated code frequently re-implements existing utilities under a different name. jscpd won't catch these — the code is structurally different even though it does the same thing.
+**How to scan:**
+1. Find utility/helper functions: `search_graph(label="Function", name_pattern="(parse_|format_|validate_|convert_|build_|get_|find_|create_|check_|is_|has_)")`
+2. For each result, extract 2–3 concept words from the function name (e.g., `format_invoice_date` → `["format", "date", "invoice"]`)
+3. Run: `search_graph(semantic_query=["<concept1>", "<concept2>", "<concept3>"])` — if `semantic_query` is unsupported, fall back to `search_graph(name_pattern="(<concept1>|<concept2>)")`
+4. Compare: if the search returns a different function, read both with `get_code_snippet` and assess whether they do the same job
+**Flag when:** two functions accept similar inputs, produce similar outputs, and operate on the same domain concept. Assessment is qualitative — the tool returns ranked results, not similarity scores.
+**Focus on:** utility directories (`utils/`, `helpers/`, `lib/`, `common/`), validators, formatters, parsers. These are where re-implementations accumulate.
+**Severity:** high = identical behavior under different names, medium = near-duplicate with minor variations that could be unified with a parameter, low = similar but distinct enough to keep
 ## 2h. Dead Code
 **How to scan:**
@@ -100,3 +131,124 @@ TODO/FIXME/HACK/XXX comments that have aged.
 - Check for over-mocked tests (testing mocks, not behavior)
 **Severity:** high = complex code (top quartile) with <30% coverage, medium = moderate complexity with <50%, low = simple code with low coverage
+## 2j. Pattern Divergence
+Code that solves a problem in a way that contradicts how the codebase already solves the same class of problem. The primary AI slop signal — structurally valid code that ignores established conventions and accumulates architectural drift.
+**Requires:** `codebase-memory-mcp` indexed. Skip this category if graph is not available.
+**How to scan:**
+**Step 1 — Identify peer groups**
+A peer group is a set of nodes in the graph that share the same role. Use `search_graph` to find them:
+| Peer group | Query |
+|---|---|
+| API/route handlers | `search_graph(label="Function", name_pattern="(handle|view|endpoint|route|controller)")` |
+| Service methods | `search_graph(label="Function", name_pattern="(service|use_case|interactor)")` |
+| Repository/data access | `search_graph(label="Function", name_pattern="(repo|repository|store|dao|query)")` |
+| Test files | `search_graph(label="Module", name_pattern="(test_|_test|spec)")` |
+| Error handlers | `search_graph(label="Function", name_pattern="(error|exception|fail|catch)")` |
+Supplement with area docs if available — each area doc lists files by role.
+**Step 2 — Extract modal pattern per peer group**
+For each peer group with ≥3 members, sample 3-5 established members (oldest by `git log`, not recently changed):
+- `get_code_snippet(qualified_name)` for each sample
+- Identify the modal pattern across: error handling style, dependency access (injected vs imported), abstraction depth (business logic in handler vs delegated to service), naming convention, return type shape
+This is the **baseline** — what the codebase already does.
+**Step 3 — Compare recent code against baseline**
+Scope: files changed in the last 60 days (`git log --since="60 days ago" --name-only --format=`). Cross-reference with the peer groups from step 1.
+For each recently changed file that belongs to a peer group:
+1. `get_code_snippet` for the changed function/class
+2. Compare against the modal pattern from step 2
+3. Flag if it diverges on any of the four critical seams (see below)
+**Step 4 — Flag divergences**
+Only flag divergence on seams that matter architecturally. Cosmetic drift (whitespace, docstring style) is not a debt item.
+| Seam | Divergence signal | Example |
+|---|---|---|
+| **Error handling** | Mix of exception-raise vs return-value-error in same layer | Most handlers raise `ValueError`; new one returns `{"error": ...}` |
+| **Data access** | Bypass of established access layer | Most services call `repo.get()`; new one imports ORM model directly |
+| **Abstraction depth** | Business logic at wrong layer | All handlers delegate; new handler contains domain logic inline |
+| **Dependency wiring** | Injected vs hardcoded import for same dependency | All services receive `db` via constructor; new one calls `get_db()` directly |
+| **Test structure** | Different test strategy in same area | All tests in area use factory fixtures; new tests use heavy mocks |
+**Step 5 — Check for hallucinated or non-existent references**
+Use `search_graph` to verify function calls in recently changed files:
+- Extract all function calls in the diff using `search_code(pattern)` or `get_code_snippet`
+- For each called function/method: `search_graph(name_pattern=<name>)` — does it exist?
+- Missing = hallucinated API, deprecated method, or invented config option
+Flag as `pattern_divergence` with detail: "Called `foo.bar()` — no matching node in graph."
+**Severity:**
+- high = divergence at a core architectural seam (data access, error handling, auth) OR hallucinated reference
+- medium = wrong abstraction layer or dependency wiring inconsistency
+- low = test strategy divergence or naming/convention drift
+**Suggested action (per seam):**
+- Error handling: align to codebase's exception or result pattern
+- Data access: route through established repository/service layer
+- Abstraction: extract domain logic to service, slim the handler
+- Dependency: adopt constructor injection or established DI pattern
+- Hallucinated ref: replace with actual existing function (use `search_graph` to find it)
+## 2k. Comment Noise
+Comments that restate the code, reference stale context, or pad function bodies without conveying non-obvious intent. A secondary LLM bloat signal — LLMs are trained to produce documentation and carry that habit into code generation.
+**How to scan:**
+**Step 1 — High comment density files**
+```bash
+grep -rcE "^\s*#|^\s*//" --include="*.py" --include="*.ts" --include="*.js" <src_dirs> | \
+  grep -v ":0$" | sort -t: -k2 -rn | head -20
+```
+Flag files with >30 comment lines. Raw count, not ratio — a 30-comment file is a candidate regardless of size.
+**Step 2 — Restatement pattern grep**
+```bash
+grep -rni \
+  -e "# loop over" -e "# iterate over" -e "# return the" -e "# return result" \
+  -e "# loop through" -e "# now call" -e "# call the" -e "# increment" -e "# decrement" \
+  -e "// loop over" -e "// iterate over" -e "// return the" -e "// return result" \
+  -e "// loop through" -e "// now call" -e "// call the" -e "// increment" -e "// decrement" \
+  --include="*.py" --include="*.ts" --include="*.js" <src_dirs>
+```
+Treat results as candidates — quick human scan to confirm before deleting.
+**Step 3 — Task/PR reference comments**
+```bash
+grep -rn \
+  -e "# added for" -e "# used by" -e "# see issue" -e "# handles the case" -e "# added in" \
+  -e "// added for" -e "// used by" -e "// see issue" -e "// handles the case" -e "// added in" \
+  --include="*.py" --include="*.ts" --include="*.js" <src_dirs>
+```
+These belong in commit messages, not source. Treat results as candidates — review before flagging, as patterns like `# see issue` can appear in legitimate context.
+**Step 4 — Docstrings on private/internal functions**
+```bash
+# Python: single-underscore private functions (excludes dunders)
+grep -rn "def _[^_]" --include="*.py" <src_dirs>
+# TS/JS: JSDoc blocks
+grep -rn "/\*\*" --include="*.ts" --include="*.js" <src_dirs>
+```
+Manual triage: open each hit and check whether a multi-line docstring follows. Python `def _name` functions and TS/JS non-exported functions don't need docstrings. Delete multi-line blocks; a single-line doc is acceptable if `comment_style` requires it.
+**Severity:**
+- high = >20 restatement comments in a single file, or task/PR references in core business logic
+- medium = 5–20 restatement comments, or any task/PR references found
+- low = multi-line docstrings on private functions
+**Suggested action:** Delete restatement comments. Move task/PR references to commit history. Trim private function docstrings to one line or remove entirely.

package/skills/references/review-personas.md ADDED Viewed

@@ -0,0 +1,405 @@
+# Review Personas Reference
+Shared persona evaluation engine used by `grimoire-review` (design review), `grimoire-pr-review` (remote PR diff), and `grimoire-precommit-review` (staged local diff).
+The calling skill is responsible for:
+- Resolving the **input** (specs only, PR diff, or staged diff)
+- Loading project context (`.grimoire/config.yaml`, `.grimoire/docs/`)
+- Building the **Project Briefing** (below) and injecting it into every persona
+- Picking which personas run based on **complexity gating**
+- Compiling persona output into the final report
+This reference defines: project briefing, materiality gate, complexity gating, and the persona prompts themselves.
+---
+## 1. Project Briefing
+Build once, inject as preface to every persona. Findings that don't threaten anything in the briefing are dropped (materiality gate, applied per-persona below).
+### Sources
+- `README.md` — first 50 lines or up to first H2 (product framing, audience, stage signals)
+- `.grimoire/config.yaml` — `project.compliance`, `project.language`, `project.comment_style`, `project.surface`, `dep_audit`
+- `.grimoire/docs/context.yml` — deployment env, related services (if exists)
+- `.grimoire/docs/components.md` — component-library inventory (if exists)
+- `.grimoire/brand/tokens.json` and `.grimoire/brand/voice.md` — brand axis (if exist; see `./brand-tokens-format.md`)
+- `.grimoire/changes/<id>/consult.md` — pre-design consult assumptions + givens (if exists)
+- `.grimoire/changes/<id>/designs/problem.md` — design problem statement (if exists)
+- Tag histogram across `.grimoire/changes/**/*.feature` + `.grimoire/archive/**/*.feature`
+- All `.grimoire/decisions/*.md` with `status: accepted` — extract ID, title, top Decision Driver
+- Linked manifest's `Why` and `Non-goals` (if a Change trailer / active manifest exists); else PR body or commit messages
+### Feature inventory
+- Glob `.grimoire/changes/**/*.feature` + `.grimoire/archive/**/*.feature`
+- Parse: `Feature:` line, first description line, `@tags`
+- Bucket by path prefix (area)
+- If total >80, emit area-level summary only (count + capability one-liner)
+### README fallback
+If missing or <200 chars: design review prompts the user once; PR/pre-commit review notes `Product framing: unknown` and proceeds.
+### Briefing block
+```markdown
+## Project Briefing
+**Product:** <one-line from README>
+**Stage:** <prototype | internal | customer-facing | regulated — inferred from compliance config + README>
+**Surface:** <tui | web | mobile | api | mixed | unknown — from `project.surface`; drives adversarial-persona filtering>
+**Users:** <who, scale, trust level>
+**Data sensitivity:** <none | pii | financial | phi — derived from tag histogram + compliance>
+**Threat surface:** <only tags with count >0, e.g. auth=4, pii=3, payment=2>
+**Brand:** <captured | none — one-line summary of `.grimoire/brand/tokens.json` presence + key tokens>
+**Component library:** <name + path to `.grimoire/docs/components.md` | none documented>
+**Problem statement:** <one-line from `designs/problem.md` | n/a>
+**Active constraints (accepted decisions):**
+- ADR-XXXX — <title>
+- ...
+**Feature inventory:**
+<area>/ (N features)
+  - <Feature title> [@tags] — one-line capability
+  ...
+Total: <N> features across <M> areas.
+**Linked change non-goals (if any):**
+- <bullets, or "n/a">
+```
+---
+## 2. Materiality Gate
+Apply to every persona. Every finding must cite either:
+- A briefing axis it threatens (stage, data sensitivity, active constraint, threat-surface tag, surface), OR
+- A concrete feature-inventory gap, OR
+- A **brand axis** mismatch (e.g., design uses `#FF0000` not in `tokens.json`), OR
+- A **component-inventory gap** (e.g., design introduces a new Button despite an existing variant in `components.md`), OR
+- A **problem-statement mismatch** (e.g., scenario doesn't address the user problem articulated in `designs/problem.md`)
+Rules:
+- If the inventory shows the concern is already covered elsewhere, drop the finding or downgrade to a cross-feature integration note.
+- Findings with no briefing anchor are dropped. Don't manufacture findings to hit a quota.
+- Treat accepted ADRs as constraints, not suggestions. If a persona thinks one is wrong, name the ADR by ID and propose superseding it.
+- Before flagging a missing capability (rate limit, audit log, etc.), check the feature inventory for a sibling feature that already covers it.
+## 2a. Steel-Man Before Flag
+Before submitting any finding, write (mentally or in the finding itself) the strongest version of why the design / code is the way it is. If the steel-man holds, drop the finding. A finding that survives must explain why the steel-man is wrong given *this* project's briefing.
+For each candidate finding, the persona must be able to complete:
+- **Steel-man:** "The author likely chose this because <strongest plausible reason tied to briefing / constraints / convention>."
+- **Why it still fails:** "Despite that, <concrete harm path tied to a briefing axis>."
+If the persona can't complete both lines with substance, the finding is dropped. Vague harm paths ("could be exploited", "might fail", "is fragile") do not count — name the trigger and the consequence.
+## 2b. Severity Calibration
+The default is **suggestion**. A finding is a **blocker** only when *all three* hold:
+1. **Concrete harm path** — name the trigger (the input, sequence, or state) and the consequence (data loss, auth bypass, regulator violation, broken acceptance criteria, regression).
+2. **Briefing-anchored** — the consequence threatens a briefing axis (stage, data sensitivity, threat-surface tag, active ADR, manifest non-goal).
+3. **Not already mitigated** — neighbor code, framework default, or sibling feature does not already handle it.
+If any of the three is missing → suggestion, not blocker. If all three are weak → drop.
+**Zero findings is a valid outcome.** Personas are not graded on volume. A persona that submits "no material findings under the briefing" is doing its job. Do not invent a blocker to hit a quota — reviewers who exaggerate severity get tuned out and the real blockers get lost.
+Severity inflation patterns to avoid:
+- "Could lead to" / "in theory" / "if an attacker" without the path → drop or downgrade.
+- "Best practice says X" without a project anchor → suggestion at most, often drop.
+- "Untested edge case" when no scenario in the briefing covers it → not a blocker.
+- "Missing observability" on a level 1-2 change → suggestion, never blocker.
+---
+## 3. Complexity Gating
+Read `complexity` from the linked manifest if available; otherwise infer from the change.
+### Design review (`grimoire-review`)
+| Complexity | Depth |
+|---|---|
+| 1 (Trivial) | Skip review entirely — proceed to apply |
+| 2 (Simple) | Senior Engineer only; skip others unless touching security or data |
+| 3 (Moderate) | All relevant personas (skip Data if no data changes, skip QA if no user-facing change) |
+| 4 (Complex) | All personas mandatory |
+Note: When `project.surface` is set, adversarial personas auto-filter per the activation matrix in `./adversarial-personas.md`. Surface-irrelevant personas (e.g., touch-target on a TUI surface) are skipped by default; the user can still force-engage them via `--personas=...`.
+### Diff review (PR + pre-commit)
+| Signal | Depth |
+|---|---|
+| Docs only, ≤50 lines | Senior engineer + Code Style skim |
+| Linked complexity 1-2, diff <200 lines, no security tags | Senior engineer + Security quick scan + Code Style |
+| Linked complexity 3, OR diff touches auth/data/API | All relevant personas (skip Data if no schema, skip QA if no user-facing) |
+| Linked complexity 4, OR diff >500 lines, OR multi-domain | All personas mandatory |
+User can override: "full review", "just security", "just code style", etc.
+### Contrarian pass
+Runs after the chosen personas submit findings, whenever at least one blocker exists. Skipped only if every persona returned zero findings. Not configurable by complexity — the inflation problem hits all levels.
+---
+## 4. Personas
+Each persona below names what it evaluates. The calling skill points the persona at the right input (specs for design review, diff hunks for PR / pre-commit). Apply the materiality gate to every finding. Flag as **blocker** (must fix before merge / commit / coding) or **suggestion**.
+### 4.1 Product Manager
+Skip if the change is purely internal (no user-facing behavior).
+Evaluate:
+- **Outcome**: Manifest's Why states the problem and how success is measured? Mechanism vs outcome ("add an endpoint" vs "users can reset passwords")?
+- **Coverage**: Do feature scenarios cover all user-facing behaviors? Missing edge cases, error states, alternate flows?
+- **Diff vs scenarios** *(PR/pre-commit only)*: If a feature file exists in the change, does the diff implement every scenario? Any scenario with no matching code change?
+- **Non-goals**: Does the design / diff touch anything the manifest's Non-goals excludes? Scope creep into non-goals = **blocker**.
+- **Acceptance**: Could a PM validate this meets the feature's acceptance criteria from the artifact in front of them?
+- **Clarity**: Are descriptions clear enough for a non-technical stakeholder? Does the PR/commit body make user-visible outcome clear?
+### 4.2 Senior Engineer
+Treat accepted decisions as constraints — cite ADR ID before suggesting an override.
+Evaluate:
+- **Build vs Buy** *(design only)*: Was prior art research thorough? If a well-maintained library exists that the manifest doesn't mention, **blocker**.
+- **Simplicity**: Simplest design that solves the problem? Unnecessary abstraction, indirection, premature generalization, config-driven where direct call would do?
+- **Architecture**: Decisions sensible for this codebase? Will this paint us into a corner?
+- **Conventions** *(PR/pre-commit)*: Does new code match file layout, naming, and patterns already in the touched areas? Check `.grimoire/docs/<area>.md` if present.
+- **Reuse**: Existing utilities/functions that were re-implemented? `grep` for similar names; check area docs' reusable-code lists.
+- **Dead code** *(PR/pre-commit)*: Functions added but not called, imports unused, commented-out code, stubs with no implementation.
+- **Scope creep** *(PR/pre-commit)*: Files changed outside the scope implied by the change-id or manifest. Formatting-only changes to unrelated files = noise.
+- **Error handling**: Errors handled at boundaries? Internal code shouldn't be littered with defensive checks; external inputs must be validated.
+- **Tests**: New behaviors have tests? Tests make real assertions (not just `assert true` / mock everything)? Check `./testing-contracts.md` if framework matches.
+- **Contract compatibility**: If `data.yml` / `schema.yml` exists, does the design / diff change request/response shape for a documented API? Contract change without updated contract test = **blocker**.
+- **Dependencies**: New packages not mentioned in tasks? Version bumps not noted?
+- **Task alignment** *(PR/pre-commit, if `tasks.md` exists)*: Does the diff complete the tasks as written? Any task marked done but no corresponding code?
+- **Surface area**: New public APIs/exports/interfaces beyond what's needed? Fewer public functions with fewer parameters is better.
+- **Quality attributes** *(design only)*: Decision records' Quality Attributes targets measurable and realistic? Blank targets on perf-sensitive change = **blocker**.
+### 4.3 Security Engineer
+Calibrate severity to stage and data sensitivity from the briefing. Don't flag generic OWASP items that don't threaten the briefing's threat surface. Apply `./security-compliance.md`.
+#### STRIDE
+For every new entry point, data flow, or trust boundary:
+| Threat | Question |
+|---|---|
+| **S**poofing | Auth check at every new route/handler? |
+| **T**ampering | Input/message integrity validated? CSRF on state-changing requests? |
+| **R**epudiation | Security-relevant actions logged? |
+| **I**nfo disclosure | Errors, logs, stack traces leaking PII/tokens/secrets? |
+| **D**oS | Unbounded loops, unlimited file uploads, expensive queries on user input, no rate limit? |
+| **E**oP | Role/permission checks at the right layer? Bypass via missing middleware? |
+Skip categories that don't apply.
+#### Code-level scan *(PR/pre-commit only)*
+- **Secrets**: Grep diff for hardcoded keys, tokens, passwords, cloud credentials, JWT secrets. Any hit = **blocker**.
+- **Injection**: Raw SQL with string concatenation, shell-exec with user input, `eval`/`exec`, unsafe deserialization. Tag OWASP + CWE.
+- **Input validation**: New endpoints without schema validation, file uploads without size/type limits, path params used directly in filesystem calls.
+- **Auth**: New routes/handlers missing auth decorators / middleware. Compare against neighbors in same file.
+- **Dependencies**: New packages — pinned to exact version (no `^`/`~`/`>=`/`*`), lockfile updated and committed with integrity hashes, name is real (typosquat risk), `dep_audit` output clean if committed. Flag packages with zero downloads, recent ownership transfer (~90 days), suspicious new maintainers, or post-install scripts. Unpinned dep or missing lockfile entry on a new package = **blocker** (see `./security-compliance.md` § Supply Chain Defense).
+- **PII**: New logging that could emit PII; new storage of personal data without encryption.
+- **Cross-service auth**: If `context.yml` lists related services, are service-to-service calls authenticated?
+#### Compliance
+If `project.compliance` configured, verify per `./security-compliance.md` (section "Compliance Framework Verification"). Security-tagged scenario in linked change with no corresponding verification = **blocker**.
+#### Tagging
+Every security finding gets OWASP 2021 + CWE tags. See CWE quick-reference in `./security-compliance.md`.
+### 4.4 QA Engineer
+Skip if change is purely internal.
+Evaluate:
+- **Test presence**: Every new user-facing behavior has a test? Every scenario from linked feature file has step definitions?
+- **Test quality**: Tests asserting outputs, or just that code "ran"? Over-mocked tests = red flag.
+- **Negative paths**: For each happy path, is there a failure-path test?
+- **Edge cases**: Empty states, concurrent users, interruptions, boundary values?
+- **Observability**: New feature — how will it be debugged in prod? Structured logs / metrics / error surfaces?
+- **Regression risk** *(PR/pre-commit)*: Which existing tests cover the touched code? Were any tests removed or weakened?
+- **Accessibility**: New UI — keyboard nav, aria labels, contrast?
+### 4.5 Data Engineer
+Skip unless change touches migrations, models, schema, or external API clients.
+Read:
+- `.grimoire/changes/<change-id>/data.yml` — proposed schema changes (design)
+- `.grimoire/docs/data/schema.yml` — current baseline
+Evaluate:
+- **Migrations**: Safe on live DB? Adding NOT NULL without default on large table = **blocker**. Renames without two-step migration = **blocker**.
+- **Indexes**: New foreign keys with no index? New query patterns against unindexed columns?
+- **Naming**: New fields follow existing schema conventions?
+- **Backwards compatibility**: Will schema change break existing API consumers, queries, or reports?
+- **Breaking contract**: `data.yml` vs `schema.yml` — removed/renamed/retyped response fields or new required request fields = **blocker** unless migration path documented.
+- **Transactions**: Multi-step writes wrapped in a transaction?
+- **External APIs** *(design)*: New API dependency — `schema_ref` pointing to a stable spec? Fallback if API unavailable?
+### 4.6 Code Style Reviewer *(PR/pre-commit only — skip on design review)*
+Verify the diff matches the project's code-style and comment standards. This is not "general taste" — every finding must cite a concrete project rule the change violates.
+#### Sources to load (in order)
+1. `.grimoire/config.yaml` → `project.comment_style` and `project.language` (sets baseline expectations)
+2. `AGENTS.md` / `CLAUDE.md` at repo root — engineering principles, comment policy
+3. `.grimoire/docs/<area>.md` for each touched area — local conventions, reusable utilities
+4. Lint/format config in repo root: `.editorconfig`, `eslint.config.*`, `.prettierrc*`, `pyproject.toml` (ruff/black), `.rubocop.yml`, `rustfmt.toml`, `.golangci.yml`, etc.
+5. **Neighboring files** in the touched directories — derive convention from what already exists when no config exists
+If none of the above pin a rule, **don't invent one**. Style preferences without a project anchor are dropped.
+#### Evaluate
+- **Naming**: Identifiers (functions, types, files) match the project's casing and naming patterns visible in neighbors? New file names follow the directory's existing pattern?
+- **File layout**: New file lives where similar files live? Module boundaries respected?
+- **Imports**: Order, grouping, and form match the project (relative vs absolute, `.js` extension policy, type-only imports)?
+- **Formatting**: Diff respects `.editorconfig` and formatter rules (indentation, line endings, trailing newline, max line length)? Any formatter-noisy hunks unrelated to the change?
+- **Comments — presence**: Is there a comment whose WHAT is already obvious from the code? Per most projects' comment policies (and grimoire's `AGENTS.md`), explanatory-of-what comments are noise — **suggestion** to remove.
+- **Comments — content**: Do comments reference current task / fix / PR / caller ("added for X", "used by Y", "fix for issue #123")? These rot — **suggestion** to remove or rewrite as durable rationale.
+- **Comments — style**: Match the project's comment form (`//` vs `/* */` vs `#`, JSDoc/TSDoc/docstring conventions)?
+- **Docstrings**: New public functions/classes — does the project require docstrings? If yes (per `comment_style` or visible convention), missing docstring = **suggestion**. If no, added boilerplate docstrings = **suggestion** to remove.
+- **Dead comments**: Commented-out code in the diff = **suggestion** to delete.
+- **TODO/FIXME**: New TODOs added with no owner or ticket reference, when project convention requires them = **suggestion**.
+- **Error messages / log strings**: Tone and format match neighbors (sentence case, periods, structured logging fields)?
+- **Type annotations** *(typed languages)*: New code matches the project's typing strictness — no `any`/`unknown`/`Object` if neighbors are strict; explicit return types if convention requires?
+Severity:
+- **Blocker**: violates a configured lint/format rule that would fail CI, or violates an explicit rule in `AGENTS.md` / `CLAUDE.md` / area doc.
+- **Suggestion**: deviates from neighbor convention without a config anchor, or comment-policy nits.
+If the project has no committed style config and neighbors are inconsistent, say so once and move on — don't pick a side.
+### 4.7 Adversarial User *(engaged when `project.surface` matches the persona's activation row)*
+Surface-conditional personas inhabit users the design might fail: keyboard-only, screen-reader, low-vision / color-blind, touch-target, responsive-breakpoint, RTL / i18n, low-bandwidth / offline, hostile-actor, API-conventions. Full criteria, persona catalog, activation matrix (persona × surface), severity calibration, and steel-man requirement live in `./adversarial-personas.md`. Engagement is gated by `project.surface` — see the activation matrix there. Findings inherit §1 briefing, §2 / §2a / §2b materiality and severity rules. The Contrarian pass (§4.8) calibrates adversarial findings post-hoc on the same terms as the other personas.
+### 4.8 Contrarian *(runs last, after all other personas submit findings)*
+Inspired by ouroboros/contrarian — adapted for review use. The Contrarian does not submit its own findings against the code. Instead, it **challenges the other personas' findings**, especially blockers, and tunes them. Its goal is to kill the reviewer-overreach failure mode: manufactured blockers, missing steel-mans, finding-by-quota, severity inflation.
+Always runs when at least one persona produced a blocker. May be skipped only when all personas produced zero findings.
+#### Inputs
+- The complete set of findings from §4.1-§4.7 (blockers and suggestions).
+- The Project Briefing (§1).
+- The diff or design under review.
+#### For each blocker, ask:
+1. **What is the steel-man for the author's choice?** Write the strongest version of why the code / design is the way it is — drawing on briefing constraints, ADRs, neighbor conventions, performance trade-offs, simplicity, stage. If the finding doesn't already include a steel-man (§2a), the Contrarian writes one. If the steel-man holds, the finding is wrong.
+2. **What assumption is this finding making?** Name it. ("Assumes inputs are untrusted at this layer." / "Assumes high traffic." / "Assumes a regulator audits this surface.") If the assumption doesn't match the briefing, the finding is mis-calibrated.
+3. **What if the opposite were right?** What if the "obvious" fix is the wrong move for this codebase / stage / scale? Inversion test: if you removed the existing code and applied the finding's recommendation, what *new* problems would you create? List them.
+4. **What if doing nothing is the right call?** Is this a symptom or a root cause? Will it actually trigger? What's the cost of "fix now" vs. "fix when it actually hurts"?
+5. **Is the severity calibrated?** Does the finding meet all three blocker criteria (§2b)? If not, downgrade or drop.
+#### For suggestions, ask:
+- Is this a real preference of the project, or the reviewer's preference? If the reviewer can't cite a project anchor (AGENTS.md, ADR, area doc, neighbor pattern), drop it.
+#### Output
+For each blocker the Contrarian processes, emit one of:
+- **Upheld** — `[blocker upheld]` with one line: "Steel-man considered; harm path holds because …"
+- **Downgraded to suggestion** — `[blocker → suggestion]` with one line explaining what was missing (no harm path / no briefing anchor / mitigated by neighbor / steel-man held in part).
+- **Dropped** — `[finding dropped]` with one line explaining why.
+The Contrarian's report replaces or annotates the original findings. The Summary uses the post-Contrarian counts.
+#### Contrarian is not a veto
+The Contrarian is a calibration pass, not an authority. If a persona disagrees with a downgrade and can cite a concrete harm path tied to briefing that the Contrarian missed, the finding is re-upheld. The Contrarian's job is to make findings *honest about their evidence*, not to suppress signal.
+#### What Contrarian does NOT do
+- It does not add new findings.
+- It does not soften the *content* of upheld blockers (no "perhaps consider possibly" hedging).
+- It does not challenge findings that already pass §2a/§2b cleanly.
+- It does not run on level 1 (no review) or when all personas returned zero findings.
+---
+## 5. Output Format
+Each persona returns a short bulleted list. The calling skill compiles them into a single report. Standard structure:
+```markdown
+# <Review Title> — <subject>
+<header line: change-id / PR number / staged-diff scope, base/head, complexity, files/lines>
+## Project Briefing
+<from §1>
+## Product Manager
+- **[blocker]** ...
+- **[suggestion]** ...
+(or "Skipped — purely internal change.")
+## Senior Engineer
+- ...
+## Security Engineer
+### STRIDE
+- Spoofing: ...
+- Tampering: ...
+- ...
+### Findings
+- **[blocker]** [A03:2021 / CWE-89] ...
+## QA Engineer
+- ...
+## Data Engineer
+- ...
+## Code Style                     <!-- omit on design review -->
+- **[blocker]** `eslint.config.js` rule `no-unused-vars` violated at `src/foo.ts:42`
+- **[suggestion]** Comment at `src/foo.ts:88` describes what the code does — remove (per `AGENTS.md` "Default to writing no comments").
+## Adversarial User                <!-- omit when no surface-matched personas engaged -->
+- **[blocker]** [keyboard-only] Submit button at `designs/variant-2.html:84` is `<div onclick>` — not focusable. Steel-man considered (custom styling); harm path holds (user cannot tab to submit).
+- **[suggestion]** [low-vision] Body text contrast 4.2:1 at `designs/variant-2.html:120` — below WCAG AA 4.5:1.
+## Contrarian                     <!-- omit when zero findings from all personas -->
+- **[blocker upheld]** Senior Engineer's auth-bypass finding at `src/api/users.ts:18`. Steel-man: middleware order may guarantee auth runs first. Inspected — the route is mounted outside the auth middleware. Harm path holds.
+- **[blocker → suggestion]** Security Engineer's "missing rate limit on /reset-password". Briefing stage is internal-tools; threat surface tag count = 0. Cost-of-fix > realistic harm.
+- **[finding dropped]** QA Engineer's "missing test for concurrent password reset". No scenario in the briefing references concurrency; no harm path stated; downgrade to suggestion would also fail. Dropped.
+## Summary                        <!-- counts are post-Contrarian -->
+- **N blockers** — must be addressed
+- **M suggestions** — consider addressing
+Recommendation: <fix blockers / request changes / approve / proceed to apply>
+```
+---
+## 6. Style Rules for Findings
+- Reference specific files and line numbers for every diff-based finding.
+- Be direct. No padding with praise. Blockers stop the gate; suggestions are advisory.
+- Findings describe the code, not the person. "This query is vulnerable to injection" not "you wrote an injection".
+- Three findings that matter beat ten that don't.
+- If the change is trivial, say so and don't manufacture issues.

package/skills/references/security-compliance.md CHANGED Viewed

@@ -75,7 +75,7 @@ For changed files, do a lightweight scan:
 | A03: Injection | String concatenation in SQL/commands/templates, `eval()`, `innerHTML` with user data |
 | A04: Insecure Design | Missing rate limiting on auth endpoints, no account lockout |
 | A05: Security Misconfiguration | Debug mode enabled, default credentials, overly permissive CORS |
-| A06: Vulnerable Components | New dependencies without version pins, known-vulnerable packages |
+| A06: Vulnerable Components | Unpinned dependencies (ranges like `^`, `~`, `>=`, `*`), missing/uncommitted lockfile, known-vulnerable packages, recently-transferred or newly-maintained packages (supply-chain attack vector) |
 | A07: Auth Failures | Weak password requirements, session tokens in URLs |
 | A08: Data Integrity Failures | Insecure deserialization (`pickle`, `yaml.load`), missing integrity checks |
 | A09: Logging Failures | Security events not logged, PII/secrets in log output |
@@ -95,6 +95,27 @@ Tag each finding with OWASP category and CWE ID.
 | Hardcoded secrets | A07:2021 / CWE-798 |
 | SSRF | A10:2021 / CWE-918 |
 | Insecure deserialization | A08:2021 / CWE-502 |
+| Unpinned / unverified dependency | A06:2021 / CWE-1357, CWE-829 |
+## Supply Chain Defense
+Applies to any change that adds or upgrades a dependency, regardless of tags. Recent ecosystem incidents (npm, PyPI, RubyGems, Cargo) show attackers compromising maintainer accounts or transferring packages to push malicious patch releases. Floating version ranges let those releases auto-install on the next build.
+**Scope:** these rules apply to **applications and services** (the thing that gets deployed). Libraries published to a registry should keep compatible ranges in their manifest so consumers can resolve — apply pinning only in the library's own dev/test lockfile, not its published manifest.
+- **Pin resolved versions via a committed lockfile.** The build must always install the same exact versions and transitive tree that were reviewed.
+  - **npm / pnpm / yarn:** lockfile committed; no `^`, `~`, `>=`, `*`, or `latest` resolving wider than intended in `package.json` for an app.
+  - **Python (pip / uv / poetry):** committed lockfile with hashes (`uv.lock`, `poetry.lock`, or `requirements.txt` generated via `pip-compile --generate-hashes`). Avoid unpinned `requirements.txt`.
+  - **Ruby:** committed `Gemfile.lock`. `Gemfile` may use pessimistic operators; lockfile gates the resolution.
+  - **Cargo:** committed `Cargo.lock` (mandatory for binaries; opt in for libraries used as apps). Caret in `Cargo.toml` is idiomatic — the lockfile is the source of truth.
+  - **Go:** `go.mod` already records exact minimum versions (Minimum Version Selection — no range operators exist), and `go.sum` carries module hashes. Both must be committed.
+- **Install from the lockfile in CI and prod.** `npm ci`, `pnpm install --frozen-lockfile`, `yarn install --frozen-lockfile` (Classic / v1) or `yarn install --immutable` (Berry / v2+), `uv sync --frozen`, `pip install --require-hashes -r requirements.txt`, `bundle install --deployment` (or `bundle config set --local frozen true && bundle install`), `cargo build --locked`, `go build` with `GOFLAGS=-mod=readonly`. Never run a resolver that can mutate the lockfile silently (`npm install`, `bundle install` without frozen, etc.).
+- **Verify integrity hashes** are present in the lockfile (npm `integrity:`, pip `--hash` entries, Cargo `checksum`, Go `h1:`, `Gemfile.lock` `CHECKSUMS` section — opt-in via `bundle lock --add-checksums` on Bundler 2.5+). Reject lockfile entries without them on ecosystems that support hashing.
+- **Vet new packages** — real name (typosquat check against the popular package), non-trivial download count, established maintainers, no ownership transfer in the last ~90 days, no sudden new-maintainer publish.
+- **Run `dep_audit`** (if `config.tools.dep_audit` is configured) against the lockfile before merge. Vulnerable transitive dep = **blocker** unless an override is justified in the change manifest.
+- **Avoid post-install / build scripts** from untrusted packages where possible. Use `--ignore-scripts` (npm) or equivalent in CI when feasible.
+A change that adds or upgrades a dependency in an app/service without a committed lockfile (and, for ecosystems that support it, integrity hashes) is a **blocker**. Treat dependency additions as a change to the trust boundary — they execute with the same privileges as your code.
 ## Compliance Framework Verification