npm - @hegemonart/get-design-done - Versions diffs - 1.48.0 → 1.50.0 - Mend

@hegemonart/get-design-done 1.48.0 → 1.50.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

package/.claude-plugin/marketplace.json +2 -2
package/.claude-plugin/plugin.json +8 -2
package/CHANGELOG.md +93 -0
package/README.md +4 -0
package/SKILL.md +2 -1
package/agents/design-auditor.md +37 -4
package/agents/design-context-builder.md +2 -0
package/agents/design-debt-crawler.md +36 -5
package/agents/design-executor.md +2 -0
package/agents/design-fixer.md +4 -1
package/agents/design-planner.md +2 -0
package/agents/design-reflector.md +2 -0
package/agents/design-research-synthesizer.md +2 -0
package/agents/design-verifier.md +7 -15
package/dist/claude-code/.claude/skills/audit/SKILL.md +1 -1
package/dist/claude-code/.claude/skills/brief/SKILL.md +1 -1
package/dist/claude-code/.claude/skills/compare/SKILL.md +1 -1
package/dist/claude-code/.claude/skills/connections/SKILL.md +1 -1
package/dist/claude-code/.claude/skills/darkmode/SKILL.md +1 -1
package/dist/claude-code/.claude/skills/design/SKILL.md +1 -1
package/dist/claude-code/.claude/skills/discover/SKILL.md +1 -1
package/dist/claude-code/.claude/skills/do/SKILL.md +1 -1
package/dist/claude-code/.claude/skills/explore/SKILL.md +1 -1
package/dist/claude-code/.claude/skills/fast/SKILL.md +1 -1
package/dist/claude-code/.claude/skills/health/SKILL.md +2 -2
package/dist/claude-code/.claude/skills/live/SKILL.md +1 -1
package/dist/claude-code/.claude/skills/new-skill/SKILL.md +90 -0
package/dist/claude-code/.claude/skills/plan/SKILL.md +1 -1
package/dist/claude-code/.claude/skills/progress/SKILL.md +9 -1
package/dist/claude-code/.claude/skills/quick/SKILL.md +1 -1
package/dist/claude-code/.claude/skills/scan/SKILL.md +1 -1
package/dist/claude-code/.claude/skills/ship/SKILL.md +1 -1
package/dist/claude-code/.claude/skills/verify/SKILL.md +1 -1
package/hooks/gdd-design-quality-check.js +340 -0
package/hooks/hooks.json +9 -0
package/package.json +12 -2
package/reference/anti-slop-rubric.md +173 -0
package/reference/audit-scoring.md +4 -0
package/reference/debt-categories.md +20 -1
package/reference/registry.json +28 -0
package/reference/reviewer-confidence-gate.md +108 -0
package/reference/skill-authoring-contract.md +97 -15
package/reference/skill-graph.md +118 -0
package/reference/visual-tells.md +383 -0
package/scripts/lib/confidence-route.cjs +60 -0
package/scripts/lib/manifest/scaffolder.cjs +261 -0
package/scripts/lib/manifest/schemas/skills.schema.json +14 -0
package/scripts/lib/manifest/skills.json +26 -18
package/scripts/lib/worktree-resolve.cjs +221 -0
package/sdk/mcp/gdd-state/server.js +37 -4
package/sdk/mcp/gdd-state/tools/shared.ts +61 -0
package/skills/audit/SKILL.md +1 -1
package/skills/brief/SKILL.md +1 -1
package/skills/compare/SKILL.md +1 -1
package/skills/connections/SKILL.md +1 -1
package/skills/darkmode/SKILL.md +1 -1
package/skills/design/SKILL.md +1 -1
package/skills/discover/SKILL.md +1 -1
package/skills/do/SKILL.md +1 -1
package/skills/explore/SKILL.md +1 -1
package/skills/fast/SKILL.md +1 -1
package/skills/health/SKILL.md +2 -2
package/skills/live/SKILL.md +1 -1
package/skills/new-skill/SKILL.md +90 -0
package/skills/plan/SKILL.md +1 -1
package/skills/progress/SKILL.md +9 -1
package/skills/quick/SKILL.md +1 -1
package/skills/scan/SKILL.md +1 -1
package/skills/ship/SKILL.md +1 -1
package/skills/verify/SKILL.md +1 -1

package/reference/anti-slop-rubric.md ADDED Viewed

@@ -0,0 +1,173 @@
+---
+name: anti-slop-rubric
+type: reference
+version: 1.0.0
+phase: 50
+tags: [anti-slop, verb-axes, lens-tag, orthogonal, aesthetic-slop, directness, distinctness, hierarchy, authenticity, density]
+last_updated: 2026-06-03
+---
+# Anti-slop Rubric (Verb Axes)
+The 7-pillar audit in `reference/audit-scoring.md` answers "is the typography wrong,
+is the contrast failing, is the spacing off-grid?". This rubric answers a different
+question: "is the work generically AI-default, even when every pillar passes?". A
+screen can clear all seven pillars and still read as a template a model produced
+without a brief. These five axes name that gap.
+## Orthogonal by design
+This is an ORTHOGONAL lens, not a new pillar. It mirrors the lens-tag pattern that
+`emotion_levels`, `composition_alignment`, and `i18n_readiness` already follow in
+`reference/audit-scoring.md`. Adding these axes does NOT:
+- add an eighth scored pillar (the reserved Pillar 8 stays unscored),
+- change any pillar weight,
+- change the qualitative /28 total in `agents/design-auditor.md`,
+- change the weighted 0-100 score in `reference/audit-scoring.md`.
+The axes attach to existing findings as a label. They produce one routing signal (a
+sum threshold) and nothing else touches the scoring math. The `verb_axes` lens-tag is
+registered in `reference/audit-scoring.md` under Lens-Tags (Orthogonal).
+## How to score
+Score each axis 1-10 against its scale, after the pillar pass. Ten is specific,
+chosen, and defensible in three sentences against the brand. One is the move a model
+makes when no brief exists. Read the diagnostic question first, then place the work on
+the scale, then record the number. Each axis below carries three paired before/after
+examples drawn from design-domain content so the boundary between a 3 and an 8 is
+concrete, not a vibe.
+The five scores are independent of the 1-4 pillar scores and never change them.
+---
+## Axis 1: Directness
+**Diagnostic question:** Does the copy and the call to action name the specific
+product, verb, and outcome, or does it fall back to a label that would fit any app?
+| Score | Criteria |
+|-------|----------|
+| 9-10 | Every primary action names its object and outcome; the headline carries a verb and a specific promise |
+| 7-8 | Most actions are specific; one generic label remains on a reversible secondary action |
+| 4-6 | A mix of specific and generic; the hero leans on a template opener |
+| 1-3 | Bare "Get Started", "Submit", "Welcome to [Product]"; no subject, no product-specific verb |
+Paired examples:
+1. Before: button reads "Get Started". After: "Start a free audit".
+2. Before: headline "Welcome to the platform". After: "Ship your first design pass in ten minutes".
+3. Before: empty state "No data". After: "No audits yet. Run your first audit to see findings here."
+---
+## Axis 2: Distinctness
+**Diagnostic question:** Would this surface be recognizable as this product, or is it
+the default palette, the default typeface, and the default decoration any model reaches
+for first?
+| Score | Criteria |
+|-------|----------|
+| 9-10 | Palette and type record a brand decision in tokens; decoration earns its place |
+| 7-8 | A clear identity with one or two default-leaning choices left undocumented |
+| 4-6 | Recognizable in places, generic in others; tokens partly present |
+| 1-3 | Purple-violet accent, Inter alone, gradient and glass standing in for identity |
+Paired examples:
+1. Before: `bg-violet-600` hardcoded on every primary button. After: `bg-primary` routed to a documented brand hue token.
+2. Before: Inter set on the root with no second face and no token. After: a defended display face paired with a body face, both as `--font-*` tokens.
+3. Before: three gradients and frosted glass carrying the hero. After: one solid surface with weight and spacing carrying hierarchy.
+---
+## Axis 3: Hierarchy
+**Diagnostic question:** Does the eye land on one clear focal point and follow an
+obvious reading order, or does every block compete on the same axis with the same
+weight?
+| Score | Criteria |
+|-------|----------|
+| 9-10 | One primary action per view; reading order is instant; weight and spacing group meaning |
+| 7-8 | Mostly clear; one or two competing priorities |
+| 4-6 | The primary action must be hunted; several blocks share equal weight |
+| 1-3 | Centered-everything; flat weight throughout; no discernible focal point |
+Paired examples:
+1. Before: hero, feature grid, and testimonial all centered with `mx-auto text-center`. After: the hero line centered, body and lists left-aligned on a shared reading edge.
+2. Before: three buttons styled as primary on one view. After: one primary action, the rest demoted to secondary or text.
+3. Before: every heading at `font-weight: 400`, same size as body. After: bold headings, regular body, medium labels in a deliberate weight ladder.
+---
+## Axis 4: Authenticity
+**Diagnostic question:** Does the surface show the real product, or does it lean on
+stock scenes, placeholder copy, and badge decoration that signal nothing was actually
+built yet?
+| Score | Criteria |
+|-------|----------|
+| 9-10 | Real screenshots or purpose-drawn art; copy is shipped, not placeholder; badges carry true status |
+| 7-8 | Mostly real with one stock asset or one stray placeholder |
+| 4-6 | A mix of real and stock; some lorem ipsum survives past mockup |
+| 1-3 | Undraw isometric scenes, lorem ipsum, "New" and "AI-powered" badge spam |
+Paired examples:
+1. Before: `undraw_dashboard.svg` in the empty state. After: a real screenshot of the populated dashboard.
+2. Before: "Lorem ipsum dolor sit amet" in a shipped card body. After: the actual feature description in product voice.
+3. Before: a row of "New", "Beta", "AI-powered" badges with no state behind them. After: one badge that reflects a real, current status.
+---
+## Axis 5: Density
+**Diagnostic question:** Is the information density chosen for the content and the
+reader, or is everything inflated to fill space with oversized single words and airy
+padding that says nothing?
+| Score | Criteria |
+|-------|----------|
+| 9-10 | Density fits the content; spacing rides the scale; type sizes serve reading, not decoration |
+| 7-8 | Mostly considered; one oversized display moment that could earn its size |
+| 4-6 | Uneven density; some off-scale padding; a few sizes chosen for drama over meaning |
+| 1-3 | One giant word per section, vast empty padding, no content to justify the scale |
+Paired examples:
+1. Before: a single word at `text-9xl` filling a section with nothing else. After: a sized headline plus the supporting sentence the word was standing in for.
+2. Before: card padding at arbitrary `p-[37px]` off the scale. After: padding snapped to the 8pt step the rest of the layout uses.
+3. Before: a feature grid where each tile holds three words and 200px of air. After: tiles sized to their content with rhythm matched across siblings.
+---
+## Threshold and routing
+Sum the five axis scores for one finding. The maximum is 50 (five axes times ten).
+```
+verb_axes_sum = directness + distinctness + hierarchy + authenticity + density
+```
+When `verb_axes_sum < 35` (out of 50), the work reads as generically AI-default even
+if the pillars pass. Route that finding to `agents/design-debt-crawler.md` as a debt
+item with `category: aesthetic-slop` (see `reference/debt-categories.md`). The auditor
+attaches the per-axis scores as the `verb_axes_scored` lens-tag and records which
+visual-tells categories matched (see `reference/visual-tells.md`).
+A sum at or above 35 is not a pass on the pillars; the pillars carry their own scores.
+The threshold is a routing rule for the aesthetic-slop debt class only. It changes no
+pillar weight and no total.
+## What this is not
+This rubric does not replace the pillar audit, does not gate a write, and does not
+produce a 0-100 number. It is a verb-based lens that sits beside the pillars and emits
+one tag plus one routing decision. Real review still applies the full rubric in
+`reference/audit-scoring.md` and the BAN / SLOP catalog in `reference/anti-patterns.md`.

package/reference/audit-scoring.md CHANGED Viewed

@@ -248,3 +248,7 @@ Attach to findings under the Visual Hierarchy pillar that relate to compositiona
 ### `i18n_readiness`
 Attach to findings under the Accessibility pillar (for WCAG 3.1.1 / 3.1.2 violations) or under the Anti-Pattern Compliance pillar (for hardcoded-string / overflow-at-+40% defects). Emitted by the `agents/design-verifier.md` §i18n probes section (Phase 28-06). See [`./i18n.md`](./i18n.md) §WCAG i18n + §Verifier Integration Spec. Does NOT change pillar weights or scores.
+### `verb_axes` (anti-slop)
+Attach to any finding to record how generically AI-default the work reads, orthogonal to whether the pillar itself fails. Emitted by the `agents/design-auditor.md` §Anti-slop scoring section (Phase 50). It attaches `verb_axes_scored: {directness, distinctness, hierarchy, authenticity, density}` (each 1-10) to the finding, per the rubric in [`./anti-slop-rubric.md`](./anti-slop-rubric.md). When the five scores sum below 35 of 50, the finding also routes to `agents/design-debt-crawler.md` with `category: aesthetic-slop` (see [`./debt-categories.md`](./debt-categories.md)). Does NOT add a pillar and does NOT change pillar weights or scores.

package/reference/debt-categories.md CHANGED Viewed

@@ -3,7 +3,7 @@ name: debt-categories
 type: reference
 version: 1.0.0
 phase: 48
-tags: [debt, taxonomy, audit, crawler, priority-scoring, retroactive]
+tags: [debt, taxonomy, audit, crawler, priority-scoring, retroactive, aesthetic-slop, anti-slop]
 last_updated: 2026-06-03
 ---
@@ -105,6 +105,25 @@ empty-state strings such as "No data" or raw error codes.
 **Fix shape:** Add the accessible name or label; rewrite generic copy to be specific
 and actionable. Copy-quality detail lives in `reference/copy-quality.md`.
+### aesthetic-slop
+**Definition:** Work that reads as generically AI-default even when the pillar audit
+passes: template copy, the default palette and typeface used without a decision, stock
+scenes and placeholder content, flat competing hierarchy, and density inflated to fill
+space. This is the orthogonal verb-axis lens from `reference/anti-slop-rubric.md`, not a
+pillar failure. A surface can clear contrast, typography, and spacing and still be
+aesthetic-slop because nothing about it is chosen.
+**Detection signal:** The five verb axes (Directness, Distinctness, Hierarchy,
+Authenticity, Density) scored 1-10 each by `agents/design-auditor.md`, with the sum
+`< 35` of 50, corroborated by matches in `reference/visual-tells.md` (for example
+`stock-photo-people`, `badge-spam`, `oversized-single-word`,
+`motion-without-content-intent`, `narrator-from-a-distance-UI`, or the v1 tells). Record
+the per-axis `verb_axes_scored` values and the matched tell categories as evidence.
+**Fix shape:** Address the lowest axes first: write specific copy, route color and type
+through documented tokens, establish one focal point, replace stock and placeholder with
+real content, and size density to the content. This is usually a redesign-leaning effort,
+not a one-line swap, so it scores low on the effort factor below.
 ---
 ## Priority Scoring Model

package/reference/registry.json CHANGED Viewed

@@ -1121,6 +1121,34 @@
       "type": "output-contract",
       "phase": 48,
       "description": "Phase 48 brief-quality rubric: 5 anti-patterns (vague verbs, missing audience, immeasurable success criteria, scope creep, missing anti-goals) the brief-auditor surfaces."
+    },
+    {
+      "name": "visual-tells",
+      "path": "reference/visual-tells.md",
+      "type": "heuristic",
+      "phase": 49,
+      "description": "Phase 49 visual-tells catalog: 8 default-AI-aesthetic categories (default-AI-hero, gradient-spam, isometric-illustration-fallback, centered-everything-syndrome, inter-everything, purple-violet-default, glassmorphism-spam, decorative-motion-without-intent) with diagnostic regex + remediation; backs the gdd-design-quality-check hook."
+    },
+    {
+      "name": "reviewer-confidence-gate",
+      "path": "reference/reviewer-confidence-gate.md",
+      "type": "meta-rules",
+      "phase": 49,
+      "description": "Phase 49 reviewer confidence gate: 4-question Pre-Report Gate + confidence 0.0-1.0 field; HIGH/CRITICAL require >=0.8 + cited proof, <0.5 stays Tentative and never reaches design-fixer."
+    },
+    {
+      "name": "anti-slop-rubric",
+      "path": "reference/anti-slop-rubric.md",
+      "type": "heuristic",
+      "phase": 50,
+      "description": "Phase 50 verb-based anti-slop rubric: 5 orthogonal axes (Directness, Distinctness, Hierarchy, Authenticity, Density), 1-10 each; sum below 35/50 routes a finding to design-debt-crawler as aesthetic-slop. Lens-tag, not a pillar."
+    },
+    {
+      "name": "skill-graph",
+      "path": "reference/skill-graph.md",
+      "type": "meta-rules",
+      "phase": 50,
+      "description": "Phase 50 auto-generated skill composition graph (mermaid): skills grouped by lifecycle stage with composes_with and next_skills edges; regenerated by scripts/generate-skill-graph.cjs and drift-gated in CI."
     }
   ]
 }

package/reference/reviewer-confidence-gate.md ADDED Viewed

@@ -0,0 +1,108 @@
+---
+name: reviewer-confidence-gate
+type: meta-rules
+version: 1.0.0
+phase: 49
+tags: [review, confidence, audit, verify, gap, routing, anti-slop]
+last_updated: 2026-06-03
+---
+# Reviewer Confidence Gate
+Audit and verify findings can inflate severity without proof. A grep hit gets reported as a BLOCKER; a single line read out of context becomes a MAJOR. This contract adds a confidence discipline so review agents (`design-auditor`, `design-verifier`, `design-debt-crawler`) earn the severity they assign, and so `design-fixer` only auto-applies fixes that are backed by evidence.
+Every emitting agent runs the Pre-Report Gate before writing a finding, stamps each finding with a `confidence` score, and parks weak findings in a `## Tentative` section that the fixer never reads. The routing helper `scripts/lib/confidence-route.cjs` encodes the same rule in code.
+## Pre-Report Gate
+Before you emit any finding or gap, answer these four questions. If you cannot answer all four with a clear yes, the finding is not ready to ship at its stated severity.
+- **a. Can I cite `file:line`?** Point at the exact location. A finding with no concrete location is a hunch, not a defect.
+- **b. Can I state the failure mode in one sentence?** Name what breaks for the user or the build. If the sentence needs an "and" plus a "maybe", the finding is two findings or none.
+- **c. Did I read context beyond the modified file?** Confirm the call site, the token definition, or the parent component. A value that looks wrong in isolation is often correct once you read what feeds it.
+- **d. Is the severity defensible?** A BLOCKER blocks shipping. A MAJOR is a real deviation from intent. If you would not defend the label to the author, lower it.
+## The `confidence` field
+Every finding carries a `confidence: 0.0-1.0` field. It records how sure you are that the finding is real and correctly classified, not how bad the issue is. Severity and confidence are independent axes: a cosmetic issue can be high confidence, and a suspected BLOCKER can be low confidence.
+| Range | Meaning | Where it goes |
+|-------|---------|---------------|
+| `>= 0.8` | Cited `file:line`, one-sentence failure mode, context read. | Reported at full severity; eligible for auto-fix. |
+| `0.5 - 0.8` | Real signal, but evidence is partial or context is incomplete. | Reported, routed to user review, never auto-fixed. |
+| `< 0.5` | A hunch, a guess, or a pattern match you could not confirm. | Moved to `## Tentative`; never reaches `design-fixer`. |
+## Routing rule
+The gate controls what reaches the fixer. The rule is:
+- A HIGH severity finding (BLOCKER or MAJOR) requires `confidence >= 0.8` **and** a `file:line` citation **and** a one-sentence failure mode. Below `0.8`, a HIGH finding is surfaced for user review instead of auto-fix.
+- A finding with `confidence < 0.5` stays in the `## Tentative` section and never reaches `design-fixer`.
+- A finding with `confidence` in the `0.5 - 0.8` band is surfaced in the report but routed to user review, not auto-fix.
+`scripts/lib/confidence-route.cjs` exports `route({ severity, confidence, tentative })` and returns `'fix'`, `'user-review'`, or `'drop'`. Agents and the fixer share this single decision so the matrix stays consistent.
+### Routing matrix
+The full decision table the helper encodes:
+| Severity | `tentative` | confidence | Destination |
+|----------|-------------|------------|-------------|
+| any | `true` | any | `drop` (never reaches fixer) |
+| any | `false` | `< 0.5` | `drop` (stays tentative) |
+| BLOCKER or MAJOR | `false` | `0.5 - 0.8` | `user-review` |
+| BLOCKER or MAJOR | `false` | `>= 0.8` | `fix` |
+| MINOR or COSMETIC | `false` | `0.5 - 0.8` | `user-review` |
+| MINOR or COSMETIC | `false` | `>= 0.8` | `fix` |
+Read the table as: tentative wins first, then the `0.5` floor, then the severity-specific `0.8` auto-fix gate.
+## How to emit a finding
+After the Pre-Report Gate passes, write the finding with the `confidence` field on its own line inside the existing locked format. For `design-verifier` gaps this sits alongside the other gap fields:
+```text
+### BLOCKER G-01: raw error object rendered on payment failure
+- Phase: 2
+- Description: Checkout.tsx renders the error object directly
+- Expected: a human-readable failure message
+- Actual: users see "[object Object]"
+- Location: src/Checkout.tsx:88
+- Suggested fix: render error.message with a fallback string
+- confidence: 0.85
+```
+A finding that scores `< 0.5` is not written in the gap list at all. It goes under a `## Tentative` heading in the same report, in plain prose, so a human can promote it later if context proves it real.
+## Paired examples
+Each pair shows a raw finding (before the gate) and the same finding after the gate corrects it.
+### Example 1: severity inflated, no context read
+**Before:** `BLOCKER: hardcoded color #1a73e8 in Button.tsx breaks theming.`
+**After:** `MINOR G-04: raw #1a73e8 instead of a semantic token. confidence: 0.9`. Reading context (question c) showed `Button.tsx:42` is the token definition file, so theming is not broken; the issue is a style-coherence nit, not a shipping blocker. High confidence, low severity.
+### Example 2: a grep guess that could not be confirmed
+**Before:** `MAJOR: missing reduced-motion guard, animations will trigger vestibular issues.`
+**After:** moved to `## Tentative` with `confidence: 0.4`. The grep matched `framer-motion` but question a failed: no single `file:line` proves the guard is absent app-wide, and a root `MotionConfig` may cover it. Parked as tentative; the fixer never sees it.
+### Example 3: real defect, evidence complete
+**Before:** `error states look weak somewhere in the checkout flow.`
+**After:** `BLOCKER G-01: Checkout.tsx:88 renders the raw error object, so users see "[object Object]" on a failed payment. confidence: 0.85`. All four questions pass: cited location, one-sentence failure mode, call site read, severity defensible. Auto-fix eligible.
+### Example 4: partial evidence, honest mid-band score
+**Before:** `MAJOR: empty state copy is generic across the app.`
+**After:** `MINOR G-06: Inbox.tsx:30 empty state reads "No data". confidence: 0.65`. One real instance is cited, but question c is only half done: the "across the app" claim was not verified. Scored mid-band, surfaced for user review rather than auto-fixed, and the severity was lowered to match the single confirmed instance.
+## Agent integration
+- `design-auditor`, `design-verifier`, and `design-debt-crawler` run the Pre-Report Gate, stamp each finding with `confidence`, and route sub-0.5 findings to `## Tentative`.
+- `design-fixer` skips every gap in `## Tentative` and skips BLOCKER or MAJOR gaps whose `confidence < 0.8`, routing those to user review instead of auto-fix.

package/reference/skill-authoring-contract.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
 name: skill-authoring-contract
 type: meta-rules
-version: 1.0.0
-phase: 28.5
-tags: [skill, authoring, contract, length-cap, description, frontmatter, progressive-disclosure]
-last_updated: 2026-05-18
+version: 3.0.0
+phase: 50
+tags: [skill, authoring, contract, length-cap, description, frontmatter, progressive-disclosure, composition, skill-graph]
+last_updated: 2026-06-03
 ---
 Source: mattpocock/skills (MIT) - adapted with permission. See `../NOTICE` for the full attribution block.
@@ -48,27 +48,65 @@ worst-offender and is scheduled for Bucket 1 rework in plan `28.5-04`. `skills/h
 Two rules:
 - **Length cap is STRICT.** `description ≤ 1024 chars` - no flag, no override. Under 20 chars
-  is also blocked as under-specification.
-- **Recommended form is LAX by default.** `<what>. Use when <triggers>.` - third person,
-  first sentence what the skill does, second sentence the trigger conditions. Validator
-  enforces the form regex only under `--strict-description` or `STRICT_DESCRIPTION=1`. Default
-  is length-only.
+  is also blocked as under-specification. The 1024-char cap is UNCHANGED in v3.
+- **Recommended form is LAX by default.** The validator enforces a form regex only under
+  `--strict-description` or `STRICT_DESCRIPTION=1`. Default is length-only.
+### v3 form (recommended)
+```text
+<what>. Use when <triggers>. Activates for requests involving <kw1>, <kw2>, <kw3>.
+```
+Three sentences, third person:
+1. **`<what>`** - what the skill does.
+2. **`Use when <triggers>`** - the trigger conditions.
+3. **`Activates for requests involving <kw1>, <kw2>, <kw3>`** - a short keyword list. This
+   trigger sentence is the v3 addition: naming the activating keywords improves retrieval, so the
+   router surfaces the skill on the requests it is meant to handle rather than on near-misses.
+### v2 form (still accepted during the transition window)
+```text
+<what>. Use when <triggers>.
+```
+The v2 form is the two-sentence shape shipped in Phase 28.5 (first sentence what, second sentence
+when). It omits the `Activates for ...` trigger sentence.
+### Transition window
+BOTH the v2 form and the v3 form are accepted for one minor version. Neither is a hard failure
+during the window; the length cap (20-1024) is the only blocking description rule. `gsd-health`
+tracks v3 adoption (the share of descriptions carrying the `Activates for ...` sentence) so the
+rollout is measurable before the v2 form is retired in a later minor.
 Why lax-by-default (D-02): `obra/superpowers/skills/writing-skills/SKILL.md` documents a
-shortcut-effect where an agent reads the description and skips the body - the more
-essential the description summary, the more often this happens. Phase 33 ships an A/B
-study at `.design/research/description-format-ab.md`; until then the regex stays advisory.
+shortcut-effect where an agent reads the description and skips the body - the more essential the
+description summary, the more often this happens. The form regex therefore stays advisory; only
+length is enforced by default.
-Examples (both 20–1024 chars, both pass the length check):
+Examples (all 20-1024 chars, all pass the length check):
 ```text
-# Strict-mode-compliant
+# v3 form (recommended)
+Renders an OKLCH gamut comparison chart. Use when the user asks to see the visible difference between a target gamut and sRGB. Activates for requests involving gamut, OKLCH, sRGB.
+# v2 form (accepted during the transition window)
 Renders an OKLCH gamut comparison chart. Use when the user asks to see the visible difference between a target gamut and sRGB.
-# Lax-mode-only acceptable
+# Lax-mode-only acceptable (length passes; form regex would flag under --strict-description)
 Compares OKLCH gamut coverage against sRGB and prints a visual diff chart.
 ```
+### Anti-boilerplate gate
+`scripts/validate-skill-frontmatter.cjs` is a separate, always-on cohort check: if three or more
+skills share an identical opening sentence OR an identical `Use when` clause, it fails. Collapsed
+boilerplate across many descriptions erases the discriminating signal the router needs, so each
+skill keeps a distinct opening and a distinct trigger clause.
 ## Frontmatter
 Required fields (validator blocks if absent):
@@ -85,6 +123,10 @@ Optional fields (recognized by the Claude Code agent loader):
   whitelist (pure shortcuts like `help`, `stats`, `note`, `health`, `zoom-out`). The
   validator blocks if a non-whitelisted skill sets this field to `true`.
 - `user-invocable: true|false` - whether the slash-command picker exposes the skill.
+- `composes_with: [skill, ...]` - optional (v3). Skill names this skill calls as
+  sub-orchestration. See `## Skill composition` below.
+- `next_skills: [skill]` - optional (v3). A pipeline hint listing the skills that naturally
+  run after this one. See `## Skill composition` below.
 Concrete example:
@@ -97,6 +139,35 @@ disable-model-invocation: true
 ---
 ```
+## Skill composition
+v3 closes the "no skill calls another skill" gap with two optional, machine-parseable frontmatter
+fields. Both are arrays of skill names and both are OPTIONAL; a skill with neither is unchanged.
+- `composes_with: [skill, ...]` - the skills this one calls as sub-orchestration. Use it when a
+  skill spawns or delegates into another skill as part of its own run.
+- `next_skills: [skill]` - a pipeline hint: the skills that naturally run after this one. It does
+  not call them; it records the intended flow so tooling can suggest the next step.
+Each entry becomes a directed edge (this skill points at the referenced skill). The composition
+graph across all skills MUST be a directed acyclic graph: a skill cannot transitively compose back
+into itself, and every referenced name MUST be a real skill. `scripts/validate-composition-graph.cjs`
+reads these fields from `scripts/lib/manifest/skills.json` (either as native array fields or parsed
+from the record's `extra_frontmatter` passthrough lines), then fails on a cycle or a dangling
+reference. `scripts/generate-skill-graph.cjs` reads the same edges and regenerates
+`./skill-graph.md`, a mermaid flowchart of the skills and their composition edges grouped by
+lifecycle stage; CI drift-gates that file with `--check`.
+```yaml
+---
+name: audit
+description: "Runs a design audit and prints a 6-pillar score. Use when the user wants to score the current design. Activates for requests involving audit, score, design review."
+tools: Read, Write, Task, Glob, Bash
+composes_with: [scan]
+next_skills: [reflect]
+---
+```
 ## Progressive disclosure
 References-one-level-deep is the rule (D-06):
@@ -157,3 +228,14 @@ node scripts/validate-skill-length.cjs --quiet --json
 Exit codes: `0` clean, `1` warnings only, `2` blockers present. Flags: `--quiet` suppresses
 per-skill output, `--strict-description` adds the form regex check, `--json` emits
 machine-readable output. Env: `STRICT_DESCRIPTION=1` and `SKILLS_DIR=<path>` are honored.
+v3 adds three SoT-driven scripts that read `scripts/lib/manifest/skills.json`:
+```text
+node scripts/validate-skill-frontmatter.cjs   # fail on 3+ shared opening/Use-when clauses
+node scripts/validate-composition-graph.cjs   # fail on a composition cycle or dangling ref
+node scripts/generate-skill-graph.cjs --check # drift-gate the generated skill-graph.md
+```
+Each exits `0` clean, `1` on a failure (drift for the generator under `--check`), `2` on an
+internal error.

package/reference/skill-graph.md ADDED Viewed

@@ -0,0 +1,118 @@
+# Skill Composition Graph
+> GENERATED FILE. Do not edit by hand. Source: scripts/lib/manifest/skills.json.
+> Regenerate: `node scripts/generate-skill-graph.cjs`; CI drift-gates it with `--check`.
+This graph visualizes every skill grouped by inferred lifecycle stage, plus the skill
+composition edges declared in v3 frontmatter (see skill-authoring-contract.md). A solid arrow
+is a `composes_with` edge (the source calls the target as sub-orchestration); a dotted arrow is
+a `next_skills` edge (a pipeline hint for what runs next). Stage grouping is best-effort and
+inferred from the skill name; skills with no stage keyword fall under Utility.
+Skills: 88. Composition edges: 0 composes_with, 0 next_skills.
+```mermaid
+flowchart TD
+  subgraph intake["Intake"]
+    n_brief["brief"]
+    n_discover["discover"]
+    n_new_cycle["new-cycle"]
+    n_new_project["new-project"]
+    n_start["start"]
+  end
+  subgraph explore["Explore"]
+    n_benchmark["benchmark"]
+    n_compare["compare"]
+    n_explore["explore"]
+    n_map["map"]
+    n_sketch["sketch"]
+    n_sketch_wrap_up["sketch-wrap-up"]
+    n_spike["spike"]
+    n_spike_wrap_up["spike-wrap-up"]
+  end
+  subgraph decide["Decide"]
+    n_discuss["discuss"]
+    n_list_assumptions["list-assumptions"]
+    n_plan["plan"]
+    n_review_decisions["review-decisions"]
+    n_unlock_decision["unlock-decision"]
+  end
+  subgraph build["Build"]
+    n_bootstrap_ds["bootstrap-ds"]
+    n_darkmode["darkmode"]
+    n_design["design"]
+    n_do["do"]
+    n_export["export"]
+    n_figma_write["figma-write"]
+    n_migrate["migrate"]
+    n_optimize["optimize"]
+  end
+  subgraph verify["Verify"]
+    n_audit["audit"]
+    n_complete_cycle["complete-cycle"]
+    n_quality_gate["quality-gate"]
+    n_review_backlog["review-backlog"]
+    n_scan["scan"]
+    n_turn_closeout["turn-closeout"]
+    n_verify["verify"]
+  end
+  subgraph operate["Operate"]
+    n_live["live"]
+    n_report_issue["report-issue"]
+    n_roi["roi"]
+    n_rollout_status["rollout-status"]
+    n_watch_authorities["watch-authorities"]
+  end
+  subgraph utility["Utility"]
+    n_add_backlog["add-backlog"]
+    n_analyze_dependencies["analyze-dependencies"]
+    n_apply_reflections["apply-reflections"]
+    n_bandit_status["bandit-status"]
+    n_budget["budget"]
+    n_cache_manager["cache-manager"]
+    n_check_update["check-update"]
+    n_connections["connections"]
+    n_continue["continue"]
+    n_debug["debug"]
+    n_extract_learnings["extract-learnings"]
+    n_fast["fast"]
+    n_figma_extract["figma-extract"]
+    n_graphify["graphify"]
+    n_health["health"]
+    n_help["help"]
+    n_list_pins["list-pins"]
+    n_locale["locale"]
+    n_new_skill["new-skill"]
+    n_next["next"]
+    n_note["note"]
+    n_openrouter_status["openrouter-status"]
+    n_pause["pause"]
+    n_peer_cli_add["peer-cli-add"]
+    n_peer_cli_customize["peer-cli-customize"]
+    n_peers["peers"]
+    n_pin["pin"]
+    n_plant_seed["plant-seed"]
+    n_pr_branch["pr-branch"]
+    n_progress["progress"]
+    n_quick["quick"]
+    n_reapply_patches["reapply-patches"]
+    n_recall["recall"]
+    n_reflect["reflect"]
+    n_resume["resume"]
+    n_router["router"]
+    n_settings["settings"]
+    n_ship["ship"]
+    n_skill_manifest["skill-manifest"]
+    n_stats["stats"]
+    n_style["style"]
+    n_synthesize["synthesize"]
+    n_timeline["timeline"]
+    n_todo["todo"]
+    n_undo["undo"]
+    n_unpin["unpin"]
+    n_update["update"]
+    n_using_gdd["using-gdd"]
+    n_warm_cache["warm-cache"]
+    n_zoom_out["zoom-out"]
+  end
+```