npm - @windyroad/itil - Versions diffs - 0.30.0 → 0.30.1-preview.315 - Mend

@windyroad/itil 0.30.0 → 0.30.1-preview.315

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/.claude-plugin/plugin.json +1 -1
package/hooks/itil-readme-refresh-discipline.sh +6 -2
package/hooks/lib/readme-refresh-detect.sh +100 -6
package/hooks/test/itil-readme-refresh-discipline.bats +160 -0
package/package.json +1 -1
package/skills/list-problems/SKILL.md +5 -4
package/skills/manage-incident/SKILL.md +23 -9
package/skills/manage-incident/test/manage-incident-adr-044-contract.bats +69 -1
package/skills/manage-problem/SKILL.md +38 -18
package/skills/manage-problem/test/manage-problem-adr-044-step4-derive-first.bats +151 -0
package/skills/reconcile-readme/SKILL.md +3 -3
package/skills/review-problems/SKILL.md +14 -7
package/skills/review-problems/test/review-problems-likely-verified-cell-shape.bats +229 -0
package/skills/transition-problem/SKILL.md +3 -0
package/skills/transition-problems/SKILL.md +2 -0

package/.claude-plugin/plugin.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
   "name": "wr-itil",
-  "version": "0.30.0",
+  "version": "0.30.1",
   "description": "ITIL-aligned IT service management for Claude Code"
 }

package/hooks/itil-readme-refresh-discipline.sh CHANGED Viewed

@@ -103,8 +103,12 @@ esac
 # Trap detected — emit deny with terse recovery.
 # Voice-tone budget per ADR-045 deny-band ≤300 bytes total. Names the
 # offending ticket ID, the literal recovery command, the BYPASS env
-# var escape, and the P165 cite.
-REASON="BLOCKED: P165. ${TICKET_ID} needs docs/problems/README.md refresh. Run: git add docs/problems/README.md. Bypass: BYPASS_README_REFRESH_GATE=1."
+# var escape with correct propagation syntax (P231 / P173), and the
+# P165 cite. Inline-prefix `VAR=1 git commit ...` does NOT propagate
+# from a Bash subshell to PreToolUse hooks; the env field of
+# `.claude/settings.json` (or shell `export` before `claude` launch)
+# is the working path.
+REASON="BLOCKED: P165. ${TICKET_ID} needs README refresh: git add docs/problems/README.md. Bypass: BYPASS_README_REFRESH_GATE=1 via .claude/settings.json env (P173)."
 cat <<EOF
 {

package/hooks/lib/readme-refresh-detect.sh CHANGED Viewed

@@ -37,8 +37,25 @@
 #
 # Bypass:
 #   - `BYPASS_README_REFRESH_GATE=1` env var → return 0 (allow). For
-#     legitimate narrative-only ticket-body edits that don't change
-#     ranking-bearing fields. Audit-traceable via shell history.
+#     legitimate one-off escape (e.g. force-amend after rebase rewrote
+#     history). Audit-traceable via shell history. Set in
+#     `.claude/settings.json` env field or shell `export` before
+#     launching `claude` — inline-prefix syntax (`VAR=1 git commit ...`)
+#     does NOT propagate from a Bash subshell to PreToolUse hooks (P173).
+#
+# Narrative-only short-circuit (P230):
+#   - When all staged ticket edits are purely narrative — no
+#     ranking-bearing field change (Priority / Effort / Status / WSJF /
+#     Type field-lines), no title change, no rename between state
+#     subdirs, no creation/deletion — AND
+#     `packages/itil/scripts/reconcile-readme.sh` reports exit=0 against
+#     the current README, return 0 (allow). Reconcile-readme is the
+#     authoritative drift oracle for narrative-only edits.
+#   - Ranking-bearing edits still fall through to existing detection
+#     regardless of reconcile state, preserving ADR-014 single-commit
+#     grain for the change-set surface (architect verdict: reconcile is
+#     a robustness layer on top of per-operation refresh, not a
+#     supersession of either).
 #
 # Fail-open contract:
 #   - Outside a git working tree, or when `git diff` fails for any
@@ -111,6 +128,7 @@ detect_readme_refresh_required() {
   local has_readme=0
   local offending_ticket=""
   local path basename
+  local staged_tickets=()
   while IFS= read -r path; do
     [ -n "$path" ] || continue
@@ -135,6 +153,7 @@ detect_readme_refresh_required() {
         case "$basename" in
           [0-9]*.md)
             [ -z "$offending_ticket" ] && offending_ticket="$path"
+            staged_tickets+=("$path")
             ;;
         esac
         ;;
@@ -143,6 +162,7 @@ detect_readme_refresh_required() {
         # Excludes README.md and README-history.md (already cased
         # above; both start with `R`, not a digit).
         [ -z "$offending_ticket" ] && offending_ticket="$path"
+        staged_tickets+=("$path")
         ;;
       *)
         # Non-ticket surface: ignored.
@@ -152,10 +172,84 @@ detect_readme_refresh_required() {
 $staged
 EOF
-  if [ -n "$offending_ticket" ] && [ "$has_readme" -eq 0 ]; then
-    printf '%s\n' "$offending_ticket"
-    return 1
+  # No staged ticket — nothing to gate.
+  [ -n "$offending_ticket" ] || return 0
+  # README staged alongside — clean.
+  [ "$has_readme" -eq 1 ] && return 0
+  # P230 narrative-only short-circuit. Detect whether the staged ticket
+  # set is purely narrative (no ranking-bearing field change, no rename
+  # between state subdirs, no creation/deletion). If so, consult
+  # reconcile-readme.sh as the authoritative drift oracle; exit=0 means
+  # the README is in sync with filesystem truth and narrative-only
+  # ticket edits are safe to allow silently.
+  if ! _readme_refresh_staged_is_ranking_bearing "${staged_tickets[@]}"; then
+    if _readme_refresh_reconcile_clean; then
+      return 0
+    fi
+  fi
+  # Either ranking-bearing, or narrative-only with reconcile drift —
+  # fall through to deny.
+  printf '%s\n' "$offending_ticket"
+  return 1
+}
+# Returns 0 if any staged ticket exhibits a ranking-bearing change:
+#   - field-line diff matching ^[+-]**(Priority|Effort|Status|WSJF|Type)**:
+#   - title-line diff matching ^[+-]# Problem
+#   - new ticket file added (A entry on a ticket path)
+#   - ticket file deleted (D entry on a ticket path)
+#   - rename between state subdirs (R<NN> entry where either path is a
+#     ticket path)
+# Returns 1 if narrative-only.
+_readme_refresh_staged_is_ranking_bearing() {
+  local tickets=("$@")
+  [ "${#tickets[@]}" -gt 0 ] || return 1
+  # (i) Field-line / title-line diff
+  if git diff --staged -- "${tickets[@]}" 2>/dev/null \
+      | grep -qE '^[+-](\*\*(Priority|Effort|Status|WSJF|Type)\*\*:|# Problem )'; then
+    return 0
   fi
-  return 0
+  # (ii) Creation / deletion / rename via --name-status -M
+  local namestatus
+  namestatus=$(git diff --staged --name-status -M 2>/dev/null) || return 1
+  local ticket_re='^docs/problems/(open|verifying|closed|known-error|parked)/[0-9].*\.md$'
+  local legacy_re='^docs/problems/[0-9].*\.md$'
+  while IFS=$'\t' read -r status p1 p2; do
+    [ -n "$status" ] || continue
+    case "$status" in
+      A|D)
+        if [[ "$p1" =~ $ticket_re ]] || [[ "$p1" =~ $legacy_re ]]; then
+          return 0
+        fi
+        ;;
+      R*)
+        if [[ "$p1" =~ $ticket_re ]] || [[ "$p1" =~ $legacy_re ]] \
+           || [[ "$p2" =~ $ticket_re ]] || [[ "$p2" =~ $legacy_re ]]; then
+          return 0
+        fi
+        ;;
+    esac
+  done <<EOF
+$namestatus
+EOF
+  return 1
+}
+# Returns 0 if reconcile-readme.sh reports the README is in sync with
+# filesystem truth (exit=0), 1 otherwise (drift, parse error, or script
+# not located).
+_readme_refresh_reconcile_clean() {
+  local lib_dir
+  lib_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" || return 1
+  local reconcile="$lib_dir/../../scripts/reconcile-readme.sh"
+  [ -f "$reconcile" ] || return 1
+  bash "$reconcile" "docs/problems" >/dev/null 2>&1
 }

package/hooks/test/itil-readme-refresh-discipline.bats CHANGED Viewed

@@ -259,3 +259,163 @@ run_bash_hook() {
   [ "$status" -eq 0 ]
   [[ "$output" != *"\"permissionDecision\": \"deny\""* ]]
 }
+# --- P230: narrative-only short-circuit (reconcile-readme is authority) ---
+#
+# When all staged ticket edits are purely narrative (Change Log entries,
+# Investigation Task checkbox ticks, prose edits — no ranking-bearing
+# field change, no rename between state subdirs, no creation/deletion)
+# AND `packages/itil/scripts/reconcile-readme.sh` reports exit=0 against
+# the current README, the hook silently passes. Ranking-bearing edits
+# still fall through to existing detection per ADR-014 single-commit
+# grain (architect verdict: reconcile is robustness layer, not
+# supersession of per-operation refresh).
+seed_valid_readme_p999_open() {
+  cat > docs/problems/README.md <<EOF
+# Problem Backlog
+## WSJF Rankings
+| ID | Title | WSJF |
+|---|---|---|
+| P999 | Test ticket | 1.0 |
+## Verification Queue
+(none)
+## Closed
+(none)
+EOF
+}
+@test "P230 allow: narrative-only edit + reconcile-readme exit=0 → allow silently" {
+  cat > docs/problems/open/999-narrative.md <<EOF
+# Problem 999: Test ticket
+**Status**: Open
+**Priority**: 1
+EOF
+  seed_valid_readme_p999_open
+  git add docs/problems/open/999-narrative.md docs/problems/README.md
+  git -c commit.gpgsign=false commit --quiet -m "seed p999"
+  # Narrative-only edit: append a Change Log line
+  echo "- 2026-05-16 — narrative tweak" >> docs/problems/open/999-narrative.md
+  git add docs/problems/open/999-narrative.md
+  run run_bash_hook "git commit -m 'docs(problems): narrative tweak'"
+  [ "$status" -eq 0 ]
+  [[ "$output" != *"\"permissionDecision\": \"deny\""* ]]
+}
+@test "P230 allow: Investigation Task checkbox tick (narrative-only) + reconcile=0 → allow silently" {
+  cat > docs/problems/open/999-checkbox.md <<EOF
+# Problem 999: Test ticket
+**Status**: Open
+**Priority**: 1
+## Investigation Tasks
+- [ ] First task
+EOF
+  seed_valid_readme_p999_open
+  git add docs/problems/open/999-checkbox.md docs/problems/README.md
+  git -c commit.gpgsign=false commit --quiet -m "seed p999"
+  # Narrative-only edit: tick a checkbox
+  sed -i.bak 's/- \[ \] First task/- [x] First task/' docs/problems/open/999-checkbox.md
+  rm docs/problems/open/999-checkbox.md.bak
+  git add docs/problems/open/999-checkbox.md
+  run run_bash_hook "git commit -m 'docs(problems): tick task'"
+  [ "$status" -eq 0 ]
+  [[ "$output" != *"\"permissionDecision\": \"deny\""* ]]
+}
+@test "P230 deny: ranking-bearing Status field change + reconcile=0 → still deny per ADR-014 single-commit grain" {
+  cat > docs/problems/open/999-ranking.md <<EOF
+# Problem 999: Test ticket
+**Status**: Open
+**Priority**: 1
+EOF
+  seed_valid_readme_p999_open
+  git add docs/problems/open/999-ranking.md docs/problems/README.md
+  git -c commit.gpgsign=false commit --quiet -m "seed p999"
+  # Ranking-bearing edit: change Status
+  sed -i.bak 's/\*\*Status\*\*: Open/\*\*Status\*\*: Verifying/' docs/problems/open/999-ranking.md
+  rm docs/problems/open/999-ranking.md.bak
+  git add docs/problems/open/999-ranking.md
+  run run_bash_hook "git commit -m 'transition'"
+  [ "$status" -eq 0 ]
+  [[ "$output" == *"\"permissionDecision\": \"deny\""* ]]
+}
+@test "P230 deny: ranking-bearing Priority field change + reconcile=0 → still deny" {
+  cat > docs/problems/open/999-priority.md <<EOF
+# Problem 999: Test ticket
+**Status**: Open
+**Priority**: 1
+EOF
+  seed_valid_readme_p999_open
+  git add docs/problems/open/999-priority.md docs/problems/README.md
+  git -c commit.gpgsign=false commit --quiet -m "seed p999"
+  # Ranking-bearing edit: change Priority
+  sed -i.bak 's/\*\*Priority\*\*: 1/\*\*Priority\*\*: 5/' docs/problems/open/999-priority.md
+  rm docs/problems/open/999-priority.md.bak
+  git add docs/problems/open/999-priority.md
+  run run_bash_hook "git commit -m 're-rate'"
+  [ "$status" -eq 0 ]
+  [[ "$output" == *"\"permissionDecision\": \"deny\""* ]]
+}
+@test "P230 deny: git mv between state subdirs (open→verifying) + no README refresh → deny (canonical iter-subprocess case)" {
+  cat > docs/problems/open/999-rename.md <<EOF
+# Problem 999: Test ticket
+**Status**: Open
+EOF
+  seed_valid_readme_p999_open
+  git add docs/problems/open/999-rename.md docs/problems/README.md
+  git -c commit.gpgsign=false commit --quiet -m "seed p999"
+  # Rename to verifying state subdir
+  git mv docs/problems/open/999-rename.md docs/problems/verifying/999-rename.md
+  run run_bash_hook "git commit -m 'transition'"
+  [ "$status" -eq 0 ]
+  [[ "$output" == *"\"permissionDecision\": \"deny\""* ]]
+}
+@test "P230 deny: narrative-only edit + reconcile-readme drift (README missing ticket) → still deny per existing logic" {
+  cat > docs/problems/open/999-narrative-drift.md <<EOF
+# Problem 999: Test ticket
+**Status**: Open
+EOF
+  # README does NOT list P999 → reconcile detects MISSING drift → exit=1
+  cat > docs/problems/README.md <<EOF
+# Problem Backlog
+## WSJF Rankings
+| ID | Title | WSJF |
+|---|---|---|
+EOF
+  git add docs/problems/open/999-narrative-drift.md docs/problems/README.md
+  git -c commit.gpgsign=false commit --quiet -m "seed p999"
+  # Narrative-only edit
+  echo "- 2026-05-16 — narrative line" >> docs/problems/open/999-narrative-drift.md
+  git add docs/problems/open/999-narrative-drift.md
+  run run_bash_hook "git commit -m 'docs(problems): narrative'"
+  [ "$status" -eq 0 ]
+  [[ "$output" == *"\"permissionDecision\": \"deny\""* ]]
+}
+# --- P231: deny message advertises correct bypass syntax (Option A) ---
+#
+# Deny message advertises `.claude/settings.json env` rather than inline
+# prefix (which doesn't propagate to PreToolUse hooks per P173).
+@test "P231 deny message advertises .claude/settings.json bypass path + P173 reference" {
+  echo "# Problem 999" > docs/problems/open/999-bypass-msg.md
+  git add docs/problems/open/999-bypass-msg.md
+  run run_bash_hook "git commit -m 'feat'"
+  [ "$status" -eq 0 ]
+  [[ "$output" == *"\"permissionDecision\": \"deny\""* ]]
+  [[ "$output" == *".claude/settings.json"* ]]
+  [[ "$output" == *"P173"* ]]
+}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@windyroad/itil",
-  "version": "0.30.0",
+  "version": "0.30.1-preview.315",
   "description": "ITIL-aligned IT service management for Claude Code (problem, and future incident/change skills)",
   "bin": {
     "windyroad-itil": "./bin/install.mjs"

package/skills/list-problems/SKILL.md CHANGED Viewed

@@ -56,7 +56,7 @@ ls docs/problems/*.parked.md docs/problems/parked/*.md 2>/dev/null        # for
 For each `.open.md` and `.known-error.md` file, read the `**Status**`, `**Priority**`, `**Effort**`, and `**WSJF**` lines from the frontmatter section. Compute WSJF if missing: `WSJF = (Severity × StatusMultiplier) / EffortDivisor` per `/wr-itil:manage-problem` WSJF Prioritisation. Default to M (divisor 2) when Effort is absent; flag missing scores so the user knows a review is overdue.
-For each `.verifying.md` file, read the `## Fix Released` marker and extract the release age for the `Likely verified?` column per P048 Candidate 4 (within-skill default: 14 days = `yes`).
+For each `.verifying.md` file, read the `## Fix Released` marker. The `Likely verified?` column carries an **evidence-first** cell per P186 (supersedes the original P048 Candidate 4 14-day heuristic). <!-- LIKELY-VERIFIED-CELL-SHAPE: evidence-based per P186 --> When this skill runs against a stale cache, the live-scan path reads the cell value from the `.verifying.md` ticket's `## Fix Released` section (or carries forward the prior cell value from the cached README when present); it does NOT compute the cell from age — that responsibility moved to `/wr-itil:review-problems` Step 4 (user confirmation populates `yes — observed: …`) and `run-retro` Step 4a close-on-evidence citations.
 ### 3. Display
@@ -70,12 +70,12 @@ Render three sections matching the README.md format so cached and live output lo
 | <score> | P<NNN> | <title> | <severity> | <status> | <effort> |
 ```
-**Verification Queue** — `.verifying.md` tickets, sorted by `Released date ASC` (oldest at row 1; same-day releases tiebreak by ID ASC) per ADR-022 + P048 user-task semantics. <!-- VQ-SORT-DIRECTION: oldest-first per ADR-022 --> Drift here re-opens P150.
+**Verification Queue** — `.verifying.md` tickets, sorted by `Released date ASC` (oldest at row 1; same-day releases tiebreak by ID ASC) per ADR-022 + P048 user-task semantics. <!-- VQ-SORT-DIRECTION: oldest-first per ADR-022 --> Drift here re-opens P150. The `Likely verified?` column carries an **evidence-first** cell per P186 — three canonical values: `yes — observed: <evidence>`, `no — not observed` (default for newly-released tickets), `no — observed regression`. <!-- LIKELY-VERIFIED-CELL-SHAPE: evidence-based per P186 --> Drift on the cell shape re-opens P186.
 ```
 | ID | Title | Released | Likely verified? |
 |----|-------|----------|------------------|
-| P<NNN> | <title> | <release marker> | yes (N days) / no (N days) |
+| P<NNN> | <title> | <release marker> | <yes — observed: …  /  no — not observed  /  no — observed regression> |
 ```
 **Parked** — `.parked.md` tickets:
@@ -106,7 +106,8 @@ After the tables, print one of two short pointers depending on what the output s
 - **ADR-022** (`docs/decisions/022-verification-pending-status.proposed.md`) — Verification Pending status conventions; `.verifying.md` exclusion from WSJF ranking.
 - **ADR-037** (`docs/decisions/037-skill-testing-strategy.proposed.md`) — contract-assertion bats pattern applied to this skill.
 - **P031** — git-history freshness check rationale (mtime unreliable in worktrees).
-- **P048** Candidate 4 — the 14-day `Likely verified?` heuristic.
+- **P048** Candidate 4 — original `Likely verified?` column (14-day age-heuristic). Superseded by P186.
+- **P186** — evidence-first cell shape (`yes — observed: <evidence>` / `no — not observed` / `no — observed regression`) replaces the age-based heuristic; `<!-- LIKELY-VERIFIED-CELL-SHAPE: evidence-based per P186 -->` drives cross-skill drift detection.
 - **JTBD-001** (`docs/jtbd/solo-developer/JTBD-001-enforce-governance.proposed.md`) — discoverable surface via `/wr-itil:` autocomplete.
 - **JTBD-101** (`docs/jtbd/plugin-developer/JTBD-101-extend-suite.proposed.md`) — one skill per distinct user intent.
 - `packages/itil/skills/manage-problem/SKILL.md` — hosts the thin-router forwarder for the deprecated `manage-problem list` form.

package/skills/manage-incident/SKILL.md CHANGED Viewed

@@ -152,21 +152,33 @@ next=$(printf 'I%03d' $((10#${last:-0} + 1)))
 echo "$next"
 ```
-### 4. For new incidents: Gather information (ADR-044 category-1 direction-setting)
+### 4. For new incidents: Gather information (P132 derive-first; ADR-044 category-4 silent-framework on derivable fields; category-1 direction-setting fallback only on Scope)
-Use `AskUserQuestion` for anything not in `$ARGUMENTS`. Incident-declaration inputs are user-knowledge that the framework cannot infer (only the user observed the symptoms / knows the scope / can rate live business impact); this is the canonical ADR-044 **category-1 (direction-setting)** surface — *"only the user knows the goals that haven't been written down yet."*
+**Derive-first dispatch.** Incident declarations carry observable evidence in the user's prose, the working tree, `RISK-POLICY.md`, and the wall-clock — the framework can resolve most fields without firing `AskUserQuestion`. Only **Scope** is genuinely user-judgment (semantic blast-radius the framework cannot infer); only **Scope** retains the AskUserQuestion gate.
-- **Title**: short kebab-case-friendly description
-- **Symptoms**: what is observable (errors, latency, missing data)?
-- **Scope**: who/what is affected (users, endpoints, regions)?
-- **Start time**: when did symptoms begin? (UTC, as precise as known)
-- **Severity**: Impact (1-5) × Likelihood (1-5) per `RISK-POLICY.md`, interpreted as live impact
+The P132 inverse-P078 trap (`docs/problems/known-error/132-...md`) is the load-bearing motivation: the I001 declaration regression fired a 4-question AskUserQuestion with 3 of 4 sub-questions being lazy classifications (Title kebab-derivable, Severity matrix-derivable, Start time git-log-derivable). This dispatch closes that regression on the manage-incident surface and mirrors `/wr-itil:capture-problem` Step 1.5's worked-example pattern (P185 derive-first refactor).
-Do not ask for fields that can be inferred:
+Resolve each field via the following dispatch. **The order is load-bearing** — every field except Scope resolves silently with a stderr advisory citing the source; Scope alone fires `AskUserQuestion` as the genuine category-1 surface.
+| Field | Dispatch | ADR-044 category |
+|-------|----------|------------------|
+| **Title** | Derive silently. Kebab-case the first 8-10 non-stopword tokens of the user's prose description (same slug derivation as `/wr-itil:capture-problem` Step 1.4 and `/wr-itil:manage-problem` Step 4). Emit stderr advisory: `manage-incident: derived title='<slug>' from description; re-invoke or rename the file if the slug is wrong`. Do NOT fire AskUserQuestion. | category-4 silent-framework |
+| **Symptoms** | Pull from user prose verbatim — the description text IS the symptoms surface for declaration. Place into the `## Observations` section template at Step 5. Do NOT fire AskUserQuestion. | category-4 silent-framework |
+| **Start time** | Derive silently, three sources in priority order: (a) explicit timestamp in description (regex `\b\d{4}-\d{2}-\d{2}([ T]\d{2}:\d{2})?\b`, or relative form `"<N> (minutes|hours|days) ago"` resolved against current wall-clock); (b) if the description cites a specific file/dir/changeset-holding-area, run `git log --diff-filter=A --follow -- <path> \| tail -1` for first-touch evidence (the I001 regression's "first hold at 2026-04-24" was this exact shape — `git log --diff-filter=A --follow -- docs/changesets-holding/`); (c) otherwise default to current wall-clock UTC. Emit stderr advisory: `manage-incident: start-time derived as <ts> from <source>; cite an additional evidence anchor in the Timeline section if symptoms began earlier`. Do NOT fire AskUserQuestion. | category-4 silent-framework |
+| **Severity** | Derive silently when evidence maps to a clear `RISK-POLICY.md` Impact × Likelihood cell. Cross-reference description signals against the matrix: (a) impact signals (service disruption keywords like `down` / `degraded` / `unavailable` → high; latency / throughput keywords → moderate; cosmetic / typo keywords → low); (b) likelihood signals (`reproducible` / `every request` → high; `intermittent` / `flaky` → medium; `one-off` / `single user` → low); (c) named anchors (held-cluster age cited → use that age to map cell; scorer state cited → use the cited band). When the cross-reference produces a single clear cell, set it silently and emit stderr advisory: `manage-incident: severity derived as <score> (<label>) from RISK-POLICY matrix + evidence: <evidence list>; re-invoke or update if mis-rated`. **Ambiguous-evidence fallback** (no mappable signal in description, or signals point to conflicting cells): fire AskUserQuestion with the Impact (1-5) × Likelihood (1-5) options as the genuine ADR-044 **category-5 (taste)** fallback surface. The fallback is genuine ambiguity, NOT defaults. | category-4 silent-framework (derivable); category-5 fallback (ambiguous) |
+| **Scope** | Retain AskUserQuestion. Scope is the user-judgment surface — only the user knows whether downstream-adopter-risk is in scope, whether mobile is affected, whether the blast radius extends past the cited symptoms. The framework cannot resolve semantic scope deterministically (same reasoning as Step 2 duplicate-check). Construct the call with `header: "Incident scope"`, `multiSelect: false` if a closed enum applies or free-text capture otherwise. This is the canonical ADR-044 **category-1 (direction-setting)** surface — *"only the user knows the goals that haven't been written down yet."* | category-1 direction-setting |
+**Inferred fields (no ask, no advisory needed)**:
 - **Reported**: today's date (UTC)
 - **Status**: always "Investigating" for new incidents
+**Stderr advisory contract**: each derived field emits a SINGLE line to stderr (NOT stdout, NOT in the ticket body) per the capture-problem Step 1.5 pattern. The advisory text shape is I2-isomorphic — identical sentence structure across fields beyond substituted values + source names. Embedding the advisory in stdout would risk machine-readers parsing it as a ticket-body line; embedding it in the ticket body would violate ADR-011's required-section schema. Stderr is the correct channel — visible to interactive maintainers in the terminal; invisible to ticket consumers; loggable by orchestrators that capture subprocess stderr.
+**ADR-026 cost-source grounding**: each derived field cites its source in the advisory (description token sequence for Title; explicit-regex / `git log` / wall-clock for Start time; RISK-POLICY matrix cell + named evidence for Severity). The `re-invoke or update if mis-rated` clause carries the reversibility marker ADR-026 mandates for ungrounded outputs.
+**AFK fail-safe (ADR-013 Rule 6)**: under AFK orchestration, all derivable fields resolve without interactive input; only Scope's AskUserQuestion can block. The orchestrator should halt-with-stderr citing which field needed input rather than guess (Scope is genuinely user-judgment per JTBD-006's "Problems requiring my judgment ... are queued for my return, not guessed at"). manage-incident is rarely AFK-invoked because incidents are interactive by design (JTBD-201), so the halt-on-Scope path is the expected behaviour, not a regression.
 ### 5. For new incidents: Write the incident file
 **File path**: `docs/incidents/<I###>-<kebab-case-title>.investigating.md`
@@ -331,7 +343,9 @@ Otherwise, after the commit in step 14 lands, drain the release queue so the fix
 ## Related
 - **P136** (`docs/problems/136-adr-044-alignment-audit-master.open.md`) — ADR-044 alignment audit master. This skill is the third high-ask SKILL audited under Phase 2 (after work-problem singular and mitigate-incident).
-- **ADR-044** (`docs/decisions/044-decision-delegation-contract.proposed.md`) — Decision-Delegation Contract. All four AskUserQuestion surfaces in this skill align with the 6-class authority taxonomy: Step 2 duplicate-check is **category-1 (direction-setting)**; Step 4 gather-info is **category-1 (direction-setting)**; Step 6 evidence-gate is **category-2 (deviation-approval)**; Step 14 risk-above-appetite is **category-3 (one-time-override)**.
+- **ADR-044** (`docs/decisions/044-decision-delegation-contract.proposed.md`) — Decision-Delegation Contract. The skill's AskUserQuestion surfaces align with the 6-class authority taxonomy: Step 2 duplicate-check is **category-1 (direction-setting)**; Step 4 is **category-4 (silent-framework)** on Title / Symptoms / Start time / Severity-when-evidence-present + **category-1 (direction-setting)** on Scope + **category-5 (taste)** fallback on Severity-on-ambiguity (P132 derive-first refactor 2026-05-15 re-classified Step 4 from "single cat-1 declaration" to "derive-first dispatch with cat-1 / cat-5 fallback only"); Step 6 evidence-gate is **category-2 (deviation-approval)**; Step 14 risk-above-appetite is **category-3 (one-time-override)**.
+- **P132** (`docs/problems/known-error/132-agents-over-ask-in-interactive-sessions-conflating-mechanical-stages-with-user-interactive-stages.md`) — Agents over-ask in interactive sessions (inverse-P078). Step 4 derive-first refactor closes the 2026-05-06 I001 declaration regression where 3 of 4 sub-questions were lazy classifications. Composes with P185 (capture-problem Step 1.5 derive-first refactor — the in-tree worked-example precedent).
+- **P185** (`docs/problems/...`) — capture-problem Step 1.5 derive-first refactor. Step 4 mirrors the same dispatch shape (silent classifier + stderr advisory + AskUserQuestion only on ambiguity).
 - **ADR-013 amended Rule 1** (`docs/decisions/013-structured-user-interaction-for-governance-decisions.proposed.md`) — structured user interaction; narrowed in P135 to defer to ADR-044 for framework-resolution boundary. All four surfaces retain `AskUserQuestion` as genuine user-authority surfaces under categories enumerated in ADR-044.
 - **ADR-013 Confirmation criterion #1** — `grep -inE "Options:.*\(a\)\|Your call:\|which would you like\|which way?"` returns zero matches. Step 2's prior prompt body violated this with `Would you like to (a) update...` phrasing; the P136 Phase 2 refactor (2026-04-28) closed the regression by lifting options into the `AskUserQuestion` `options[]` mechanism.
 - **ADR-011** (`docs/decisions/011-manage-incident-skill.proposed.md`) — incident lifecycle; evidence-first workflow; reversible-mitigation preference; Sev 4-5 lightweight path. Step 6's evidence-gate refactor (2026-04-28) extends ADR-011's evidence-first rule with the documented `Record anyway` audit-trail bypass that mitigate-incident already used (cool-headed-commitment consistency across the two incident skills).

package/skills/manage-incident/test/manage-incident-adr-044-contract.bats CHANGED Viewed

@@ -67,7 +67,8 @@ setup() {
 }
 # ----------------------------------------------------------------------
-# Surface 2 — Step 4 gather info (cat-1 cosmetic cross-ref)
+# Surface 2 — Step 4 gather info (P132 derive-first refactor — cat-4 silent-framework
+# on derivable fields; cat-1 direction-setting fallback only on Scope)
 # ----------------------------------------------------------------------
 @test "SKILL.md Step 4 gather-info cross-references ADR-044 category-1 (direction-setting)" {
@@ -77,6 +78,73 @@ setup() {
   [[ "$output" == *"direction-setting"* ]] || [[ "$output" == *"category 1"* ]] || [[ "$output" == *"category-1"* ]]
 }
+@test "SKILL.md Step 4 cross-references ADR-044 category-4 (silent-framework) for derivable fields (P132 derive-first)" {
+  # P132 derive-first refactor: Title / Symptoms / Start time / Severity-when-evidence-present
+  # resolve via silent-framework per ADR-044 category 4. Only Scope retains AskUserQuestion as
+  # the genuine category-1 direction-setting surface.
+  run awk '/^### 4\. /,/^### 5\. /' "$SKILL_FILE"
+  [ "$status" -eq 0 ]
+  [[ "$output" == *"silent-framework"* ]] || [[ "$output" == *"category 4"* ]] || [[ "$output" == *"category-4"* ]]
+}
+@test "SKILL.md Step 4 derives Title from prose silently (P132 inverse-P078)" {
+  # I001 regression cited in P132 line 14: agent asked "Title" with 3 candidate
+  # options when kebab-casing the description would have produced the slug directly.
+  # The refactor names "Title" + "derive"/"derived"/"kebab" in the same step.
+  run awk '/^### 4\. /,/^### 5\. /' "$SKILL_FILE"
+  [ "$status" -eq 0 ]
+  [[ "$output" == *"Title"* ]]
+  [[ "$output" == *"derive"* ]] || [[ "$output" == *"derived"* ]]
+  [[ "$output" == *"kebab"* ]] || [[ "$output" == *"prose"* ]]
+}
+@test "SKILL.md Step 4 derives Start time from evidence sources (P132 inverse-P078)" {
+  # I001 regression cited in P132 line 16: agent asked "Start time" with 3 candidate
+  # options when git log first-touch evidence would have produced 2026-04-24 directly.
+  # The refactor names git-log / timestamp / wall-clock as the three priority-ordered sources.
+  run awk '/^### 4\. /,/^### 5\. /' "$SKILL_FILE"
+  [ "$status" -eq 0 ]
+  [[ "$output" == *"Start time"* ]] || [[ "$output" == *"start-time"* ]]
+  [[ "$output" == *"git log"* ]] || [[ "$output" == *"timestamp"* ]]
+}
+@test "SKILL.md Step 4 derives Severity from RISK-POLICY matrix + evidence (P132 inverse-P078)" {
+  # I001 regression cited in P132 line 15: agent asked "Severity" with 4 candidate
+  # options when the RISK-POLICY matrix + observable evidence (cluster age, scorer
+  # state) maps to a clear cell. The refactor cites RISK-POLICY.md + evidence in
+  # the Severity row of the dispatch table. Ambiguous-evidence fallback to
+  # AskUserQuestion is preserved as the genuine cat-5 (taste) surface.
+  run awk '/^### 4\. /,/^### 5\. /' "$SKILL_FILE"
+  [ "$status" -eq 0 ]
+  [[ "$output" == *"Severity"* ]]
+  [[ "$output" == *"RISK-POLICY"* ]]
+}
+@test "SKILL.md Step 4 retains Scope as AskUserQuestion direction-setting (negative-of-negative guard)" {
+  # Regression-resistance: the refactor MUST preserve the genuine cat-1 direction-setting
+  # surface on Scope. Semantic scope (who/what affected, blast radius) is user-judgment;
+  # the framework cannot resolve it deterministically. Same reasoning as Step 2 duplicate-check.
+  run awk '/^### 4\. /,/^### 5\. /' "$SKILL_FILE"
+  [ "$status" -eq 0 ]
+  [[ "$output" == *"Scope"* ]]
+  [[ "$output" == *"AskUserQuestion"* ]]
+}
+@test "SKILL.md Step 4 cites P132 (inverse-P078 audit traceability)" {
+  # P132 + ADR-044 must appear in Step 4 or Related section so the audit trail
+  # for the I001 regression fix is recoverable from the SKILL.md surface.
+  run grep -nE "P132" "$SKILL_FILE"
+  [ "$status" -eq 0 ]
+}
+@test "SKILL.md Step 4 documents stderr advisory shape for derived fields (ADR-026 grounding)" {
+  # ADR-026 cost-source grounding: each silent derivation emits a stderr advisory
+  # citing the source. Pattern parity with capture-problem Step 1.5 stderr advisory.
+  run awk '/^### 4\. /,/^### 5\. /' "$SKILL_FILE"
+  [ "$status" -eq 0 ]
+  [[ "$output" == *"stderr"* ]] || [[ "$output" == *"advisory"* ]]
+}
 # ----------------------------------------------------------------------
 # Surface 3 — Step 6 evidence-first gate refactor (cat-2; align with mitigate-incident)
 # ----------------------------------------------------------------------

package/skills/manage-problem/SKILL.md CHANGED Viewed

@@ -374,19 +374,34 @@ next=$(printf '%03d' $(( 10#$(echo -e "${local_max:-0}\n${origin_max:-0}" | sort
 If the local choice would have collided with an origin ticket created since the last fetch, the `git ls-tree origin/<base>` lookup catches it here and the renumber is automatic. Log the renumber decision in the operation report (e.g. "Bumped next ID from 042 → 043 to avoid collision with origin").
-### 4. For new problems: Gather information
+### 4. For new problems: Gather information (P132 derive-first; ADR-044 category-4 silent-framework on derivable fields; category-1 direction-setting fallback only on Description)
-If the arguments contain a description, extract what you can. For anything missing, use `AskUserQuestion` to gather:
+**Derive-first dispatch.** Problem-declaration inputs carry observable evidence in the user's prose, the working tree, `RISK-POLICY.md`, and the wall-clock — the framework can resolve most fields without firing `AskUserQuestion`. Only **Description** is genuinely user-knowledge (without prose there is literally nothing to capture); only **Description** retains the AskUserQuestion gate.
-- **Title**: Short kebab-case-friendly description
-- **Description**: What is happening? What should happen instead?
-- **Priority**: Impact (1-5) × Likelihood (1-5) per RISK-POLICY.md
+The P132 inverse-P078 trap (`docs/problems/known-error/132-...md`) is the load-bearing motivation. The 2026-05-06 I001 declaration regression cited in P132 fired a 4-question AskUserQuestion with 3 of 4 sub-questions being lazy classifications (Title kebab-derivable, Severity matrix-derivable, Start time git-log-derivable). manage-problem Step 4 is the second declaration-skill surface under Phase 2a (after manage-incident Step 4 in commit b7cc645) to ship the derive-first dispatch. The pattern is isomorphic across `/wr-itil:capture-problem` Step 1.5 (P185 worked example), `/wr-itil:manage-incident` Step 4, and this skill.
-Do NOT ask for fields that can be inferred:
-- **Reported date**: Use today's date
-- **Status**: Always "Open" for new problems
-- **Symptoms**: Infer from description if possible
-- **Workaround**: Default to "None identified yet." unless obvious from context
+Resolve each field via the following dispatch. **The order is load-bearing** — every field except Description resolves silently with a stderr advisory citing the source; Description alone fires `AskUserQuestion` as the genuine category-1 surface.
+| Field | Dispatch | ADR-044 category |
+|-------|----------|------------------|
+| **Title** | Derive silently. Kebab-case the first 8-10 non-stopword tokens of the user's prose description (same slug derivation as `/wr-itil:capture-problem` Step 1.4 and `/wr-itil:manage-incident` Step 4). Emit stderr advisory: `manage-problem: derived title='<slug>' from description; re-invoke with the desired title or rename the file if the slug is wrong`. Do NOT fire AskUserQuestion. | category-4 silent-framework |
+| **Description** | Pull verbatim from `$ARGUMENTS` prose into Step 5's `## Description` section. **Fallback**: when `$ARGUMENTS` carries NO prose at all (only flags / status / no body), fire AskUserQuestion as the genuine category-1 direction-setting surface — *"only the user knows the goals that haven't been written down yet."* Question text: *"What is happening? What should happen instead?"* This is the ONLY user-knowledge field at Step 4. | category-1 direction-setting (fallback only; category-4 silent-framework on the typical path where prose is present) |
+| **Priority** (Impact × Likelihood) | Derive silently when description signals map to a clear `RISK-POLICY.md` Impact × Likelihood cell. Cross-reference signals: (a) **impact** — service-disruption keywords (`down` / `degraded` / `unavailable` / `data loss` → high; latency / throughput / slow → moderate; cosmetic / typo / minor friction → low); (b) **likelihood** — reproducibility keywords (`every invocation` / `reproducible` / `100%` → high; `intermittent` / `flaky` / `sometimes` → medium; `one-off` / `single observation` → low); (c) **named anchors** — explicit `Impact: <label>` / `Likelihood: <label>` or `Priority: <score>` mentions in prose take precedence. When the cross-reference produces a single clear cell, set it silently and emit stderr advisory: `manage-problem: priority derived as <score> (<label>) from RISK-POLICY matrix + evidence: <evidence list>; re-invoke or update if mis-rated`. **Ambiguous-evidence fallback** (no mappable signal, or signals point to conflicting cells): fire AskUserQuestion with the Impact (1-5) × Likelihood (1-5) options as the genuine ADR-044 **category-5 (taste)** fallback surface. The fallback is genuine ambiguity, NOT defaults. | category-4 silent-framework (derivable); category-5 fallback (ambiguous) |
+**Inferred fields (no ask, no advisory needed)**:
+- **Reported date**: today's date (`date +%Y-%m-%d`)
+- **Status**: always "Open" for new problems
+- **Symptoms**: infer from description verbatim into Step 5's `## Symptoms` section
+- **Workaround**: default to "None identified yet." unless explicit workaround prose appears in `$ARGUMENTS`
+**Stderr advisory contract**: each derived field emits a SINGLE line to stderr (NOT stdout, NOT in the ticket body) per the capture-problem Step 1.5 + manage-incident Step 4 pattern. The advisory text shape is I2-isomorphic — identical sentence structure across the three declaration-skill surfaces (`<skill>: derived <field>=<value> from <source>; <reversibility-clause>`) beyond substituted values + source names. Embedding the advisory in stdout would risk machine-readers parsing it as a ticket-body line; embedding it in the ticket body would violate the required-section schema. Stderr is the correct channel — visible to interactive maintainers in the terminal; invisible to ticket consumers; loggable by orchestrators that capture subprocess stderr.
+**ADR-026 cost-source grounding**: each derived field cites its source in the advisory (description token sequence for Title; RISK-POLICY matrix cell + named evidence for Priority). The `re-invoke or update if mis-rated` clause carries the reversibility marker ADR-026 mandates for ungrounded outputs.
+**AFK fail-safe (ADR-013 Rule 6)**: under AFK orchestration, all derivable fields resolve without interactive input; only Description-when-absent can block. The orchestrator should halt-with-stderr citing the missing-prose case rather than guess (Description is genuinely user-judgment per JTBD-006's "Problems requiring my judgment ... are queued for my return, not guessed at"). The typical AFK manage-problem call carries prose in `$ARGUMENTS` (or the orchestrator's per-iter context supplies it), so the halt-on-Description path is the rare-corner-case behaviour, not the routine flow.
+**Cross-skill consistency note**: this is the third declaration-skill surface to ship the derive-first dispatch (after `/wr-itil:capture-problem` Step 1.5 and `/wr-itil:manage-incident` Step 4 in commit b7cc645). The architect verdict 2026-05-15 P132 Phase 2a-ii flagged this triplet as the pattern-lock point — the I2-isomorphic stderr advisory format is now established across three skills before Phase 2a-iii (`/wr-architect:create-adr` argument-collection) extends the same pattern to a fourth.
 ### 4b. For new problems: Concern-boundary analysis (multi-concern check)
@@ -483,6 +498,8 @@ After writing the new `.open.md` file, regenerate `docs/problems/README.md` to i
 **Verification Queue sort direction (P150)**: rows in the Verification Queue table are sorted by `Released date ASC` (oldest at row 1; same-day releases tiebreak by ID ASC) per ADR-022 + P048 user-task semantics — older entries are the most likely-verified candidates the user wants to surface first when closing the queue. Newest-first ordering pushes those actionable closure candidates below the fold and contradicts the section header. <!-- VQ-SORT-DIRECTION: oldest-first per ADR-022 --> Any future change to the VQ sort direction MUST update this render block, the Step 7 P062 block, the Step 9c presentation block, the Step 9e template, AND `/wr-itil:review-problems` + `/wr-itil:transition-problem` + `/wr-itil:transition-problems` + `/wr-itil:reconcile-readme` + `/wr-itil:list-problems` — drift here re-opens P150.
+**Likely-verified cell shape (P186)**: the `Likely verified?` column carries an **evidence-first** cell — `yes — observed: <evidence>` / `no — not observed` / `no — observed regression`. The 14-day age-based heuristic (originally introduced by P048 Candidate 4) is superseded — age is preserved separately via the `Released` column; the `Likely verified?` column is reserved for session-observed evidence (Step 4 user confirmation, in-session test invocation outcome per ADR-026 grounding, or `run-retro` Step 4a close-on-evidence citation). <!-- LIKELY-VERIFIED-CELL-SHAPE: evidence-based per P186 --> Any future change to the cell shape MUST update this render block, the Step 7 P062 block, the Step 9c presentation block, the Step 9e template, AND `/wr-itil:review-problems` + `/wr-itil:transition-problem` + `/wr-itil:transition-problems` + `/wr-itil:reconcile-readme` + `/wr-itil:list-problems` — drift here re-opens P186.
 1. After `Write`-ing the new `.open.md` file (and, for multi-concern splits per step 4b, after all split files are written), regenerate `docs/problems/README.md` in-place reflecting the new filename set.
 2. Update the "Last reviewed" line per the **Last-reviewed line discipline (P134)** subsection below — name the new ticket as the most-recent fragment (e.g. `P<NNN> opened — <one-line title>`); displaced prior fragments rotate to `docs/problems/README-history.md`.
 3. `git add docs/problems/README.md` — the stage list at Step 11 must include it alongside the new `.open.md` file (Step 11's `git add -u` catch-all handles tracked-file modifications; the new README render lands via this path when README.md already exists in git, and via an explicit `git add docs/problems/README.md` when it is newly created). When line-3 truncation displaces prior content, also `git add docs/problems/README-history.md`.
@@ -650,6 +667,8 @@ The refresh uses the same rendering rules as Step 9e (dual-tolerant glob per RFC
 **Verification Queue sort direction (P150)**: rows in the Verification Queue table are sorted by `Released date ASC` (oldest at row 1; same-day releases tiebreak by ID ASC) per ADR-022 + P048 user-task semantics — older entries are the most likely-verified candidates the user wants to surface first when closing the queue. Newest-first ordering pushes those actionable closure candidates below the fold and contradicts the section header. <!-- VQ-SORT-DIRECTION: oldest-first per ADR-022 --> Any future change to the VQ sort direction MUST update this render block, the Step 5 P094 block, the Step 9c presentation block, the Step 9e template, AND `/wr-itil:review-problems` + `/wr-itil:transition-problem` + `/wr-itil:transition-problems` + `/wr-itil:reconcile-readme` + `/wr-itil:list-problems` — drift here re-opens P150.
+**Likely-verified cell shape (P186)**: the `Likely verified?` column carries an **evidence-first** cell — `yes — observed: <evidence>` / `no — not observed` / `no — observed regression`. Age is preserved separately via the `Released` column; session-observed evidence drives the cell. On a Known Error → Verification Pending transition the refresh writes `no — not observed` as the default (no observed evidence yet at the moment of release). <!-- LIKELY-VERIFIED-CELL-SHAPE: evidence-based per P186 --> Any future change to the cell shape MUST update this render block, the Step 5 P094 block, the Step 9c presentation block, the Step 9e template, AND `/wr-itil:review-problems` + `/wr-itil:transition-problem` + `/wr-itil:transition-problems` + `/wr-itil:reconcile-readme` + `/wr-itil:list-problems` — drift here re-opens P186.
 **Mechanism:**
 1. After renaming + Editing + `git add`-ing the transitioned ticket file (per the staging-trap rule above), regenerate `docs/problems/README.md` in-place reflecting the new filename set and the transitioned ticket's new Status.
@@ -745,14 +764,15 @@ After reviewing all problems, present a WSJF-ranked table for open/known-error p
 | WSJF | ID | Title | Severity | Status | Effort | Reported | Notes |
 |------|-----|-------|----------|--------|--------|----------|-------|
-Then present a separate **Verification Queue** section for `.verifying.md` files (per ADR-022 — ranked by release age, oldest first; no WSJF because the multiplier is 0). Sort key + direction is the canonical `Released date ASC` (oldest at row 1; same-day releases tiebreak by ID ASC) — drift here re-opens P150. <!-- VQ-SORT-DIRECTION: oldest-first per ADR-022 --> Highlight each ticket whose release age is **≥ 14 days** (the within-skill default per P048 Candidate 4 — tunable; if it needs cross-skill consistency later, promote to policy) with a `likely verified` marker in the final column. This makes the Verification Queue not just a list but a ranked view of which verifications are most likely ready to close — older entries are the most likely-verified candidates the user wants to surface first when closing the queue:
+Then present a separate **Verification Queue** section for `.verifying.md` files (per ADR-022 — ranked by release age, oldest first; no WSJF because the multiplier is 0). Sort key + direction is the canonical `Released date ASC` (oldest at row 1; same-day releases tiebreak by ID ASC) — drift here re-opens P150. <!-- VQ-SORT-DIRECTION: oldest-first per ADR-022 --> The final `Likely verified?` column carries an **evidence-first** cell (per P186 — supersedes the original P048 Candidate 4 14-day heuristic). <!-- LIKELY-VERIFIED-CELL-SHAPE: evidence-based per P186 --> Three canonical values:
 | ID | Title | Released | Fix summary | Likely verified? |
 |----|-------|----------|-------------|------------------|
-The `Likely verified?` column takes values:
-- `yes (N days)` — release age ≥ 14 days; the user is unlikely to revert a landed fix after this long. Surface these first in step 9d's verification prompt so the user can batch-close them.
-- `no (N days)` — release age < 14 days; may still be in validation. Fire step 9d for these too, but without the highlight.
+The `Likely verified?` column takes values (per P186):
+- `yes — observed: <evidence>` — session-observed evidence the fix works. Cite the evidence inline (≤ 120 chars): a Step 9d user confirmation phrase quoted, an in-session test invocation + observable outcome per ADR-026 grounding, or a `run-retro` Step 4a close-on-evidence citation. Surface these FIRST in step 9d's verification prompt so the user can batch-close them.
+- `no — not observed` — fix released but no session-observable evidence yet. Default for newly-released tickets. Fire step 9d for these too, without batch-close highlight. Aging surfaces via the `Released` column — NOT in this cell.
+- `no — observed regression` — fix released and the bug recurred this session. Cite the recurrence inline (≤ 120 chars). Do NOT batch-close — these may warrant `.verifying.md` → `.known-error.md` flip-back via `/wr-itil:transition-problem`.
 Then present a separate **Parked** section listing `.parked.md` files (no ranking):
@@ -805,11 +825,11 @@ Edit each problem file where the priority changed. Then write/overwrite `docs/pr
 ## Verification Queue
-Fix released, awaiting user verification (driven off `docs/problems/*.verifying.md` via glob — per ADR-022). Sorted by `Released date ASC` (oldest at row 1; same-day releases tiebreak by ID ASC). <!-- VQ-SORT-DIRECTION: oldest-first per ADR-022 --> Drift here re-opens P150 — any change to VQ sort direction MUST update the Step 5 P094 block, the Step 7 P062 block, the Step 9c presentation block, this template, AND `/wr-itil:review-problems` + `/wr-itil:transition-problem` + `/wr-itil:transition-problems` + `/wr-itil:reconcile-readme` + `/wr-itil:list-problems`.
+Fix released, awaiting user verification (driven off `docs/problems/*.verifying.md` via glob — per ADR-022). Sorted by `Released date ASC` (oldest at row 1; same-day releases tiebreak by ID ASC). <!-- VQ-SORT-DIRECTION: oldest-first per ADR-022 --> Drift here re-opens P150 — any change to VQ sort direction MUST update the Step 5 P094 block, the Step 7 P062 block, the Step 9c presentation block, this template, AND `/wr-itil:review-problems` + `/wr-itil:transition-problem` + `/wr-itil:transition-problems` + `/wr-itil:reconcile-readme` + `/wr-itil:list-problems`. The `Likely verified?` column carries an **evidence-first** cell per P186 — three canonical values: `yes — observed: <evidence>`, `no — not observed` (default for newly-released tickets), `no — observed regression`. <!-- LIKELY-VERIFIED-CELL-SHAPE: evidence-based per P186 --> Age is preserved separately via the `Released` column — drift on the cell shape re-opens P186.
-| ID | Title | Released | Fix summary |
-|----|-------|----------|-------------|
-| P<NNN> | <title> | <release marker> | <one-sentence fix summary> |
+| ID | Title | Released | Fix summary | Likely verified? |
+|----|-------|----------|-------------|------------------|
+| P<NNN> | <title> | <release marker> | <one-sentence fix summary> | <yes — observed: …  /  no — not observed  /  no — observed regression> |
 ...
 ## Parked

package/skills/manage-problem/test/manage-problem-adr-044-step4-derive-first.bats ADDED Viewed

@@ -0,0 +1,151 @@
+#!/usr/bin/env bats
+# ADR-044 alignment contract assertions for manage-problem SKILL.md
+# Step 4 (P132 Phase 2a-ii derive-first refactor, 2026-05-15).
+#
+# tdd-review: structural-permitted (justification: SKILL.md prose contract
+# assertions; behavioural skill-runtime harness pending P012 + P081 Phase 2;
+# expected to migrate to behavioural form once the harness exists. Added
+# during P132 Phase 2a-ii per the inline plan's bridge-marker rule —
+# isomorphic precedent at manage-incident-adr-044-contract.bats Surface 2.)
+#
+# This file is the dedicated structural-grep-permitted home for the ADR-044
+# alignment contract during the bridge window. After P081 Phase 2 retrofits
+# the project's structural-grep tests to behavioural form, this file's
+# assertions migrate too.
+#
+# @problem P132 (agents over-ask in interactive sessions — Phase 2a-ii
+#   manage-problem create flow derive-first refactor)
+# @problem P185 (capture-problem Step 1.5 worked-example precedent)
+# @problem P136 (ADR-044 alignment audit master)
+# @adr ADR-044 (Decision-Delegation Contract)
+# @adr ADR-013 amended Rule 1 (structured user interaction)
+# @adr ADR-026 (cost-source grounding — stderr advisory shape)
+# @adr ADR-052 (behavioural-by-default with structural bridge window)
+# @jtbd JTBD-001 (enforce governance without slowing down — primary)
+# @jtbd JTBD-006 (work backlog AFK — queued for return, not guessed at)
+# @jtbd JTBD-101 (extend the suite with consistent patterns)
+setup() {
+  SKILL_DIR="$(cd "$(dirname "$BATS_TEST_FILENAME")/.." && pwd)"
+  SKILL_FILE="${SKILL_DIR}/SKILL.md"
+  [ -f "$SKILL_FILE" ]
+}
+# ----------------------------------------------------------------------
+# Step 4 derive-first refactor (P132 Phase 2a-ii) — cat-4 silent-framework
+# on Title + Priority-when-evidence-present; cat-1 direction-setting only
+# on Description; cat-5 taste fallback only on Priority-when-ambiguous.
+# ----------------------------------------------------------------------
+@test "SKILL.md Step 4 cross-references ADR-044 category-4 (silent-framework) for derivable fields (P132 derive-first)" {
+  # P132 Phase 2a-ii: Title + Priority-when-evidence-present resolve via
+  # silent-framework per ADR-044 category 4. Only Description retains
+  # AskUserQuestion as genuine cat-1 direction-setting (no prose -> nothing
+  # to capture).
+  run awk '/^### 4\. /,/^### 4b\. /' "$SKILL_FILE"
+  [ "$status" -eq 0 ]
+  [[ "$output" == *"silent-framework"* ]] || [[ "$output" == *"category 4"* ]] || [[ "$output" == *"category-4"* ]]
+}
+@test "SKILL.md Step 4 cross-references ADR-044 category-1 (direction-setting) for Description fallback" {
+  # Description is the genuine cat-1 surface — without prose there is
+  # literally nothing to capture. The refactor preserves the AskUserQuestion
+  # on Description.
+  run awk '/^### 4\. /,/^### 4b\. /' "$SKILL_FILE"
+  [ "$status" -eq 0 ]
+  [[ "$output" == *"direction-setting"* ]] || [[ "$output" == *"category 1"* ]] || [[ "$output" == *"category-1"* ]]
+}
+@test "SKILL.md Step 4 derives Title from prose silently (P132 inverse-P078)" {
+  # The 2026-05-06 I001 declaration regression cited in P132 line 14 was the
+  # same agent failure mode on the manage-incident surface: agent asked
+  # "Title" with 3 candidate options when kebab-casing the description
+  # would have produced the slug directly. manage-problem Step 4 must ship
+  # the same derive-first pattern.
+  run awk '/^### 4\. /,/^### 4b\. /' "$SKILL_FILE"
+  [ "$status" -eq 0 ]
+  [[ "$output" == *"Title"* ]]
+  [[ "$output" == *"derive"* ]] || [[ "$output" == *"derived"* ]]
+  [[ "$output" == *"kebab"* ]] || [[ "$output" == *"prose"* ]]
+}
+@test "SKILL.md Step 4 derives Priority from RISK-POLICY matrix + evidence (P132 inverse-P078)" {
+  # The I001 regression cited in P132 line 15 was the analogous failure on
+  # Severity. manage-problem's Priority (Impact x Likelihood) derives from
+  # the same RISK-POLICY matrix lookup against description signals.
+  # Ambiguous-evidence falls back to AskUserQuestion as cat-5 (taste).
+  run awk '/^### 4\. /,/^### 4b\. /' "$SKILL_FILE"
+  [ "$status" -eq 0 ]
+  [[ "$output" == *"Priority"* ]]
+  [[ "$output" == *"RISK-POLICY"* ]]
+}
+@test "SKILL.md Step 4 retains Description as AskUserQuestion fallback (negative-of-negative guard)" {
+  # Regression-resistance: the refactor MUST preserve the genuine cat-1
+  # direction-setting surface on Description. Without user-supplied prose
+  # the SKILL has nothing to derive from — Description IS the input. Same
+  # reasoning as manage-incident Step 4 Scope retention.
+  run awk '/^### 4\. /,/^### 4b\. /' "$SKILL_FILE"
+  [ "$status" -eq 0 ]
+  [[ "$output" == *"Description"* ]]
+  [[ "$output" == *"AskUserQuestion"* ]]
+}
+@test "SKILL.md Step 4 cites P132 (inverse-P078 audit traceability)" {
+  # P132 + ADR-044 must appear in Step 4 or Related section so the audit
+  # trail for the Phase 2a-ii refactor is recoverable from the SKILL.md
+  # surface.
+  run grep -nE "P132" "$SKILL_FILE"
+  [ "$status" -eq 0 ]
+}
+@test "SKILL.md Step 4 documents stderr advisory shape for derived fields (ADR-026 grounding)" {
+  # ADR-026 cost-source grounding: each silent derivation emits a stderr
+  # advisory citing the source. Pattern parity with capture-problem Step
+  # 1.5 + manage-incident Step 4 (I2-isomorphic across the three
+  # declaration-skill surfaces per architect verdict 2026-05-15).
+  run awk '/^### 4\. /,/^### 4b\. /' "$SKILL_FILE"
+  [ "$status" -eq 0 ]
+  [[ "$output" == *"stderr"* ]] || [[ "$output" == *"advisory"* ]]
+}
+@test "SKILL.md Step 4 cross-references capture-problem Step 1.5 + manage-incident Step 4 (cross-skill consistency)" {
+  # The architect verdict 2026-05-15 P132 Phase 2a-ii flagged cross-skill
+  # consistency: three declaration-skill surfaces now ship the same
+  # dispatch shape. The Step 4 prose must explicitly cite both prior
+  # surfaces (P185 capture-problem + manage-incident b7cc645) as
+  # worked-example precedents so the I2-isomorphic stderr advisory format
+  # is locked-in by reference before a fourth surface (Phase 2a-iii
+  # create-adr) drifts.
+  run awk '/^### 4\. /,/^### 4b\. /' "$SKILL_FILE"
+  [ "$status" -eq 0 ]
+  [[ "$output" == *"P185"* ]] || [[ "$output" == *"capture-problem"* ]]
+  [[ "$output" == *"manage-incident"* ]] || [[ "$output" == *"b7cc645"* ]]
+}
+# ----------------------------------------------------------------------
+# Negative-of-negative guards — Step 4b multi-concern + Step 2
+# duplicate-check MUST remain cat-1 direction-setting AskUserQuestion
+# surfaces (architect verdict 2026-05-15: not touched by Phase 2a-ii).
+# ----------------------------------------------------------------------
+@test "SKILL.md Step 4b multi-concern AskUserQuestion is preserved (cat-1 direction-setting, not touched by Phase 2a-ii)" {
+  # Architect verdict 2026-05-15: Step 4b is a separate cat-1
+  # direction-setting surface — only the user knows whether the concerns
+  # can be independently fixed. The Phase 2a-ii refactor MUST NOT touch
+  # Step 4b's AskUserQuestion gate.
+  run awk '/^### 4b\. /,/^### 5\. /' "$SKILL_FILE"
+  [ "$status" -eq 0 ]
+  [[ "$output" == *"AskUserQuestion"* ]]
+  [[ "$output" == *"concern"* ]] || [[ "$output" == *"split"* ]]
+}
+@test "SKILL.md Step 2 duplicate-check AskUserQuestion is preserved (cat-1 direction-setting, not touched by Phase 2a-ii)" {
+  # Architect verdict 2026-05-15: Step 2 is a separate cat-1
+  # direction-setting surface — only the user knows whether an existing
+  # ticket is the same root cause. The Phase 2a-ii refactor MUST NOT
+  # touch Step 2's AskUserQuestion gate.
+  run awk '/^### 2\. /,/^### 3\. /' "$SKILL_FILE"
+  [ "$status" -eq 0 ]
+  [[ "$output" == *"AskUserQuestion"* ]]
+}

package/skills/reconcile-readme/SKILL.md CHANGED Viewed

@@ -85,10 +85,10 @@ For each `MISSING` Verification Queue entry, read the `## Fix Released` block:
 sed -n '/^## Fix Released/,/^## /p' docs/problems/<NNN>-*.verifying.md
 ```
-Render the Verification Queue row in the existing format:
+Render the Verification Queue row in the existing format. The `Likely verified?` cell carries an **evidence-first** value per P186 (supersedes the original P048 Candidate 4 14-day heuristic). <!-- LIKELY-VERIFIED-CELL-SHAPE: evidence-based per P186 --> When reconcile-readme synthesises a missing row, default the cell to `no — not observed` — the row is being added because some prior session committed the `.verifying.md` transition without staging the README refresh; reconcile-readme has no session-observed evidence to cite. Subsequent `/wr-itil:review-problems` Step 4 or `run-retro` Step 4a passes populate `yes — observed: <evidence>` when the user verifies. Drift on the cell shape re-opens P186.
 ```
-| P<NNN> | <title> | <release marker> | <Likely verified? per P048 Candidate 4: yes if ≥14 days, else no (<N> days)> |
+| P<NNN> | <title> | <release marker> | no — not observed |
 ```
 ### Step 4. Apply edits via Edit tool — preserve narrative
@@ -99,7 +99,7 @@ For each REMOVE: `Edit` with the existing row as `old_string`, and remove it (re
 For each ADD to WSJF Rankings: locate the correct WSJF position by descending order. Use `Edit` to insert the new row immediately above the next-lower-WSJF row (or append at the bottom of the table if the new row's WSJF is the lowest). The Edit's `old_string` is the line that the new row inserts above; the `new_string` is the new row + the same line below.
-For each ADD to Verification Queue: insert the new row in `Released date ASC` position (oldest at row 1; same-day releases tiebreak by ID ASC) per the canonical VQ sort direction. <!-- VQ-SORT-DIRECTION: oldest-first per ADR-022 --> Recent releases land at the bottom; oldest-pending verifications surface at the top so the user lands on actionable closure candidates first per P048 user-task semantics. Drift here re-opens P150.
+For each ADD to Verification Queue: insert the new row in `Released date ASC` position (oldest at row 1; same-day releases tiebreak by ID ASC) per the canonical VQ sort direction. <!-- VQ-SORT-DIRECTION: oldest-first per ADR-022 --> Recent releases land at the bottom; oldest-pending verifications surface at the top so the user lands on actionable closure candidates first per P048 user-task semantics. Drift here re-opens P150. The synthesised cell defaults to `no — not observed` per the P186 evidence-first cell shape — see the "Render the Verification Queue row" block above. <!-- LIKELY-VERIFIED-CELL-SHAPE: evidence-based per P186 -->
 After all edits, re-run `packages/itil/scripts/reconcile-readme.sh docs/problems` to confirm exit 0. If the second run still reports drift, investigate the residual edits — do NOT re-run reconciliation in a loop, as that hides systematic edit failures.

package/skills/review-problems/SKILL.md CHANGED Viewed

@@ -73,7 +73,13 @@ After re-scoring, present three sections matching the README.md format (same ren
 |------|-----|-------|----------|--------|--------|----------|-------|
 ```
-**Verification Queue** — `.verifying.md` tickets, sorted by `Released date ASC` (oldest at row 1; same-day releases tiebreak by ID ASC) per ADR-022 + P048 user-task semantics. Older entries are the most likely-verified candidates the user wants to surface first when closing the queue; newest-first ordering pushes those actionable closure candidates below the fold and contradicts the section header. <!-- VQ-SORT-DIRECTION: oldest-first per ADR-022 --> Any change to the VQ sort direction MUST update this rendering block, Step 5's README template, AND `/wr-itil:manage-problem` SKILL.md Step 5 P094 / Step 7 P062 / Step 9c / Step 9e + `/wr-itil:transition-problem` + `/wr-itil:transition-problems` + `/wr-itil:reconcile-readme` + `/wr-itil:list-problems` — drift re-opens P150. Highlight any ticket whose release age is **≥ 14 days** with a `yes (N days)` marker in the `Likely verified?` column (within-skill default per P048 Candidate 4 — tunable; promote to cross-skill policy if needed):
+**Verification Queue** — `.verifying.md` tickets, sorted by `Released date ASC` (oldest at row 1; same-day releases tiebreak by ID ASC) per ADR-022 + P048 user-task semantics. Older entries are the most likely-verified candidates the user wants to surface first when closing the queue; newest-first ordering pushes those actionable closure candidates below the fold and contradicts the section header. <!-- VQ-SORT-DIRECTION: oldest-first per ADR-022 --> Any change to the VQ sort direction MUST update this rendering block, Step 5's README template, AND `/wr-itil:manage-problem` SKILL.md Step 5 P094 / Step 7 P062 / Step 9c / Step 9e + `/wr-itil:transition-problem` + `/wr-itil:transition-problems` + `/wr-itil:reconcile-readme` + `/wr-itil:list-problems` — drift re-opens P150. The `Likely verified?` column carries an **evidence-first** cell (per P186 — supersedes the age-based heuristic). <!-- LIKELY-VERIFIED-CELL-SHAPE: evidence-based per P186 --> Three canonical values:
+- `yes — observed: <evidence>` — a Step 4 user confirmation, an in-session test invocation + observable outcome (per ADR-026 grounding), or a `run-retro` Step 4a close-on-evidence citation. Quote the evidence inline (≤ 120 chars; abbreviate to ticket/commit/version anchor + verb).
+- `no — not observed` — fix released but no session-observable evidence yet. Default for newly-released tickets. Aging is preserved separately via the `Released` column — the Released column is the aging signal, `Likely verified?` is the evidence signal.
+- `no — observed regression` — fix released and the bug recurred this session. Cite the recurrence inline (≤ 120 chars).
+Any change to the canonical cell shape MUST update this rendering block, Step 5's README template, AND every co-located render site listed in the VQ-SORT-DIRECTION drift-tripwire above — drift re-opens P186. Surface `yes — observed: …` rows first in Step 4's verification prompt (user can batch-close them); `no — observed regression` rows must NOT be batch-closed (they may signal a botched fix and warrant a flip-back to `.known-error.md`).
 ```
 | ID | Title | Released | Fix summary | Likely verified? |
@@ -102,9 +108,9 @@ Target the dual-tolerant glob `docs/problems/*.verifying.md docs/problems/verify
 The question MUST include a fix summary extracted from the `## Fix Released` section — include the first sentence (or first bullet list) of that section in the question body or as the option description, so the user can answer without reading the full problem file. Do NOT ask with only the problem ID + title + version.
-- Surface the Step 3 `yes (N days)` tickets first so the user can batch-close them.
-- If the user confirms: close the problem (`git mv` from `.verifying.md` to `.closed.md`, update Status to "Closed", re-stage per the P057 staging trap).
-- If the user says no or is unsure: leave the ticket as Verification Pending.
+- Surface the Step 3 `yes — observed: …` tickets first so the user can batch-close them (per P186 evidence-first cell shape).
+- If the user confirms: close the problem (`git mv` from `.verifying.md` to `.closed.md`, update Status to "Closed", re-stage per the P057 staging trap). Update the `Likely verified?` cell on the same render path to `yes — observed: user confirmed <YYYY-MM-DD>`.
+- If the user says no or is unsure: leave the ticket as Verification Pending. If the user reports recurrence, update the cell to `no — observed regression — <one-line citation>` and flag for `.verifying.md` → `.known-error.md` flip-back via `/wr-itil:transition-problem`.
 **AFK / non-interactive branch (ADR-013 Rule 6):** when `AskUserQuestion` is unavailable, record the Verification Queue in the review output and skip the prompt. Do NOT auto-close verifying tickets — only the user can make that call. The user sees the queue on next interactive invocation.
@@ -222,11 +228,11 @@ Dev-work queue only. Verification Pending (`.verifying.md`, WSJF multiplier 0) a
 ## Verification Queue
-Fix released, awaiting user verification (driven off the dual-tolerant glob `docs/problems/*.verifying.md docs/problems/verifying/*.md` per ADR-022 + RFC-002 migration window). Sorted by `Released date ASC` (oldest at row 1; same-day releases tiebreak by ID ASC). <!-- VQ-SORT-DIRECTION: oldest-first per ADR-022 --> `Likely verified?` column marks tickets ≥14 days old (P048 Candidate 4 default).
+Fix released, awaiting user verification (driven off the dual-tolerant glob `docs/problems/*.verifying.md docs/problems/verifying/*.md` per ADR-022 + RFC-002 migration window). Sorted by `Released date ASC` (oldest at row 1; same-day releases tiebreak by ID ASC). <!-- VQ-SORT-DIRECTION: oldest-first per ADR-022 --> `Likely verified?` column carries an **evidence-first** cell per P186 — three canonical values: `yes — observed: <evidence>`, `no — not observed` (default for newly-released tickets), `no — observed regression`. <!-- LIKELY-VERIFIED-CELL-SHAPE: evidence-based per P186 --> Age is preserved separately via the `Released` column — aging surfaces there, not in `Likely verified?`.
 | ID | Title | Released | Likely verified? |
 |----|-------|----------|------------------|
-| P<NNN> | <title> | <release marker> | <yes (N days) / no (N days)> |
+| P<NNN> | <title> | <release marker> | <yes — observed: …  /  no — not observed  /  no — observed regression> |
 ...
 ## Inbound Upstream Reports
@@ -293,7 +299,8 @@ Otherwise, after the commit in Step 6 lands, drain the release queue per the mec
 - **ADR-037** (`docs/decisions/037-skill-testing-strategy.proposed.md`) — contract-assertion bats pattern applied to this skill.
 - **P031** — git-history freshness check rationale (mtime unreliable in worktrees). Applies to the README cache this skill owns.
 - **P047** — live-estimate effort buckets; the Step 2 re-estimate is the lifecycle transition this ticket closes.
-- **P048** Candidate 4 — the 14-day `Likely verified?` heuristic in Step 3.
+- **P048** Candidate 4 — original `Likely verified?` column introduction (14-day age-heuristic). Superseded by P186 evidence-first cell shape.
+- **P186** — evidence-first cell shape (`yes — observed: <evidence>` / `no — not observed` / `no — observed regression`) supersedes the age-based heuristic in Step 3 + Step 5; `<!-- LIKELY-VERIFIED-CELL-SHAPE: evidence-based per P186 -->` marker drives cross-skill drift detection (P138 / P150 fix-shape precedent).
 - **P057** — staging trap. Step 2's auto-transition MUST re-stage after Edit.
 - **P062** — README.md refresh on transitions. Step 5 is the review-path of the same refresh; `/wr-itil:manage-problem` Step 7 carries the transition-path.
 - **JTBD-001** (`docs/jtbd/solo-developer/JTBD-001-enforce-governance.proposed.md`) — discoverable surface via `/wr-itil:` autocomplete.

package/skills/review-problems/test/review-problems-likely-verified-cell-shape.bats ADDED Viewed

@@ -0,0 +1,229 @@
+#!/usr/bin/env bats
+# P186: `Likely verified?` column in docs/problems/README.md
+# Verification Queue must carry an evidence-first cell shape — NOT
+# the original P048 Candidate 4 age-based heuristic (≥14 days = yes).
+# Sibling proxy-for-evidence anti-pattern to P185 at the review-problems
+# Step 3/5 surface. User critique 2026-05-12: "I don't like 'it's been
+# a while, so likely verified' approach. We want firm evidence. For
+# these, it should be things you actually observe."
+#
+# Three canonical values per P186:
+#   yes — observed: <evidence>   (session-observed evidence the fix works)
+#   no — not observed            (fix released, no evidence yet; default)
+#   no — observed regression     (fix released, bug recurred)
+#
+# Hybrid coverage per ADR-005 + ADR-037 + ADR-052:
+#   - Structural contract-assertions (Permitted Exception per ADR-005 /
+#     contract-assertion pattern per ADR-037 — narrowly scoped to marker
+#     presence per architect verdict): each render-block site carries the
+#     canonical LIKELY-VERIFIED-CELL-SHAPE marker pointing to P186.
+#   - Behavioural-shape assertions: each render site documents the three
+#     canonical cell values + the age-based heuristic is NOT cited as
+#     authority anywhere the marker fires.
+#   - Drift-tripwire prose assertion: primary render sites (review-problems
+#     + manage-problem) name P186 in the drift-re-opens contract per
+#     P138 / P150 fix-shape precedent.
+#
+# @problem P186
+# @jtbd JTBD-001 (enforce governance without slowing down — evidence-grounded
+#   closure decision rather than calendar proxy)
+# @jtbd JTBD-006 (progress backlog AFK — `observed: <evidence>` cell IS the
+#   audit trail the AFK contract requires)
+#
+# Cross-reference:
+#   P186: docs/problems/open/186-vq-likely-verified-column-uses-age-heuristic-not-evidence.md
+#   P185: sibling proxy-for-evidence anti-pattern at capture-problem Step 1.5
+#   P150: sibling fix shape — VQ-SORT-DIRECTION marker
+#   P138: sibling fix shape — TIE-BREAK-LADDER-SOURCE marker
+#   P048: introduced the Verification Queue + 14-day heuristic this ticket supersedes
+#   ADR-022 — `.verifying.md` lifecycle; VQ rendering
+#   ADR-026 — agent output grounding (evidence-citation discipline)
+#   ADR-037 — contract-assertion bats pattern
+#   ADR-052 — behavioural-tests default
+setup() {
+  REPO_ROOT="$(cd "$(dirname "$BATS_TEST_FILENAME")/../../../../.." && pwd)"
+  REVIEW_SKILL="$REPO_ROOT/packages/itil/skills/review-problems/SKILL.md"
+  MANAGE_SKILL="$REPO_ROOT/packages/itil/skills/manage-problem/SKILL.md"
+  LIST_SKILL="$REPO_ROOT/packages/itil/skills/list-problems/SKILL.md"
+  TRANSITION_SKILL="$REPO_ROOT/packages/itil/skills/transition-problem/SKILL.md"
+  TRANSITIONS_SKILL="$REPO_ROOT/packages/itil/skills/transition-problems/SKILL.md"
+  RECONCILE_SKILL="$REPO_ROOT/packages/itil/skills/reconcile-readme/SKILL.md"
+  MARKER='<!-- LIKELY-VERIFIED-CELL-SHAPE: evidence-based per P186 -->'
+}
+# ---------------------------------------------------------------------------
+# Marker presence at every render site (P138 / P150 fix-shape precedent)
+# ---------------------------------------------------------------------------
+@test "review-problems carries the LIKELY-VERIFIED-CELL-SHAPE marker" {
+  run grep -F "$MARKER" "$REVIEW_SKILL"
+  [ "$status" -eq 0 ]
+  count=$(grep -c -F "$MARKER" "$REVIEW_SKILL")
+  # review-problems is the primary owner — Step 3 presentation AND Step 5
+  # README template both render the column.
+  [ "$count" -ge 2 ]
+}
+@test "manage-problem carries the LIKELY-VERIFIED-CELL-SHAPE marker at every render site" {
+  run grep -F "$MARKER" "$MANAGE_SKILL"
+  [ "$status" -eq 0 ]
+  count=$(grep -c -F "$MARKER" "$MANAGE_SKILL")
+  # manage-problem renders the VQ at 4 sites: Step 5 P094, Step 7 P062,
+  # Step 9c presentation, Step 9e README template. Marker must appear at
+  # each — drift re-opens P186.
+  [ "$count" -ge 4 ]
+}
+@test "list-problems VQ rendering carries the LIKELY-VERIFIED-CELL-SHAPE marker" {
+  run grep -F "$MARKER" "$LIST_SKILL"
+  [ "$status" -eq 0 ]
+}
+@test "transition-problem Step 7 README refresh carries the LIKELY-VERIFIED-CELL-SHAPE marker" {
+  run grep -F "$MARKER" "$TRANSITION_SKILL"
+  [ "$status" -eq 0 ]
+}
+@test "transition-problems batch render carries the LIKELY-VERIFIED-CELL-SHAPE marker" {
+  run grep -F "$MARKER" "$TRANSITIONS_SKILL"
+  [ "$status" -eq 0 ]
+}
+@test "reconcile-readme rendering carries the LIKELY-VERIFIED-CELL-SHAPE marker" {
+  run grep -F "$MARKER" "$RECONCILE_SKILL"
+  [ "$status" -eq 0 ]
+}
+# ---------------------------------------------------------------------------
+# Canonical cell values present at every render site
+# ---------------------------------------------------------------------------
+@test "review-problems documents all three canonical cell values" {
+  run grep -F 'yes — observed:' "$REVIEW_SKILL"
+  [ "$status" -eq 0 ]
+  run grep -F 'no — not observed' "$REVIEW_SKILL"
+  [ "$status" -eq 0 ]
+  run grep -F 'no — observed regression' "$REVIEW_SKILL"
+  [ "$status" -eq 0 ]
+}
+@test "manage-problem documents all three canonical cell values" {
+  run grep -F 'yes — observed:' "$MANAGE_SKILL"
+  [ "$status" -eq 0 ]
+  run grep -F 'no — not observed' "$MANAGE_SKILL"
+  [ "$status" -eq 0 ]
+  run grep -F 'no — observed regression' "$MANAGE_SKILL"
+  [ "$status" -eq 0 ]
+}
+@test "list-problems documents all three canonical cell values" {
+  run grep -F 'yes — observed:' "$LIST_SKILL"
+  [ "$status" -eq 0 ]
+  run grep -F 'no — not observed' "$LIST_SKILL"
+  [ "$status" -eq 0 ]
+  run grep -F 'no — observed regression' "$LIST_SKILL"
+  [ "$status" -eq 0 ]
+}
+# ---------------------------------------------------------------------------
+# Drift-tripwire prose at primary render sites (P138 / P150 precedent)
+# ---------------------------------------------------------------------------
+@test "review-problems names drift-re-opens-P186 contract" {
+  run grep -F 'drift re-opens P186' "$REVIEW_SKILL"
+  [ "$status" -eq 0 ]
+}
+@test "manage-problem names drift-re-opens-P186 contract" {
+  # manage-problem hosts the drift-tripwire prose at Step 5 P094 AND Step 7
+  # P062 — both render sites name P186 alongside the existing P138 / P150
+  # contracts. List-problems / transition-problem(s) / reconcile-readme
+  # carry the marker but defer the canonical drift contract to the primary
+  # owners (manage-problem / review-problems) per the P138 + P150 precedent.
+  run grep -F 'drift here re-opens P186' "$MANAGE_SKILL"
+  [ "$status" -eq 0 ]
+  count=$(grep -c -i 're-opens P186' "$MANAGE_SKILL")
+  [ "$count" -ge 2 ]
+}
+# ---------------------------------------------------------------------------
+# Age-based heuristic must NOT survive as the authoritative cell rule
+# ---------------------------------------------------------------------------
+@test "review-problems no longer cites the 14-day heuristic as the cell rule" {
+  # The P048 Candidate 4 "marks tickets ≥14 days old" phrasing was the
+  # exact framing the user critique targeted. After P186, the cell shape
+  # contract no longer references age as the authoritative trigger — age
+  # is preserved separately via the `Released` column. The phrase may
+  # survive in historical context (e.g. Related-section pointer back to
+  # P048) but NOT as the live rendering rule.
+  run grep -F 'marks tickets ≥14 days old' "$REVIEW_SKILL"
+  [ "$status" -ne 0 ]
+}
+@test "manage-problem Step 9c no longer treats age as the cell trigger" {
+  # Pre-P186 Step 9c documented `yes (N days)` and `no (N days)` as the
+  # cell values keyed on a 14-day threshold. The new shape replaces both
+  # with evidence-first values; the literal `yes (N days)` template must
+  # not survive as a documented cell value (it can still appear in
+  # historical narrative such as the README VQ rows pending re-render).
+  run grep -F '`yes (N days)` — release age ≥ 14 days' "$MANAGE_SKILL"
+  [ "$status" -ne 0 ]
+}
+# ---------------------------------------------------------------------------
+# Behavioural / template-shape — README template row carries the new shape
+# ---------------------------------------------------------------------------
+@test "review-problems Step 5 README template row uses the new cell-shape vocabulary" {
+  # The template ROW (the `| P<NNN> | <title> | ... |` line below the
+  # Verification Queue header) must reference the new vocabulary, not
+  # the old `yes (N days) / no (N days)` placeholder.
+  run grep -F 'yes — observed' "$REVIEW_SKILL"
+  [ "$status" -eq 0 ]
+  # Old placeholder gone
+  run grep -F '<yes (N days) / no (N days)>' "$REVIEW_SKILL"
+  [ "$status" -ne 0 ]
+}
+@test "list-problems Step 3 template row uses the new cell-shape vocabulary" {
+  run grep -F 'yes — observed' "$LIST_SKILL"
+  [ "$status" -eq 0 ]
+  # Old placeholder gone from list-problems template
+  run grep -F 'yes (N days) / no (N days)' "$LIST_SKILL"
+  [ "$status" -ne 0 ]
+}
+# ---------------------------------------------------------------------------
+# Behavioural — produced README's VQ section uses the new cell vocabulary
+# ---------------------------------------------------------------------------
+#
+# Behavioural assertion per ADR-052: the actual rendered docs/problems/
+# README.md Verification Queue rows must use the new evidence-first cell
+# shape, not age-based markers. This is the user-visible artefact the
+# entire fix targets.
+@test "docs/problems/README.md VQ section contains the new evidence-first cell vocabulary" {
+  README="$REPO_ROOT/docs/problems/README.md"
+  [ -f "$README" ]
+  # At least one row should carry the new vocabulary after the iter
+  # re-renders the VQ section. Tests run after the iter's edits land.
+  run grep -F 'no — not observed' "$README"
+  [ "$status" -eq 0 ]
+}
+@test "docs/problems/README.md VQ section no longer uses bare age-marker cells like 'no (N days)' as the dominant rendering" {
+  README="$REPO_ROOT/docs/problems/README.md"
+  [ -f "$README" ]
+  # Allow a small residual count for transitional rows or quoted prose,
+  # but the bulk of the VQ table must have migrated to the new shape.
+  # Concretely: count `no — not observed` occurrences and require they
+  # exceed the count of bare `no (<digit>` age-marker cells. This is a
+  # behavioural check — the rendered surface, not the SKILL.md template.
+  new_shape_count=$(grep -c -F 'no — not observed' "$README" || true)
+  old_shape_count=$(grep -cE '\| no \([0-9]+ days?\) \|' "$README" || true)
+  [ "$new_shape_count" -gt "$old_shape_count" ]
+}

package/skills/transition-problem/SKILL.md CHANGED Viewed

@@ -173,6 +173,8 @@ The refresh uses the same rendering rules as `/wr-itil:review-problems` Step 9e
 **Verification Queue sort direction (P150)**: Verification Queue rows are sorted by `Released date ASC` (oldest at row 1; same-day releases tiebreak by ID ASC) per ADR-022 + P048 user-task semantics — older entries are the most likely-verified candidates the user wants to surface first when closing the queue. <!-- VQ-SORT-DIRECTION: oldest-first per ADR-022 --> Drift here re-opens P150.
+**Likely-verified cell shape (P186)**: the `Likely verified?` column carries an **evidence-first** cell — `yes — observed: <evidence>` / `no — not observed` / `no — observed regression`. On a Known Error → Verification Pending transition the refresh writes `no — not observed` as the default (no observed evidence yet at the moment of release). On a Verification Pending → Closed transition the closing commit's session-observed evidence should populate the cell as `yes — observed: <evidence>` before the row exits the queue. <!-- LIKELY-VERIFIED-CELL-SHAPE: evidence-based per P186 --> Drift on the cell shape re-opens P186.
 **Mechanism:**
 1. After renaming + Editing + `git add`-ing the transitioned ticket file (per the staging-trap rule above), regenerate `docs/problems/README.md` in-place reflecting the new filename set and the transitioned ticket's new Status.
@@ -234,6 +236,7 @@ Release draining is owned by the caller — `/wr-itil:manage-problem` Step 12 (i
 - **ADR-037** (`docs/decisions/037-skill-testing-strategy.proposed.md`) — contract-assertion bats pattern applied to this skill.
 - **P057** — `git mv` + Edit staging trap rationale; the delegated Step 7 block implements the re-stage. Named here as a transitive contract so callers can reason about the dependency.
 - **P062** — `/wr-itil:review-problems` is the canonical README.md cache writer, but Step 7 transitions also refresh README.md in-place per P062's mechanism. Named here as a transitive contract.
+- **P186** — evidence-first `Likely verified?` cell shape (`yes — observed: <evidence>` / `no — not observed` / `no — observed regression`); `<!-- LIKELY-VERIFIED-CELL-SHAPE: evidence-based per P186 -->` marker drives cross-skill drift detection (P138 / P150 fix-shape precedent).
 - **P063** — external-root-cause detection at Open → Known Error and at the `upstream-blocked` park path. The delegated Step 7 block owns the prompt; this skill inherits the AFK fallback without re-implementing.
 - **JTBD-001** (`docs/jtbd/solo-developer/JTBD-001-enforce-governance.proposed.md`) — discoverable surface via `/wr-itil:` autocomplete. Users type `/wr-itil:transition-problem 042 known-error` rather than remembering the `manage-problem <NNN> known-error` subcommand.
 - **JTBD-101** (`docs/jtbd/plugin-developer/JTBD-101-extend-suite.proposed.md`) — one skill per distinct user intent.

package/skills/transition-problems/SKILL.md CHANGED Viewed

@@ -180,6 +180,8 @@ Per P062, every Step 7 status transition refreshes README.md. At the batch grain
 The refresh follows the same render rules as `/wr-itil:review-problems` Step 9e (glob `docs/problems/*.open.md` / `*.known-error.md` / `*.verifying.md` / `*.parked.md`; rank open + known-error by WSJF; Verification Queue sorted by `Released date ASC` with same-day tiebreak by ID ASC per ADR-022 + P048; Parked section). It does NOT re-rank — existing WSJF values on ticket files are trusted; the refresh is a render, not a re-rank. <!-- VQ-SORT-DIRECTION: oldest-first per ADR-022 --> Drift on the VQ sort direction re-opens P150.
+**Likely-verified cell shape (P186)**: the `Likely verified?` column carries an **evidence-first** cell — `yes — observed: <evidence>` / `no — not observed` / `no — observed regression`. At batch grain the refresh writes the per-pair cell from the per-pair transition context: a `verifying` destination defaults to `no — not observed` (the batch just released the fix; evidence accrues subsequently); a `close` destination assumes session-observed evidence was the trigger for the batch close (the upstream caller — `run-retro` Step 4a, `review-problems` Step 9d — already verified the evidence) and the row exits the queue (not re-rendered as VQ). <!-- LIKELY-VERIFIED-CELL-SHAPE: evidence-based per P186 --> Drift on the cell shape re-opens P186.
 ```bash
 git add docs/problems/README.md
 ```