@windyroad/itil 0.14.0 → 0.15.0-preview.157

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,66 @@
1
+ #!/usr/bin/env bats
2
+ # Contract assertions for manage-incident's `restored` subcommand forwarder (P071 split slice 6b).
3
+ #
4
+ # Per ADR-010 amended (Skill Granularity section) + P071 phased plan:
5
+ # `/wr-itil:manage-incident <I> restored` delegates to the new
6
+ # `/wr-itil:restore-incident <I>` skill via a thin-router forwarder.
7
+ # Original skill already carries `deprecated-arguments: true` frontmatter
8
+ # (slice 5); forwarder emits a canonical one-line systemMessage
9
+ # deprecation notice.
10
+ #
11
+ # Structural assertion — Permitted Exception to the source-grep ban
12
+ # (ADR-005 / P011 / ADR-037 contract-assertion pattern).
13
+ #
14
+ # @problem P071
15
+ # @jtbd JTBD-001 (enforce governance without slowing down)
16
+ # @jtbd JTBD-101 (extend the suite with clear patterns)
17
+ # @jtbd JTBD-201 (restore service fast with an audit trail)
18
+ #
19
+ # Cross-reference:
20
+ # P071: docs/problems/071-argument-based-skill-subcommands-are-not-discoverable.open.md
21
+ # ADR-010 amended — split naming + forwarder contract + deprecated-arguments flag
22
+ # ADR-011 — manage-incident skill-wrapping precedent (restore transition + manage-problem handoff)
23
+ # ADR-013 Rule 1 — structured user interaction (forwarder emits systemMessage, not AskUserQuestion)
24
+ # ADR-037 — contract-assertion bats pattern
25
+
26
+ setup() {
27
+ SKILL_DIR="$(cd "$(dirname "$BATS_TEST_FILENAME")/.." && pwd)"
28
+ SKILL_FILE="${SKILL_DIR}/SKILL.md"
29
+ }
30
+
31
+ @test "manage-incident SKILL.md frontmatter has deprecated-arguments: true (ADR-010 amended)" {
32
+ # Already pinned by slice 5; slice 6b inherits. Guard against regression.
33
+ run grep -nE "^deprecated-arguments:[[:space:]]*true[[:space:]]*$" "$SKILL_FILE"
34
+ [ "$status" -eq 0 ]
35
+ }
36
+
37
+ @test "manage-incident Step 1 forwards 'restored' argument to /wr-itil:restore-incident (P071)" {
38
+ # The forwarder names the target skill explicitly so the router is legible
39
+ # at the contract level. ADR-010's canonical shape: "invokes the new
40
+ # named skill via the Skill tool, not via re-prompting the user".
41
+ run grep -inE "/wr-itil:restore-incident" "$SKILL_FILE"
42
+ [ "$status" -eq 0 ]
43
+ }
44
+
45
+ @test "manage-incident Step 1 emits the canonical restored deprecation notice (ADR-010 amended)" {
46
+ # ADR-010's canonical deprecation-notice template:
47
+ # "/wr-<plugin>:<old> <arg> is deprecated; use /wr-<plugin>:<new>
48
+ # directly. This forwarder will be removed in <plugin>'s next major
49
+ # version."
50
+ # The notice MUST be emitted as a systemMessage (not AskUserQuestion)
51
+ # because deprecation is informational, not decisional (ADR-013 Rule 1
52
+ # structured-interaction scope).
53
+ run grep -inE "is deprecated.*use /wr-itil:restore-incident|deprecated.*restore-incident|restored.*removed in .* next major version" "$SKILL_FILE"
54
+ [ "$status" -eq 0 ]
55
+ }
56
+
57
+ @test "manage-incident Step 1 restored forwarder delegates via Skill tool (P071 regression guard)" {
58
+ # The forwarder must not duplicate the restore logic — it must delegate.
59
+ # Per ADR-010: "thin-router forwarder re-invokes the new named skill
60
+ # via the Skill tool". If the forwarder grows its own rename + handoff
61
+ # logic, the deprecation window will harden into a permanent fork.
62
+ # Guard against this by asserting the forwarder block mentions "delegate"
63
+ # or "Skill tool" language near the restore-incident reference.
64
+ run grep -inE "delegate.*restore-incident|Skill tool.*restore-incident|restore-incident.*Skill tool" "$SKILL_FILE"
65
+ [ "$status" -eq 0 ]
66
+ }
@@ -0,0 +1,211 @@
1
+ ---
2
+ name: wr-itil:mitigate-incident
3
+ description: Record a mitigation attempt against an incident — transitions an investigating incident to mitigating on the first attempt, appends subsequent attempts to the Mitigation attempts timeline. Evidence-first gate enforced per ADR-011.
4
+ allowed-tools: Read, Write, Edit, Bash, Glob, Grep, AskUserQuestion, Skill
5
+ ---
6
+
7
+ # Mitigate Incident
8
+
9
+ Record a mitigation attempt against an active incident and transition its lifecycle. The first mitigation attempt moves the file from `.investigating.md` to `.mitigating.md`; subsequent attempts append to the existing `.mitigating.md` without re-transitioning. Every attempt (successful or not) is recorded so the post-incident audit trail is complete (JTBD-201).
10
+
11
+ This skill is the P071 phased-landing split of `/wr-itil:manage-incident <I> mitigate <action>` per ADR-010 amended Skill Granularity rule: one skill per distinct user intent. The arguments `<I>` (incident ID) and `<action>` (mitigation description) are data parameters, permitted under the amendment — only word-verb-arguments must be split out. The original `/wr-itil:manage-incident <I> mitigate <action>` subcommand route remains as a thin-router forwarder during the deprecation window but is scheduled for removal in `@windyroad/itil`'s next major version.
12
+
13
+ ## Arguments
14
+
15
+ `/wr-itil:mitigate-incident <I###> <action>` — both positional:
16
+
17
+ - `<I###>` — the incident ID (e.g. `I007` or bare `007`). Resolves to `docs/incidents/<I###>-*.{investigating,mitigating}.md`.
18
+ - `<action>` — free-text description of the mitigation being applied (e.g. `rollback checkout service to 1.2.4`, `feature flag checkout.fast-path off`, `restart ingest worker pool`). Prefer **reversible** actions — see "Reversible preference" below.
19
+
20
+ If `$ARGUMENTS` is empty or malformed, ask via `AskUserQuestion` for the incident ID and the action.
21
+
22
+ ## Reversible preference (ADR-011)
23
+
24
+ Prefer **reversible** mitigations over forward fixes:
25
+
26
+ 1. Rollback to a known-good version
27
+ 2. Feature flag off
28
+ 3. Restart / cycle the affected component
29
+ 4. Route traffic away
30
+ 5. Scale up
31
+ 6. Only after reversibles are exhausted: forward fix
32
+
33
+ Record every attempt, successful or not. Failed mitigations are as important to the audit trail as successful ones — they narrow hypothesis space for future investigation.
34
+
35
+ ## Evidence-first gate (ADR-011)
36
+
37
+ **Pre-flight check before the first mitigation attempt**: the incident file must contain at least one hypothesis with cited evidence in the `## Hypotheses` section. If not, block the transition and ask via `AskUserQuestion`:
38
+
39
+ > "Incident `<I###>` has no hypothesis with cited evidence. Per ADR-011, mitigation requires at least one ranked hypothesis backed by a log, repro, diff, or metric reference. (a) Add a hypothesis + evidence now and retry, (b) Record the mitigation anyway with an evidence-skipped justification (requires audit-trail note), (c) Cancel."
40
+
41
+ This gate is the **cool-headed commitment**: it blocks "try this and see" actions during the high-adrenaline phase of an incident unless evidence is cited. The gate runs only on the first mitigation (the `.investigating.md → .mitigating.md` transition); subsequent mitigations on an already-`.mitigating.md` file append directly without re-gating.
42
+
43
+ ## Steps
44
+
45
+ ### 1. Parse arguments
46
+
47
+ Extract `<I###>` and `<action>` from `$ARGUMENTS`. Normalise `<I###>`:
48
+
49
+ - Accept `I007`, `i007`, `007`, `7` → canonicalise to `I007` (uppercase I + zero-padded 3 digits).
50
+ - If missing, ask via `AskUserQuestion`.
51
+
52
+ Extract `<action>` as everything after the incident ID. If missing or trivially short (< 8 chars), ask via `AskUserQuestion` for a descriptive action.
53
+
54
+ ### 2. Locate the incident file
55
+
56
+ ```bash
57
+ ls docs/incidents/<I###>-*.investigating.md docs/incidents/<I###>-*.mitigating.md 2>/dev/null
58
+ ```
59
+
60
+ - If neither exists, report "No active incident `<I###>` found. Check `/wr-itil:list-incidents` for the active backlog or `/wr-itil:manage-incident` to declare a new one." and exit.
61
+ - If exactly one file matches, record its current suffix (`investigating` or `mitigating`) — this drives the transition decision in Step 4.
62
+ - If multiple files match (should not happen under the `<ID>-<title>.<status>.md` naming convention), report the ambiguity and exit.
63
+
64
+ ### 3. Pre-flight: evidence gate (first mitigation only)
65
+
66
+ If the file suffix is `.investigating.md` (i.e. this is the first mitigation), read the `## Hypotheses` section and check for at least one line containing `Evidence:` followed by a non-empty reference. The shape per ADR-011:
67
+
68
+ ```
69
+ - [ranked] <hypothesis> — Evidence: <log/repro/diff/metric reference>. Confidence: <low|med|high>.
70
+ ```
71
+
72
+ - If at least one hypothesis has a cited evidence reference, proceed to Step 4.
73
+ - If no hypothesis carries evidence, invoke `AskUserQuestion` with the three-option prompt from "Evidence-first gate" above. Branch:
74
+ - (a) User adds a hypothesis + evidence now — re-read the file and re-check; if satisfied, proceed. If still missing, report the gate failure and exit.
75
+ - (b) User records anyway — append an `## Audit trail` note to the file: `[<timestamp> UTC] Evidence-gate bypassed by user — reason: <justification>`. Then proceed to Step 4.
76
+ - (c) User cancels — exit without change.
77
+
78
+ If the file suffix is already `.mitigating.md`, skip the gate (it only runs on the transition).
79
+
80
+ ### 4. Record the mitigation and transition if needed
81
+
82
+ Compute a UTC timestamp (e.g. `2026-04-21T14:37Z`). Then:
83
+
84
+ **Case A — first mitigation (`.investigating.md` → `.mitigating.md`)**:
85
+
86
+ 1. `git mv docs/incidents/<I###>-<title>.investigating.md docs/incidents/<I###>-<title>.mitigating.md`
87
+ 2. Update the `**Status**:` field from `Investigating` to `Mitigating` via `Edit`.
88
+ 3. Append to the `## Mitigation attempts` section:
89
+
90
+ ```markdown
91
+ - [<timestamp> UTC] <action> → pending verification
92
+ ```
93
+
94
+ If the `## Mitigation attempts` section contains `*(none yet)*`, replace that placeholder with the first attempt row. Otherwise append below the last attempt.
95
+
96
+ 4. Append to the `## Timeline` section:
97
+
98
+ ```markdown
99
+ - [<timestamp> UTC] Mitigation attempt: <action>
100
+ ```
101
+
102
+ **Case B — subsequent mitigation (`.mitigating.md` stays `.mitigating.md`)**:
103
+
104
+ 1. No `git mv` needed.
105
+ 2. Do not touch the `**Status**:` field.
106
+ 3. Append to the `## Mitigation attempts` section:
107
+
108
+ ```markdown
109
+ - [<timestamp> UTC] <action> → pending verification
110
+ ```
111
+
112
+ 4. Append to the `## Timeline` section:
113
+
114
+ ```markdown
115
+ - [<timestamp> UTC] Mitigation attempt: <action>
116
+ ```
117
+
118
+ The outcome text starts at `pending verification` because verification signals (error-rate recovery, synthetic-probe passing, user report) usually arrive after the mitigation. The `/wr-itil:manage-incident <I###> restored` flow updates the outcome to the final verification signal when service is restored. Failed mitigations should be updated in place (via a subsequent `/wr-itil:manage-incident <I###>` update call or a future `/wr-itil:mitigate-incident` re-record) with the observed outcome — do not delete the original row.
119
+
120
+ ### 5. Low-severity lightweight path (ADR-011 Step 12 edge case)
121
+
122
+ For **Sev 4-5** incidents, the Hypotheses section may be skipped if the user confirmed no investigation was needed at declare time. In that case:
123
+
124
+ - The evidence-first gate in Step 3 does not apply (there are no hypotheses to check).
125
+ - The Mitigation attempts append in Step 4 remains mandatory — Timeline, Observations, and at least one mitigation attempt are always required per ADR-011.
126
+ - Do not upgrade a skipped-hypotheses incident's severity silently; if the user decides mid-incident that investigation IS needed, they should update the incident via `/wr-itil:manage-incident <I###>` and add the hypothesis explicitly.
127
+
128
+ Detect "lightweight path" by reading the Severity label from the incident frontmatter: if Impact × Likelihood resolves to Sev 4 or Sev 5, the gate defaults to bypass with an audit-trail note unless the user has populated Hypotheses explicitly.
129
+
130
+ ### 6. Quality checks
131
+
132
+ After any mitigation record, verify:
133
+
134
+ - **Status consistency**: `**Status**:` field matches the filename suffix (Investigating + `.investigating.md` OR Mitigating + `.mitigating.md`).
135
+ - **Timeline monotonicity**: the new timeline entry's timestamp is ≥ the last existing timeline entry's timestamp.
136
+ - **Mitigation attempts section exists**: if somehow missing from an older incident file, create it before appending.
137
+ - **No evidence-gate silent bypass**: if the gate was bypassed in Step 3, the `## Audit trail` note must be present.
138
+
139
+ ### 7. Report
140
+
141
+ Report:
142
+
143
+ - The file path created/modified.
144
+ - The incident ID and title.
145
+ - The transition (Investigating → Mitigating, or Mitigating → Mitigating).
146
+ - The recorded action and the `pending verification` outcome.
147
+ - Any quality-check warnings.
148
+ - A pointer: "Run `/wr-itil:manage-incident <I###> restored` when the verification signal confirms service is restored, or re-invoke `/wr-itil:mitigate-incident <I###> <next-action>` to record another mitigation attempt."
149
+
150
+ ### 8. Commit the completed work (ADR-014)
151
+
152
+ Per ADR-014, governance skills commit their own work.
153
+
154
+ 1. `git add` the renamed / modified incident file.
155
+ 2. Delegate to `wr-risk-scorer:pipeline` (subagent_type: `wr-risk-scorer:pipeline`) to assess the staged changes and create a bypass marker. If the subagent type is not available (spawned subagent surface), invoke `/wr-risk-scorer:assess-release` via the Skill tool instead — per ADR-015 it wraps the same pipeline subagent.
156
+ 3. `git commit -m "docs(incidents): I<NNN> mitigated — <action summary>"`.
157
+ 4. If risk is above appetite: use `AskUserQuestion` to ask whether to commit anyway, remediate first, or park the work. If `AskUserQuestion` is unavailable, skip the commit and report the uncommitted state clearly.
158
+
159
+ ### 9. Auto-release when changesets are queued (ADR-020)
160
+
161
+ **Skip this step if the skill is running inside an AFK orchestrator.** Orchestrators handle release cadence themselves per ADR-018 (Step 6.5). When in doubt, defer to the orchestrator by skipping this step.
162
+
163
+ Otherwise, after the commit in step 8 lands, drain the release queue so the fix actually lands on npm without requiring manual user action.
164
+
165
+ **Mechanism — delegate, do not re-implement scoring (per ADR-015):**
166
+
167
+ 1. Invoke the release scorer. Two paths are valid:
168
+ - **Primary**: delegate to subagent type `wr-risk-scorer:pipeline` via the Agent tool.
169
+ - **Fallback**: if that subagent type is not available, invoke skill `/wr-risk-scorer:assess-release` via the Skill tool.
170
+ 2. Read the returned `RISK_SCORES: commit=X push=Y release=Z` line.
171
+ 3. **Drain condition**: if `push` and `release` are both within appetite (≤ 4/25, "Low" band per `RISK-POLICY.md`), AND `.changeset/` is non-empty, proceed to the drain action. Otherwise, skip the drain and report the unreleased state.
172
+
173
+ **Drain action (non-interactive, policy-authorised per ADR-013 Rule 6):**
174
+
175
+ 1. Run `npm run push:watch` (push + wait for CI to pass).
176
+ 2. If `.changeset/` remains non-empty after push (i.e. a release PR is pending), run `npm run release:watch` (merge the release PR + wait for npm publish).
177
+ 3. Report the release: "Released <package>@<version>. Mitigation record is now live on npm."
178
+
179
+ **Failure handling**: if `release:watch` fails (CI failure, publish failure), stop and report the failure clearly. Do not retry non-interactively — the user must intervene.
180
+
181
+ **Above-appetite branch**: if push/release risk is above appetite, skip the drain and report: "Release skipped — risk above appetite. Run `npm run push:watch` and `npm run release:watch` manually when ready."
182
+
183
+ ## Ownership boundary
184
+
185
+ `mitigate-incident` writes the Mitigation attempts timeline, the Status field, and the file rename on the first-attempt transition. It does NOT:
186
+
187
+ - Restore the incident to `.restored.md` (that is `/wr-itil:manage-incident <I###> restored` — slice 6b of the P071 phased plan will split this out).
188
+ - Close the incident (that is `/wr-itil:manage-incident <I###> close` — slice 6c).
189
+ - Create or link problems (that is the restore handoff; mitigate-incident does not touch problem state).
190
+ - Add or edit Hypotheses or Observations. Those belong to `/wr-itil:manage-incident <I###>` update flow.
191
+
192
+ If the user wants any of the above, the skill reports the appropriate sibling and exits.
193
+
194
+ ## Related
195
+
196
+ - **P071** (`docs/problems/071-argument-based-skill-subcommands-are-not-discoverable.open.md`) — originating ticket. This skill is slice 6a of the P071 phased-landing plan.
197
+ - **ADR-010 amended** (`docs/decisions/010-rename-wr-problem-to-wr-itil.proposed.md` — Skill Granularity section) — canonical skill-split naming + forwarder contract + `deprecated-arguments: true` frontmatter flag.
198
+ - **ADR-011** (`docs/decisions/011-manage-incident-skill-wrapping.proposed.md`) — incident lifecycle file-suffix conventions (`.investigating.md` / `.mitigating.md` / `.restored.md` / `.closed.md`) + evidence-first rule + reversible-mitigation preference + Sev 4-5 lightweight path.
199
+ - **ADR-013** Rule 1 — structured user interaction (evidence-gate prompt uses AskUserQuestion; deprecation notices use systemMessage).
200
+ - **ADR-013** Rule 6 — policy-within-appetite non-interactive actions (release drain).
201
+ - **ADR-014** — governance skills commit their own work.
202
+ - **ADR-015** — release scorer delegation pattern.
203
+ - **ADR-020** — auto-release when changesets are queued.
204
+ - **ADR-037** (`docs/decisions/037-skill-testing-strategy.proposed.md`) — contract-assertion bats pattern applied to this skill.
205
+ - **JTBD-001** (`docs/jtbd/solo-developer/JTBD-001-enforce-governance.proposed.md`) — discoverable surface via `/wr-itil:` autocomplete.
206
+ - **JTBD-101** (`docs/jtbd/plugin-developer/JTBD-101-extend-suite.proposed.md`) — one skill per distinct user intent.
207
+ - **JTBD-201** (`docs/jtbd/tech-lead/JTBD-201-restore-service-fast.proposed.md`) — evidence-first audit trail preserved post-split.
208
+ - `packages/itil/skills/manage-incident/SKILL.md` — hosts the thin-router forwarder for the deprecated `manage-incident <I###> mitigate <action>` form.
209
+ - `packages/itil/skills/list-incidents/SKILL.md` — slice 5 precedent; the split-skill shape this slice mirrors.
210
+
211
+ $ARGUMENTS
@@ -0,0 +1,152 @@
1
+ #!/usr/bin/env bats
2
+ # Contract assertions for /wr-itil:mitigate-incident (P071 split slice 6a).
3
+ #
4
+ # This skill hosts the "mitigate an incident" user intent previously
5
+ # hidden behind /wr-itil:manage-incident <I> mitigate <action>. It records
6
+ # a mitigation attempt, transitions an incident from .investigating.md to
7
+ # .mitigating.md (first mitigation only), and appends the attempt + outcome
8
+ # to the Mitigation attempts timeline per ADR-011.
9
+ #
10
+ # Structural assertion — Permitted Exception to the source-grep ban
11
+ # (ADR-005 / P011 / ADR-037 contract-assertion pattern).
12
+ #
13
+ # @problem P071
14
+ # @jtbd JTBD-001 (enforce governance without slowing down — discoverable surface)
15
+ # @jtbd JTBD-101 (extend the suite with clear patterns — one skill per distinct user intent)
16
+ # @jtbd JTBD-201 (restore service fast with an audit trail — mitigation + evidence gate)
17
+ #
18
+ # Cross-reference:
19
+ # P071: docs/problems/071-argument-based-skill-subcommands-are-not-discoverable.open.md
20
+ # ADR-010 amended (Skill Granularity section) — split naming + forwarder contract
21
+ # ADR-011 — manage-incident skill-wrapping precedent (evidence-gate, reversible preference)
22
+ # ADR-037 — contract-assertion bats pattern
23
+
24
+ setup() {
25
+ SKILL_DIR="$(cd "$(dirname "$BATS_TEST_FILENAME")/.." && pwd)"
26
+ SKILL_FILE="${SKILL_DIR}/SKILL.md"
27
+ }
28
+
29
+ @test "SKILL.md exists and has frontmatter" {
30
+ [ -f "$SKILL_FILE" ]
31
+ run head -1 "$SKILL_FILE"
32
+ [ "$status" -eq 0 ]
33
+ [ "$output" = "---" ]
34
+ }
35
+
36
+ @test "SKILL.md frontmatter name is wr-itil:mitigate-incident (P071 + ADR-010 amended)" {
37
+ # Split naming convention per ADR-010 amendment: <verb>-<object> pair.
38
+ # The new skill's name must match the phased-landing plan pinned in P071.
39
+ run grep -n "^name: wr-itil:mitigate-incident$" "$SKILL_FILE"
40
+ [ "$status" -eq 0 ]
41
+ }
42
+
43
+ @test "SKILL.md frontmatter description names the mitigate intent (P071)" {
44
+ # Description must name "mitigate" and "incident" so Claude Code autocomplete
45
+ # surfaces the user intent rather than a generic name.
46
+ run grep -inE "^description:.*mitigat.*incident|^description:.*incident.*mitigat" "$SKILL_FILE"
47
+ [ "$status" -eq 0 ]
48
+ }
49
+
50
+ @test "SKILL.md frontmatter allowed-tools grants file-mutation surface (P071 — mitigate requires rename + edit)" {
51
+ # Unlike list-incidents (read-only), mitigate-incident renames
52
+ # .investigating.md → .mitigating.md, updates the Status field, and
53
+ # appends to Mitigation attempts. It must declare Write + Edit + Bash
54
+ # (for git mv) in its allowed-tools. AskUserQuestion is required for
55
+ # the evidence-gate pre-flight prompt per ADR-011 when a hypothesis
56
+ # lacks cited evidence.
57
+ run grep -nE "^allowed-tools:" "$SKILL_FILE"
58
+ [ "$status" -eq 0 ]
59
+ run grep -nE "^allowed-tools:.*Write" "$SKILL_FILE"
60
+ [ "$status" -eq 0 ]
61
+ run grep -nE "^allowed-tools:.*Edit" "$SKILL_FILE"
62
+ [ "$status" -eq 0 ]
63
+ run grep -nE "^allowed-tools:.*Bash" "$SKILL_FILE"
64
+ [ "$status" -eq 0 ]
65
+ run grep -nE "^allowed-tools:.*AskUserQuestion" "$SKILL_FILE"
66
+ [ "$status" -eq 0 ]
67
+ }
68
+
69
+ @test "SKILL.md documents the evidence-gate pre-flight (P071 + ADR-011)" {
70
+ # Per ADR-011's "Do not act on a hypothesis without at least one cited
71
+ # evidence source" rule, mitigate-incident's pre-flight must block the
72
+ # first mitigation attempt when no hypothesis has cited evidence.
73
+ # The SKILL.md must name the gate explicitly so the audit-trail
74
+ # invariant (JTBD-201) is legible.
75
+ run grep -inE "evidence|hypothes" "$SKILL_FILE"
76
+ [ "$status" -eq 0 ]
77
+ }
78
+
79
+ @test "SKILL.md documents the .investigating.md → .mitigating.md rename (P071 + ADR-011)" {
80
+ # The first mitigation attempt transitions the incident file from
81
+ # .investigating.md to .mitigating.md. Subsequent mitigations append
82
+ # to the existing .mitigating.md. The SKILL.md must name both
83
+ # suffixes explicitly so the file-suffix contract is legible.
84
+ run grep -inE "\.investigating\.md" "$SKILL_FILE"
85
+ [ "$status" -eq 0 ]
86
+ run grep -inE "\.mitigating\.md" "$SKILL_FILE"
87
+ [ "$status" -eq 0 ]
88
+ }
89
+
90
+ @test "SKILL.md documents the reversible-mitigation preference (P071 + ADR-011)" {
91
+ # Per ADR-011, mitigate-incident prefers reversible mitigations
92
+ # (rollback → feature flag → restart → route traffic → scale → fix)
93
+ # over forward fixes. The SKILL.md must name the preference so the
94
+ # cool-headed-commitment invariant is preserved post-split.
95
+ run grep -inE "reversible|rollback" "$SKILL_FILE"
96
+ [ "$status" -eq 0 ]
97
+ }
98
+
99
+ @test "SKILL.md documents the Mitigation attempts timeline append (P071 + ADR-011)" {
100
+ # Every mitigation attempt, successful or not, must append a
101
+ # [timestamp] action → outcome row to the Mitigation attempts section.
102
+ # The SKILL.md must name the append contract so future maintainers
103
+ # don't drop failed-attempt recording.
104
+ run grep -inE "[Mm]itigation attempt" "$SKILL_FILE"
105
+ [ "$status" -eq 0 ]
106
+ }
107
+
108
+ @test "SKILL.md cites P071 and ADR-010 amended (P071 + ADR-025)" {
109
+ # ADR-025 inheritance per ADR-037: contract-assertion bats should reflect
110
+ # traceability cites on the skill spec document.
111
+ run grep -inE "P071|ADR-010" "$SKILL_FILE"
112
+ [ "$status" -eq 0 ]
113
+ }
114
+
115
+ @test "SKILL.md cites ADR-011 for the incident lifecycle conventions (P071 + ADR-011)" {
116
+ # mitigate-incident inherits file-suffix conventions and the
117
+ # evidence-first rule from ADR-011. The SKILL.md must cite ADR-011
118
+ # so the precedent chain is legible.
119
+ run grep -inE "ADR-011" "$SKILL_FILE"
120
+ [ "$status" -eq 0 ]
121
+ }
122
+
123
+ @test "SKILL.md does not carry a deprecated-arguments frontmatter flag (clean-split skill)" {
124
+ # mitigate-incident is a clean-split skill with no argument-subcommands
125
+ # itself (its arguments are data parameters — incident ID + action).
126
+ # ADR-010 amendment's `deprecated-arguments: true` flag is only valid
127
+ # on host skills with forwarder routes. mitigate-incident is a
128
+ # forwarder TARGET, not a host. It must NOT carry the flag.
129
+ run grep -E "^deprecated-arguments:" "$SKILL_FILE"
130
+ [ "$status" -ne 0 ]
131
+ }
132
+
133
+ @test "SKILL.md does not use word-argument subcommand branching (P071 regression guard)" {
134
+ # The whole point of P071: Claude Code autocomplete does not surface
135
+ # word-argument subcommands. A clean-split skill must not reintroduce
136
+ # word-arg subcommand routing (e.g. `list` / `mitigate` / `restore`).
137
+ # The data parameters <I> and <action> are strings, not verb keywords,
138
+ # so the anti-pattern is patterns like `If arguments start with "list"`.
139
+ run grep -inE "If arguments start with \"(list|mitigate|restore|close|link)\"|If arguments contain \"(list|mitigate|restore|close|link)\"" "$SKILL_FILE"
140
+ [ "$status" -ne 0 ]
141
+ }
142
+
143
+ @test "SKILL.md documents the low-severity lightweight path (P071 + ADR-011 edge case)" {
144
+ # ADR-011's Step 12 edge case: for Sev 4-5 incidents, the Hypotheses
145
+ # section may be skipped if the user confirms no investigation is
146
+ # needed. Timeline, Observations, and at least one mitigation attempt
147
+ # remain mandatory. The split skill must preserve this lightweight
148
+ # path so JTBD-001's "without slowing down" outcome holds during
149
+ # low-severity incidents.
150
+ run grep -inE "lightweight|low.?severity|Sev 4|Sev 5" "$SKILL_FILE"
151
+ [ "$status" -eq 0 ]
152
+ }
@@ -0,0 +1,195 @@
1
+ ---
2
+ name: wr-itil:restore-incident
3
+ description: Mark an incident as service-restored — transitions a mitigating incident to restored, appends a Timeline entry, and hands off to /wr-itil:manage-problem for linked-problem creation or update per ADR-011.
4
+ allowed-tools: Read, Write, Edit, Bash, Glob, Grep, AskUserQuestion, Skill
5
+ ---
6
+
7
+ # Restore Incident
8
+
9
+ Mark an active incident as service-restored and hand off to problem management for root-cause tracking. This skill is the active-restoration path — the point where the "cool-headed, evidence-first" incident lifecycle crosses from mitigation to post-restoration (JTBD-201).
10
+
11
+ The first restore attempt moves the file from `.mitigating.md` to `.restored.md`, updates the `**Status**` field to "Restored", appends a verification-signal entry to the Timeline, and invokes `/wr-itil:manage-problem` via the Skill tool so a problem record captures the root-cause handoff. If the user declines problem creation with a justification, a `## No Problem` section is written on the incident instead — preserving the audit-trail invariant.
12
+
13
+ This skill is the P071 phased-landing split of `/wr-itil:manage-incident <I> restored` per ADR-010 amended Skill Granularity rule: one skill per distinct user intent. The argument `<I>` is a data parameter, permitted under the amendment — only word-verb-arguments must be split out. The original `/wr-itil:manage-incident <I> restored` subcommand route remains as a thin-router forwarder during the deprecation window but is scheduled for removal in `@windyroad/itil`'s next major version.
14
+
15
+ ## Arguments
16
+
17
+ `/wr-itil:restore-incident <I###>` — one positional data parameter:
18
+
19
+ - `<I###>` — the incident ID (e.g. `I007` or bare `007`). Resolves to `docs/incidents/<I###>-*.mitigating.md` (primary path) or `docs/incidents/<I###>-*.restored.md` (idempotent re-invocation).
20
+
21
+ If `$ARGUMENTS` is empty or malformed, ask via `AskUserQuestion` for the incident ID.
22
+
23
+ ## Pre-flight (ADR-011)
24
+
25
+ Restore requires two pre-flight conditions:
26
+
27
+ 1. **Mitigation recorded**: at least one `[<timestamp> UTC] <action> → <outcome>` row in the `## Mitigation attempts` section. A restore with no mitigation recorded usually means the incident self-resolved — in that case the user should record the self-resolution as a mitigation entry first (e.g. `[<timestamp> UTC] (no action taken — issue self-resolved) → pending verification`).
28
+ 2. **Verification signal captured**: the user must describe the signal that confirms service is restored (e.g. "error rate back to baseline per Datadog", "user reports normal", "synthetic probe passing"). This is the verification evidence that the restore is real, not wishful.
29
+
30
+ If either pre-flight fails, block the transition and ask via `AskUserQuestion` what to do: (a) record a mitigation + verification signal now and retry, (b) document why this incident is an exception (e.g. Sev 4-5 lightweight path) and proceed with an Audit-trail note, (c) cancel.
31
+
32
+ ## Steps
33
+
34
+ ### 1. Parse arguments
35
+
36
+ Extract `<I###>` from `$ARGUMENTS`. Normalise:
37
+
38
+ - Accept `I007`, `i007`, `007`, `7` → canonicalise to `I007` (uppercase I + zero-padded 3 digits).
39
+ - If missing, ask via `AskUserQuestion`.
40
+
41
+ ### 2. Locate the incident file
42
+
43
+ ```bash
44
+ ls docs/incidents/<I###>-*.mitigating.md docs/incidents/<I###>-*.restored.md 2>/dev/null
45
+ ```
46
+
47
+ - If neither exists (the incident is `.investigating.md` or `.closed.md`), report "No active mitigation-or-restored incident `<I###>` found. If the incident is still investigating, record at least one mitigation attempt via `/wr-itil:mitigate-incident <I###> <action>` first." and exit.
48
+ - If a `.mitigating.md` file matches, this is the restore transition (Case A in Step 4).
49
+ - If a `.restored.md` file matches, this is an idempotent re-invocation (Case B in Step 4).
50
+ - If multiple files match (should not happen under the naming convention), report the ambiguity and exit.
51
+
52
+ ### 3. Pre-flight: verification signal + mitigation-attempts check
53
+
54
+ For the Case A path (restore transition), perform the pre-flight checks:
55
+
56
+ 1. Read the `## Mitigation attempts` section. If it is missing, empty, or contains only `*(none yet)*`, the first pre-flight fails. Ask via `AskUserQuestion`.
57
+ 2. Ask via `AskUserQuestion` for the verification signal if not already provided in `$ARGUMENTS`. The signal is free-text and should name the metric, probe, or report that confirms restoration.
58
+
59
+ ### 4. Record the restore and transition if needed
60
+
61
+ Compute a UTC timestamp (e.g. `2026-04-21T14:37Z`). Then:
62
+
63
+ **Case A — first restore (`.mitigating.md` → `.restored.md`)**:
64
+
65
+ 1. `git mv docs/incidents/<I###>-<title>.mitigating.md docs/incidents/<I###>-<title>.restored.md`
66
+ 2. Update the `**Status**:` field from `Mitigating` to `Restored` via `Edit`.
67
+ 3. Append to the `## Timeline` section:
68
+
69
+ ```markdown
70
+ - [<timestamp> UTC] Service restored — <verification signal>
71
+ ```
72
+
73
+ **Case B — idempotent re-invocation (`.restored.md` stays `.restored.md`)**:
74
+
75
+ 1. No `git mv` needed.
76
+ 2. Do not re-edit the `**Status**:` field.
77
+ 3. If a verification signal was provided in `$ARGUMENTS` and differs from the existing Timeline, append an additional `[<timestamp> UTC] Service restored (re-verified) — <verification signal>` line to the `## Timeline`. Otherwise, skip the Timeline append and proceed to Step 5 (the user may be re-running the handoff).
78
+
79
+ ### 5. Problem handoff
80
+
81
+ Ask via `AskUserQuestion`: "Service restored. Should I create or update a problem record for the root cause? (a) yes — recommended, (b) no — document why (trivial/one-off)".
82
+
83
+ **Branch (a) — Yes, create or update problem**:
84
+
85
+ 1. Construct a handoff payload:
86
+ - Incident ID and title
87
+ - Timeline summary (most recent entries)
88
+ - Top-ranked hypothesis + cited evidence from `## Hypotheses`
89
+ - Mitigation applied + verification signal from `## Mitigation attempts` + the just-appended Timeline entry
90
+ 2. Invoke `/wr-itil:manage-problem` via the **Skill tool** with the payload as arguments. The problem skill's existing dedupe flow handles new-vs-update — do not duplicate that logic here.
91
+ 3. Capture the returned `P<NNN>` (or `P<NNN> (updated)` for a dedupe hit).
92
+ 4. Write (or update) the incident's `## Linked Problem` section:
93
+
94
+ ```markdown
95
+ ## Linked Problem
96
+ P<NNN> (<title>) — <status>
97
+ ```
98
+
99
+ If the section already exists, edit it in place; otherwise append at the end of the file.
100
+
101
+ **Branch (b) — No, document the no-problem justification**:
102
+
103
+ 1. Ask via `AskUserQuestion` for the justification (free-text). Examples: "one-off cosmic-bit-flip; not reproducible", "transient upstream outage; no action on our side", "test incident (training exercise)".
104
+ 2. Write a `## No Problem` section into the incident file:
105
+
106
+ ```markdown
107
+ ## No Problem
108
+ <reason — e.g. "one-off cosmic-bit-flip; not reproducible">
109
+ ```
110
+
111
+ If a `## Linked Problem` section exists, replace it with the `## No Problem` section (one or the other — never both).
112
+
113
+ ### 6. Quality checks
114
+
115
+ After the restore, verify:
116
+
117
+ - **Status consistency**: `**Status**:` field matches the filename suffix (`Restored` + `.restored.md`).
118
+ - **Timeline monotonicity**: the new "Service restored" entry's timestamp is ≥ the last existing timeline entry's timestamp.
119
+ - **Post-restore sections present**: exactly one of `## Linked Problem` or `## No Problem` exists; never both, never neither.
120
+
121
+ ### 7. Report
122
+
123
+ Report:
124
+
125
+ - The file path created/modified.
126
+ - The incident ID and title.
127
+ - The transition (Mitigating → Restored, or Restored → Restored for re-invocations).
128
+ - The verification signal recorded.
129
+ - Whether a problem was created, updated, or skipped with a `## No Problem` justification.
130
+ - The linked problem ID (if any).
131
+ - A pointer: "Run `/wr-itil:close-incident <I###>` when the linked problem reaches Known Error, Verifying, or Closed (or if the incident carries a No Problem justification), or keep the incident in Restored while the root cause work progresses."
132
+
133
+ ### 8. Commit the completed work (ADR-014)
134
+
135
+ Per ADR-014, governance skills commit their own work.
136
+
137
+ 1. `git add` the renamed / modified incident file.
138
+ 2. Delegate to `wr-risk-scorer:pipeline` (subagent_type: `wr-risk-scorer:pipeline`) to assess the staged changes and create a bypass marker. If the subagent type is not available (spawned subagent surface), invoke `/wr-risk-scorer:assess-release` via the Skill tool instead — per ADR-015 it wraps the same pipeline subagent.
139
+ 3. `git commit -m "docs(incidents): I<NNN> restored — <verification signal summary>"`.
140
+ 4. If risk is above appetite: use `AskUserQuestion` to ask whether to commit anyway, remediate first, or park the work. If `AskUserQuestion` is unavailable, skip the commit and report the uncommitted state clearly.
141
+
142
+ ### 9. Auto-release when changesets are queued (ADR-020)
143
+
144
+ **Skip this step if the skill is running inside an AFK orchestrator.** Orchestrators handle release cadence themselves per ADR-018 (Step 6.5). When in doubt, defer to the orchestrator by skipping this step.
145
+
146
+ Otherwise, after the commit in step 8 lands, drain the release queue so the fix actually lands on npm without requiring manual user action.
147
+
148
+ **Mechanism — delegate, do not re-implement scoring (per ADR-015):**
149
+
150
+ 1. Invoke the release scorer. Two paths are valid:
151
+ - **Primary**: delegate to subagent type `wr-risk-scorer:pipeline` via the Agent tool.
152
+ - **Fallback**: if that subagent type is not available, invoke skill `/wr-risk-scorer:assess-release` via the Skill tool.
153
+ 2. Read the returned `RISK_SCORES: commit=X push=Y release=Z` line.
154
+ 3. **Drain condition**: if `push` and `release` are both within appetite (≤ 4/25, "Low" band per `RISK-POLICY.md`), AND `.changeset/` is non-empty, proceed to the drain action. Otherwise, skip the drain and report the unreleased state.
155
+
156
+ **Drain action (non-interactive, policy-authorised per ADR-013 Rule 6):**
157
+
158
+ 1. Run `npm run push:watch` (push + wait for CI to pass).
159
+ 2. If `.changeset/` remains non-empty after push (i.e. a release PR is pending), run `npm run release:watch` (merge the release PR + wait for npm publish).
160
+ 3. Report the release: "Released <package>@<version>. Restoration record is now live on npm."
161
+
162
+ **Failure handling**: if `release:watch` fails (CI failure, publish failure), stop and report the failure clearly. Do not retry non-interactively — the user must intervene.
163
+
164
+ **Above-appetite branch**: if push/release risk is above appetite, skip the drain and report: "Release skipped — risk above appetite. Run `npm run push:watch` and `npm run release:watch` manually when ready."
165
+
166
+ ## Ownership boundary
167
+
168
+ `restore-incident` writes the Timeline "Service restored" entry, the Status field, the file rename on the transition, and exactly one of `## Linked Problem` or `## No Problem`. It does NOT:
169
+
170
+ - Close the incident (that is `/wr-itil:close-incident <I###>` — slice 6c).
171
+ - Link the incident to an existing problem without performing the restore transition (that is `/wr-itil:link-incident <I###> P<MMM>` — slice 6d).
172
+ - Record a mitigation attempt (that is `/wr-itil:mitigate-incident <I###> <action>` — slice 6a).
173
+ - Rename or transition a `.investigating.md` file — the incident must already be `.mitigating.md` before restore.
174
+
175
+ If the user wants any of the above, the skill reports the appropriate sibling and exits.
176
+
177
+ ## Related
178
+
179
+ - **P071** (`docs/problems/071-argument-based-skill-subcommands-are-not-discoverable.open.md`) — originating ticket. This skill is slice 6b of the P071 phased-landing plan.
180
+ - **ADR-010 amended** (`docs/decisions/010-rename-wr-problem-to-wr-itil.proposed.md` — Skill Granularity section) — canonical skill-split naming + forwarder contract + `deprecated-arguments: true` frontmatter flag.
181
+ - **ADR-011** (`docs/decisions/011-manage-incident-skill.proposed.md`) — incident lifecycle file-suffix conventions (`.investigating.md` / `.mitigating.md` / `.restored.md` / `.closed.md`) + Decision Outcome point 4 (direct Skill-tool invocation of `/wr-itil:manage-problem` for problem handoff).
182
+ - **ADR-013** Rule 1 — structured user interaction (verification-signal and handoff prompts use AskUserQuestion; deprecation notice uses systemMessage).
183
+ - **ADR-013** Rule 6 — policy-within-appetite non-interactive actions (release drain).
184
+ - **ADR-014** — governance skills commit their own work.
185
+ - **ADR-015** — release scorer delegation pattern.
186
+ - **ADR-020** — auto-release when changesets are queued.
187
+ - **ADR-037** (`docs/decisions/037-skill-testing-strategy.proposed.md`) — contract-assertion bats pattern applied to this skill.
188
+ - **JTBD-001** (`docs/jtbd/solo-developer/JTBD-001-enforce-governance.proposed.md`) — discoverable surface via `/wr-itil:` autocomplete.
189
+ - **JTBD-101** (`docs/jtbd/plugin-developer/JTBD-101-extend-suite.proposed.md`) — one skill per distinct user intent.
190
+ - **JTBD-201** (`docs/jtbd/tech-lead/JTBD-201-restore-service-fast.proposed.md`) — this skill IS the active-restoration path; audit trail invariants preserved post-split.
191
+ - `packages/itil/skills/manage-incident/SKILL.md` — hosts the thin-router forwarder for the deprecated `manage-incident <I###> restored` form.
192
+ - `packages/itil/skills/mitigate-incident/SKILL.md` — slice 6a precedent; restore-incident mirrors the split shape.
193
+ - `packages/itil/skills/manage-problem/SKILL.md` — cross-skill invocation target during problem handoff.
194
+
195
+ $ARGUMENTS