agentic-team-templates 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +280 -0
- package/bin/cli.js +5 -0
- package/package.json +47 -0
- package/src/index.js +521 -0
- package/templates/_shared/code-quality.md +162 -0
- package/templates/_shared/communication.md +114 -0
- package/templates/_shared/core-principles.md +62 -0
- package/templates/_shared/git-workflow.md +165 -0
- package/templates/_shared/security-fundamentals.md +173 -0
- package/templates/blockchain/.cursorrules/defi-patterns.md +520 -0
- package/templates/blockchain/.cursorrules/gas-optimization.md +339 -0
- package/templates/blockchain/.cursorrules/overview.md +130 -0
- package/templates/blockchain/.cursorrules/security.md +318 -0
- package/templates/blockchain/.cursorrules/smart-contracts.md +364 -0
- package/templates/blockchain/.cursorrules/testing.md +415 -0
- package/templates/blockchain/.cursorrules/web3-integration.md +538 -0
- package/templates/blockchain/CLAUDE.md +389 -0
- package/templates/cli-tools/.cursorrules/architecture.md +412 -0
- package/templates/cli-tools/.cursorrules/arguments.md +406 -0
- package/templates/cli-tools/.cursorrules/distribution.md +546 -0
- package/templates/cli-tools/.cursorrules/error-handling.md +455 -0
- package/templates/cli-tools/.cursorrules/overview.md +136 -0
- package/templates/cli-tools/.cursorrules/testing.md +537 -0
- package/templates/cli-tools/.cursorrules/user-experience.md +545 -0
- package/templates/cli-tools/CLAUDE.md +356 -0
- package/templates/data-engineering/.cursorrules/data-modeling.md +367 -0
- package/templates/data-engineering/.cursorrules/data-quality.md +455 -0
- package/templates/data-engineering/.cursorrules/overview.md +85 -0
- package/templates/data-engineering/.cursorrules/performance.md +339 -0
- package/templates/data-engineering/.cursorrules/pipeline-design.md +280 -0
- package/templates/data-engineering/.cursorrules/security.md +460 -0
- package/templates/data-engineering/.cursorrules/testing.md +452 -0
- package/templates/data-engineering/CLAUDE.md +974 -0
- package/templates/devops-sre/.cursorrules/capacity-planning.md +653 -0
- package/templates/devops-sre/.cursorrules/change-management.md +584 -0
- package/templates/devops-sre/.cursorrules/chaos-engineering.md +651 -0
- package/templates/devops-sre/.cursorrules/disaster-recovery.md +641 -0
- package/templates/devops-sre/.cursorrules/incident-management.md +565 -0
- package/templates/devops-sre/.cursorrules/observability.md +714 -0
- package/templates/devops-sre/.cursorrules/overview.md +230 -0
- package/templates/devops-sre/.cursorrules/postmortems.md +588 -0
- package/templates/devops-sre/.cursorrules/runbooks.md +760 -0
- package/templates/devops-sre/.cursorrules/slo-sli.md +617 -0
- package/templates/devops-sre/.cursorrules/toil-reduction.md +567 -0
- package/templates/devops-sre/CLAUDE.md +1007 -0
- package/templates/documentation/.cursorrules/adr.md +277 -0
- package/templates/documentation/.cursorrules/api-documentation.md +411 -0
- package/templates/documentation/.cursorrules/code-comments.md +253 -0
- package/templates/documentation/.cursorrules/maintenance.md +260 -0
- package/templates/documentation/.cursorrules/overview.md +82 -0
- package/templates/documentation/.cursorrules/readme-standards.md +306 -0
- package/templates/documentation/CLAUDE.md +120 -0
- package/templates/fullstack/.cursorrules/api-contracts.md +331 -0
- package/templates/fullstack/.cursorrules/architecture.md +298 -0
- package/templates/fullstack/.cursorrules/overview.md +109 -0
- package/templates/fullstack/.cursorrules/shared-types.md +348 -0
- package/templates/fullstack/.cursorrules/testing.md +386 -0
- package/templates/fullstack/CLAUDE.md +349 -0
- package/templates/ml-ai/.cursorrules/data-engineering.md +483 -0
- package/templates/ml-ai/.cursorrules/deployment.md +601 -0
- package/templates/ml-ai/.cursorrules/model-development.md +538 -0
- package/templates/ml-ai/.cursorrules/monitoring.md +658 -0
- package/templates/ml-ai/.cursorrules/overview.md +131 -0
- package/templates/ml-ai/.cursorrules/security.md +637 -0
- package/templates/ml-ai/.cursorrules/testing.md +678 -0
- package/templates/ml-ai/CLAUDE.md +1136 -0
- package/templates/mobile/.cursorrules/navigation.md +246 -0
- package/templates/mobile/.cursorrules/offline-first.md +302 -0
- package/templates/mobile/.cursorrules/overview.md +71 -0
- package/templates/mobile/.cursorrules/performance.md +345 -0
- package/templates/mobile/.cursorrules/testing.md +339 -0
- package/templates/mobile/CLAUDE.md +233 -0
- package/templates/platform-engineering/.cursorrules/ci-cd.md +778 -0
- package/templates/platform-engineering/.cursorrules/developer-experience.md +632 -0
- package/templates/platform-engineering/.cursorrules/infrastructure-as-code.md +600 -0
- package/templates/platform-engineering/.cursorrules/kubernetes.md +710 -0
- package/templates/platform-engineering/.cursorrules/observability.md +747 -0
- package/templates/platform-engineering/.cursorrules/overview.md +215 -0
- package/templates/platform-engineering/.cursorrules/security.md +855 -0
- package/templates/platform-engineering/.cursorrules/testing.md +878 -0
- package/templates/platform-engineering/CLAUDE.md +850 -0
- package/templates/utility-agent/.cursorrules/action-control.md +284 -0
- package/templates/utility-agent/.cursorrules/context-management.md +186 -0
- package/templates/utility-agent/.cursorrules/hallucination-prevention.md +253 -0
- package/templates/utility-agent/.cursorrules/overview.md +78 -0
- package/templates/utility-agent/.cursorrules/token-optimization.md +369 -0
- package/templates/utility-agent/CLAUDE.md +513 -0
- package/templates/web-backend/.cursorrules/api-design.md +255 -0
- package/templates/web-backend/.cursorrules/authentication.md +309 -0
- package/templates/web-backend/.cursorrules/database-patterns.md +298 -0
- package/templates/web-backend/.cursorrules/error-handling.md +366 -0
- package/templates/web-backend/.cursorrules/overview.md +69 -0
- package/templates/web-backend/.cursorrules/security.md +358 -0
- package/templates/web-backend/.cursorrules/testing.md +395 -0
- package/templates/web-backend/CLAUDE.md +366 -0
- package/templates/web-frontend/.cursorrules/accessibility.md +296 -0
- package/templates/web-frontend/.cursorrules/component-patterns.md +204 -0
- package/templates/web-frontend/.cursorrules/overview.md +72 -0
- package/templates/web-frontend/.cursorrules/performance.md +325 -0
- package/templates/web-frontend/.cursorrules/state-management.md +227 -0
- package/templates/web-frontend/.cursorrules/styling.md +271 -0
- package/templates/web-frontend/.cursorrules/testing.md +311 -0
- package/templates/web-frontend/CLAUDE.md +399 -0
|
@@ -0,0 +1,565 @@
|
|
|
1
|
+
# Incident Management
|
|
2
|
+
|
|
3
|
+
Comprehensive guidelines for detecting, responding to, and learning from incidents.
|
|
4
|
+
|
|
5
|
+
## Core Principles
|
|
6
|
+
|
|
7
|
+
1. **Detect Fast** - Automated monitoring catches issues before users report them
|
|
8
|
+
2. **Communicate Clearly** - Stakeholders know what's happening and when it will be fixed
|
|
9
|
+
3. **Mitigate First** - Stop the bleeding before root cause analysis
|
|
10
|
+
4. **Learn Always** - Every incident is an opportunity to improve
|
|
11
|
+
|
|
12
|
+
## Incident Severity Levels
|
|
13
|
+
|
|
14
|
+
### Severity Definitions
|
|
15
|
+
|
|
16
|
+
```yaml
|
|
17
|
+
sev1_critical:
|
|
18
|
+
description: "Complete service outage or severe security incident"
|
|
19
|
+
criteria:
|
|
20
|
+
- "Core service completely unavailable"
|
|
21
|
+
- "Data breach or security incident in progress"
|
|
22
|
+
- "Data loss affecting customers"
|
|
23
|
+
- "Revenue-impacting payment failures"
|
|
24
|
+
response:
|
|
25
|
+
time_to_acknowledge: "5 minutes"
|
|
26
|
+
time_to_engage: "15 minutes"
|
|
27
|
+
war_room: "Immediately"
|
|
28
|
+
status_page: "Update within 15 minutes"
|
|
29
|
+
executive_notification: "Yes"
|
|
30
|
+
examples:
|
|
31
|
+
- "Production database down"
|
|
32
|
+
- "API returning 5xx for all requests"
|
|
33
|
+
- "Active security breach"
|
|
34
|
+
- "Complete payment processing failure"
|
|
35
|
+
|
|
36
|
+
sev2_major:
|
|
37
|
+
description: "Significant degradation affecting many users"
|
|
38
|
+
criteria:
|
|
39
|
+
- "Major feature unavailable"
|
|
40
|
+
- ">10% of users affected"
|
|
41
|
+
- "Significant latency increase"
|
|
42
|
+
- "Partial data loss risk"
|
|
43
|
+
response:
|
|
44
|
+
time_to_acknowledge: "15 minutes"
|
|
45
|
+
time_to_engage: "30 minutes"
|
|
46
|
+
war_room: "If not resolved in 30 minutes"
|
|
47
|
+
status_page: "Update within 30 minutes"
|
|
48
|
+
executive_notification: "If > 1 hour duration"
|
|
49
|
+
examples:
|
|
50
|
+
- "Search functionality broken"
|
|
51
|
+
- "50% error rate on checkout"
|
|
52
|
+
- "Mobile app unable to sync"
|
|
53
|
+
- "Authentication intermittently failing"
|
|
54
|
+
|
|
55
|
+
sev3_minor:
|
|
56
|
+
description: "Limited impact with workaround available"
|
|
57
|
+
criteria:
|
|
58
|
+
- "Single feature degraded"
|
|
59
|
+
- "<10% of users affected"
|
|
60
|
+
- "Workaround exists"
|
|
61
|
+
- "Non-critical functionality"
|
|
62
|
+
response:
|
|
63
|
+
time_to_acknowledge: "1 hour"
|
|
64
|
+
time_to_engage: "4 hours"
|
|
65
|
+
war_room: "Not required"
|
|
66
|
+
status_page: "Optional"
|
|
67
|
+
executive_notification: "No"
|
|
68
|
+
examples:
|
|
69
|
+
- "Export feature failing"
|
|
70
|
+
- "Admin dashboard slow"
|
|
71
|
+
- "Email notifications delayed"
|
|
72
|
+
- "Analytics not updating"
|
|
73
|
+
|
|
74
|
+
sev4_low:
|
|
75
|
+
description: "Minimal impact, cosmetic issues"
|
|
76
|
+
criteria:
|
|
77
|
+
- "No user-facing impact"
|
|
78
|
+
- "Internal tooling issues"
|
|
79
|
+
- "Cosmetic bugs"
|
|
80
|
+
response:
|
|
81
|
+
time_to_acknowledge: "Next business day"
|
|
82
|
+
time_to_engage: "Standard sprint work"
|
|
83
|
+
war_room: "Not required"
|
|
84
|
+
status_page: "No"
|
|
85
|
+
executive_notification: "No"
|
|
86
|
+
examples:
|
|
87
|
+
- "Log formatting issues"
|
|
88
|
+
- "Internal dashboard UI bug"
|
|
89
|
+
- "Dev environment problems"
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## Incident Response Process
|
|
93
|
+
|
|
94
|
+
### Phase 1: Detection
|
|
95
|
+
|
|
96
|
+
```yaml
|
|
97
|
+
detection_sources:
|
|
98
|
+
automated:
|
|
99
|
+
- "Alerting systems (Prometheus, Datadog)"
|
|
100
|
+
- "Synthetic monitoring"
|
|
101
|
+
- "Error tracking (Sentry, Bugsnag)"
|
|
102
|
+
- "APM anomaly detection"
|
|
103
|
+
|
|
104
|
+
human:
|
|
105
|
+
- "User reports (support tickets)"
|
|
106
|
+
- "Internal reports (engineers, QA)"
|
|
107
|
+
- "Social media monitoring"
|
|
108
|
+
- "Partner notifications"
|
|
109
|
+
|
|
110
|
+
detection_goals:
|
|
111
|
+
mttd_targets:
|
|
112
|
+
sev1: "< 5 minutes"
|
|
113
|
+
sev2: "< 15 minutes"
|
|
114
|
+
sev3: "< 1 hour"
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
### Phase 2: Response
|
|
118
|
+
|
|
119
|
+
```yaml
|
|
120
|
+
initial_response:
|
|
121
|
+
steps:
|
|
122
|
+
1: "Acknowledge alert/report"
|
|
123
|
+
2: "Assess severity based on criteria"
|
|
124
|
+
3: "Declare incident if criteria met"
|
|
125
|
+
4: "Create incident channel"
|
|
126
|
+
5: "Page appropriate responders"
|
|
127
|
+
6: "Start incident document"
|
|
128
|
+
|
|
129
|
+
incident_declaration:
|
|
130
|
+
when_to_declare:
|
|
131
|
+
- "Alert indicates customer impact"
|
|
132
|
+
- "Multiple related alerts firing"
|
|
133
|
+
- "User reports confirmed"
|
|
134
|
+
- "Uncertainty about scope"
|
|
135
|
+
|
|
136
|
+
how_to_declare:
|
|
137
|
+
slack: "/incident create [description] [severity]"
|
|
138
|
+
pagerduty: "Trigger incident with severity"
|
|
139
|
+
|
|
140
|
+
incident_channel:
|
|
141
|
+
naming: "inc-YYYY-MM-DD-short-description"
|
|
142
|
+
topic: "SEV[X] | IC: @name | Status: investigating"
|
|
143
|
+
pinned:
|
|
144
|
+
- "Incident summary"
|
|
145
|
+
- "Timeline document"
|
|
146
|
+
- "Relevant dashboards"
|
|
147
|
+
- "Runbook links"
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
### Phase 3: Roles and Responsibilities
|
|
151
|
+
|
|
152
|
+
```yaml
|
|
153
|
+
incident_commander:
|
|
154
|
+
also_known_as: "IC"
|
|
155
|
+
responsibilities:
|
|
156
|
+
- "Single point of coordination"
|
|
157
|
+
- "Assign and reassign roles"
|
|
158
|
+
- "Make decisions when no consensus"
|
|
159
|
+
- "Manage communication cadence"
|
|
160
|
+
- "Escalate when needed"
|
|
161
|
+
- "Declare incident resolved"
|
|
162
|
+
should_not:
|
|
163
|
+
- "Debug technical issues directly"
|
|
164
|
+
- "Write code to fix the issue"
|
|
165
|
+
- "Get lost in technical details"
|
|
166
|
+
|
|
167
|
+
technical_lead:
|
|
168
|
+
also_known_as: "Tech Lead, TL"
|
|
169
|
+
responsibilities:
|
|
170
|
+
- "Lead technical investigation"
|
|
171
|
+
- "Coordinate debugging efforts"
|
|
172
|
+
- "Make technical decisions"
|
|
173
|
+
- "Implement fixes"
|
|
174
|
+
- "Verify resolution"
|
|
175
|
+
should_not:
|
|
176
|
+
- "Handle communications"
|
|
177
|
+
- "Update status page"
|
|
178
|
+
- "Brief stakeholders"
|
|
179
|
+
|
|
180
|
+
communications_lead:
|
|
181
|
+
also_known_as: "Comms Lead"
|
|
182
|
+
responsibilities:
|
|
183
|
+
- "Update status page"
|
|
184
|
+
- "Send stakeholder updates"
|
|
185
|
+
- "Draft customer communications"
|
|
186
|
+
- "Coordinate with support team"
|
|
187
|
+
- "Manage external messaging"
|
|
188
|
+
|
|
189
|
+
scribe:
|
|
190
|
+
responsibilities:
|
|
191
|
+
- "Maintain incident timeline"
|
|
192
|
+
- "Document key decisions"
|
|
193
|
+
- "Record actions taken"
|
|
194
|
+
- "Capture screenshots/data"
|
|
195
|
+
- "Note participants"
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
### Phase 4: Mitigation
|
|
199
|
+
|
|
200
|
+
```yaml
|
|
201
|
+
mitigation_priority:
|
|
202
|
+
1_stop_the_bleeding:
|
|
203
|
+
goal: "Restore service to users"
|
|
204
|
+
actions:
|
|
205
|
+
- "Rollback recent changes"
|
|
206
|
+
- "Scale up resources"
|
|
207
|
+
- "Enable feature flags to disable broken code"
|
|
208
|
+
- "Failover to backup systems"
|
|
209
|
+
- "Enable maintenance mode"
|
|
210
|
+
|
|
211
|
+
2_stabilize:
|
|
212
|
+
goal: "Ensure service stays up"
|
|
213
|
+
actions:
|
|
214
|
+
- "Apply temporary fixes"
|
|
215
|
+
- "Add extra monitoring"
|
|
216
|
+
- "Scale for headroom"
|
|
217
|
+
- "Disable non-critical features"
|
|
218
|
+
|
|
219
|
+
3_root_cause_later:
|
|
220
|
+
goal: "Don't debug during outage"
|
|
221
|
+
note: "Root cause analysis happens in postmortem"
|
|
222
|
+
|
|
223
|
+
common_mitigation_actions:
|
|
224
|
+
rollback:
|
|
225
|
+
when: "Recent deployment suspected"
|
|
226
|
+
command: "kubectl rollout undo deployment/[name]"
|
|
227
|
+
verify: "Check error rates return to normal"
|
|
228
|
+
|
|
229
|
+
scale_up:
|
|
230
|
+
when: "Capacity-related issues"
|
|
231
|
+
command: "kubectl scale deployment/[name] --replicas=[N]"
|
|
232
|
+
verify: "Check resource utilization decreases"
|
|
233
|
+
|
|
234
|
+
failover:
|
|
235
|
+
when: "Primary system unrecoverable"
|
|
236
|
+
procedure: "Follow DR runbook"
|
|
237
|
+
verify: "Traffic flowing to secondary"
|
|
238
|
+
|
|
239
|
+
feature_flag:
|
|
240
|
+
when: "Specific feature causing issues"
|
|
241
|
+
action: "Disable flag in LaunchDarkly/Unleash"
|
|
242
|
+
verify: "Feature disabled, errors stop"
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
### Phase 5: Resolution
|
|
246
|
+
|
|
247
|
+
```yaml
|
|
248
|
+
resolution_criteria:
|
|
249
|
+
technical:
|
|
250
|
+
- "Error rates at normal levels"
|
|
251
|
+
- "Latency within SLO"
|
|
252
|
+
- "No degradation visible"
|
|
253
|
+
- "Monitors green"
|
|
254
|
+
|
|
255
|
+
operational:
|
|
256
|
+
- "Mitigation is stable (not just temporarily fixed)"
|
|
257
|
+
- "No immediate risk of recurrence"
|
|
258
|
+
- "Team can step down from incident"
|
|
259
|
+
|
|
260
|
+
post_resolution:
|
|
261
|
+
immediate:
|
|
262
|
+
- "Update status page to resolved"
|
|
263
|
+
- "Send final stakeholder update"
|
|
264
|
+
- "Thank participants"
|
|
265
|
+
- "Schedule postmortem"
|
|
266
|
+
|
|
267
|
+
within_24_hours:
|
|
268
|
+
- "Draft initial postmortem"
|
|
269
|
+
- "Gather timeline from participants"
|
|
270
|
+
- "Collect relevant data/logs"
|
|
271
|
+
|
|
272
|
+
within_5_days:
|
|
273
|
+
- "Complete postmortem review"
|
|
274
|
+
- "Create action items"
|
|
275
|
+
- "Share learnings"
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
## Communication Templates
|
|
279
|
+
|
|
280
|
+
### Status Page Update Template
|
|
281
|
+
|
|
282
|
+
```markdown
|
|
283
|
+
**[Investigating/Identified/Monitoring/Resolved]**
|
|
284
|
+
|
|
285
|
+
**Summary**: [One sentence description of user impact]
|
|
286
|
+
|
|
287
|
+
**Affected Services**: [List of affected services]
|
|
288
|
+
|
|
289
|
+
**Current Status**: [What we know and what we're doing]
|
|
290
|
+
|
|
291
|
+
**Next Update**: [Time of next update]
|
|
292
|
+
|
|
293
|
+
---
|
|
294
|
+
Examples:
|
|
295
|
+
|
|
296
|
+
**Investigating**
|
|
297
|
+
We are currently investigating reports of elevated error rates
|
|
298
|
+
on the checkout flow. Some users may experience failures when
|
|
299
|
+
completing purchases. We will provide an update in 15 minutes.
|
|
300
|
+
|
|
301
|
+
**Identified**
|
|
302
|
+
We have identified the cause of checkout failures as a database
|
|
303
|
+
connectivity issue. We are working on restoring connectivity.
|
|
304
|
+
Estimated resolution in 30 minutes.
|
|
305
|
+
|
|
306
|
+
**Monitoring**
|
|
307
|
+
A fix has been implemented. We are monitoring to confirm
|
|
308
|
+
stability. Users should no longer experience checkout failures.
|
|
309
|
+
|
|
310
|
+
**Resolved**
|
|
311
|
+
This incident has been resolved. Checkout functionality has
|
|
312
|
+
been restored. Total duration: 45 minutes.
|
|
313
|
+
```
|
|
314
|
+
|
|
315
|
+
### Stakeholder Update Template
|
|
316
|
+
|
|
317
|
+
```markdown
|
|
318
|
+
## Incident Update: [Title]
|
|
319
|
+
|
|
320
|
+
**Severity**: SEV[X]
|
|
321
|
+
**Status**: [Active/Resolved]
|
|
322
|
+
**Duration**: [X hours Y minutes]
|
|
323
|
+
|
|
324
|
+
### What Happened
|
|
325
|
+
[Brief description of the issue and user impact]
|
|
326
|
+
|
|
327
|
+
### Current Status
|
|
328
|
+
[What we know and what we're doing]
|
|
329
|
+
|
|
330
|
+
### Business Impact
|
|
331
|
+
- Users affected: [X]
|
|
332
|
+
- Revenue impact: [$X] (if applicable)
|
|
333
|
+
- Customer complaints: [X]
|
|
334
|
+
|
|
335
|
+
### Next Steps
|
|
336
|
+
[What happens next, when the next update will be]
|
|
337
|
+
|
|
338
|
+
### Questions?
|
|
339
|
+
Contact: [Incident Commander name and channel]
|
|
340
|
+
```
|
|
341
|
+
|
|
342
|
+
## War Room Guidelines
|
|
343
|
+
|
|
344
|
+
### Setting Up a War Room
|
|
345
|
+
|
|
346
|
+
```yaml
|
|
347
|
+
virtual_war_room:
|
|
348
|
+
when: "SEV1 or SEV2 lasting > 30 minutes"
|
|
349
|
+
|
|
350
|
+
setup:
|
|
351
|
+
- "Create video call (Zoom/Meet/Teams)"
|
|
352
|
+
- "Post link in incident channel"
|
|
353
|
+
- "IC joins immediately"
|
|
354
|
+
- "Technical responders join as needed"
|
|
355
|
+
|
|
356
|
+
ground_rules:
|
|
357
|
+
- "IC runs the call"
|
|
358
|
+
- "Mute when not speaking"
|
|
359
|
+
- "Use raise hand to speak"
|
|
360
|
+
- "Technical work happens off-call, report back"
|
|
361
|
+
- "No side conversations"
|
|
362
|
+
- "Stay focused on mitigation"
|
|
363
|
+
|
|
364
|
+
communication_cadence:
|
|
365
|
+
- "Status update every 15 minutes"
|
|
366
|
+
- "IC asks for updates from each workstream"
|
|
367
|
+
- "Decisions announced and documented"
|
|
368
|
+
```
|
|
369
|
+
|
|
370
|
+
### War Room Commands
|
|
371
|
+
|
|
372
|
+
```yaml
|
|
373
|
+
ic_commands:
|
|
374
|
+
status_check:
|
|
375
|
+
phrase: "Status check - what's your current state?"
|
|
376
|
+
purpose: "Get updates from all participants"
|
|
377
|
+
|
|
378
|
+
decision_time:
|
|
379
|
+
phrase: "We need to make a decision on [X]. Options are [A, B, C]. Any objections to [A]?"
|
|
380
|
+
purpose: "Drive decisions forward"
|
|
381
|
+
|
|
382
|
+
escalation:
|
|
383
|
+
phrase: "We need to escalate to [person/team]. [Name], can you page them?"
|
|
384
|
+
purpose: "Bring in additional help"
|
|
385
|
+
|
|
386
|
+
parallel_work:
|
|
387
|
+
phrase: "[Name] investigate [X]. [Name] investigate [Y]. Report back in 10 minutes."
|
|
388
|
+
purpose: "Divide and conquer"
|
|
389
|
+
|
|
390
|
+
refocus:
|
|
391
|
+
phrase: "Let's refocus on [priority]. We can debug [other thing] after the incident."
|
|
392
|
+
purpose: "Keep team on track"
|
|
393
|
+
```
|
|
394
|
+
|
|
395
|
+
## On-Call Integration
|
|
396
|
+
|
|
397
|
+
### Escalation Paths
|
|
398
|
+
|
|
399
|
+
```yaml
|
|
400
|
+
escalation_matrix:
|
|
401
|
+
api_services:
|
|
402
|
+
primary: "backend-oncall"
|
|
403
|
+
secondary: "backend-lead"
|
|
404
|
+
tertiary: "engineering-manager"
|
|
405
|
+
executive: "vp-engineering"
|
|
406
|
+
|
|
407
|
+
infrastructure:
|
|
408
|
+
primary: "infra-oncall"
|
|
409
|
+
secondary: "sre-lead"
|
|
410
|
+
tertiary: "engineering-manager"
|
|
411
|
+
executive: "vp-engineering"
|
|
412
|
+
|
|
413
|
+
database:
|
|
414
|
+
primary: "dba-oncall"
|
|
415
|
+
secondary: "dba-lead"
|
|
416
|
+
tertiary: "infra-oncall"
|
|
417
|
+
executive: "vp-engineering"
|
|
418
|
+
|
|
419
|
+
security:
|
|
420
|
+
primary: "security-oncall"
|
|
421
|
+
secondary: "security-lead"
|
|
422
|
+
tertiary: "ciso"
|
|
423
|
+
executive: "cto"
|
|
424
|
+
|
|
425
|
+
escalation_triggers:
|
|
426
|
+
automatic:
|
|
427
|
+
- "No acknowledgment within SLA"
|
|
428
|
+
- "Incident duration exceeds threshold"
|
|
429
|
+
- "Severity upgraded"
|
|
430
|
+
|
|
431
|
+
manual:
|
|
432
|
+
- "Primary on-call requests help"
|
|
433
|
+
- "Technical expertise needed"
|
|
434
|
+
- "Business decision required"
|
|
435
|
+
```
|
|
436
|
+
|
|
437
|
+
### Handoff During Incidents
|
|
438
|
+
|
|
439
|
+
```yaml
|
|
440
|
+
shift_handoff_during_incident:
|
|
441
|
+
when: "Incident spans on-call rotation change"
|
|
442
|
+
|
|
443
|
+
process:
|
|
444
|
+
1: "Outgoing on-call notifies IC of shift change"
|
|
445
|
+
2: "30-minute overlap for knowledge transfer"
|
|
446
|
+
3: "Incoming on-call joins war room"
|
|
447
|
+
4: "Outgoing provides verbal summary"
|
|
448
|
+
5: "Incoming confirms understanding"
|
|
449
|
+
6: "IC announces handoff complete"
|
|
450
|
+
|
|
451
|
+
handoff_includes:
|
|
452
|
+
- "Current incident status"
|
|
453
|
+
- "What's been tried"
|
|
454
|
+
- "Current hypothesis"
|
|
455
|
+
- "Assigned tasks"
|
|
456
|
+
- "Open questions"
|
|
457
|
+
```
|
|
458
|
+
|
|
459
|
+
## Incident Documentation
|
|
460
|
+
|
|
461
|
+
### Real-Time Timeline
|
|
462
|
+
|
|
463
|
+
```markdown
|
|
464
|
+
## Incident Timeline: inc-2025-01-15-api-outage
|
|
465
|
+
|
|
466
|
+
All times UTC
|
|
467
|
+
|
|
468
|
+
| Time | Actor | Event |
|
|
469
|
+
|------|-------|-------|
|
|
470
|
+
| 14:00 | System | Deployment api-server v2.3.1 started |
|
|
471
|
+
| 14:02 | System | Deployment completed |
|
|
472
|
+
| 14:05 | Alert | APIHighErrorRate fired |
|
|
473
|
+
| 14:06 | @oncall | Acknowledged alert |
|
|
474
|
+
| 14:08 | @oncall | Incident declared SEV2 |
|
|
475
|
+
| 14:08 | @oncall | Created #inc-2025-01-15-api-outage |
|
|
476
|
+
| 14:10 | @oncall | Checking recent deployments |
|
|
477
|
+
| 14:12 | @oncall | Identified v2.3.1 deployed 5 min before errors |
|
|
478
|
+
| 14:15 | @oncall | Initiating rollback to v2.3.0 |
|
|
479
|
+
| 14:18 | System | Rollback completed |
|
|
480
|
+
| 14:20 | @oncall | Error rates returning to normal |
|
|
481
|
+
| 14:25 | @oncall | Monitoring - errors at baseline |
|
|
482
|
+
| 14:30 | @oncall | Incident resolved, scheduling postmortem |
|
|
483
|
+
```
|
|
484
|
+
|
|
485
|
+
## Metrics and Improvement
|
|
486
|
+
|
|
487
|
+
### Incident Metrics to Track
|
|
488
|
+
|
|
489
|
+
```yaml
|
|
490
|
+
response_metrics:
|
|
491
|
+
mttd:
|
|
492
|
+
name: "Mean Time to Detect"
|
|
493
|
+
target: "< 5 minutes for SEV1"
|
|
494
|
+
measurement: "Time from incident start to first alert"
|
|
495
|
+
|
|
496
|
+
mtta:
|
|
497
|
+
name: "Mean Time to Acknowledge"
|
|
498
|
+
target: "< 5 minutes"
|
|
499
|
+
measurement: "Time from alert to acknowledgment"
|
|
500
|
+
|
|
501
|
+
mttm:
|
|
502
|
+
name: "Mean Time to Mitigate"
|
|
503
|
+
target: "< 30 minutes for SEV1"
|
|
504
|
+
measurement: "Time from detection to user impact resolved"
|
|
505
|
+
|
|
506
|
+
mttr:
|
|
507
|
+
name: "Mean Time to Resolve"
|
|
508
|
+
target: "< 4 hours for SEV1"
|
|
509
|
+
measurement: "Time from detection to root cause fixed"
|
|
510
|
+
|
|
511
|
+
volume_metrics:
|
|
512
|
+
incidents_per_week:
|
|
513
|
+
target: "Decreasing trend"
|
|
514
|
+
|
|
515
|
+
incidents_by_severity:
|
|
516
|
+
goal: "Fewer SEV1/SEV2, more caught as SEV3/SEV4"
|
|
517
|
+
|
|
518
|
+
repeat_incidents:
|
|
519
|
+
target: "< 10% of incidents are repeats"
|
|
520
|
+
|
|
521
|
+
quality_metrics:
|
|
522
|
+
postmortem_completion_rate:
|
|
523
|
+
target: "100% for SEV1/SEV2"
|
|
524
|
+
|
|
525
|
+
action_item_completion_rate:
|
|
526
|
+
target: "> 90% within 30 days"
|
|
527
|
+
```
|
|
528
|
+
|
|
529
|
+
## Common Pitfalls
|
|
530
|
+
|
|
531
|
+
### During Incidents
|
|
532
|
+
|
|
533
|
+
```yaml
|
|
534
|
+
pitfall_debugging_during_outage:
|
|
535
|
+
wrong: "Spending 30 minutes debugging while users are down"
|
|
536
|
+
right: "Mitigate first (rollback, scale, failover), debug later"
|
|
537
|
+
|
|
538
|
+
pitfall_too_many_cooks:
|
|
539
|
+
wrong: "Everyone jumping in and trying different things"
|
|
540
|
+
right: "IC assigns specific tasks, coordinates parallel work"
|
|
541
|
+
|
|
542
|
+
pitfall_silent_war_room:
|
|
543
|
+
wrong: "People working silently, no one knows what's happening"
|
|
544
|
+
right: "Regular status updates, thinking out loud"
|
|
545
|
+
|
|
546
|
+
pitfall_forgetting_communication:
|
|
547
|
+
wrong: "Technical team so focused they forget to update stakeholders"
|
|
548
|
+
right: "Comms Lead handles all external communication"
|
|
549
|
+
```
|
|
550
|
+
|
|
551
|
+
### After Incidents
|
|
552
|
+
|
|
553
|
+
```yaml
|
|
554
|
+
pitfall_skipping_postmortem:
|
|
555
|
+
wrong: "We fixed it, move on"
|
|
556
|
+
right: "Every SEV1/SEV2 gets a postmortem within 5 days"
|
|
557
|
+
|
|
558
|
+
pitfall_blame_game:
|
|
559
|
+
wrong: "Who deployed the bad code?"
|
|
560
|
+
right: "What systemic issues allowed this to happen?"
|
|
561
|
+
|
|
562
|
+
pitfall_action_item_graveyard:
|
|
563
|
+
wrong: "Create action items that never get done"
|
|
564
|
+
right: "Track action items, report on completion, prioritize fixes"
|
|
565
|
+
```
|