agentic-team-templates 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +280 -0
- package/bin/cli.js +5 -0
- package/package.json +47 -0
- package/src/index.js +521 -0
- package/templates/_shared/code-quality.md +162 -0
- package/templates/_shared/communication.md +114 -0
- package/templates/_shared/core-principles.md +62 -0
- package/templates/_shared/git-workflow.md +165 -0
- package/templates/_shared/security-fundamentals.md +173 -0
- package/templates/blockchain/.cursorrules/defi-patterns.md +520 -0
- package/templates/blockchain/.cursorrules/gas-optimization.md +339 -0
- package/templates/blockchain/.cursorrules/overview.md +130 -0
- package/templates/blockchain/.cursorrules/security.md +318 -0
- package/templates/blockchain/.cursorrules/smart-contracts.md +364 -0
- package/templates/blockchain/.cursorrules/testing.md +415 -0
- package/templates/blockchain/.cursorrules/web3-integration.md +538 -0
- package/templates/blockchain/CLAUDE.md +389 -0
- package/templates/cli-tools/.cursorrules/architecture.md +412 -0
- package/templates/cli-tools/.cursorrules/arguments.md +406 -0
- package/templates/cli-tools/.cursorrules/distribution.md +546 -0
- package/templates/cli-tools/.cursorrules/error-handling.md +455 -0
- package/templates/cli-tools/.cursorrules/overview.md +136 -0
- package/templates/cli-tools/.cursorrules/testing.md +537 -0
- package/templates/cli-tools/.cursorrules/user-experience.md +545 -0
- package/templates/cli-tools/CLAUDE.md +356 -0
- package/templates/data-engineering/.cursorrules/data-modeling.md +367 -0
- package/templates/data-engineering/.cursorrules/data-quality.md +455 -0
- package/templates/data-engineering/.cursorrules/overview.md +85 -0
- package/templates/data-engineering/.cursorrules/performance.md +339 -0
- package/templates/data-engineering/.cursorrules/pipeline-design.md +280 -0
- package/templates/data-engineering/.cursorrules/security.md +460 -0
- package/templates/data-engineering/.cursorrules/testing.md +452 -0
- package/templates/data-engineering/CLAUDE.md +974 -0
- package/templates/devops-sre/.cursorrules/capacity-planning.md +653 -0
- package/templates/devops-sre/.cursorrules/change-management.md +584 -0
- package/templates/devops-sre/.cursorrules/chaos-engineering.md +651 -0
- package/templates/devops-sre/.cursorrules/disaster-recovery.md +641 -0
- package/templates/devops-sre/.cursorrules/incident-management.md +565 -0
- package/templates/devops-sre/.cursorrules/observability.md +714 -0
- package/templates/devops-sre/.cursorrules/overview.md +230 -0
- package/templates/devops-sre/.cursorrules/postmortems.md +588 -0
- package/templates/devops-sre/.cursorrules/runbooks.md +760 -0
- package/templates/devops-sre/.cursorrules/slo-sli.md +617 -0
- package/templates/devops-sre/.cursorrules/toil-reduction.md +567 -0
- package/templates/devops-sre/CLAUDE.md +1007 -0
- package/templates/documentation/.cursorrules/adr.md +277 -0
- package/templates/documentation/.cursorrules/api-documentation.md +411 -0
- package/templates/documentation/.cursorrules/code-comments.md +253 -0
- package/templates/documentation/.cursorrules/maintenance.md +260 -0
- package/templates/documentation/.cursorrules/overview.md +82 -0
- package/templates/documentation/.cursorrules/readme-standards.md +306 -0
- package/templates/documentation/CLAUDE.md +120 -0
- package/templates/fullstack/.cursorrules/api-contracts.md +331 -0
- package/templates/fullstack/.cursorrules/architecture.md +298 -0
- package/templates/fullstack/.cursorrules/overview.md +109 -0
- package/templates/fullstack/.cursorrules/shared-types.md +348 -0
- package/templates/fullstack/.cursorrules/testing.md +386 -0
- package/templates/fullstack/CLAUDE.md +349 -0
- package/templates/ml-ai/.cursorrules/data-engineering.md +483 -0
- package/templates/ml-ai/.cursorrules/deployment.md +601 -0
- package/templates/ml-ai/.cursorrules/model-development.md +538 -0
- package/templates/ml-ai/.cursorrules/monitoring.md +658 -0
- package/templates/ml-ai/.cursorrules/overview.md +131 -0
- package/templates/ml-ai/.cursorrules/security.md +637 -0
- package/templates/ml-ai/.cursorrules/testing.md +678 -0
- package/templates/ml-ai/CLAUDE.md +1136 -0
- package/templates/mobile/.cursorrules/navigation.md +246 -0
- package/templates/mobile/.cursorrules/offline-first.md +302 -0
- package/templates/mobile/.cursorrules/overview.md +71 -0
- package/templates/mobile/.cursorrules/performance.md +345 -0
- package/templates/mobile/.cursorrules/testing.md +339 -0
- package/templates/mobile/CLAUDE.md +233 -0
- package/templates/platform-engineering/.cursorrules/ci-cd.md +778 -0
- package/templates/platform-engineering/.cursorrules/developer-experience.md +632 -0
- package/templates/platform-engineering/.cursorrules/infrastructure-as-code.md +600 -0
- package/templates/platform-engineering/.cursorrules/kubernetes.md +710 -0
- package/templates/platform-engineering/.cursorrules/observability.md +747 -0
- package/templates/platform-engineering/.cursorrules/overview.md +215 -0
- package/templates/platform-engineering/.cursorrules/security.md +855 -0
- package/templates/platform-engineering/.cursorrules/testing.md +878 -0
- package/templates/platform-engineering/CLAUDE.md +850 -0
- package/templates/utility-agent/.cursorrules/action-control.md +284 -0
- package/templates/utility-agent/.cursorrules/context-management.md +186 -0
- package/templates/utility-agent/.cursorrules/hallucination-prevention.md +253 -0
- package/templates/utility-agent/.cursorrules/overview.md +78 -0
- package/templates/utility-agent/.cursorrules/token-optimization.md +369 -0
- package/templates/utility-agent/CLAUDE.md +513 -0
- package/templates/web-backend/.cursorrules/api-design.md +255 -0
- package/templates/web-backend/.cursorrules/authentication.md +309 -0
- package/templates/web-backend/.cursorrules/database-patterns.md +298 -0
- package/templates/web-backend/.cursorrules/error-handling.md +366 -0
- package/templates/web-backend/.cursorrules/overview.md +69 -0
- package/templates/web-backend/.cursorrules/security.md +358 -0
- package/templates/web-backend/.cursorrules/testing.md +395 -0
- package/templates/web-backend/CLAUDE.md +366 -0
- package/templates/web-frontend/.cursorrules/accessibility.md +296 -0
- package/templates/web-frontend/.cursorrules/component-patterns.md +204 -0
- package/templates/web-frontend/.cursorrules/overview.md +72 -0
- package/templates/web-frontend/.cursorrules/performance.md +325 -0
- package/templates/web-frontend/.cursorrules/state-management.md +227 -0
- package/templates/web-frontend/.cursorrules/styling.md +271 -0
- package/templates/web-frontend/.cursorrules/testing.md +311 -0
- package/templates/web-frontend/CLAUDE.md +399 -0
|
@@ -0,0 +1,588 @@
|
|
|
1
|
+
# Postmortems
|
|
2
|
+
|
|
3
|
+
Comprehensive guidelines for conducting blameless postmortems and learning from incidents.
|
|
4
|
+
|
|
5
|
+
## Core Principles
|
|
6
|
+
|
|
7
|
+
1. **Blameless** - Focus on systems, not individuals
|
|
8
|
+
2. **Thorough** - Understand root causes, not just symptoms
|
|
9
|
+
3. **Actionable** - Create specific, tracked action items
|
|
10
|
+
4. **Shared** - Learning benefits the whole organization
|
|
11
|
+
|
|
12
|
+
## The Blameless Culture
|
|
13
|
+
|
|
14
|
+
### Why Blameless?
|
|
15
|
+
|
|
16
|
+
```yaml
|
|
17
|
+
blame_culture_problems:
|
|
18
|
+
hiding_information:
|
|
19
|
+
behavior: "Engineers don't report near-misses"
|
|
20
|
+
impact: "Can't learn from close calls"
|
|
21
|
+
|
|
22
|
+
defensive_responses:
|
|
23
|
+
behavior: "Focus on who, not how"
|
|
24
|
+
impact: "Miss systemic improvements"
|
|
25
|
+
|
|
26
|
+
fear_of_reporting:
|
|
27
|
+
behavior: "Incidents go unreported"
|
|
28
|
+
impact: "Can't improve what we don't know about"
|
|
29
|
+
|
|
30
|
+
simplified_narratives:
|
|
31
|
+
behavior: "Blame single cause/person"
|
|
32
|
+
impact: "Miss complex contributing factors"
|
|
33
|
+
|
|
34
|
+
blameless_principles:
|
|
35
|
+
assume_good_intentions:
|
|
36
|
+
premise: "Engineers make decisions based on available information"
|
|
37
|
+
question: "What did they know at the time?"
|
|
38
|
+
|
|
39
|
+
systems_thinking:
|
|
40
|
+
premise: "Humans are part of a system"
|
|
41
|
+
question: "What about the system allowed this?"
|
|
42
|
+
|
|
43
|
+
learning_focus:
|
|
44
|
+
premise: "Goal is improvement, not punishment"
|
|
45
|
+
question: "How do we prevent this class of error?"
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### Blameless Language
|
|
49
|
+
|
|
50
|
+
```yaml
|
|
51
|
+
language_examples:
|
|
52
|
+
avoid:
|
|
53
|
+
- "Who broke this?"
|
|
54
|
+
- "Why didn't you check?"
|
|
55
|
+
- "You should have known"
|
|
56
|
+
- "This was a mistake by [person]"
|
|
57
|
+
- "Failure to follow procedure"
|
|
58
|
+
|
|
59
|
+
prefer:
|
|
60
|
+
- "What happened?"
|
|
61
|
+
- "What information was available?"
|
|
62
|
+
- "What made this decision seem reasonable?"
|
|
63
|
+
- "The system allowed this to happen"
|
|
64
|
+
- "The procedure didn't account for this case"
|
|
65
|
+
|
|
66
|
+
reframing_examples:
|
|
67
|
+
blame: "Developer deployed without testing"
|
|
68
|
+
blameless: "The deployment process allowed deployment without test verification"
|
|
69
|
+
|
|
70
|
+
blame: "Engineer didn't follow the runbook"
|
|
71
|
+
blameless: "The runbook was unclear about this scenario"
|
|
72
|
+
|
|
73
|
+
blame: "On-call should have caught this"
|
|
74
|
+
blameless: "The alert didn't fire for this condition"
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Postmortem Process
|
|
78
|
+
|
|
79
|
+
### When to Write a Postmortem
|
|
80
|
+
|
|
81
|
+
```yaml
|
|
82
|
+
always_required:
|
|
83
|
+
- "SEV1 incidents (any duration)"
|
|
84
|
+
- "SEV2 incidents (> 30 minutes)"
|
|
85
|
+
- "Data loss incidents"
|
|
86
|
+
- "Security incidents"
|
|
87
|
+
- "Near-misses that could have been severe"
|
|
88
|
+
|
|
89
|
+
recommended:
|
|
90
|
+
- "SEV3 incidents with learning opportunities"
|
|
91
|
+
- "Recurring issues (3+ occurrences)"
|
|
92
|
+
- "Novel failure modes"
|
|
93
|
+
|
|
94
|
+
optional:
|
|
95
|
+
- "Minor incidents with obvious fixes"
|
|
96
|
+
- "Issues caught before user impact"
|
|
97
|
+
|
|
98
|
+
timeline:
|
|
99
|
+
draft: "Within 24-48 hours of incident"
|
|
100
|
+
review: "Within 5 business days"
|
|
101
|
+
publication: "Within 7 business days"
|
|
102
|
+
action_items: "Due within 30 days"
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### Postmortem Meeting
|
|
106
|
+
|
|
107
|
+
```yaml
|
|
108
|
+
meeting_structure:
|
|
109
|
+
duration: "60-90 minutes"
|
|
110
|
+
|
|
111
|
+
attendees:
|
|
112
|
+
required:
|
|
113
|
+
- "Incident responders"
|
|
114
|
+
- "Service owner"
|
|
115
|
+
- "Postmortem facilitator"
|
|
116
|
+
optional:
|
|
117
|
+
- "Related team representatives"
|
|
118
|
+
- "Management (listening, not judging)"
|
|
119
|
+
|
|
120
|
+
agenda:
|
|
121
|
+
- "Timeline review (20 min)"
|
|
122
|
+
- "Contributing factors (20 min)"
|
|
123
|
+
- "What went well (10 min)"
|
|
124
|
+
- "What could be improved (15 min)"
|
|
125
|
+
- "Action items (15 min)"
|
|
126
|
+
|
|
127
|
+
facilitator_role:
|
|
128
|
+
- "Keep discussion blameless"
|
|
129
|
+
- "Ensure all voices heard"
|
|
130
|
+
- "Drive toward action items"
|
|
131
|
+
- "Capture decisions and insights"
|
|
132
|
+
- "Redirect blame to systems"
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
## Postmortem Template
|
|
136
|
+
|
|
137
|
+
```markdown
|
|
138
|
+
# Postmortem: [Incident Title]
|
|
139
|
+
|
|
140
|
+
## Metadata
|
|
141
|
+
|
|
142
|
+
| Field | Value |
|
|
143
|
+
|-------|-------|
|
|
144
|
+
| **Date** | YYYY-MM-DD |
|
|
145
|
+
| **Authors** | @name, @name |
|
|
146
|
+
| **Reviewers** | @name, @name |
|
|
147
|
+
| **Status** | Draft / In Review / Final |
|
|
148
|
+
| **Severity** | SEV1 / SEV2 / SEV3 |
|
|
149
|
+
| **Incident ID** | INC-XXXX |
|
|
150
|
+
|
|
151
|
+
---
|
|
152
|
+
|
|
153
|
+
## Executive Summary
|
|
154
|
+
|
|
155
|
+
**One-paragraph summary**: What happened, how long it lasted, what was the impact, and what we're doing to prevent recurrence.
|
|
156
|
+
|
|
157
|
+
---
|
|
158
|
+
|
|
159
|
+
## Impact
|
|
160
|
+
|
|
161
|
+
### User Impact
|
|
162
|
+
- **Duration**: X hours Y minutes
|
|
163
|
+
- **Users affected**: X% of users / Y total users
|
|
164
|
+
- **Functionality affected**: [List of affected features]
|
|
165
|
+
|
|
166
|
+
### Business Impact
|
|
167
|
+
- **Revenue impact**: $X (if applicable)
|
|
168
|
+
- **Customer complaints**: X tickets filed
|
|
169
|
+
- **SLA breach**: Yes/No
|
|
170
|
+
|
|
171
|
+
### SLO Impact
|
|
172
|
+
- **Error budget consumed**: X%
|
|
173
|
+
- **Monthly budget remaining**: Y%
|
|
174
|
+
|
|
175
|
+
---
|
|
176
|
+
|
|
177
|
+
## Timeline
|
|
178
|
+
|
|
179
|
+
All times in UTC.
|
|
180
|
+
|
|
181
|
+
| Time | Event |
|
|
182
|
+
|------|-------|
|
|
183
|
+
| 14:00 | [First sign of issue - what monitoring showed] |
|
|
184
|
+
| 14:05 | [Alert fired - which alert, who received] |
|
|
185
|
+
| 14:07 | [On-call acknowledged - initial actions] |
|
|
186
|
+
| 14:15 | [Escalation - who was paged, why] |
|
|
187
|
+
| 14:20 | [Root cause identified - how it was found] |
|
|
188
|
+
| 14:25 | [Mitigation started - what action was taken] |
|
|
189
|
+
| 14:30 | [Service restored - verification steps] |
|
|
190
|
+
| 14:45 | [Incident closed - final verification] |
|
|
191
|
+
|
|
192
|
+
---
|
|
193
|
+
|
|
194
|
+
## Root Cause Analysis
|
|
195
|
+
|
|
196
|
+
### What Happened
|
|
197
|
+
|
|
198
|
+
Detailed technical explanation of what went wrong. Include:
|
|
199
|
+
- The immediate cause
|
|
200
|
+
- The sequence of events
|
|
201
|
+
- Technical details relevant to understanding
|
|
202
|
+
|
|
203
|
+
### Why It Happened
|
|
204
|
+
|
|
205
|
+
Use the "5 Whys" technique to get to root cause:
|
|
206
|
+
|
|
207
|
+
1. **Why** did the service return errors?
|
|
208
|
+
- Because the database connection pool was exhausted.
|
|
209
|
+
|
|
210
|
+
2. **Why** was the connection pool exhausted?
|
|
211
|
+
- Because a slow query was holding connections for 30+ seconds.
|
|
212
|
+
|
|
213
|
+
3. **Why** was the query slow?
|
|
214
|
+
- Because a missing index caused a full table scan.
|
|
215
|
+
|
|
216
|
+
4. **Why** was the index missing?
|
|
217
|
+
- Because the migration to add it was never run in production.
|
|
218
|
+
|
|
219
|
+
5. **Why** wasn't the migration run?
|
|
220
|
+
- Because our deployment process doesn't verify pending migrations.
|
|
221
|
+
|
|
222
|
+
**Root Cause**: Deployment process lacks verification of pending database migrations.
|
|
223
|
+
|
|
224
|
+
---
|
|
225
|
+
|
|
226
|
+
## Contributing Factors
|
|
227
|
+
|
|
228
|
+
Factors that made the incident possible or worse:
|
|
229
|
+
|
|
230
|
+
| Factor | Description | Type |
|
|
231
|
+
|--------|-------------|------|
|
|
232
|
+
| Missing migration check | Deployment doesn't verify migrations | Process |
|
|
233
|
+
| No query timeout | Long queries hold connections indefinitely | Configuration |
|
|
234
|
+
| Insufficient connection pool monitoring | Didn't alert until pool was 100% exhausted | Monitoring |
|
|
235
|
+
| Runbook outdated | Steps didn't match current architecture | Documentation |
|
|
236
|
+
|
|
237
|
+
---
|
|
238
|
+
|
|
239
|
+
## What Went Well
|
|
240
|
+
|
|
241
|
+
- **Detection**: Alert fired within 5 minutes of issue starting
|
|
242
|
+
- **Response**: On-call acknowledged within 2 minutes
|
|
243
|
+
- **Communication**: Status page updated promptly
|
|
244
|
+
- **Teamwork**: Cross-team collaboration was smooth
|
|
245
|
+
- **Rollback**: Quick rollback reduced impact duration
|
|
246
|
+
|
|
247
|
+
---
|
|
248
|
+
|
|
249
|
+
## What Went Poorly
|
|
250
|
+
|
|
251
|
+
- **Detection**: Alert threshold too high, should have fired earlier
|
|
252
|
+
- **Diagnosis**: Initially went down wrong path investigating network
|
|
253
|
+
- **Documentation**: Runbook didn't cover this scenario
|
|
254
|
+
- **Communication**: Internal Slack updates were sporadic
|
|
255
|
+
- **Recovery**: Rollback took longer than expected due to manual steps
|
|
256
|
+
|
|
257
|
+
---
|
|
258
|
+
|
|
259
|
+
## Where We Got Lucky
|
|
260
|
+
|
|
261
|
+
Things that could have made this worse but didn't:
|
|
262
|
+
|
|
263
|
+
- Issue happened during business hours with full team available
|
|
264
|
+
- Recent backup was only 15 minutes old
|
|
265
|
+
- The slow query only affected one service
|
|
266
|
+
|
|
267
|
+
---
|
|
268
|
+
|
|
269
|
+
## Action Items
|
|
270
|
+
|
|
271
|
+
| ID | Action | Type | Priority | Owner | Due Date | Status |
|
|
272
|
+
|----|--------|------|----------|-------|----------|--------|
|
|
273
|
+
| 1 | Add migration check to deployment pipeline | Prevent | P1 | @engineer | 2025-02-01 | TODO |
|
|
274
|
+
| 2 | Configure query timeout at 5 seconds | Prevent | P1 | @dba | 2025-01-25 | TODO |
|
|
275
|
+
| 3 | Add connection pool utilization alert at 70% | Detect | P2 | @sre | 2025-02-01 | TODO |
|
|
276
|
+
| 4 | Update runbook with DB troubleshooting steps | Document | P2 | @engineer | 2025-01-30 | TODO |
|
|
277
|
+
| 5 | Automate rollback procedure | Mitigate | P3 | @sre | 2025-02-15 | TODO |
|
|
278
|
+
|
|
279
|
+
### Action Item Types
|
|
280
|
+
- **Prevent**: Stop this class of incident from happening
|
|
281
|
+
- **Detect**: Find this faster next time
|
|
282
|
+
- **Mitigate**: Reduce impact when it happens
|
|
283
|
+
- **Document**: Improve understanding/procedures
|
|
284
|
+
|
|
285
|
+
---
|
|
286
|
+
|
|
287
|
+
## Lessons Learned
|
|
288
|
+
|
|
289
|
+
### Key Takeaways
|
|
290
|
+
|
|
291
|
+
1. **Database migrations need verification**: Our deployment process should verify that all migrations are applied before proceeding.
|
|
292
|
+
|
|
293
|
+
2. **Defense in depth for connection pools**: Multiple safeguards (query timeout, pool limits, alerting) would have limited impact.
|
|
294
|
+
|
|
295
|
+
3. **Runbooks need regular review**: This scenario wasn't covered, suggesting we need periodic runbook audits.
|
|
296
|
+
|
|
297
|
+
### Recommendations for Broader Organization
|
|
298
|
+
|
|
299
|
+
- Consider adding migration checks to the standard deployment template
|
|
300
|
+
- Review other services for similar query timeout gaps
|
|
301
|
+
- Schedule quarterly runbook review process
|
|
302
|
+
|
|
303
|
+
---
|
|
304
|
+
|
|
305
|
+
## Supporting Information
|
|
306
|
+
|
|
307
|
+
### Relevant Logs
|
|
308
|
+
|
|
309
|
+
```
|
|
310
|
+
2025-01-15 14:05:23 ERROR Database connection timeout after 30s
|
|
311
|
+
2025-01-15 14:05:24 ERROR Connection pool exhausted: 50/50 active
|
|
312
|
+
2025-01-15 14:05:24 ERROR Request failed: unable to acquire connection
|
|
313
|
+
```
|
|
314
|
+
|
|
315
|
+
### Relevant Metrics
|
|
316
|
+
|
|
317
|
+

|
|
318
|
+

|
|
319
|
+
|
|
320
|
+
### Related Incidents
|
|
321
|
+
|
|
322
|
+
- [INC-1234](link) - Similar connection pool issue in 2024-06
|
|
323
|
+
- [INC-2345](link) - Related slow query incident
|
|
324
|
+
|
|
325
|
+
---
|
|
326
|
+
|
|
327
|
+
## Appendix
|
|
328
|
+
|
|
329
|
+
### Glossary
|
|
330
|
+
|
|
331
|
+
- **Connection Pool**: Set of reusable database connections
|
|
332
|
+
- **Migration**: Database schema change script
|
|
333
|
+
|
|
334
|
+
### References
|
|
335
|
+
|
|
336
|
+
- [Service Architecture Doc](link)
|
|
337
|
+
- [Database Runbook](link)
|
|
338
|
+
- [Deployment Pipeline](link)
|
|
339
|
+
```
|
|
340
|
+
|
|
341
|
+
## Root Cause Analysis Techniques
|
|
342
|
+
|
|
343
|
+
### 5 Whys
|
|
344
|
+
|
|
345
|
+
```yaml
|
|
346
|
+
five_whys:
|
|
347
|
+
description: "Ask 'why' repeatedly to find root cause"
|
|
348
|
+
|
|
349
|
+
process:
|
|
350
|
+
- "Start with the problem"
|
|
351
|
+
- "Ask 'why did this happen?'"
|
|
352
|
+
- "For each answer, ask 'why?' again"
|
|
353
|
+
- "Continue until you reach a systemic issue"
|
|
354
|
+
- "Usually 5 iterations, but can be more or fewer"
|
|
355
|
+
|
|
356
|
+
example:
|
|
357
|
+
problem: "Service returned 500 errors"
|
|
358
|
+
why_1:
|
|
359
|
+
q: "Why did the service return 500 errors?"
|
|
360
|
+
a: "The database was unreachable"
|
|
361
|
+
why_2:
|
|
362
|
+
q: "Why was the database unreachable?"
|
|
363
|
+
a: "The database ran out of disk space"
|
|
364
|
+
why_3:
|
|
365
|
+
q: "Why did it run out of disk space?"
|
|
366
|
+
a: "Logs were not being rotated"
|
|
367
|
+
why_4:
|
|
368
|
+
q: "Why were logs not being rotated?"
|
|
369
|
+
a: "Log rotation was configured but not enabled"
|
|
370
|
+
why_5:
|
|
371
|
+
q: "Why was it not enabled?"
|
|
372
|
+
a: "The infrastructure template didn't include it"
|
|
373
|
+
root_cause: "Infrastructure templates missing log rotation config"
|
|
374
|
+
|
|
375
|
+
tips:
|
|
376
|
+
- "Multiple branches are OK (parallel why chains)"
|
|
377
|
+
- "Stop when you reach something you can fix"
|
|
378
|
+
- "Focus on process/system, not people"
|
|
379
|
+
```
|
|
380
|
+
|
|
381
|
+
### Fishbone Diagram (Ishikawa)
|
|
382
|
+
|
|
383
|
+
```yaml
|
|
384
|
+
fishbone_analysis:
|
|
385
|
+
description: "Categorize contributing factors"
|
|
386
|
+
|
|
387
|
+
categories:
|
|
388
|
+
people:
|
|
389
|
+
- "Training gaps"
|
|
390
|
+
- "Cognitive load"
|
|
391
|
+
- "Communication issues"
|
|
392
|
+
|
|
393
|
+
process:
|
|
394
|
+
- "Missing procedures"
|
|
395
|
+
- "Unclear ownership"
|
|
396
|
+
- "Inadequate review"
|
|
397
|
+
|
|
398
|
+
technology:
|
|
399
|
+
- "Software bugs"
|
|
400
|
+
- "Infrastructure issues"
|
|
401
|
+
- "Tool limitations"
|
|
402
|
+
|
|
403
|
+
environment:
|
|
404
|
+
- "Time pressure"
|
|
405
|
+
- "Resource constraints"
|
|
406
|
+
- "External dependencies"
|
|
407
|
+
|
|
408
|
+
visualization: |
|
|
409
|
+
People Process
|
|
410
|
+
\ /
|
|
411
|
+
\ /
|
|
412
|
+
\ /
|
|
413
|
+
\ /
|
|
414
|
+
[PROBLEM]
|
|
415
|
+
/ \
|
|
416
|
+
/ \
|
|
417
|
+
/ \
|
|
418
|
+
/ \
|
|
419
|
+
Technology Environment
|
|
420
|
+
```
|
|
421
|
+
|
|
422
|
+
### Timeline Analysis
|
|
423
|
+
|
|
424
|
+
```yaml
|
|
425
|
+
timeline_analysis:
|
|
426
|
+
purpose: "Understand sequence of events"
|
|
427
|
+
|
|
428
|
+
elements:
|
|
429
|
+
- "Timestamp (in UTC)"
|
|
430
|
+
- "Event description"
|
|
431
|
+
- "Actor (human or system)"
|
|
432
|
+
- "Evidence (logs, alerts, messages)"
|
|
433
|
+
|
|
434
|
+
tips:
|
|
435
|
+
- "Be precise with times"
|
|
436
|
+
- "Include non-events (what didn't happen)"
|
|
437
|
+
- "Note decision points"
|
|
438
|
+
- "Highlight delays"
|
|
439
|
+
|
|
440
|
+
questions:
|
|
441
|
+
- "When did the issue actually start?"
|
|
442
|
+
- "When was it first detectable?"
|
|
443
|
+
- "What triggered detection?"
|
|
444
|
+
- "What caused delays in response?"
|
|
445
|
+
```
|
|
446
|
+
|
|
447
|
+
## Action Item Management
|
|
448
|
+
|
|
449
|
+
### Writing Good Action Items
|
|
450
|
+
|
|
451
|
+
```yaml
|
|
452
|
+
good_action_item:
|
|
453
|
+
specific: "Clear what needs to be done"
|
|
454
|
+
measurable: "Know when it's complete"
|
|
455
|
+
assigned: "Single owner (not a team)"
|
|
456
|
+
relevant: "Actually addresses the problem"
|
|
457
|
+
time_bound: "Has a due date"
|
|
458
|
+
|
|
459
|
+
examples:
|
|
460
|
+
bad:
|
|
461
|
+
- "Improve monitoring" # Vague
|
|
462
|
+
- "Don't do this again" # Not actionable
|
|
463
|
+
- "Team should fix" # No owner
|
|
464
|
+
|
|
465
|
+
good:
|
|
466
|
+
- "Add alert for connection pool > 80% (@sre, due 2025-02-01)"
|
|
467
|
+
- "Update deployment pipeline to check pending migrations (@engineer, due 2025-01-25)"
|
|
468
|
+
- "Document database failover procedure in runbook (@dba, due 2025-02-01)"
|
|
469
|
+
```
|
|
470
|
+
|
|
471
|
+
### Tracking Action Items
|
|
472
|
+
|
|
473
|
+
```yaml
|
|
474
|
+
tracking_process:
|
|
475
|
+
documentation:
|
|
476
|
+
- "Action items in postmortem document"
|
|
477
|
+
- "Tickets created in issue tracker"
|
|
478
|
+
- "Link tickets back to postmortem"
|
|
479
|
+
|
|
480
|
+
review:
|
|
481
|
+
- "Weekly review of open action items"
|
|
482
|
+
- "Escalate overdue items"
|
|
483
|
+
- "Close completed items with verification"
|
|
484
|
+
|
|
485
|
+
metrics:
|
|
486
|
+
- "Action item completion rate"
|
|
487
|
+
- "Average time to complete"
|
|
488
|
+
- "Overdue action items"
|
|
489
|
+
|
|
490
|
+
escalation:
|
|
491
|
+
- "7 days overdue: Remind owner"
|
|
492
|
+
- "14 days overdue: Escalate to manager"
|
|
493
|
+
- "30 days overdue: Review in leadership meeting"
|
|
494
|
+
```
|
|
495
|
+
|
|
496
|
+
## Sharing Learnings
|
|
497
|
+
|
|
498
|
+
### Postmortem Review Meeting
|
|
499
|
+
|
|
500
|
+
```yaml
|
|
501
|
+
review_meeting:
|
|
502
|
+
frequency: "Weekly or bi-weekly"
|
|
503
|
+
duration: "30-60 minutes"
|
|
504
|
+
|
|
505
|
+
attendees:
|
|
506
|
+
- "All engineering teams (optional attendance)"
|
|
507
|
+
- "On-call engineers (recommended)"
|
|
508
|
+
- "New team members (learning)"
|
|
509
|
+
|
|
510
|
+
format:
|
|
511
|
+
- "Author presents summary (5 min)"
|
|
512
|
+
- "Q&A and discussion (15 min)"
|
|
513
|
+
- "Lessons for broader application (10 min)"
|
|
514
|
+
|
|
515
|
+
goals:
|
|
516
|
+
- "Share knowledge across teams"
|
|
517
|
+
- "Identify patterns"
|
|
518
|
+
- "Celebrate learning culture"
|
|
519
|
+
```
|
|
520
|
+
|
|
521
|
+
### Postmortem Database
|
|
522
|
+
|
|
523
|
+
```yaml
|
|
524
|
+
postmortem_repository:
|
|
525
|
+
storage: "Wiki, Git, or dedicated tool"
|
|
526
|
+
|
|
527
|
+
searchable_by:
|
|
528
|
+
- "Service affected"
|
|
529
|
+
- "Root cause category"
|
|
530
|
+
- "Date range"
|
|
531
|
+
- "Severity"
|
|
532
|
+
- "Tags/keywords"
|
|
533
|
+
|
|
534
|
+
analytics:
|
|
535
|
+
- "Common root causes"
|
|
536
|
+
- "Recurring issues"
|
|
537
|
+
- "Action item completion rates"
|
|
538
|
+
- "MTTR trends"
|
|
539
|
+
|
|
540
|
+
review:
|
|
541
|
+
- "Quarterly analysis of patterns"
|
|
542
|
+
- "Identify systemic issues"
|
|
543
|
+
- "Prioritize cross-cutting improvements"
|
|
544
|
+
```
|
|
545
|
+
|
|
546
|
+
## Common Pitfalls
|
|
547
|
+
|
|
548
|
+
```yaml
|
|
549
|
+
pitfall_blame_creep:
|
|
550
|
+
problem: "Discussion devolves into blame"
|
|
551
|
+
signs:
|
|
552
|
+
- "Focus on 'who' not 'how'"
|
|
553
|
+
- "Defensive responses"
|
|
554
|
+
- "Finger pointing"
|
|
555
|
+
solution: "Facilitator redirects to systems"
|
|
556
|
+
|
|
557
|
+
pitfall_shallow_analysis:
|
|
558
|
+
problem: "Stop at immediate cause"
|
|
559
|
+
signs:
|
|
560
|
+
- "Single 'why' answer"
|
|
561
|
+
- "Fix only the symptom"
|
|
562
|
+
- "Similar incidents recur"
|
|
563
|
+
solution: "Keep asking why until systemic issue"
|
|
564
|
+
|
|
565
|
+
pitfall_action_item_graveyard:
|
|
566
|
+
problem: "Action items never completed"
|
|
567
|
+
signs:
|
|
568
|
+
- "Growing backlog"
|
|
569
|
+
- "Same issues recur"
|
|
570
|
+
- "No tracking"
|
|
571
|
+
solution: "Track, review, escalate"
|
|
572
|
+
|
|
573
|
+
pitfall_postmortem_theater:
|
|
574
|
+
problem: "Go through motions without learning"
|
|
575
|
+
signs:
|
|
576
|
+
- "Copy-paste templates"
|
|
577
|
+
- "No discussion"
|
|
578
|
+
- "No one reads them"
|
|
579
|
+
solution: "Regular review meetings, leadership engagement"
|
|
580
|
+
|
|
581
|
+
pitfall_excessive_action_items:
|
|
582
|
+
problem: "Too many action items from one incident"
|
|
583
|
+
signs:
|
|
584
|
+
- "10+ action items"
|
|
585
|
+
- "Low priority items"
|
|
586
|
+
- "Boil the ocean"
|
|
587
|
+
solution: "Focus on 3-5 highest impact items"
|
|
588
|
+
```
|