agentic-team-templates 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. package/README.md +280 -0
  2. package/bin/cli.js +5 -0
  3. package/package.json +47 -0
  4. package/src/index.js +521 -0
  5. package/templates/_shared/code-quality.md +162 -0
  6. package/templates/_shared/communication.md +114 -0
  7. package/templates/_shared/core-principles.md +62 -0
  8. package/templates/_shared/git-workflow.md +165 -0
  9. package/templates/_shared/security-fundamentals.md +173 -0
  10. package/templates/blockchain/.cursorrules/defi-patterns.md +520 -0
  11. package/templates/blockchain/.cursorrules/gas-optimization.md +339 -0
  12. package/templates/blockchain/.cursorrules/overview.md +130 -0
  13. package/templates/blockchain/.cursorrules/security.md +318 -0
  14. package/templates/blockchain/.cursorrules/smart-contracts.md +364 -0
  15. package/templates/blockchain/.cursorrules/testing.md +415 -0
  16. package/templates/blockchain/.cursorrules/web3-integration.md +538 -0
  17. package/templates/blockchain/CLAUDE.md +389 -0
  18. package/templates/cli-tools/.cursorrules/architecture.md +412 -0
  19. package/templates/cli-tools/.cursorrules/arguments.md +406 -0
  20. package/templates/cli-tools/.cursorrules/distribution.md +546 -0
  21. package/templates/cli-tools/.cursorrules/error-handling.md +455 -0
  22. package/templates/cli-tools/.cursorrules/overview.md +136 -0
  23. package/templates/cli-tools/.cursorrules/testing.md +537 -0
  24. package/templates/cli-tools/.cursorrules/user-experience.md +545 -0
  25. package/templates/cli-tools/CLAUDE.md +356 -0
  26. package/templates/data-engineering/.cursorrules/data-modeling.md +367 -0
  27. package/templates/data-engineering/.cursorrules/data-quality.md +455 -0
  28. package/templates/data-engineering/.cursorrules/overview.md +85 -0
  29. package/templates/data-engineering/.cursorrules/performance.md +339 -0
  30. package/templates/data-engineering/.cursorrules/pipeline-design.md +280 -0
  31. package/templates/data-engineering/.cursorrules/security.md +460 -0
  32. package/templates/data-engineering/.cursorrules/testing.md +452 -0
  33. package/templates/data-engineering/CLAUDE.md +974 -0
  34. package/templates/devops-sre/.cursorrules/capacity-planning.md +653 -0
  35. package/templates/devops-sre/.cursorrules/change-management.md +584 -0
  36. package/templates/devops-sre/.cursorrules/chaos-engineering.md +651 -0
  37. package/templates/devops-sre/.cursorrules/disaster-recovery.md +641 -0
  38. package/templates/devops-sre/.cursorrules/incident-management.md +565 -0
  39. package/templates/devops-sre/.cursorrules/observability.md +714 -0
  40. package/templates/devops-sre/.cursorrules/overview.md +230 -0
  41. package/templates/devops-sre/.cursorrules/postmortems.md +588 -0
  42. package/templates/devops-sre/.cursorrules/runbooks.md +760 -0
  43. package/templates/devops-sre/.cursorrules/slo-sli.md +617 -0
  44. package/templates/devops-sre/.cursorrules/toil-reduction.md +567 -0
  45. package/templates/devops-sre/CLAUDE.md +1007 -0
  46. package/templates/documentation/.cursorrules/adr.md +277 -0
  47. package/templates/documentation/.cursorrules/api-documentation.md +411 -0
  48. package/templates/documentation/.cursorrules/code-comments.md +253 -0
  49. package/templates/documentation/.cursorrules/maintenance.md +260 -0
  50. package/templates/documentation/.cursorrules/overview.md +82 -0
  51. package/templates/documentation/.cursorrules/readme-standards.md +306 -0
  52. package/templates/documentation/CLAUDE.md +120 -0
  53. package/templates/fullstack/.cursorrules/api-contracts.md +331 -0
  54. package/templates/fullstack/.cursorrules/architecture.md +298 -0
  55. package/templates/fullstack/.cursorrules/overview.md +109 -0
  56. package/templates/fullstack/.cursorrules/shared-types.md +348 -0
  57. package/templates/fullstack/.cursorrules/testing.md +386 -0
  58. package/templates/fullstack/CLAUDE.md +349 -0
  59. package/templates/ml-ai/.cursorrules/data-engineering.md +483 -0
  60. package/templates/ml-ai/.cursorrules/deployment.md +601 -0
  61. package/templates/ml-ai/.cursorrules/model-development.md +538 -0
  62. package/templates/ml-ai/.cursorrules/monitoring.md +658 -0
  63. package/templates/ml-ai/.cursorrules/overview.md +131 -0
  64. package/templates/ml-ai/.cursorrules/security.md +637 -0
  65. package/templates/ml-ai/.cursorrules/testing.md +678 -0
  66. package/templates/ml-ai/CLAUDE.md +1136 -0
  67. package/templates/mobile/.cursorrules/navigation.md +246 -0
  68. package/templates/mobile/.cursorrules/offline-first.md +302 -0
  69. package/templates/mobile/.cursorrules/overview.md +71 -0
  70. package/templates/mobile/.cursorrules/performance.md +345 -0
  71. package/templates/mobile/.cursorrules/testing.md +339 -0
  72. package/templates/mobile/CLAUDE.md +233 -0
  73. package/templates/platform-engineering/.cursorrules/ci-cd.md +778 -0
  74. package/templates/platform-engineering/.cursorrules/developer-experience.md +632 -0
  75. package/templates/platform-engineering/.cursorrules/infrastructure-as-code.md +600 -0
  76. package/templates/platform-engineering/.cursorrules/kubernetes.md +710 -0
  77. package/templates/platform-engineering/.cursorrules/observability.md +747 -0
  78. package/templates/platform-engineering/.cursorrules/overview.md +215 -0
  79. package/templates/platform-engineering/.cursorrules/security.md +855 -0
  80. package/templates/platform-engineering/.cursorrules/testing.md +878 -0
  81. package/templates/platform-engineering/CLAUDE.md +850 -0
  82. package/templates/utility-agent/.cursorrules/action-control.md +284 -0
  83. package/templates/utility-agent/.cursorrules/context-management.md +186 -0
  84. package/templates/utility-agent/.cursorrules/hallucination-prevention.md +253 -0
  85. package/templates/utility-agent/.cursorrules/overview.md +78 -0
  86. package/templates/utility-agent/.cursorrules/token-optimization.md +369 -0
  87. package/templates/utility-agent/CLAUDE.md +513 -0
  88. package/templates/web-backend/.cursorrules/api-design.md +255 -0
  89. package/templates/web-backend/.cursorrules/authentication.md +309 -0
  90. package/templates/web-backend/.cursorrules/database-patterns.md +298 -0
  91. package/templates/web-backend/.cursorrules/error-handling.md +366 -0
  92. package/templates/web-backend/.cursorrules/overview.md +69 -0
  93. package/templates/web-backend/.cursorrules/security.md +358 -0
  94. package/templates/web-backend/.cursorrules/testing.md +395 -0
  95. package/templates/web-backend/CLAUDE.md +366 -0
  96. package/templates/web-frontend/.cursorrules/accessibility.md +296 -0
  97. package/templates/web-frontend/.cursorrules/component-patterns.md +204 -0
  98. package/templates/web-frontend/.cursorrules/overview.md +72 -0
  99. package/templates/web-frontend/.cursorrules/performance.md +325 -0
  100. package/templates/web-frontend/.cursorrules/state-management.md +227 -0
  101. package/templates/web-frontend/.cursorrules/styling.md +271 -0
  102. package/templates/web-frontend/.cursorrules/testing.md +311 -0
  103. package/templates/web-frontend/CLAUDE.md +399 -0
@@ -0,0 +1,588 @@
1
+ # Postmortems
2
+
3
+ Comprehensive guidelines for conducting blameless postmortems and learning from incidents.
4
+
5
+ ## Core Principles
6
+
7
+ 1. **Blameless** - Focus on systems, not individuals
8
+ 2. **Thorough** - Understand root causes, not just symptoms
9
+ 3. **Actionable** - Create specific, tracked action items
10
+ 4. **Shared** - Learning benefits the whole organization
11
+
12
+ ## The Blameless Culture
13
+
14
+ ### Why Blameless?
15
+
16
+ ```yaml
17
+ blame_culture_problems:
18
+ hiding_information:
19
+ behavior: "Engineers don't report near-misses"
20
+ impact: "Can't learn from close calls"
21
+
22
+ defensive_responses:
23
+ behavior: "Focus on who, not how"
24
+ impact: "Miss systemic improvements"
25
+
26
+ fear_of_reporting:
27
+ behavior: "Incidents go unreported"
28
+ impact: "Can't improve what we don't know about"
29
+
30
+ simplified_narratives:
31
+ behavior: "Blame single cause/person"
32
+ impact: "Miss complex contributing factors"
33
+
34
+ blameless_principles:
35
+ assume_good_intentions:
36
+ premise: "Engineers make decisions based on available information"
37
+ question: "What did they know at the time?"
38
+
39
+ systems_thinking:
40
+ premise: "Humans are part of a system"
41
+ question: "What about the system allowed this?"
42
+
43
+ learning_focus:
44
+ premise: "Goal is improvement, not punishment"
45
+ question: "How do we prevent this class of error?"
46
+ ```
47
+
48
+ ### Blameless Language
49
+
50
+ ```yaml
51
+ language_examples:
52
+ avoid:
53
+ - "Who broke this?"
54
+ - "Why didn't you check?"
55
+ - "You should have known"
56
+ - "This was a mistake by [person]"
57
+ - "Failure to follow procedure"
58
+
59
+ prefer:
60
+ - "What happened?"
61
+ - "What information was available?"
62
+ - "What made this decision seem reasonable?"
63
+ - "The system allowed this to happen"
64
+ - "The procedure didn't account for this case"
65
+
66
+ reframing_examples:
67
+ blame: "Developer deployed without testing"
68
+ blameless: "The deployment process allowed deployment without test verification"
69
+
70
+ blame: "Engineer didn't follow the runbook"
71
+ blameless: "The runbook was unclear about this scenario"
72
+
73
+ blame: "On-call should have caught this"
74
+ blameless: "The alert didn't fire for this condition"
75
+ ```
76
+
77
+ ## Postmortem Process
78
+
79
+ ### When to Write a Postmortem
80
+
81
+ ```yaml
82
+ always_required:
83
+ - "SEV1 incidents (any duration)"
84
+ - "SEV2 incidents (> 30 minutes)"
85
+ - "Data loss incidents"
86
+ - "Security incidents"
87
+ - "Near-misses that could have been severe"
88
+
89
+ recommended:
90
+ - "SEV3 incidents with learning opportunities"
91
+ - "Recurring issues (3+ occurrences)"
92
+ - "Novel failure modes"
93
+
94
+ optional:
95
+ - "Minor incidents with obvious fixes"
96
+ - "Issues caught before user impact"
97
+
98
+ timeline:
99
+ draft: "Within 24-48 hours of incident"
100
+ review: "Within 5 business days"
101
+ publication: "Within 7 business days"
102
+ action_items: "Due within 30 days"
103
+ ```
104
+
105
+ ### Postmortem Meeting
106
+
107
+ ```yaml
108
+ meeting_structure:
109
+ duration: "60-90 minutes"
110
+
111
+ attendees:
112
+ required:
113
+ - "Incident responders"
114
+ - "Service owner"
115
+ - "Postmortem facilitator"
116
+ optional:
117
+ - "Related team representatives"
118
+ - "Management (listening, not judging)"
119
+
120
+ agenda:
121
+ - "Timeline review (20 min)"
122
+ - "Contributing factors (20 min)"
123
+ - "What went well (10 min)"
124
+ - "What could be improved (15 min)"
125
+ - "Action items (15 min)"
126
+
127
+ facilitator_role:
128
+ - "Keep discussion blameless"
129
+ - "Ensure all voices heard"
130
+ - "Drive toward action items"
131
+ - "Capture decisions and insights"
132
+ - "Redirect blame to systems"
133
+ ```
134
+
135
+ ## Postmortem Template
136
+
137
+ ```markdown
138
+ # Postmortem: [Incident Title]
139
+
140
+ ## Metadata
141
+
142
+ | Field | Value |
143
+ |-------|-------|
144
+ | **Date** | YYYY-MM-DD |
145
+ | **Authors** | @name, @name |
146
+ | **Reviewers** | @name, @name |
147
+ | **Status** | Draft / In Review / Final |
148
+ | **Severity** | SEV1 / SEV2 / SEV3 |
149
+ | **Incident ID** | INC-XXXX |
150
+
151
+ ---
152
+
153
+ ## Executive Summary
154
+
155
+ **One-paragraph summary**: What happened, how long it lasted, what was the impact, and what we're doing to prevent recurrence.
156
+
157
+ ---
158
+
159
+ ## Impact
160
+
161
+ ### User Impact
162
+ - **Duration**: X hours Y minutes
163
+ - **Users affected**: X% of users / Y total users
164
+ - **Functionality affected**: [List of affected features]
165
+
166
+ ### Business Impact
167
+ - **Revenue impact**: $X (if applicable)
168
+ - **Customer complaints**: X tickets filed
169
+ - **SLA breach**: Yes/No
170
+
171
+ ### SLO Impact
172
+ - **Error budget consumed**: X%
173
+ - **Monthly budget remaining**: Y%
174
+
175
+ ---
176
+
177
+ ## Timeline
178
+
179
+ All times in UTC.
180
+
181
+ | Time | Event |
182
+ |------|-------|
183
+ | 14:00 | [First sign of issue - what monitoring showed] |
184
+ | 14:05 | [Alert fired - which alert, who received] |
185
+ | 14:07 | [On-call acknowledged - initial actions] |
186
+ | 14:15 | [Escalation - who was paged, why] |
187
+ | 14:20 | [Root cause identified - how it was found] |
188
+ | 14:25 | [Mitigation started - what action was taken] |
189
+ | 14:30 | [Service restored - verification steps] |
190
+ | 14:45 | [Incident closed - final verification] |
191
+
192
+ ---
193
+
194
+ ## Root Cause Analysis
195
+
196
+ ### What Happened
197
+
198
+ Detailed technical explanation of what went wrong. Include:
199
+ - The immediate cause
200
+ - The sequence of events
201
+ - Technical details relevant to understanding
202
+
203
+ ### Why It Happened
204
+
205
+ Use the "5 Whys" technique to get to root cause:
206
+
207
+ 1. **Why** did the service return errors?
208
+ - Because the database connection pool was exhausted.
209
+
210
+ 2. **Why** was the connection pool exhausted?
211
+ - Because a slow query was holding connections for 30+ seconds.
212
+
213
+ 3. **Why** was the query slow?
214
+ - Because a missing index caused a full table scan.
215
+
216
+ 4. **Why** was the index missing?
217
+ - Because the migration to add it was never run in production.
218
+
219
+ 5. **Why** wasn't the migration run?
220
+ - Because our deployment process doesn't verify pending migrations.
221
+
222
+ **Root Cause**: Deployment process lacks verification of pending database migrations.
223
+
224
+ ---
225
+
226
+ ## Contributing Factors
227
+
228
+ Factors that made the incident possible or worse:
229
+
230
+ | Factor | Description | Type |
231
+ |--------|-------------|------|
232
+ | Missing migration check | Deployment doesn't verify migrations | Process |
233
+ | No query timeout | Long queries hold connections indefinitely | Configuration |
234
+ | Insufficient connection pool monitoring | Didn't alert until pool was 100% exhausted | Monitoring |
235
+ | Runbook outdated | Steps didn't match current architecture | Documentation |
236
+
237
+ ---
238
+
239
+ ## What Went Well
240
+
241
+ - **Detection**: Alert fired within 5 minutes of issue starting
242
+ - **Response**: On-call acknowledged within 2 minutes
243
+ - **Communication**: Status page updated promptly
244
+ - **Teamwork**: Cross-team collaboration was smooth
245
+ - **Rollback**: Quick rollback reduced impact duration
246
+
247
+ ---
248
+
249
+ ## What Went Poorly
250
+
251
+ - **Detection**: Alert threshold too high, should have fired earlier
252
+ - **Diagnosis**: Initially went down wrong path investigating network
253
+ - **Documentation**: Runbook didn't cover this scenario
254
+ - **Communication**: Internal Slack updates were sporadic
255
+ - **Recovery**: Rollback took longer than expected due to manual steps
256
+
257
+ ---
258
+
259
+ ## Where We Got Lucky
260
+
261
+ Things that could have made this worse but didn't:
262
+
263
+ - Issue happened during business hours with full team available
264
+ - Recent backup was only 15 minutes old
265
+ - The slow query only affected one service
266
+
267
+ ---
268
+
269
+ ## Action Items
270
+
271
+ | ID | Action | Type | Priority | Owner | Due Date | Status |
272
+ |----|--------|------|----------|-------|----------|--------|
273
+ | 1 | Add migration check to deployment pipeline | Prevent | P1 | @engineer | 2025-02-01 | TODO |
274
+ | 2 | Configure query timeout at 5 seconds | Prevent | P1 | @dba | 2025-01-25 | TODO |
275
+ | 3 | Add connection pool utilization alert at 70% | Detect | P2 | @sre | 2025-02-01 | TODO |
276
+ | 4 | Update runbook with DB troubleshooting steps | Document | P2 | @engineer | 2025-01-30 | TODO |
277
+ | 5 | Automate rollback procedure | Mitigate | P3 | @sre | 2025-02-15 | TODO |
278
+
279
+ ### Action Item Types
280
+ - **Prevent**: Stop this class of incident from happening
281
+ - **Detect**: Find this faster next time
282
+ - **Mitigate**: Reduce impact when it happens
283
+ - **Document**: Improve understanding/procedures
284
+
285
+ ---
286
+
287
+ ## Lessons Learned
288
+
289
+ ### Key Takeaways
290
+
291
+ 1. **Database migrations need verification**: Our deployment process should verify that all migrations are applied before proceeding.
292
+
293
+ 2. **Defense in depth for connection pools**: Multiple safeguards (query timeout, pool limits, alerting) would have limited impact.
294
+
295
+ 3. **Runbooks need regular review**: This scenario wasn't covered, suggesting we need periodic runbook audits.
296
+
297
+ ### Recommendations for Broader Organization
298
+
299
+ - Consider adding migration checks to the standard deployment template
300
+ - Review other services for similar query timeout gaps
301
+ - Schedule quarterly runbook review process
302
+
303
+ ---
304
+
305
+ ## Supporting Information
306
+
307
+ ### Relevant Logs
308
+
309
+ ```
310
+ 2025-01-15 14:05:23 ERROR Database connection timeout after 30s
311
+ 2025-01-15 14:05:24 ERROR Connection pool exhausted: 50/50 active
312
+ 2025-01-15 14:05:24 ERROR Request failed: unable to acquire connection
313
+ ```
314
+
315
+ ### Relevant Metrics
316
+
317
+ ![Error Rate Graph](link-to-screenshot)
318
+ ![Connection Pool Graph](link-to-screenshot)
319
+
320
+ ### Related Incidents
321
+
322
+ - [INC-1234](link) - Similar connection pool issue in 2024-06
323
+ - [INC-2345](link) - Related slow query incident
324
+
325
+ ---
326
+
327
+ ## Appendix
328
+
329
+ ### Glossary
330
+
331
+ - **Connection Pool**: Set of reusable database connections
332
+ - **Migration**: Database schema change script
333
+
334
+ ### References
335
+
336
+ - [Service Architecture Doc](link)
337
+ - [Database Runbook](link)
338
+ - [Deployment Pipeline](link)
339
+ ```
340
+
341
+ ## Root Cause Analysis Techniques
342
+
343
+ ### 5 Whys
344
+
345
+ ```yaml
346
+ five_whys:
347
+ description: "Ask 'why' repeatedly to find root cause"
348
+
349
+ process:
350
+ - "Start with the problem"
351
+ - "Ask 'why did this happen?'"
352
+ - "For each answer, ask 'why?' again"
353
+ - "Continue until you reach a systemic issue"
354
+ - "Usually 5 iterations, but can be more or fewer"
355
+
356
+ example:
357
+ problem: "Service returned 500 errors"
358
+ why_1:
359
+ q: "Why did the service return 500 errors?"
360
+ a: "The database was unreachable"
361
+ why_2:
362
+ q: "Why was the database unreachable?"
363
+ a: "The database ran out of disk space"
364
+ why_3:
365
+ q: "Why did it run out of disk space?"
366
+ a: "Logs were not being rotated"
367
+ why_4:
368
+ q: "Why were logs not being rotated?"
369
+ a: "Log rotation was configured but not enabled"
370
+ why_5:
371
+ q: "Why was it not enabled?"
372
+ a: "The infrastructure template didn't include it"
373
+ root_cause: "Infrastructure templates missing log rotation config"
374
+
375
+ tips:
376
+ - "Multiple branches are OK (parallel why chains)"
377
+ - "Stop when you reach something you can fix"
378
+ - "Focus on process/system, not people"
379
+ ```
380
+
381
+ ### Fishbone Diagram (Ishikawa)
382
+
383
+ ```yaml
384
+ fishbone_analysis:
385
+ description: "Categorize contributing factors"
386
+
387
+ categories:
388
+ people:
389
+ - "Training gaps"
390
+ - "Cognitive load"
391
+ - "Communication issues"
392
+
393
+ process:
394
+ - "Missing procedures"
395
+ - "Unclear ownership"
396
+ - "Inadequate review"
397
+
398
+ technology:
399
+ - "Software bugs"
400
+ - "Infrastructure issues"
401
+ - "Tool limitations"
402
+
403
+ environment:
404
+ - "Time pressure"
405
+ - "Resource constraints"
406
+ - "External dependencies"
407
+
408
+ visualization: |
409
+ People Process
410
+ \ /
411
+ \ /
412
+ \ /
413
+ \ /
414
+ [PROBLEM]
415
+ / \
416
+ / \
417
+ / \
418
+ / \
419
+ Technology Environment
420
+ ```
421
+
422
+ ### Timeline Analysis
423
+
424
+ ```yaml
425
+ timeline_analysis:
426
+ purpose: "Understand sequence of events"
427
+
428
+ elements:
429
+ - "Timestamp (in UTC)"
430
+ - "Event description"
431
+ - "Actor (human or system)"
432
+ - "Evidence (logs, alerts, messages)"
433
+
434
+ tips:
435
+ - "Be precise with times"
436
+ - "Include non-events (what didn't happen)"
437
+ - "Note decision points"
438
+ - "Highlight delays"
439
+
440
+ questions:
441
+ - "When did the issue actually start?"
442
+ - "When was it first detectable?"
443
+ - "What triggered detection?"
444
+ - "What caused delays in response?"
445
+ ```
446
+
447
+ ## Action Item Management
448
+
449
+ ### Writing Good Action Items
450
+
451
+ ```yaml
452
+ good_action_item:
453
+ specific: "Clear what needs to be done"
454
+ measurable: "Know when it's complete"
455
+ assigned: "Single owner (not a team)"
456
+ relevant: "Actually addresses the problem"
457
+ time_bound: "Has a due date"
458
+
459
+ examples:
460
+ bad:
461
+ - "Improve monitoring" # Vague
462
+ - "Don't do this again" # Not actionable
463
+ - "Team should fix" # No owner
464
+
465
+ good:
466
+ - "Add alert for connection pool > 80% (@sre, due 2025-02-01)"
467
+ - "Update deployment pipeline to check pending migrations (@engineer, due 2025-01-25)"
468
+ - "Document database failover procedure in runbook (@dba, due 2025-02-01)"
469
+ ```
470
+
471
+ ### Tracking Action Items
472
+
473
+ ```yaml
474
+ tracking_process:
475
+ documentation:
476
+ - "Action items in postmortem document"
477
+ - "Tickets created in issue tracker"
478
+ - "Link tickets back to postmortem"
479
+
480
+ review:
481
+ - "Weekly review of open action items"
482
+ - "Escalate overdue items"
483
+ - "Close completed items with verification"
484
+
485
+ metrics:
486
+ - "Action item completion rate"
487
+ - "Average time to complete"
488
+ - "Overdue action items"
489
+
490
+ escalation:
491
+ - "7 days overdue: Remind owner"
492
+ - "14 days overdue: Escalate to manager"
493
+ - "30 days overdue: Review in leadership meeting"
494
+ ```
495
+
496
+ ## Sharing Learnings
497
+
498
+ ### Postmortem Review Meeting
499
+
500
+ ```yaml
501
+ review_meeting:
502
+ frequency: "Weekly or bi-weekly"
503
+ duration: "30-60 minutes"
504
+
505
+ attendees:
506
+ - "All engineering teams (optional attendance)"
507
+ - "On-call engineers (recommended)"
508
+ - "New team members (learning)"
509
+
510
+ format:
511
+ - "Author presents summary (5 min)"
512
+ - "Q&A and discussion (15 min)"
513
+ - "Lessons for broader application (10 min)"
514
+
515
+ goals:
516
+ - "Share knowledge across teams"
517
+ - "Identify patterns"
518
+ - "Celebrate learning culture"
519
+ ```
520
+
521
+ ### Postmortem Database
522
+
523
+ ```yaml
524
+ postmortem_repository:
525
+ storage: "Wiki, Git, or dedicated tool"
526
+
527
+ searchable_by:
528
+ - "Service affected"
529
+ - "Root cause category"
530
+ - "Date range"
531
+ - "Severity"
532
+ - "Tags/keywords"
533
+
534
+ analytics:
535
+ - "Common root causes"
536
+ - "Recurring issues"
537
+ - "Action item completion rates"
538
+ - "MTTR trends"
539
+
540
+ review:
541
+ - "Quarterly analysis of patterns"
542
+ - "Identify systemic issues"
543
+ - "Prioritize cross-cutting improvements"
544
+ ```
545
+
546
+ ## Common Pitfalls
547
+
548
+ ```yaml
549
+ pitfall_blame_creep:
550
+ problem: "Discussion devolves into blame"
551
+ signs:
552
+ - "Focus on 'who' not 'how'"
553
+ - "Defensive responses"
554
+ - "Finger pointing"
555
+ solution: "Facilitator redirects to systems"
556
+
557
+ pitfall_shallow_analysis:
558
+ problem: "Stop at immediate cause"
559
+ signs:
560
+ - "Single 'why' answer"
561
+ - "Fix only the symptom"
562
+ - "Similar incidents recur"
563
+ solution: "Keep asking why until systemic issue"
564
+
565
+ pitfall_action_item_graveyard:
566
+ problem: "Action items never completed"
567
+ signs:
568
+ - "Growing backlog"
569
+ - "Same issues recur"
570
+ - "No tracking"
571
+ solution: "Track, review, escalate"
572
+
573
+ pitfall_postmortem_theater:
574
+ problem: "Go through motions without learning"
575
+ signs:
576
+ - "Copy-paste templates"
577
+ - "No discussion"
578
+ - "No one reads them"
579
+ solution: "Regular review meetings, leadership engagement"
580
+
581
+ pitfall_excessive_action_items:
582
+ problem: "Too many action items from one incident"
583
+ signs:
584
+ - "10+ action items"
585
+ - "Low priority items"
586
+ - "Boil the ocean"
587
+ solution: "Focus on 3-5 highest impact items"
588
+ ```