agentic-team-templates 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. package/README.md +280 -0
  2. package/bin/cli.js +5 -0
  3. package/package.json +47 -0
  4. package/src/index.js +521 -0
  5. package/templates/_shared/code-quality.md +162 -0
  6. package/templates/_shared/communication.md +114 -0
  7. package/templates/_shared/core-principles.md +62 -0
  8. package/templates/_shared/git-workflow.md +165 -0
  9. package/templates/_shared/security-fundamentals.md +173 -0
  10. package/templates/blockchain/.cursorrules/defi-patterns.md +520 -0
  11. package/templates/blockchain/.cursorrules/gas-optimization.md +339 -0
  12. package/templates/blockchain/.cursorrules/overview.md +130 -0
  13. package/templates/blockchain/.cursorrules/security.md +318 -0
  14. package/templates/blockchain/.cursorrules/smart-contracts.md +364 -0
  15. package/templates/blockchain/.cursorrules/testing.md +415 -0
  16. package/templates/blockchain/.cursorrules/web3-integration.md +538 -0
  17. package/templates/blockchain/CLAUDE.md +389 -0
  18. package/templates/cli-tools/.cursorrules/architecture.md +412 -0
  19. package/templates/cli-tools/.cursorrules/arguments.md +406 -0
  20. package/templates/cli-tools/.cursorrules/distribution.md +546 -0
  21. package/templates/cli-tools/.cursorrules/error-handling.md +455 -0
  22. package/templates/cli-tools/.cursorrules/overview.md +136 -0
  23. package/templates/cli-tools/.cursorrules/testing.md +537 -0
  24. package/templates/cli-tools/.cursorrules/user-experience.md +545 -0
  25. package/templates/cli-tools/CLAUDE.md +356 -0
  26. package/templates/data-engineering/.cursorrules/data-modeling.md +367 -0
  27. package/templates/data-engineering/.cursorrules/data-quality.md +455 -0
  28. package/templates/data-engineering/.cursorrules/overview.md +85 -0
  29. package/templates/data-engineering/.cursorrules/performance.md +339 -0
  30. package/templates/data-engineering/.cursorrules/pipeline-design.md +280 -0
  31. package/templates/data-engineering/.cursorrules/security.md +460 -0
  32. package/templates/data-engineering/.cursorrules/testing.md +452 -0
  33. package/templates/data-engineering/CLAUDE.md +974 -0
  34. package/templates/devops-sre/.cursorrules/capacity-planning.md +653 -0
  35. package/templates/devops-sre/.cursorrules/change-management.md +584 -0
  36. package/templates/devops-sre/.cursorrules/chaos-engineering.md +651 -0
  37. package/templates/devops-sre/.cursorrules/disaster-recovery.md +641 -0
  38. package/templates/devops-sre/.cursorrules/incident-management.md +565 -0
  39. package/templates/devops-sre/.cursorrules/observability.md +714 -0
  40. package/templates/devops-sre/.cursorrules/overview.md +230 -0
  41. package/templates/devops-sre/.cursorrules/postmortems.md +588 -0
  42. package/templates/devops-sre/.cursorrules/runbooks.md +760 -0
  43. package/templates/devops-sre/.cursorrules/slo-sli.md +617 -0
  44. package/templates/devops-sre/.cursorrules/toil-reduction.md +567 -0
  45. package/templates/devops-sre/CLAUDE.md +1007 -0
  46. package/templates/documentation/.cursorrules/adr.md +277 -0
  47. package/templates/documentation/.cursorrules/api-documentation.md +411 -0
  48. package/templates/documentation/.cursorrules/code-comments.md +253 -0
  49. package/templates/documentation/.cursorrules/maintenance.md +260 -0
  50. package/templates/documentation/.cursorrules/overview.md +82 -0
  51. package/templates/documentation/.cursorrules/readme-standards.md +306 -0
  52. package/templates/documentation/CLAUDE.md +120 -0
  53. package/templates/fullstack/.cursorrules/api-contracts.md +331 -0
  54. package/templates/fullstack/.cursorrules/architecture.md +298 -0
  55. package/templates/fullstack/.cursorrules/overview.md +109 -0
  56. package/templates/fullstack/.cursorrules/shared-types.md +348 -0
  57. package/templates/fullstack/.cursorrules/testing.md +386 -0
  58. package/templates/fullstack/CLAUDE.md +349 -0
  59. package/templates/ml-ai/.cursorrules/data-engineering.md +483 -0
  60. package/templates/ml-ai/.cursorrules/deployment.md +601 -0
  61. package/templates/ml-ai/.cursorrules/model-development.md +538 -0
  62. package/templates/ml-ai/.cursorrules/monitoring.md +658 -0
  63. package/templates/ml-ai/.cursorrules/overview.md +131 -0
  64. package/templates/ml-ai/.cursorrules/security.md +637 -0
  65. package/templates/ml-ai/.cursorrules/testing.md +678 -0
  66. package/templates/ml-ai/CLAUDE.md +1136 -0
  67. package/templates/mobile/.cursorrules/navigation.md +246 -0
  68. package/templates/mobile/.cursorrules/offline-first.md +302 -0
  69. package/templates/mobile/.cursorrules/overview.md +71 -0
  70. package/templates/mobile/.cursorrules/performance.md +345 -0
  71. package/templates/mobile/.cursorrules/testing.md +339 -0
  72. package/templates/mobile/CLAUDE.md +233 -0
  73. package/templates/platform-engineering/.cursorrules/ci-cd.md +778 -0
  74. package/templates/platform-engineering/.cursorrules/developer-experience.md +632 -0
  75. package/templates/platform-engineering/.cursorrules/infrastructure-as-code.md +600 -0
  76. package/templates/platform-engineering/.cursorrules/kubernetes.md +710 -0
  77. package/templates/platform-engineering/.cursorrules/observability.md +747 -0
  78. package/templates/platform-engineering/.cursorrules/overview.md +215 -0
  79. package/templates/platform-engineering/.cursorrules/security.md +855 -0
  80. package/templates/platform-engineering/.cursorrules/testing.md +878 -0
  81. package/templates/platform-engineering/CLAUDE.md +850 -0
  82. package/templates/utility-agent/.cursorrules/action-control.md +284 -0
  83. package/templates/utility-agent/.cursorrules/context-management.md +186 -0
  84. package/templates/utility-agent/.cursorrules/hallucination-prevention.md +253 -0
  85. package/templates/utility-agent/.cursorrules/overview.md +78 -0
  86. package/templates/utility-agent/.cursorrules/token-optimization.md +369 -0
  87. package/templates/utility-agent/CLAUDE.md +513 -0
  88. package/templates/web-backend/.cursorrules/api-design.md +255 -0
  89. package/templates/web-backend/.cursorrules/authentication.md +309 -0
  90. package/templates/web-backend/.cursorrules/database-patterns.md +298 -0
  91. package/templates/web-backend/.cursorrules/error-handling.md +366 -0
  92. package/templates/web-backend/.cursorrules/overview.md +69 -0
  93. package/templates/web-backend/.cursorrules/security.md +358 -0
  94. package/templates/web-backend/.cursorrules/testing.md +395 -0
  95. package/templates/web-backend/CLAUDE.md +366 -0
  96. package/templates/web-frontend/.cursorrules/accessibility.md +296 -0
  97. package/templates/web-frontend/.cursorrules/component-patterns.md +204 -0
  98. package/templates/web-frontend/.cursorrules/overview.md +72 -0
  99. package/templates/web-frontend/.cursorrules/performance.md +325 -0
  100. package/templates/web-frontend/.cursorrules/state-management.md +227 -0
  101. package/templates/web-frontend/.cursorrules/styling.md +271 -0
  102. package/templates/web-frontend/.cursorrules/testing.md +311 -0
  103. package/templates/web-frontend/CLAUDE.md +399 -0
@@ -0,0 +1,565 @@
1
+ # Incident Management
2
+
3
+ Comprehensive guidelines for detecting, responding to, and learning from incidents.
4
+
5
+ ## Core Principles
6
+
7
+ 1. **Detect Fast** - Automated monitoring catches issues before users report them
8
+ 2. **Communicate Clearly** - Stakeholders know what's happening and when it will be fixed
9
+ 3. **Mitigate First** - Stop the bleeding before root cause analysis
10
+ 4. **Learn Always** - Every incident is an opportunity to improve
11
+
12
+ ## Incident Severity Levels
13
+
14
+ ### Severity Definitions
15
+
16
+ ```yaml
17
+ sev1_critical:
18
+ description: "Complete service outage or severe security incident"
19
+ criteria:
20
+ - "Core service completely unavailable"
21
+ - "Data breach or security incident in progress"
22
+ - "Data loss affecting customers"
23
+ - "Revenue-impacting payment failures"
24
+ response:
25
+ time_to_acknowledge: "5 minutes"
26
+ time_to_engage: "15 minutes"
27
+ war_room: "Immediately"
28
+ status_page: "Update within 15 minutes"
29
+ executive_notification: "Yes"
30
+ examples:
31
+ - "Production database down"
32
+ - "API returning 5xx for all requests"
33
+ - "Active security breach"
34
+ - "Complete payment processing failure"
35
+
36
+ sev2_major:
37
+ description: "Significant degradation affecting many users"
38
+ criteria:
39
+ - "Major feature unavailable"
40
+ - ">10% of users affected"
41
+ - "Significant latency increase"
42
+ - "Partial data loss risk"
43
+ response:
44
+ time_to_acknowledge: "15 minutes"
45
+ time_to_engage: "30 minutes"
46
+ war_room: "If not resolved in 30 minutes"
47
+ status_page: "Update within 30 minutes"
48
+ executive_notification: "If > 1 hour duration"
49
+ examples:
50
+ - "Search functionality broken"
51
+ - "50% error rate on checkout"
52
+ - "Mobile app unable to sync"
53
+ - "Authentication intermittently failing"
54
+
55
+ sev3_minor:
56
+ description: "Limited impact with workaround available"
57
+ criteria:
58
+ - "Single feature degraded"
59
+ - "<10% of users affected"
60
+ - "Workaround exists"
61
+ - "Non-critical functionality"
62
+ response:
63
+ time_to_acknowledge: "1 hour"
64
+ time_to_engage: "4 hours"
65
+ war_room: "Not required"
66
+ status_page: "Optional"
67
+ executive_notification: "No"
68
+ examples:
69
+ - "Export feature failing"
70
+ - "Admin dashboard slow"
71
+ - "Email notifications delayed"
72
+ - "Analytics not updating"
73
+
74
+ sev4_low:
75
+ description: "Minimal impact, cosmetic issues"
76
+ criteria:
77
+ - "No user-facing impact"
78
+ - "Internal tooling issues"
79
+ - "Cosmetic bugs"
80
+ response:
81
+ time_to_acknowledge: "Next business day"
82
+ time_to_engage: "Standard sprint work"
83
+ war_room: "Not required"
84
+ status_page: "No"
85
+ executive_notification: "No"
86
+ examples:
87
+ - "Log formatting issues"
88
+ - "Internal dashboard UI bug"
89
+ - "Dev environment problems"
90
+ ```
91
+
92
+ ## Incident Response Process
93
+
94
+ ### Phase 1: Detection
95
+
96
+ ```yaml
97
+ detection_sources:
98
+ automated:
99
+ - "Alerting systems (Prometheus, Datadog)"
100
+ - "Synthetic monitoring"
101
+ - "Error tracking (Sentry, Bugsnag)"
102
+ - "APM anomaly detection"
103
+
104
+ human:
105
+ - "User reports (support tickets)"
106
+ - "Internal reports (engineers, QA)"
107
+ - "Social media monitoring"
108
+ - "Partner notifications"
109
+
110
+ detection_goals:
111
+ mttd_targets:
112
+ sev1: "< 5 minutes"
113
+ sev2: "< 15 minutes"
114
+ sev3: "< 1 hour"
115
+ ```
116
+
117
+ ### Phase 2: Response
118
+
119
+ ```yaml
120
+ initial_response:
121
+ steps:
122
+ 1: "Acknowledge alert/report"
123
+ 2: "Assess severity based on criteria"
124
+ 3: "Declare incident if criteria met"
125
+ 4: "Create incident channel"
126
+ 5: "Page appropriate responders"
127
+ 6: "Start incident document"
128
+
129
+ incident_declaration:
130
+ when_to_declare:
131
+ - "Alert indicates customer impact"
132
+ - "Multiple related alerts firing"
133
+ - "User reports confirmed"
134
+ - "Uncertainty about scope"
135
+
136
+ how_to_declare:
137
+ slack: "/incident create [description] [severity]"
138
+ pagerduty: "Trigger incident with severity"
139
+
140
+ incident_channel:
141
+ naming: "inc-YYYY-MM-DD-short-description"
142
+ topic: "SEV[X] | IC: @name | Status: investigating"
143
+ pinned:
144
+ - "Incident summary"
145
+ - "Timeline document"
146
+ - "Relevant dashboards"
147
+ - "Runbook links"
148
+ ```
149
+
150
+ ### Phase 3: Roles and Responsibilities
151
+
152
+ ```yaml
153
+ incident_commander:
154
+ also_known_as: "IC"
155
+ responsibilities:
156
+ - "Single point of coordination"
157
+ - "Assign and reassign roles"
158
+ - "Make decisions when no consensus"
159
+ - "Manage communication cadence"
160
+ - "Escalate when needed"
161
+ - "Declare incident resolved"
162
+ should_not:
163
+ - "Debug technical issues directly"
164
+ - "Write code to fix the issue"
165
+ - "Get lost in technical details"
166
+
167
+ technical_lead:
168
+ also_known_as: "Tech Lead, TL"
169
+ responsibilities:
170
+ - "Lead technical investigation"
171
+ - "Coordinate debugging efforts"
172
+ - "Make technical decisions"
173
+ - "Implement fixes"
174
+ - "Verify resolution"
175
+ should_not:
176
+ - "Handle communications"
177
+ - "Update status page"
178
+ - "Brief stakeholders"
179
+
180
+ communications_lead:
181
+ also_known_as: "Comms Lead"
182
+ responsibilities:
183
+ - "Update status page"
184
+ - "Send stakeholder updates"
185
+ - "Draft customer communications"
186
+ - "Coordinate with support team"
187
+ - "Manage external messaging"
188
+
189
+ scribe:
190
+ responsibilities:
191
+ - "Maintain incident timeline"
192
+ - "Document key decisions"
193
+ - "Record actions taken"
194
+ - "Capture screenshots/data"
195
+ - "Note participants"
196
+ ```
197
+
198
+ ### Phase 4: Mitigation
199
+
200
+ ```yaml
201
+ mitigation_priority:
202
+ 1_stop_the_bleeding:
203
+ goal: "Restore service to users"
204
+ actions:
205
+ - "Rollback recent changes"
206
+ - "Scale up resources"
207
+ - "Enable feature flags to disable broken code"
208
+ - "Failover to backup systems"
209
+ - "Enable maintenance mode"
210
+
211
+ 2_stabilize:
212
+ goal: "Ensure service stays up"
213
+ actions:
214
+ - "Apply temporary fixes"
215
+ - "Add extra monitoring"
216
+ - "Scale for headroom"
217
+ - "Disable non-critical features"
218
+
219
+ 3_root_cause_later:
220
+ goal: "Don't debug during outage"
221
+ note: "Root cause analysis happens in postmortem"
222
+
223
+ common_mitigation_actions:
224
+ rollback:
225
+ when: "Recent deployment suspected"
226
+ command: "kubectl rollout undo deployment/[name]"
227
+ verify: "Check error rates return to normal"
228
+
229
+ scale_up:
230
+ when: "Capacity-related issues"
231
+ command: "kubectl scale deployment/[name] --replicas=[N]"
232
+ verify: "Check resource utilization decreases"
233
+
234
+ failover:
235
+ when: "Primary system unrecoverable"
236
+ procedure: "Follow DR runbook"
237
+ verify: "Traffic flowing to secondary"
238
+
239
+ feature_flag:
240
+ when: "Specific feature causing issues"
241
+ action: "Disable flag in LaunchDarkly/Unleash"
242
+ verify: "Feature disabled, errors stop"
243
+ ```
244
+
245
+ ### Phase 5: Resolution
246
+
247
+ ```yaml
248
+ resolution_criteria:
249
+ technical:
250
+ - "Error rates at normal levels"
251
+ - "Latency within SLO"
252
+ - "No degradation visible"
253
+ - "Monitors green"
254
+
255
+ operational:
256
+ - "Mitigation is stable (not just temporarily fixed)"
257
+ - "No immediate risk of recurrence"
258
+ - "Team can step down from incident"
259
+
260
+ post_resolution:
261
+ immediate:
262
+ - "Update status page to resolved"
263
+ - "Send final stakeholder update"
264
+ - "Thank participants"
265
+ - "Schedule postmortem"
266
+
267
+ within_24_hours:
268
+ - "Draft initial postmortem"
269
+ - "Gather timeline from participants"
270
+ - "Collect relevant data/logs"
271
+
272
+ within_5_days:
273
+ - "Complete postmortem review"
274
+ - "Create action items"
275
+ - "Share learnings"
276
+ ```
277
+
278
+ ## Communication Templates
279
+
280
+ ### Status Page Update Template
281
+
282
+ ```markdown
283
+ **[Investigating/Identified/Monitoring/Resolved]**
284
+
285
+ **Summary**: [One sentence description of user impact]
286
+
287
+ **Affected Services**: [List of affected services]
288
+
289
+ **Current Status**: [What we know and what we're doing]
290
+
291
+ **Next Update**: [Time of next update]
292
+
293
+ ---
294
+ Examples:
295
+
296
+ **Investigating**
297
+ We are currently investigating reports of elevated error rates
298
+ on the checkout flow. Some users may experience failures when
299
+ completing purchases. We will provide an update in 15 minutes.
300
+
301
+ **Identified**
302
+ We have identified the cause of checkout failures as a database
303
+ connectivity issue. We are working on restoring connectivity.
304
+ Estimated resolution in 30 minutes.
305
+
306
+ **Monitoring**
307
+ A fix has been implemented. We are monitoring to confirm
308
+ stability. Users should no longer experience checkout failures.
309
+
310
+ **Resolved**
311
+ This incident has been resolved. Checkout functionality has
312
+ been restored. Total duration: 45 minutes.
313
+ ```
314
+
315
+ ### Stakeholder Update Template
316
+
317
+ ```markdown
318
+ ## Incident Update: [Title]
319
+
320
+ **Severity**: SEV[X]
321
+ **Status**: [Active/Resolved]
322
+ **Duration**: [X hours Y minutes]
323
+
324
+ ### What Happened
325
+ [Brief description of the issue and user impact]
326
+
327
+ ### Current Status
328
+ [What we know and what we're doing]
329
+
330
+ ### Business Impact
331
+ - Users affected: [X]
332
+ - Revenue impact: [$X] (if applicable)
333
+ - Customer complaints: [X]
334
+
335
+ ### Next Steps
336
+ [What happens next, when the next update will be]
337
+
338
+ ### Questions?
339
+ Contact: [Incident Commander name and channel]
340
+ ```
341
+
342
+ ## War Room Guidelines
343
+
344
+ ### Setting Up a War Room
345
+
346
+ ```yaml
347
+ virtual_war_room:
348
+ when: "SEV1 or SEV2 lasting > 30 minutes"
349
+
350
+ setup:
351
+ - "Create video call (Zoom/Meet/Teams)"
352
+ - "Post link in incident channel"
353
+ - "IC joins immediately"
354
+ - "Technical responders join as needed"
355
+
356
+ ground_rules:
357
+ - "IC runs the call"
358
+ - "Mute when not speaking"
359
+ - "Use raise hand to speak"
360
+ - "Technical work happens off-call, report back"
361
+ - "No side conversations"
362
+ - "Stay focused on mitigation"
363
+
364
+ communication_cadence:
365
+ - "Status update every 15 minutes"
366
+ - "IC asks for updates from each workstream"
367
+ - "Decisions announced and documented"
368
+ ```
369
+
370
+ ### War Room Commands
371
+
372
+ ```yaml
373
+ ic_commands:
374
+ status_check:
375
+ phrase: "Status check - what's your current state?"
376
+ purpose: "Get updates from all participants"
377
+
378
+ decision_time:
379
+ phrase: "We need to make a decision on [X]. Options are [A, B, C]. Any objections to [A]?"
380
+ purpose: "Drive decisions forward"
381
+
382
+ escalation:
383
+ phrase: "We need to escalate to [person/team]. [Name], can you page them?"
384
+ purpose: "Bring in additional help"
385
+
386
+ parallel_work:
387
+ phrase: "[Name] investigate [X]. [Name] investigate [Y]. Report back in 10 minutes."
388
+ purpose: "Divide and conquer"
389
+
390
+ refocus:
391
+ phrase: "Let's refocus on [priority]. We can debug [other thing] after the incident."
392
+ purpose: "Keep team on track"
393
+ ```
394
+
395
+ ## On-Call Integration
396
+
397
+ ### Escalation Paths
398
+
399
+ ```yaml
400
+ escalation_matrix:
401
+ api_services:
402
+ primary: "backend-oncall"
403
+ secondary: "backend-lead"
404
+ tertiary: "engineering-manager"
405
+ executive: "vp-engineering"
406
+
407
+ infrastructure:
408
+ primary: "infra-oncall"
409
+ secondary: "sre-lead"
410
+ tertiary: "engineering-manager"
411
+ executive: "vp-engineering"
412
+
413
+ database:
414
+ primary: "dba-oncall"
415
+ secondary: "dba-lead"
416
+ tertiary: "infra-oncall"
417
+ executive: "vp-engineering"
418
+
419
+ security:
420
+ primary: "security-oncall"
421
+ secondary: "security-lead"
422
+ tertiary: "ciso"
423
+ executive: "cto"
424
+
425
+ escalation_triggers:
426
+ automatic:
427
+ - "No acknowledgment within SLA"
428
+ - "Incident duration exceeds threshold"
429
+ - "Severity upgraded"
430
+
431
+ manual:
432
+ - "Primary on-call requests help"
433
+ - "Technical expertise needed"
434
+ - "Business decision required"
435
+ ```
436
+
437
+ ### Handoff During Incidents
438
+
439
+ ```yaml
440
+ shift_handoff_during_incident:
441
+ when: "Incident spans on-call rotation change"
442
+
443
+ process:
444
+ 1: "Outgoing on-call notifies IC of shift change"
445
+ 2: "30-minute overlap for knowledge transfer"
446
+ 3: "Incoming on-call joins war room"
447
+ 4: "Outgoing provides verbal summary"
448
+ 5: "Incoming confirms understanding"
449
+ 6: "IC announces handoff complete"
450
+
451
+ handoff_includes:
452
+ - "Current incident status"
453
+ - "What's been tried"
454
+ - "Current hypothesis"
455
+ - "Assigned tasks"
456
+ - "Open questions"
457
+ ```
458
+
459
+ ## Incident Documentation
460
+
461
+ ### Real-Time Timeline
462
+
463
+ ```markdown
464
+ ## Incident Timeline: inc-2025-01-15-api-outage
465
+
466
+ All times UTC
467
+
468
+ | Time | Actor | Event |
469
+ |------|-------|-------|
470
+ | 14:00 | System | Deployment api-server v2.3.1 started |
471
+ | 14:02 | System | Deployment completed |
472
+ | 14:05 | Alert | APIHighErrorRate fired |
473
+ | 14:06 | @oncall | Acknowledged alert |
474
+ | 14:08 | @oncall | Incident declared SEV2 |
475
+ | 14:08 | @oncall | Created #inc-2025-01-15-api-outage |
476
+ | 14:10 | @oncall | Checking recent deployments |
477
+ | 14:12 | @oncall | Identified v2.3.1 deployed 5 min before errors |
478
+ | 14:15 | @oncall | Initiating rollback to v2.3.0 |
479
+ | 14:18 | System | Rollback completed |
480
+ | 14:20 | @oncall | Error rates returning to normal |
481
+ | 14:25 | @oncall | Monitoring - errors at baseline |
482
+ | 14:30 | @oncall | Incident resolved, scheduling postmortem |
483
+ ```
484
+
485
+ ## Metrics and Improvement
486
+
487
+ ### Incident Metrics to Track
488
+
489
+ ```yaml
490
+ response_metrics:
491
+ mttd:
492
+ name: "Mean Time to Detect"
493
+ target: "< 5 minutes for SEV1"
494
+ measurement: "Time from incident start to first alert"
495
+
496
+ mtta:
497
+ name: "Mean Time to Acknowledge"
498
+ target: "< 5 minutes"
499
+ measurement: "Time from alert to acknowledgment"
500
+
501
+ mttm:
502
+ name: "Mean Time to Mitigate"
503
+ target: "< 30 minutes for SEV1"
504
+ measurement: "Time from detection to user impact resolved"
505
+
506
+ mttr:
507
+ name: "Mean Time to Resolve"
508
+ target: "< 4 hours for SEV1"
509
+ measurement: "Time from detection to root cause fixed"
510
+
511
+ volume_metrics:
512
+ incidents_per_week:
513
+ target: "Decreasing trend"
514
+
515
+ incidents_by_severity:
516
+ goal: "Fewer SEV1/SEV2, more caught as SEV3/SEV4"
517
+
518
+ repeat_incidents:
519
+ target: "< 10% of incidents are repeats"
520
+
521
+ quality_metrics:
522
+ postmortem_completion_rate:
523
+ target: "100% for SEV1/SEV2"
524
+
525
+ action_item_completion_rate:
526
+ target: "> 90% within 30 days"
527
+ ```
528
+
529
+ ## Common Pitfalls
530
+
531
+ ### During Incidents
532
+
533
+ ```yaml
534
+ pitfall_debugging_during_outage:
535
+ wrong: "Spending 30 minutes debugging while users are down"
536
+ right: "Mitigate first (rollback, scale, failover), debug later"
537
+
538
+ pitfall_too_many_cooks:
539
+ wrong: "Everyone jumping in and trying different things"
540
+ right: "IC assigns specific tasks, coordinates parallel work"
541
+
542
+ pitfall_silent_war_room:
543
+ wrong: "People working silently, no one knows what's happening"
544
+ right: "Regular status updates, thinking out loud"
545
+
546
+ pitfall_forgetting_communication:
547
+ wrong: "Technical team so focused they forget to update stakeholders"
548
+ right: "Comms Lead handles all external communication"
549
+ ```
550
+
551
+ ### After Incidents
552
+
553
+ ```yaml
554
+ pitfall_skipping_postmortem:
555
+ wrong: "We fixed it, move on"
556
+ right: "Every SEV1/SEV2 gets a postmortem within 5 days"
557
+
558
+ pitfall_blame_game:
559
+ wrong: "Who deployed the bad code?"
560
+ right: "What systemic issues allowed this to happen?"
561
+
562
+ pitfall_action_item_graveyard:
563
+ wrong: "Create action items that never get done"
564
+ right: "Track action items, report on completion, prioritize fixes"
565
+ ```