agentic-team-templates 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. package/README.md +280 -0
  2. package/bin/cli.js +5 -0
  3. package/package.json +47 -0
  4. package/src/index.js +521 -0
  5. package/templates/_shared/code-quality.md +162 -0
  6. package/templates/_shared/communication.md +114 -0
  7. package/templates/_shared/core-principles.md +62 -0
  8. package/templates/_shared/git-workflow.md +165 -0
  9. package/templates/_shared/security-fundamentals.md +173 -0
  10. package/templates/blockchain/.cursorrules/defi-patterns.md +520 -0
  11. package/templates/blockchain/.cursorrules/gas-optimization.md +339 -0
  12. package/templates/blockchain/.cursorrules/overview.md +130 -0
  13. package/templates/blockchain/.cursorrules/security.md +318 -0
  14. package/templates/blockchain/.cursorrules/smart-contracts.md +364 -0
  15. package/templates/blockchain/.cursorrules/testing.md +415 -0
  16. package/templates/blockchain/.cursorrules/web3-integration.md +538 -0
  17. package/templates/blockchain/CLAUDE.md +389 -0
  18. package/templates/cli-tools/.cursorrules/architecture.md +412 -0
  19. package/templates/cli-tools/.cursorrules/arguments.md +406 -0
  20. package/templates/cli-tools/.cursorrules/distribution.md +546 -0
  21. package/templates/cli-tools/.cursorrules/error-handling.md +455 -0
  22. package/templates/cli-tools/.cursorrules/overview.md +136 -0
  23. package/templates/cli-tools/.cursorrules/testing.md +537 -0
  24. package/templates/cli-tools/.cursorrules/user-experience.md +545 -0
  25. package/templates/cli-tools/CLAUDE.md +356 -0
  26. package/templates/data-engineering/.cursorrules/data-modeling.md +367 -0
  27. package/templates/data-engineering/.cursorrules/data-quality.md +455 -0
  28. package/templates/data-engineering/.cursorrules/overview.md +85 -0
  29. package/templates/data-engineering/.cursorrules/performance.md +339 -0
  30. package/templates/data-engineering/.cursorrules/pipeline-design.md +280 -0
  31. package/templates/data-engineering/.cursorrules/security.md +460 -0
  32. package/templates/data-engineering/.cursorrules/testing.md +452 -0
  33. package/templates/data-engineering/CLAUDE.md +974 -0
  34. package/templates/devops-sre/.cursorrules/capacity-planning.md +653 -0
  35. package/templates/devops-sre/.cursorrules/change-management.md +584 -0
  36. package/templates/devops-sre/.cursorrules/chaos-engineering.md +651 -0
  37. package/templates/devops-sre/.cursorrules/disaster-recovery.md +641 -0
  38. package/templates/devops-sre/.cursorrules/incident-management.md +565 -0
  39. package/templates/devops-sre/.cursorrules/observability.md +714 -0
  40. package/templates/devops-sre/.cursorrules/overview.md +230 -0
  41. package/templates/devops-sre/.cursorrules/postmortems.md +588 -0
  42. package/templates/devops-sre/.cursorrules/runbooks.md +760 -0
  43. package/templates/devops-sre/.cursorrules/slo-sli.md +617 -0
  44. package/templates/devops-sre/.cursorrules/toil-reduction.md +567 -0
  45. package/templates/devops-sre/CLAUDE.md +1007 -0
  46. package/templates/documentation/.cursorrules/adr.md +277 -0
  47. package/templates/documentation/.cursorrules/api-documentation.md +411 -0
  48. package/templates/documentation/.cursorrules/code-comments.md +253 -0
  49. package/templates/documentation/.cursorrules/maintenance.md +260 -0
  50. package/templates/documentation/.cursorrules/overview.md +82 -0
  51. package/templates/documentation/.cursorrules/readme-standards.md +306 -0
  52. package/templates/documentation/CLAUDE.md +120 -0
  53. package/templates/fullstack/.cursorrules/api-contracts.md +331 -0
  54. package/templates/fullstack/.cursorrules/architecture.md +298 -0
  55. package/templates/fullstack/.cursorrules/overview.md +109 -0
  56. package/templates/fullstack/.cursorrules/shared-types.md +348 -0
  57. package/templates/fullstack/.cursorrules/testing.md +386 -0
  58. package/templates/fullstack/CLAUDE.md +349 -0
  59. package/templates/ml-ai/.cursorrules/data-engineering.md +483 -0
  60. package/templates/ml-ai/.cursorrules/deployment.md +601 -0
  61. package/templates/ml-ai/.cursorrules/model-development.md +538 -0
  62. package/templates/ml-ai/.cursorrules/monitoring.md +658 -0
  63. package/templates/ml-ai/.cursorrules/overview.md +131 -0
  64. package/templates/ml-ai/.cursorrules/security.md +637 -0
  65. package/templates/ml-ai/.cursorrules/testing.md +678 -0
  66. package/templates/ml-ai/CLAUDE.md +1136 -0
  67. package/templates/mobile/.cursorrules/navigation.md +246 -0
  68. package/templates/mobile/.cursorrules/offline-first.md +302 -0
  69. package/templates/mobile/.cursorrules/overview.md +71 -0
  70. package/templates/mobile/.cursorrules/performance.md +345 -0
  71. package/templates/mobile/.cursorrules/testing.md +339 -0
  72. package/templates/mobile/CLAUDE.md +233 -0
  73. package/templates/platform-engineering/.cursorrules/ci-cd.md +778 -0
  74. package/templates/platform-engineering/.cursorrules/developer-experience.md +632 -0
  75. package/templates/platform-engineering/.cursorrules/infrastructure-as-code.md +600 -0
  76. package/templates/platform-engineering/.cursorrules/kubernetes.md +710 -0
  77. package/templates/platform-engineering/.cursorrules/observability.md +747 -0
  78. package/templates/platform-engineering/.cursorrules/overview.md +215 -0
  79. package/templates/platform-engineering/.cursorrules/security.md +855 -0
  80. package/templates/platform-engineering/.cursorrules/testing.md +878 -0
  81. package/templates/platform-engineering/CLAUDE.md +850 -0
  82. package/templates/utility-agent/.cursorrules/action-control.md +284 -0
  83. package/templates/utility-agent/.cursorrules/context-management.md +186 -0
  84. package/templates/utility-agent/.cursorrules/hallucination-prevention.md +253 -0
  85. package/templates/utility-agent/.cursorrules/overview.md +78 -0
  86. package/templates/utility-agent/.cursorrules/token-optimization.md +369 -0
  87. package/templates/utility-agent/CLAUDE.md +513 -0
  88. package/templates/web-backend/.cursorrules/api-design.md +255 -0
  89. package/templates/web-backend/.cursorrules/authentication.md +309 -0
  90. package/templates/web-backend/.cursorrules/database-patterns.md +298 -0
  91. package/templates/web-backend/.cursorrules/error-handling.md +366 -0
  92. package/templates/web-backend/.cursorrules/overview.md +69 -0
  93. package/templates/web-backend/.cursorrules/security.md +358 -0
  94. package/templates/web-backend/.cursorrules/testing.md +395 -0
  95. package/templates/web-backend/CLAUDE.md +366 -0
  96. package/templates/web-frontend/.cursorrules/accessibility.md +296 -0
  97. package/templates/web-frontend/.cursorrules/component-patterns.md +204 -0
  98. package/templates/web-frontend/.cursorrules/overview.md +72 -0
  99. package/templates/web-frontend/.cursorrules/performance.md +325 -0
  100. package/templates/web-frontend/.cursorrules/state-management.md +227 -0
  101. package/templates/web-frontend/.cursorrules/styling.md +271 -0
  102. package/templates/web-frontend/.cursorrules/testing.md +311 -0
  103. package/templates/web-frontend/CLAUDE.md +399 -0
@@ -0,0 +1,641 @@
1
+ # Disaster Recovery
2
+
3
+ Comprehensive guidelines for disaster recovery planning, testing, and execution.
4
+
5
+ ## Core Principles
6
+
7
+ 1. **Plan for Failure** - Assume disasters will happen, prepare accordingly
8
+ 2. **Test Regularly** - Untested recovery plans are just documentation
9
+ 3. **Automate Recovery** - Manual procedures are slow and error-prone
10
+ 4. **Document Everything** - Recovery is stressful; don't rely on memory
11
+
12
+ ## Recovery Objectives
13
+
14
+ ### RTO and RPO
15
+
16
+ ```yaml
17
+ rto:
18
+ name: "Recovery Time Objective"
19
+ definition: "Maximum acceptable time to restore service"
20
+ question: "How long can we be down?"
21
+ factors:
22
+ - "Business impact per hour"
23
+ - "Contractual obligations"
24
+ - "User expectations"
25
+
26
+ rpo:
27
+ name: "Recovery Point Objective"
28
+ definition: "Maximum acceptable data loss"
29
+ question: "How much data can we lose?"
30
+ factors:
31
+ - "Data criticality"
32
+ - "Regulatory requirements"
33
+ - "Cost of data recreation"
34
+
35
+ relationship: |
36
+ ┌─────────────────────────────────────────────────────────────────┐
37
+ │ TIMELINE │
38
+ │ │
39
+ │ Last Good Disaster Recovery Full │
40
+ │ Backup Occurs Begins Recovery │
41
+ │ │ │ │ │ │
42
+ │ ▼ ▼ ▼ ▼ │
43
+ │ ────┼─────────────────┼───────────────┼───────────────┼──── │
44
+ │ │ │ │ │ │
45
+ │ │◄───── RPO ─────►│ │ │ │
46
+ │ │ (Data at risk) │ │ │ │
47
+ │ │◄──────────── RTO ────────────►│ │
48
+ │ │ (Downtime duration) │ │
49
+ │ │
50
+ └─────────────────────────────────────────────────────────────────┘
51
+ ```
52
+
53
+ ### Service Tiers
54
+
55
+ ```yaml
56
+ tier_1_critical:
57
+ description: "Core business functions"
58
+ services:
59
+ - "User authentication"
60
+ - "Payment processing"
61
+ - "Primary API"
62
+ rto: "15 minutes"
63
+ rpo: "0 (no data loss)"
64
+ strategy: "Active-active multi-region"
65
+ backup_frequency: "Continuous replication"
66
+
67
+ tier_2_important:
68
+ description: "Important but not critical"
69
+ services:
70
+ - "Search functionality"
71
+ - "Notifications"
72
+ - "Analytics ingestion"
73
+ rto: "1 hour"
74
+ rpo: "15 minutes"
75
+ strategy: "Warm standby with automated failover"
76
+ backup_frequency: "Every 15 minutes"
77
+
78
+ tier_3_standard:
79
+ description: "Supporting services"
80
+ services:
81
+ - "Admin dashboard"
82
+ - "Reporting"
83
+ - "Batch processing"
84
+ rto: "4 hours"
85
+ rpo: "1 hour"
86
+ strategy: "Cold standby with manual failover"
87
+ backup_frequency: "Hourly"
88
+
89
+ tier_4_non_critical:
90
+ description: "Nice to have services"
91
+ services:
92
+ - "Developer tools"
93
+ - "Internal dashboards"
94
+ - "Documentation"
95
+ rto: "24 hours"
96
+ rpo: "24 hours"
97
+ strategy: "Restore from backup"
98
+ backup_frequency: "Daily"
99
+ ```
100
+
101
+ ## Disaster Scenarios
102
+
103
+ ### Scenario Classification
104
+
105
+ ```yaml
106
+ infrastructure_failure:
107
+ examples:
108
+ - "Single server failure"
109
+ - "Network partition"
110
+ - "Storage failure"
111
+ - "Cloud provider AZ failure"
112
+ likelihood: "Common"
113
+ preparation: "Redundancy, failover"
114
+
115
+ regional_outage:
116
+ examples:
117
+ - "Cloud region unavailable"
118
+ - "Natural disaster in region"
119
+ - "Major network outage"
120
+ likelihood: "Rare"
121
+ preparation: "Multi-region deployment"
122
+
123
+ data_corruption:
124
+ examples:
125
+ - "Database corruption"
126
+ - "Bad deployment corrupting data"
127
+ - "Ransomware"
128
+ likelihood: "Uncommon"
129
+ preparation: "Backups, point-in-time recovery"
130
+
131
+ security_incident:
132
+ examples:
133
+ - "Data breach"
134
+ - "Compromised credentials"
135
+ - "Malicious insider"
136
+ likelihood: "Uncommon"
137
+ preparation: "Incident response plan, isolation"
138
+
139
+ human_error:
140
+ examples:
141
+ - "Accidental data deletion"
142
+ - "Misconfiguration"
143
+ - "Wrong environment deployment"
144
+ likelihood: "Common"
145
+ preparation: "RBAC, backups, change management"
146
+ ```
147
+
148
+ ## Backup Strategy
149
+
150
+ ### Backup Types
151
+
152
+ ```yaml
153
+ full_backup:
154
+ description: "Complete copy of all data"
155
+ frequency: "Weekly"
156
+ pros:
157
+ - "Fast restore"
158
+ - "Self-contained"
159
+ cons:
160
+ - "Slow to create"
161
+ - "Storage intensive"
162
+
163
+ incremental_backup:
164
+ description: "Only changes since last backup"
165
+ frequency: "Daily or hourly"
166
+ pros:
167
+ - "Fast to create"
168
+ - "Storage efficient"
169
+ cons:
170
+ - "Slower restore"
171
+ - "Depends on previous backups"
172
+
173
+ continuous_replication:
174
+ description: "Real-time data sync"
175
+ frequency: "Continuous"
176
+ pros:
177
+ - "Minimal data loss"
178
+ - "Fast failover"
179
+ cons:
180
+ - "Complex setup"
181
+ - "Can replicate corruption"
182
+ ```
183
+
184
+ ### Backup Configuration
185
+
186
+ ```yaml
187
+ database_backup:
188
+ postgresql:
189
+ continuous:
190
+ method: "WAL archiving + streaming replication"
191
+ rpo: "< 1 minute"
192
+ retention: "7 days of WAL"
193
+
194
+ point_in_time:
195
+ method: "pg_basebackup + WAL"
196
+ recovery: "Restore to any point in time"
197
+
198
+ logical:
199
+ method: "pg_dump"
200
+ frequency: "Daily"
201
+ retention: "30 days"
202
+
203
+ commands: |
204
+ # Continuous backup with WAL archiving
205
+ # postgresql.conf
206
+ archive_mode = on
207
+ archive_command = 'aws s3 cp %p s3://backups/wal/%f'
208
+
209
+ # Daily logical backup
210
+ pg_dump -Fc database > backup.dump
211
+ aws s3 cp backup.dump s3://backups/daily/$(date +%Y-%m-%d).dump
212
+
213
+ object_storage_backup:
214
+ method: "Cross-region replication"
215
+ configuration: |
216
+ # S3 bucket replication
217
+ aws s3api put-bucket-replication --bucket source-bucket --replication-configuration '{
218
+ "Role": "arn:aws:iam::account:role/replication-role",
219
+ "Rules": [{
220
+ "Status": "Enabled",
221
+ "Destination": {
222
+ "Bucket": "arn:aws:s3:::dest-bucket",
223
+ "StorageClass": "STANDARD"
224
+ }
225
+ }]
226
+ }'
227
+
228
+ kubernetes_backup:
229
+ method: "Velero"
230
+ includes:
231
+ - "Cluster state"
232
+ - "Persistent volumes"
233
+ - "Secrets and ConfigMaps"
234
+ commands: |
235
+ # Install Velero
236
+ velero install --provider aws --bucket backups --secret-file ./credentials
237
+
238
+ # Create backup
239
+ velero backup create daily-backup --include-namespaces production
240
+
241
+ # Schedule backups
242
+ velero schedule create daily --schedule="0 1 * * *" --include-namespaces production
243
+ ```
244
+
245
+ ### Backup Verification
246
+
247
+ ```yaml
248
+ backup_testing:
249
+ frequency: "Monthly at minimum"
250
+
251
+ process:
252
+ - "Select random backup"
253
+ - "Restore to test environment"
254
+ - "Verify data integrity"
255
+ - "Test application functionality"
256
+ - "Document results"
257
+
258
+ checklist:
259
+ - "Backup files exist and accessible"
260
+ - "Backup can be decrypted"
261
+ - "Restore completes without errors"
262
+ - "Data matches expected state"
263
+ - "Application can read restored data"
264
+ - "Restore time within RTO"
265
+
266
+ integrity_checks: |
267
+ # PostgreSQL backup verification
268
+ pg_restore --list backup.dump > /dev/null && echo "Backup valid"
269
+
270
+ # Compare row counts
271
+ psql -c "SELECT count(*) FROM users" production
272
+ psql -c "SELECT count(*) FROM users" restored_db
273
+
274
+ # Checksum verification
275
+ sha256sum backup.dump > backup.sha256
276
+ # Store and verify later
277
+ ```
278
+
279
+ ## Failover Procedures
280
+
281
+ ### Automated Failover
282
+
283
+ ```yaml
284
+ database_failover:
285
+ postgresql_patroni:
286
+ description: "Automatic leader election"
287
+ detection: "Health checks every 10 seconds"
288
+ failover_time: "30-60 seconds"
289
+ configuration: |
290
+ # Patroni configuration
291
+ bootstrap:
292
+ dcs:
293
+ ttl: 30
294
+ loop_wait: 10
295
+ retry_timeout: 10
296
+ maximum_lag_on_failover: 1048576
297
+
298
+ rds_multi_az:
299
+ description: "AWS managed failover"
300
+ detection: "Automatic"
301
+ failover_time: "60-120 seconds"
302
+ action: "Automatic, no intervention needed"
303
+
304
+ application_failover:
305
+ kubernetes:
306
+ description: "Pod rescheduling"
307
+ detection: "Liveness/readiness probes"
308
+ failover_time: "Seconds to minutes"
309
+ configuration: |
310
+ livenessProbe:
311
+ httpGet:
312
+ path: /health
313
+ port: 8080
314
+ initialDelaySeconds: 10
315
+ periodSeconds: 10
316
+ failureThreshold: 3
317
+
318
+ load_balancer:
319
+ description: "Health check based routing"
320
+ detection: "HTTP health checks"
321
+ failover_time: "Seconds"
322
+ ```
323
+
324
+ ### Regional Failover
325
+
326
+ ```yaml
327
+ regional_failover_process:
328
+ detection:
329
+ triggers:
330
+ - "Multiple AZ failures"
331
+ - "Regional network issues"
332
+ - "Extended outage (> 15 minutes)"
333
+ monitoring:
334
+ - "Regional health dashboard"
335
+ - "External synthetic monitoring"
336
+ - "Cross-region health checks"
337
+
338
+ decision:
339
+ criteria:
340
+ - "Primary region unrecoverable"
341
+ - "Data sync status known"
342
+ - "Business approval (if applicable)"
343
+ timeframe: "Decide within 15 minutes"
344
+
345
+ execution:
346
+ steps:
347
+ 1_verify: "Confirm secondary region ready"
348
+ 2_dns: "Update DNS to secondary region"
349
+ 3_scale: "Scale secondary region capacity"
350
+ 4_verify: "Verify traffic flowing"
351
+ 5_monitor: "Monitor error rates"
352
+
353
+ communication:
354
+ - "Status page update"
355
+ - "Internal notification"
356
+ - "Customer communication"
357
+
358
+ dns_failover: |
359
+ # Route 53 health check based failover
360
+ aws route53 change-resource-record-sets --hosted-zone-id Z123 --change-batch '{
361
+ "Changes": [{
362
+ "Action": "UPSERT",
363
+ "ResourceRecordSet": {
364
+ "Name": "api.example.com",
365
+ "Type": "A",
366
+ "SetIdentifier": "secondary",
367
+ "Failover": "SECONDARY",
368
+ "TTL": 60,
369
+ "ResourceRecords": [{"Value": "secondary-ip"}],
370
+ "HealthCheckId": "health-check-id"
371
+ }
372
+ }]
373
+ }'
374
+ ```
375
+
376
+ ## DR Runbook Template
377
+
378
+ ```markdown
379
+ # Disaster Recovery Runbook: [Scenario]
380
+
381
+ ## Overview
382
+
383
+ **Scenario**: [Description of disaster scenario]
384
+ **Affected Services**: [List of services]
385
+ **Recovery Strategy**: [Active-Active | Warm Standby | Cold Standby]
386
+ **Target RTO**: [Time]
387
+ **Target RPO**: [Time]
388
+
389
+ ---
390
+
391
+ ## Detection
392
+
393
+ ### Monitoring
394
+ - Dashboard: [Grafana link]
395
+ - Alerts: [Alert names that indicate this scenario]
396
+ - External monitoring: [Synthetic checks]
397
+
398
+ ### Verification
399
+ Before declaring disaster:
400
+ 1. Verify issue is not transient (wait 5 minutes)
401
+ 2. Confirm with multiple monitoring sources
402
+ 3. Check cloud provider status page
403
+ 4. Attempt basic remediation
404
+
405
+ ---
406
+
407
+ ## Declaration
408
+
409
+ ### When to Declare
410
+ - [ ] Primary region unreachable for > 15 minutes
411
+ - [ ] Data center evacuation required
412
+ - [ ] Security incident requires isolation
413
+ - [ ] Other: [specific criteria]
414
+
415
+ ### Declaration Process
416
+ 1. Notify incident commander
417
+ 2. Start incident channel: #dr-YYYY-MM-DD
418
+ 3. Page DR response team
419
+ 4. Update status page: "Major outage, activating DR"
420
+
421
+ ---
422
+
423
+ ## Failover Procedure
424
+
425
+ ### Pre-Failover Checks
426
+
427
+ ```bash
428
+ # Verify secondary region health
429
+ curl https://secondary-region-healthcheck.example.com/health
430
+
431
+ # Check replication lag
432
+ # [Database-specific command]
433
+
434
+ # Verify backup status
435
+ # [Command to check latest backup]
436
+ ```
437
+
438
+ ### Step 1: Stop Traffic to Primary
439
+
440
+ ```bash
441
+ # Update load balancer
442
+ aws elbv2 modify-listener --listener-arn <arn> --default-actions Type=fixed-response,FixedResponseConfig={StatusCode=503}
443
+
444
+ # Or update DNS TTL (if not already low)
445
+ # DNS should already have low TTL (60s) for DR
446
+ ```
447
+
448
+ ### Step 2: Promote Secondary Database
449
+
450
+ ```bash
451
+ # PostgreSQL promotion
452
+ patronictl failover --master primary-node --candidate secondary-node
453
+
454
+ # Or RDS
455
+ aws rds promote-read-replica --db-instance-identifier secondary-db
456
+ ```
457
+
458
+ ### Step 3: Scale Secondary Application
459
+
460
+ ```bash
461
+ # Scale up secondary region
462
+ kubectl config use-context secondary-region
463
+ kubectl scale deployment/api-server --replicas=20
464
+ kubectl scale deployment/web-server --replicas=10
465
+ ```
466
+
467
+ ### Step 4: Update DNS
468
+
469
+ ```bash
470
+ # Switch DNS to secondary
471
+ aws route53 change-resource-record-sets --hosted-zone-id Z123 --change-batch file://failover-dns.json
472
+
473
+ # Or if using Route 53 failover
474
+ # Health check failure should trigger automatic failover
475
+ ```
476
+
477
+ ### Step 5: Verify Recovery
478
+
479
+ ```bash
480
+ # Check application health
481
+ curl https://api.example.com/health
482
+
483
+ # Check error rates
484
+ # [Query Prometheus/Datadog]
485
+
486
+ # Run smoke tests
487
+ ./scripts/smoke-test.sh
488
+ ```
489
+
490
+ ---
491
+
492
+ ## Post-Failover
493
+
494
+ ### Immediate (0-1 hour)
495
+ - [ ] Verify all critical functions working
496
+ - [ ] Update status page: "Operating in DR mode"
497
+ - [ ] Notify stakeholders
498
+ - [ ] Monitor error rates
499
+
500
+ ### Short-term (1-24 hours)
501
+ - [ ] Assess primary region status
502
+ - [ ] Document data loss (if any)
503
+ - [ ] Plan failback procedure
504
+ - [ ] Customer communication (if needed)
505
+
506
+ ### Recovery (24-72 hours)
507
+ - [ ] Repair primary region
508
+ - [ ] Resync data
509
+ - [ ] Test primary region
510
+ - [ ] Schedule failback
511
+
512
+ ---
513
+
514
+ ## Failback Procedure
515
+
516
+ ### Prerequisites
517
+ - [ ] Primary region fully operational
518
+ - [ ] Data synced from secondary to primary
519
+ - [ ] Testing completed in primary
520
+ - [ ] Change window scheduled
521
+
522
+ ### Failback Steps
523
+ 1. Stop writes to secondary (if needed)
524
+ 2. Final data sync
525
+ 3. Verify data consistency
526
+ 4. Switch traffic to primary
527
+ 5. Monitor and verify
528
+ 6. Decommission DR mode
529
+
530
+ ---
531
+
532
+ ## Contacts
533
+
534
+ | Role | Contact | Responsibility |
535
+ |------|---------|----------------|
536
+ | DR Coordinator | @dr-lead | Overall coordination |
537
+ | Database | @dba-oncall | Database failover |
538
+ | Infrastructure | @infra-oncall | DNS, load balancers |
539
+ | Application | @app-oncall | Application verification |
540
+
541
+ ---
542
+
543
+ ## Revision History
544
+
545
+ | Date | Author | Change |
546
+ |------|--------|--------|
547
+ | 2025-01-15 | @engineer | Initial version |
548
+ ```
549
+
550
+ ## DR Testing
551
+
552
+ ### Test Types
553
+
554
+ ```yaml
555
+ tabletop_exercise:
556
+ description: "Walk through DR plan verbally"
557
+ frequency: "Quarterly"
558
+ duration: "2-4 hours"
559
+ participants: "All on-call engineers"
560
+ outcome: "Identify gaps in documentation"
561
+
562
+ component_failover:
563
+ description: "Test individual component recovery"
564
+ frequency: "Monthly"
565
+ examples:
566
+ - "Database replica promotion"
567
+ - "Single AZ failure simulation"
568
+ - "Service restart recovery"
569
+ outcome: "Verify automated failover works"
570
+
571
+ regional_failover:
572
+ description: "Full region evacuation test"
573
+ frequency: "Bi-annually"
574
+ preparation:
575
+ - "Schedule maintenance window"
576
+ - "Notify customers"
577
+ - "Prepare rollback"
578
+ outcome: "Validate end-to-end DR capability"
579
+
580
+ chaos_engineering:
581
+ description: "Inject failures in production"
582
+ frequency: "Ongoing"
583
+ examples:
584
+ - "Kill random pods"
585
+ - "Inject network latency"
586
+ - "Simulate AZ failure"
587
+ outcome: "Continuous validation of resilience"
588
+ ```
589
+
590
+ ### DR Test Checklist
591
+
592
+ ```yaml
593
+ test_planning:
594
+ - "Define test objectives"
595
+ - "Identify success criteria"
596
+ - "Schedule appropriate window"
597
+ - "Notify stakeholders"
598
+ - "Prepare rollback plan"
599
+
600
+ during_test:
601
+ - "Document all actions"
602
+ - "Record timing for each step"
603
+ - "Note any deviations from plan"
604
+ - "Capture issues encountered"
605
+
606
+ post_test:
607
+ - "Compare actual vs expected RTO/RPO"
608
+ - "Document lessons learned"
609
+ - "Update runbooks"
610
+ - "Create action items for improvements"
611
+ - "Schedule follow-up test for gaps"
612
+ ```
613
+
614
+ ## Common Pitfalls
615
+
616
+ ```yaml
617
+ pitfall_untested_backups:
618
+ problem: "Backups exist but never tested"
619
+ impact: "Discover corruption during actual disaster"
620
+ solution: "Monthly restore testing"
621
+
622
+ pitfall_stale_runbooks:
623
+ problem: "DR runbooks outdated"
624
+ impact: "Wrong commands, missing steps"
625
+ solution: "Update runbooks after every test and change"
626
+
627
+ pitfall_single_region:
628
+ problem: "All resources in one region"
629
+ impact: "Complete outage if region fails"
630
+ solution: "Multi-region architecture for critical services"
631
+
632
+ pitfall_no_communication_plan:
633
+ problem: "No plan for customer communication"
634
+ impact: "Confusion, support overload"
635
+ solution: "Pre-written communication templates"
636
+
637
+ pitfall_manual_failover:
638
+ problem: "Failover requires manual steps"
639
+ impact: "Slow recovery, human error"
640
+ solution: "Automate failover where possible"
641
+ ```