agentic-team-templates 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +280 -0
- package/bin/cli.js +5 -0
- package/package.json +47 -0
- package/src/index.js +521 -0
- package/templates/_shared/code-quality.md +162 -0
- package/templates/_shared/communication.md +114 -0
- package/templates/_shared/core-principles.md +62 -0
- package/templates/_shared/git-workflow.md +165 -0
- package/templates/_shared/security-fundamentals.md +173 -0
- package/templates/blockchain/.cursorrules/defi-patterns.md +520 -0
- package/templates/blockchain/.cursorrules/gas-optimization.md +339 -0
- package/templates/blockchain/.cursorrules/overview.md +130 -0
- package/templates/blockchain/.cursorrules/security.md +318 -0
- package/templates/blockchain/.cursorrules/smart-contracts.md +364 -0
- package/templates/blockchain/.cursorrules/testing.md +415 -0
- package/templates/blockchain/.cursorrules/web3-integration.md +538 -0
- package/templates/blockchain/CLAUDE.md +389 -0
- package/templates/cli-tools/.cursorrules/architecture.md +412 -0
- package/templates/cli-tools/.cursorrules/arguments.md +406 -0
- package/templates/cli-tools/.cursorrules/distribution.md +546 -0
- package/templates/cli-tools/.cursorrules/error-handling.md +455 -0
- package/templates/cli-tools/.cursorrules/overview.md +136 -0
- package/templates/cli-tools/.cursorrules/testing.md +537 -0
- package/templates/cli-tools/.cursorrules/user-experience.md +545 -0
- package/templates/cli-tools/CLAUDE.md +356 -0
- package/templates/data-engineering/.cursorrules/data-modeling.md +367 -0
- package/templates/data-engineering/.cursorrules/data-quality.md +455 -0
- package/templates/data-engineering/.cursorrules/overview.md +85 -0
- package/templates/data-engineering/.cursorrules/performance.md +339 -0
- package/templates/data-engineering/.cursorrules/pipeline-design.md +280 -0
- package/templates/data-engineering/.cursorrules/security.md +460 -0
- package/templates/data-engineering/.cursorrules/testing.md +452 -0
- package/templates/data-engineering/CLAUDE.md +974 -0
- package/templates/devops-sre/.cursorrules/capacity-planning.md +653 -0
- package/templates/devops-sre/.cursorrules/change-management.md +584 -0
- package/templates/devops-sre/.cursorrules/chaos-engineering.md +651 -0
- package/templates/devops-sre/.cursorrules/disaster-recovery.md +641 -0
- package/templates/devops-sre/.cursorrules/incident-management.md +565 -0
- package/templates/devops-sre/.cursorrules/observability.md +714 -0
- package/templates/devops-sre/.cursorrules/overview.md +230 -0
- package/templates/devops-sre/.cursorrules/postmortems.md +588 -0
- package/templates/devops-sre/.cursorrules/runbooks.md +760 -0
- package/templates/devops-sre/.cursorrules/slo-sli.md +617 -0
- package/templates/devops-sre/.cursorrules/toil-reduction.md +567 -0
- package/templates/devops-sre/CLAUDE.md +1007 -0
- package/templates/documentation/.cursorrules/adr.md +277 -0
- package/templates/documentation/.cursorrules/api-documentation.md +411 -0
- package/templates/documentation/.cursorrules/code-comments.md +253 -0
- package/templates/documentation/.cursorrules/maintenance.md +260 -0
- package/templates/documentation/.cursorrules/overview.md +82 -0
- package/templates/documentation/.cursorrules/readme-standards.md +306 -0
- package/templates/documentation/CLAUDE.md +120 -0
- package/templates/fullstack/.cursorrules/api-contracts.md +331 -0
- package/templates/fullstack/.cursorrules/architecture.md +298 -0
- package/templates/fullstack/.cursorrules/overview.md +109 -0
- package/templates/fullstack/.cursorrules/shared-types.md +348 -0
- package/templates/fullstack/.cursorrules/testing.md +386 -0
- package/templates/fullstack/CLAUDE.md +349 -0
- package/templates/ml-ai/.cursorrules/data-engineering.md +483 -0
- package/templates/ml-ai/.cursorrules/deployment.md +601 -0
- package/templates/ml-ai/.cursorrules/model-development.md +538 -0
- package/templates/ml-ai/.cursorrules/monitoring.md +658 -0
- package/templates/ml-ai/.cursorrules/overview.md +131 -0
- package/templates/ml-ai/.cursorrules/security.md +637 -0
- package/templates/ml-ai/.cursorrules/testing.md +678 -0
- package/templates/ml-ai/CLAUDE.md +1136 -0
- package/templates/mobile/.cursorrules/navigation.md +246 -0
- package/templates/mobile/.cursorrules/offline-first.md +302 -0
- package/templates/mobile/.cursorrules/overview.md +71 -0
- package/templates/mobile/.cursorrules/performance.md +345 -0
- package/templates/mobile/.cursorrules/testing.md +339 -0
- package/templates/mobile/CLAUDE.md +233 -0
- package/templates/platform-engineering/.cursorrules/ci-cd.md +778 -0
- package/templates/platform-engineering/.cursorrules/developer-experience.md +632 -0
- package/templates/platform-engineering/.cursorrules/infrastructure-as-code.md +600 -0
- package/templates/platform-engineering/.cursorrules/kubernetes.md +710 -0
- package/templates/platform-engineering/.cursorrules/observability.md +747 -0
- package/templates/platform-engineering/.cursorrules/overview.md +215 -0
- package/templates/platform-engineering/.cursorrules/security.md +855 -0
- package/templates/platform-engineering/.cursorrules/testing.md +878 -0
- package/templates/platform-engineering/CLAUDE.md +850 -0
- package/templates/utility-agent/.cursorrules/action-control.md +284 -0
- package/templates/utility-agent/.cursorrules/context-management.md +186 -0
- package/templates/utility-agent/.cursorrules/hallucination-prevention.md +253 -0
- package/templates/utility-agent/.cursorrules/overview.md +78 -0
- package/templates/utility-agent/.cursorrules/token-optimization.md +369 -0
- package/templates/utility-agent/CLAUDE.md +513 -0
- package/templates/web-backend/.cursorrules/api-design.md +255 -0
- package/templates/web-backend/.cursorrules/authentication.md +309 -0
- package/templates/web-backend/.cursorrules/database-patterns.md +298 -0
- package/templates/web-backend/.cursorrules/error-handling.md +366 -0
- package/templates/web-backend/.cursorrules/overview.md +69 -0
- package/templates/web-backend/.cursorrules/security.md +358 -0
- package/templates/web-backend/.cursorrules/testing.md +395 -0
- package/templates/web-backend/CLAUDE.md +366 -0
- package/templates/web-frontend/.cursorrules/accessibility.md +296 -0
- package/templates/web-frontend/.cursorrules/component-patterns.md +204 -0
- package/templates/web-frontend/.cursorrules/overview.md +72 -0
- package/templates/web-frontend/.cursorrules/performance.md +325 -0
- package/templates/web-frontend/.cursorrules/state-management.md +227 -0
- package/templates/web-frontend/.cursorrules/styling.md +271 -0
- package/templates/web-frontend/.cursorrules/testing.md +311 -0
- package/templates/web-frontend/CLAUDE.md +399 -0
|
@@ -0,0 +1,641 @@
|
|
|
1
|
+
# Disaster Recovery
|
|
2
|
+
|
|
3
|
+
Comprehensive guidelines for disaster recovery planning, testing, and execution.
|
|
4
|
+
|
|
5
|
+
## Core Principles
|
|
6
|
+
|
|
7
|
+
1. **Plan for Failure** - Assume disasters will happen, prepare accordingly
|
|
8
|
+
2. **Test Regularly** - Untested recovery plans are just documentation
|
|
9
|
+
3. **Automate Recovery** - Manual procedures are slow and error-prone
|
|
10
|
+
4. **Document Everything** - Recovery is stressful; don't rely on memory
|
|
11
|
+
|
|
12
|
+
## Recovery Objectives
|
|
13
|
+
|
|
14
|
+
### RTO and RPO
|
|
15
|
+
|
|
16
|
+
```yaml
|
|
17
|
+
rto:
|
|
18
|
+
name: "Recovery Time Objective"
|
|
19
|
+
definition: "Maximum acceptable time to restore service"
|
|
20
|
+
question: "How long can we be down?"
|
|
21
|
+
factors:
|
|
22
|
+
- "Business impact per hour"
|
|
23
|
+
- "Contractual obligations"
|
|
24
|
+
- "User expectations"
|
|
25
|
+
|
|
26
|
+
rpo:
|
|
27
|
+
name: "Recovery Point Objective"
|
|
28
|
+
definition: "Maximum acceptable data loss"
|
|
29
|
+
question: "How much data can we lose?"
|
|
30
|
+
factors:
|
|
31
|
+
- "Data criticality"
|
|
32
|
+
- "Regulatory requirements"
|
|
33
|
+
- "Cost of data recreation"
|
|
34
|
+
|
|
35
|
+
relationship: |
|
|
36
|
+
┌─────────────────────────────────────────────────────────────────┐
|
|
37
|
+
│ TIMELINE │
|
|
38
|
+
│ │
|
|
39
|
+
│ Last Good Disaster Recovery Full │
|
|
40
|
+
│ Backup Occurs Begins Recovery │
|
|
41
|
+
│ │ │ │ │ │
|
|
42
|
+
│ ▼ ▼ ▼ ▼ │
|
|
43
|
+
│ ────┼─────────────────┼───────────────┼───────────────┼──── │
|
|
44
|
+
│ │ │ │ │ │
|
|
45
|
+
│ │◄───── RPO ─────►│ │ │ │
|
|
46
|
+
│ │ (Data at risk) │ │ │ │
|
|
47
|
+
│ │◄──────────── RTO ────────────►│ │
|
|
48
|
+
│ │ (Downtime duration) │ │
|
|
49
|
+
│ │
|
|
50
|
+
└─────────────────────────────────────────────────────────────────┘
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### Service Tiers
|
|
54
|
+
|
|
55
|
+
```yaml
|
|
56
|
+
tier_1_critical:
|
|
57
|
+
description: "Core business functions"
|
|
58
|
+
services:
|
|
59
|
+
- "User authentication"
|
|
60
|
+
- "Payment processing"
|
|
61
|
+
- "Primary API"
|
|
62
|
+
rto: "15 minutes"
|
|
63
|
+
rpo: "0 (no data loss)"
|
|
64
|
+
strategy: "Active-active multi-region"
|
|
65
|
+
backup_frequency: "Continuous replication"
|
|
66
|
+
|
|
67
|
+
tier_2_important:
|
|
68
|
+
description: "Important but not critical"
|
|
69
|
+
services:
|
|
70
|
+
- "Search functionality"
|
|
71
|
+
- "Notifications"
|
|
72
|
+
- "Analytics ingestion"
|
|
73
|
+
rto: "1 hour"
|
|
74
|
+
rpo: "15 minutes"
|
|
75
|
+
strategy: "Warm standby with automated failover"
|
|
76
|
+
backup_frequency: "Every 15 minutes"
|
|
77
|
+
|
|
78
|
+
tier_3_standard:
|
|
79
|
+
description: "Supporting services"
|
|
80
|
+
services:
|
|
81
|
+
- "Admin dashboard"
|
|
82
|
+
- "Reporting"
|
|
83
|
+
- "Batch processing"
|
|
84
|
+
rto: "4 hours"
|
|
85
|
+
rpo: "1 hour"
|
|
86
|
+
strategy: "Cold standby with manual failover"
|
|
87
|
+
backup_frequency: "Hourly"
|
|
88
|
+
|
|
89
|
+
tier_4_non_critical:
|
|
90
|
+
description: "Nice to have services"
|
|
91
|
+
services:
|
|
92
|
+
- "Developer tools"
|
|
93
|
+
- "Internal dashboards"
|
|
94
|
+
- "Documentation"
|
|
95
|
+
rto: "24 hours"
|
|
96
|
+
rpo: "24 hours"
|
|
97
|
+
strategy: "Restore from backup"
|
|
98
|
+
backup_frequency: "Daily"
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## Disaster Scenarios
|
|
102
|
+
|
|
103
|
+
### Scenario Classification
|
|
104
|
+
|
|
105
|
+
```yaml
|
|
106
|
+
infrastructure_failure:
|
|
107
|
+
examples:
|
|
108
|
+
- "Single server failure"
|
|
109
|
+
- "Network partition"
|
|
110
|
+
- "Storage failure"
|
|
111
|
+
- "Cloud provider AZ failure"
|
|
112
|
+
likelihood: "Common"
|
|
113
|
+
preparation: "Redundancy, failover"
|
|
114
|
+
|
|
115
|
+
regional_outage:
|
|
116
|
+
examples:
|
|
117
|
+
- "Cloud region unavailable"
|
|
118
|
+
- "Natural disaster in region"
|
|
119
|
+
- "Major network outage"
|
|
120
|
+
likelihood: "Rare"
|
|
121
|
+
preparation: "Multi-region deployment"
|
|
122
|
+
|
|
123
|
+
data_corruption:
|
|
124
|
+
examples:
|
|
125
|
+
- "Database corruption"
|
|
126
|
+
- "Bad deployment corrupting data"
|
|
127
|
+
- "Ransomware"
|
|
128
|
+
likelihood: "Uncommon"
|
|
129
|
+
preparation: "Backups, point-in-time recovery"
|
|
130
|
+
|
|
131
|
+
security_incident:
|
|
132
|
+
examples:
|
|
133
|
+
- "Data breach"
|
|
134
|
+
- "Compromised credentials"
|
|
135
|
+
- "Malicious insider"
|
|
136
|
+
likelihood: "Uncommon"
|
|
137
|
+
preparation: "Incident response plan, isolation"
|
|
138
|
+
|
|
139
|
+
human_error:
|
|
140
|
+
examples:
|
|
141
|
+
- "Accidental data deletion"
|
|
142
|
+
- "Misconfiguration"
|
|
143
|
+
- "Wrong environment deployment"
|
|
144
|
+
likelihood: "Common"
|
|
145
|
+
preparation: "RBAC, backups, change management"
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
## Backup Strategy
|
|
149
|
+
|
|
150
|
+
### Backup Types
|
|
151
|
+
|
|
152
|
+
```yaml
|
|
153
|
+
full_backup:
|
|
154
|
+
description: "Complete copy of all data"
|
|
155
|
+
frequency: "Weekly"
|
|
156
|
+
pros:
|
|
157
|
+
- "Fast restore"
|
|
158
|
+
- "Self-contained"
|
|
159
|
+
cons:
|
|
160
|
+
- "Slow to create"
|
|
161
|
+
- "Storage intensive"
|
|
162
|
+
|
|
163
|
+
incremental_backup:
|
|
164
|
+
description: "Only changes since last backup"
|
|
165
|
+
frequency: "Daily or hourly"
|
|
166
|
+
pros:
|
|
167
|
+
- "Fast to create"
|
|
168
|
+
- "Storage efficient"
|
|
169
|
+
cons:
|
|
170
|
+
- "Slower restore"
|
|
171
|
+
- "Depends on previous backups"
|
|
172
|
+
|
|
173
|
+
continuous_replication:
|
|
174
|
+
description: "Real-time data sync"
|
|
175
|
+
frequency: "Continuous"
|
|
176
|
+
pros:
|
|
177
|
+
- "Minimal data loss"
|
|
178
|
+
- "Fast failover"
|
|
179
|
+
cons:
|
|
180
|
+
- "Complex setup"
|
|
181
|
+
- "Can replicate corruption"
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
### Backup Configuration
|
|
185
|
+
|
|
186
|
+
```yaml
|
|
187
|
+
database_backup:
|
|
188
|
+
postgresql:
|
|
189
|
+
continuous:
|
|
190
|
+
method: "WAL archiving + streaming replication"
|
|
191
|
+
rpo: "< 1 minute"
|
|
192
|
+
retention: "7 days of WAL"
|
|
193
|
+
|
|
194
|
+
point_in_time:
|
|
195
|
+
method: "pg_basebackup + WAL"
|
|
196
|
+
recovery: "Restore to any point in time"
|
|
197
|
+
|
|
198
|
+
logical:
|
|
199
|
+
method: "pg_dump"
|
|
200
|
+
frequency: "Daily"
|
|
201
|
+
retention: "30 days"
|
|
202
|
+
|
|
203
|
+
commands: |
|
|
204
|
+
# Continuous backup with WAL archiving
|
|
205
|
+
# postgresql.conf
|
|
206
|
+
archive_mode = on
|
|
207
|
+
archive_command = 'aws s3 cp %p s3://backups/wal/%f'
|
|
208
|
+
|
|
209
|
+
# Daily logical backup
|
|
210
|
+
pg_dump -Fc database > backup.dump
|
|
211
|
+
aws s3 cp backup.dump s3://backups/daily/$(date +%Y-%m-%d).dump
|
|
212
|
+
|
|
213
|
+
object_storage_backup:
|
|
214
|
+
method: "Cross-region replication"
|
|
215
|
+
configuration: |
|
|
216
|
+
# S3 bucket replication
|
|
217
|
+
aws s3api put-bucket-replication --bucket source-bucket --replication-configuration '{
|
|
218
|
+
"Role": "arn:aws:iam::account:role/replication-role",
|
|
219
|
+
"Rules": [{
|
|
220
|
+
"Status": "Enabled",
|
|
221
|
+
"Destination": {
|
|
222
|
+
"Bucket": "arn:aws:s3:::dest-bucket",
|
|
223
|
+
"StorageClass": "STANDARD"
|
|
224
|
+
}
|
|
225
|
+
}]
|
|
226
|
+
}'
|
|
227
|
+
|
|
228
|
+
kubernetes_backup:
|
|
229
|
+
method: "Velero"
|
|
230
|
+
includes:
|
|
231
|
+
- "Cluster state"
|
|
232
|
+
- "Persistent volumes"
|
|
233
|
+
- "Secrets and ConfigMaps"
|
|
234
|
+
commands: |
|
|
235
|
+
# Install Velero
|
|
236
|
+
velero install --provider aws --bucket backups --secret-file ./credentials
|
|
237
|
+
|
|
238
|
+
# Create backup
|
|
239
|
+
velero backup create daily-backup --include-namespaces production
|
|
240
|
+
|
|
241
|
+
# Schedule backups
|
|
242
|
+
velero schedule create daily --schedule="0 1 * * *" --include-namespaces production
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
### Backup Verification
|
|
246
|
+
|
|
247
|
+
```yaml
|
|
248
|
+
backup_testing:
|
|
249
|
+
frequency: "Monthly at minimum"
|
|
250
|
+
|
|
251
|
+
process:
|
|
252
|
+
- "Select random backup"
|
|
253
|
+
- "Restore to test environment"
|
|
254
|
+
- "Verify data integrity"
|
|
255
|
+
- "Test application functionality"
|
|
256
|
+
- "Document results"
|
|
257
|
+
|
|
258
|
+
checklist:
|
|
259
|
+
- "Backup files exist and accessible"
|
|
260
|
+
- "Backup can be decrypted"
|
|
261
|
+
- "Restore completes without errors"
|
|
262
|
+
- "Data matches expected state"
|
|
263
|
+
- "Application can read restored data"
|
|
264
|
+
- "Restore time within RTO"
|
|
265
|
+
|
|
266
|
+
integrity_checks: |
|
|
267
|
+
# PostgreSQL backup verification
|
|
268
|
+
pg_restore --list backup.dump > /dev/null && echo "Backup valid"
|
|
269
|
+
|
|
270
|
+
# Compare row counts
|
|
271
|
+
psql -c "SELECT count(*) FROM users" production
|
|
272
|
+
psql -c "SELECT count(*) FROM users" restored_db
|
|
273
|
+
|
|
274
|
+
# Checksum verification
|
|
275
|
+
sha256sum backup.dump > backup.sha256
|
|
276
|
+
# Store and verify later
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
## Failover Procedures
|
|
280
|
+
|
|
281
|
+
### Automated Failover
|
|
282
|
+
|
|
283
|
+
```yaml
|
|
284
|
+
database_failover:
|
|
285
|
+
postgresql_patroni:
|
|
286
|
+
description: "Automatic leader election"
|
|
287
|
+
detection: "Health checks every 10 seconds"
|
|
288
|
+
failover_time: "30-60 seconds"
|
|
289
|
+
configuration: |
|
|
290
|
+
# Patroni configuration
|
|
291
|
+
bootstrap:
|
|
292
|
+
dcs:
|
|
293
|
+
ttl: 30
|
|
294
|
+
loop_wait: 10
|
|
295
|
+
retry_timeout: 10
|
|
296
|
+
maximum_lag_on_failover: 1048576
|
|
297
|
+
|
|
298
|
+
rds_multi_az:
|
|
299
|
+
description: "AWS managed failover"
|
|
300
|
+
detection: "Automatic"
|
|
301
|
+
failover_time: "60-120 seconds"
|
|
302
|
+
action: "Automatic, no intervention needed"
|
|
303
|
+
|
|
304
|
+
application_failover:
|
|
305
|
+
kubernetes:
|
|
306
|
+
description: "Pod rescheduling"
|
|
307
|
+
detection: "Liveness/readiness probes"
|
|
308
|
+
failover_time: "Seconds to minutes"
|
|
309
|
+
configuration: |
|
|
310
|
+
livenessProbe:
|
|
311
|
+
httpGet:
|
|
312
|
+
path: /health
|
|
313
|
+
port: 8080
|
|
314
|
+
initialDelaySeconds: 10
|
|
315
|
+
periodSeconds: 10
|
|
316
|
+
failureThreshold: 3
|
|
317
|
+
|
|
318
|
+
load_balancer:
|
|
319
|
+
description: "Health check based routing"
|
|
320
|
+
detection: "HTTP health checks"
|
|
321
|
+
failover_time: "Seconds"
|
|
322
|
+
```
|
|
323
|
+
|
|
324
|
+
### Regional Failover
|
|
325
|
+
|
|
326
|
+
```yaml
|
|
327
|
+
regional_failover_process:
|
|
328
|
+
detection:
|
|
329
|
+
triggers:
|
|
330
|
+
- "Multiple AZ failures"
|
|
331
|
+
- "Regional network issues"
|
|
332
|
+
- "Extended outage (> 15 minutes)"
|
|
333
|
+
monitoring:
|
|
334
|
+
- "Regional health dashboard"
|
|
335
|
+
- "External synthetic monitoring"
|
|
336
|
+
- "Cross-region health checks"
|
|
337
|
+
|
|
338
|
+
decision:
|
|
339
|
+
criteria:
|
|
340
|
+
- "Primary region unrecoverable"
|
|
341
|
+
- "Data sync status known"
|
|
342
|
+
- "Business approval (if applicable)"
|
|
343
|
+
timeframe: "Decide within 15 minutes"
|
|
344
|
+
|
|
345
|
+
execution:
|
|
346
|
+
steps:
|
|
347
|
+
1_verify: "Confirm secondary region ready"
|
|
348
|
+
2_dns: "Update DNS to secondary region"
|
|
349
|
+
3_scale: "Scale secondary region capacity"
|
|
350
|
+
4_verify: "Verify traffic flowing"
|
|
351
|
+
5_monitor: "Monitor error rates"
|
|
352
|
+
|
|
353
|
+
communication:
|
|
354
|
+
- "Status page update"
|
|
355
|
+
- "Internal notification"
|
|
356
|
+
- "Customer communication"
|
|
357
|
+
|
|
358
|
+
dns_failover: |
|
|
359
|
+
# Route 53 health check based failover
|
|
360
|
+
aws route53 change-resource-record-sets --hosted-zone-id Z123 --change-batch '{
|
|
361
|
+
"Changes": [{
|
|
362
|
+
"Action": "UPSERT",
|
|
363
|
+
"ResourceRecordSet": {
|
|
364
|
+
"Name": "api.example.com",
|
|
365
|
+
"Type": "A",
|
|
366
|
+
"SetIdentifier": "secondary",
|
|
367
|
+
"Failover": "SECONDARY",
|
|
368
|
+
"TTL": 60,
|
|
369
|
+
"ResourceRecords": [{"Value": "secondary-ip"}],
|
|
370
|
+
"HealthCheckId": "health-check-id"
|
|
371
|
+
}
|
|
372
|
+
}]
|
|
373
|
+
}'
|
|
374
|
+
```
|
|
375
|
+
|
|
376
|
+
## DR Runbook Template
|
|
377
|
+
|
|
378
|
+
```markdown
|
|
379
|
+
# Disaster Recovery Runbook: [Scenario]
|
|
380
|
+
|
|
381
|
+
## Overview
|
|
382
|
+
|
|
383
|
+
**Scenario**: [Description of disaster scenario]
|
|
384
|
+
**Affected Services**: [List of services]
|
|
385
|
+
**Recovery Strategy**: [Active-Active | Warm Standby | Cold Standby]
|
|
386
|
+
**Target RTO**: [Time]
|
|
387
|
+
**Target RPO**: [Time]
|
|
388
|
+
|
|
389
|
+
---
|
|
390
|
+
|
|
391
|
+
## Detection
|
|
392
|
+
|
|
393
|
+
### Monitoring
|
|
394
|
+
- Dashboard: [Grafana link]
|
|
395
|
+
- Alerts: [Alert names that indicate this scenario]
|
|
396
|
+
- External monitoring: [Synthetic checks]
|
|
397
|
+
|
|
398
|
+
### Verification
|
|
399
|
+
Before declaring disaster:
|
|
400
|
+
1. Verify issue is not transient (wait 5 minutes)
|
|
401
|
+
2. Confirm with multiple monitoring sources
|
|
402
|
+
3. Check cloud provider status page
|
|
403
|
+
4. Attempt basic remediation
|
|
404
|
+
|
|
405
|
+
---
|
|
406
|
+
|
|
407
|
+
## Declaration
|
|
408
|
+
|
|
409
|
+
### When to Declare
|
|
410
|
+
- [ ] Primary region unreachable for > 15 minutes
|
|
411
|
+
- [ ] Data center evacuation required
|
|
412
|
+
- [ ] Security incident requires isolation
|
|
413
|
+
- [ ] Other: [specific criteria]
|
|
414
|
+
|
|
415
|
+
### Declaration Process
|
|
416
|
+
1. Notify incident commander
|
|
417
|
+
2. Start incident channel: #dr-YYYY-MM-DD
|
|
418
|
+
3. Page DR response team
|
|
419
|
+
4. Update status page: "Major outage, activating DR"
|
|
420
|
+
|
|
421
|
+
---
|
|
422
|
+
|
|
423
|
+
## Failover Procedure
|
|
424
|
+
|
|
425
|
+
### Pre-Failover Checks
|
|
426
|
+
|
|
427
|
+
```bash
|
|
428
|
+
# Verify secondary region health
|
|
429
|
+
curl https://secondary-region-healthcheck.example.com/health
|
|
430
|
+
|
|
431
|
+
# Check replication lag
|
|
432
|
+
# [Database-specific command]
|
|
433
|
+
|
|
434
|
+
# Verify backup status
|
|
435
|
+
# [Command to check latest backup]
|
|
436
|
+
```
|
|
437
|
+
|
|
438
|
+
### Step 1: Stop Traffic to Primary
|
|
439
|
+
|
|
440
|
+
```bash
|
|
441
|
+
# Update load balancer
|
|
442
|
+
aws elbv2 modify-listener --listener-arn <arn> --default-actions Type=fixed-response,FixedResponseConfig={StatusCode=503}
|
|
443
|
+
|
|
444
|
+
# Or update DNS TTL (if not already low)
|
|
445
|
+
# DNS should already have low TTL (60s) for DR
|
|
446
|
+
```
|
|
447
|
+
|
|
448
|
+
### Step 2: Promote Secondary Database
|
|
449
|
+
|
|
450
|
+
```bash
|
|
451
|
+
# PostgreSQL promotion
|
|
452
|
+
patronictl failover --master primary-node --candidate secondary-node
|
|
453
|
+
|
|
454
|
+
# Or RDS
|
|
455
|
+
aws rds promote-read-replica --db-instance-identifier secondary-db
|
|
456
|
+
```
|
|
457
|
+
|
|
458
|
+
### Step 3: Scale Secondary Application
|
|
459
|
+
|
|
460
|
+
```bash
|
|
461
|
+
# Scale up secondary region
|
|
462
|
+
kubectl config use-context secondary-region
|
|
463
|
+
kubectl scale deployment/api-server --replicas=20
|
|
464
|
+
kubectl scale deployment/web-server --replicas=10
|
|
465
|
+
```
|
|
466
|
+
|
|
467
|
+
### Step 4: Update DNS
|
|
468
|
+
|
|
469
|
+
```bash
|
|
470
|
+
# Switch DNS to secondary
|
|
471
|
+
aws route53 change-resource-record-sets --hosted-zone-id Z123 --change-batch file://failover-dns.json
|
|
472
|
+
|
|
473
|
+
# Or if using Route 53 failover
|
|
474
|
+
# Health check failure should trigger automatic failover
|
|
475
|
+
```
|
|
476
|
+
|
|
477
|
+
### Step 5: Verify Recovery
|
|
478
|
+
|
|
479
|
+
```bash
|
|
480
|
+
# Check application health
|
|
481
|
+
curl https://api.example.com/health
|
|
482
|
+
|
|
483
|
+
# Check error rates
|
|
484
|
+
# [Query Prometheus/Datadog]
|
|
485
|
+
|
|
486
|
+
# Run smoke tests
|
|
487
|
+
./scripts/smoke-test.sh
|
|
488
|
+
```
|
|
489
|
+
|
|
490
|
+
---
|
|
491
|
+
|
|
492
|
+
## Post-Failover
|
|
493
|
+
|
|
494
|
+
### Immediate (0-1 hour)
|
|
495
|
+
- [ ] Verify all critical functions working
|
|
496
|
+
- [ ] Update status page: "Operating in DR mode"
|
|
497
|
+
- [ ] Notify stakeholders
|
|
498
|
+
- [ ] Monitor error rates
|
|
499
|
+
|
|
500
|
+
### Short-term (1-24 hours)
|
|
501
|
+
- [ ] Assess primary region status
|
|
502
|
+
- [ ] Document data loss (if any)
|
|
503
|
+
- [ ] Plan failback procedure
|
|
504
|
+
- [ ] Customer communication (if needed)
|
|
505
|
+
|
|
506
|
+
### Recovery (24-72 hours)
|
|
507
|
+
- [ ] Repair primary region
|
|
508
|
+
- [ ] Resync data
|
|
509
|
+
- [ ] Test primary region
|
|
510
|
+
- [ ] Schedule failback
|
|
511
|
+
|
|
512
|
+
---
|
|
513
|
+
|
|
514
|
+
## Failback Procedure
|
|
515
|
+
|
|
516
|
+
### Prerequisites
|
|
517
|
+
- [ ] Primary region fully operational
|
|
518
|
+
- [ ] Data synced from secondary to primary
|
|
519
|
+
- [ ] Testing completed in primary
|
|
520
|
+
- [ ] Change window scheduled
|
|
521
|
+
|
|
522
|
+
### Failback Steps
|
|
523
|
+
1. Stop writes to secondary (if needed)
|
|
524
|
+
2. Final data sync
|
|
525
|
+
3. Verify data consistency
|
|
526
|
+
4. Switch traffic to primary
|
|
527
|
+
5. Monitor and verify
|
|
528
|
+
6. Decommission DR mode
|
|
529
|
+
|
|
530
|
+
---
|
|
531
|
+
|
|
532
|
+
## Contacts
|
|
533
|
+
|
|
534
|
+
| Role | Contact | Responsibility |
|
|
535
|
+
|------|---------|----------------|
|
|
536
|
+
| DR Coordinator | @dr-lead | Overall coordination |
|
|
537
|
+
| Database | @dba-oncall | Database failover |
|
|
538
|
+
| Infrastructure | @infra-oncall | DNS, load balancers |
|
|
539
|
+
| Application | @app-oncall | Application verification |
|
|
540
|
+
|
|
541
|
+
---
|
|
542
|
+
|
|
543
|
+
## Revision History
|
|
544
|
+
|
|
545
|
+
| Date | Author | Change |
|
|
546
|
+
|------|--------|--------|
|
|
547
|
+
| 2025-01-15 | @engineer | Initial version |
|
|
548
|
+
```
|
|
549
|
+
|
|
550
|
+
## DR Testing
|
|
551
|
+
|
|
552
|
+
### Test Types
|
|
553
|
+
|
|
554
|
+
```yaml
|
|
555
|
+
tabletop_exercise:
|
|
556
|
+
description: "Walk through DR plan verbally"
|
|
557
|
+
frequency: "Quarterly"
|
|
558
|
+
duration: "2-4 hours"
|
|
559
|
+
participants: "All on-call engineers"
|
|
560
|
+
outcome: "Identify gaps in documentation"
|
|
561
|
+
|
|
562
|
+
component_failover:
|
|
563
|
+
description: "Test individual component recovery"
|
|
564
|
+
frequency: "Monthly"
|
|
565
|
+
examples:
|
|
566
|
+
- "Database replica promotion"
|
|
567
|
+
- "Single AZ failure simulation"
|
|
568
|
+
- "Service restart recovery"
|
|
569
|
+
outcome: "Verify automated failover works"
|
|
570
|
+
|
|
571
|
+
regional_failover:
|
|
572
|
+
description: "Full region evacuation test"
|
|
573
|
+
frequency: "Bi-annually"
|
|
574
|
+
preparation:
|
|
575
|
+
- "Schedule maintenance window"
|
|
576
|
+
- "Notify customers"
|
|
577
|
+
- "Prepare rollback"
|
|
578
|
+
outcome: "Validate end-to-end DR capability"
|
|
579
|
+
|
|
580
|
+
chaos_engineering:
|
|
581
|
+
description: "Inject failures in production"
|
|
582
|
+
frequency: "Ongoing"
|
|
583
|
+
examples:
|
|
584
|
+
- "Kill random pods"
|
|
585
|
+
- "Inject network latency"
|
|
586
|
+
- "Simulate AZ failure"
|
|
587
|
+
outcome: "Continuous validation of resilience"
|
|
588
|
+
```
|
|
589
|
+
|
|
590
|
+
### DR Test Checklist
|
|
591
|
+
|
|
592
|
+
```yaml
|
|
593
|
+
test_planning:
|
|
594
|
+
- "Define test objectives"
|
|
595
|
+
- "Identify success criteria"
|
|
596
|
+
- "Schedule appropriate window"
|
|
597
|
+
- "Notify stakeholders"
|
|
598
|
+
- "Prepare rollback plan"
|
|
599
|
+
|
|
600
|
+
during_test:
|
|
601
|
+
- "Document all actions"
|
|
602
|
+
- "Record timing for each step"
|
|
603
|
+
- "Note any deviations from plan"
|
|
604
|
+
- "Capture issues encountered"
|
|
605
|
+
|
|
606
|
+
post_test:
|
|
607
|
+
- "Compare actual vs expected RTO/RPO"
|
|
608
|
+
- "Document lessons learned"
|
|
609
|
+
- "Update runbooks"
|
|
610
|
+
- "Create action items for improvements"
|
|
611
|
+
- "Schedule follow-up test for gaps"
|
|
612
|
+
```
|
|
613
|
+
|
|
614
|
+
## Common Pitfalls
|
|
615
|
+
|
|
616
|
+
```yaml
|
|
617
|
+
pitfall_untested_backups:
|
|
618
|
+
problem: "Backups exist but never tested"
|
|
619
|
+
impact: "Discover corruption during actual disaster"
|
|
620
|
+
solution: "Monthly restore testing"
|
|
621
|
+
|
|
622
|
+
pitfall_stale_runbooks:
|
|
623
|
+
problem: "DR runbooks outdated"
|
|
624
|
+
impact: "Wrong commands, missing steps"
|
|
625
|
+
solution: "Update runbooks after every test and change"
|
|
626
|
+
|
|
627
|
+
pitfall_single_region:
|
|
628
|
+
problem: "All resources in one region"
|
|
629
|
+
impact: "Complete outage if region fails"
|
|
630
|
+
solution: "Multi-region architecture for critical services"
|
|
631
|
+
|
|
632
|
+
pitfall_no_communication_plan:
|
|
633
|
+
problem: "No plan for customer communication"
|
|
634
|
+
impact: "Confusion, support overload"
|
|
635
|
+
solution: "Pre-written communication templates"
|
|
636
|
+
|
|
637
|
+
pitfall_manual_failover:
|
|
638
|
+
problem: "Failover requires manual steps"
|
|
639
|
+
impact: "Slow recovery, human error"
|
|
640
|
+
solution: "Automate failover where possible"
|
|
641
|
+
```
|