@intentsolutionsio/fairdb-operations-kit 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +26 -0
- package/LICENSE +21 -0
- package/README.md +298 -0
- package/agents/fairdb-automation-agent.md +307 -0
- package/commands/fairdb-emergency-response.md +480 -0
- package/commands/fairdb-health-check.md +459 -0
- package/commands/fairdb-onboard-customer.md +446 -0
- package/commands/fairdb-setup-backup.md +420 -0
- package/package.json +48 -0
- package/skills/fairdb-backup-manager/SKILL.md +72 -0
- package/skills/fairdb-backup-manager/assets/README.md +26 -0
- package/skills/fairdb-backup-manager/references/README.md +26 -0
- package/skills/fairdb-backup-manager/scripts/README.md +24 -0
- package/skills/skill-adapter/assets/README.md +4 -0
- package/skills/skill-adapter/assets/config-template.json +32 -0
- package/skills/skill-adapter/assets/skill-schema.json +28 -0
- package/skills/skill-adapter/assets/test-data.json +27 -0
- package/skills/skill-adapter/references/README.md +4 -0
- package/skills/skill-adapter/references/best-practices.md +69 -0
- package/skills/skill-adapter/references/examples.md +73 -0
- package/skills/skill-adapter/scripts/README.md +10 -0
- package/skills/skill-adapter/scripts/helper-template.sh +42 -0
- package/skills/skill-adapter/scripts/validation.sh +32 -0
|
@@ -0,0 +1,480 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: fairdb-emergency-response
|
|
3
|
+
description: Emergency incident response procedures for critical FairDB issues
|
|
4
|
+
model: sonnet
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# FairDB Emergency Incident Response
|
|
8
|
+
|
|
9
|
+
You are responding to a critical incident in the FairDB PostgreSQL infrastructure. Follow this structured approach to diagnose, contain, and resolve the issue.
|
|
10
|
+
|
|
11
|
+
## Incident Classification
|
|
12
|
+
|
|
13
|
+
First, identify the incident type:
|
|
14
|
+
- **P1 Critical**: Complete service outage, data loss risk
|
|
15
|
+
- **P2 High**: Major degradation, affecting multiple customers
|
|
16
|
+
- **P3 Medium**: Single customer impact, performance issues
|
|
17
|
+
- **P4 Low**: Minor issues, cosmetic problems
|
|
18
|
+
|
|
19
|
+
## Initial Assessment (First 5 Minutes)
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
#!/bin/bash
|
|
23
|
+
# FairDB Emergency Response Script
|
|
24
|
+
|
|
25
|
+
echo "================================================"
|
|
26
|
+
echo " FAIRDB EMERGENCY INCIDENT RESPONSE"
|
|
27
|
+
echo " Started: $(date '+%Y-%m-%d %H:%M:%S')"
|
|
28
|
+
echo "================================================"
|
|
29
|
+
|
|
30
|
+
# Create incident log
|
|
31
|
+
INCIDENT_ID="INC-$(date +%Y%m%d-%H%M%S)"
|
|
32
|
+
INCIDENT_LOG="/opt/fairdb/incidents/${INCIDENT_ID}.log"
|
|
33
|
+
mkdir -p /opt/fairdb/incidents
|
|
34
|
+
|
|
35
|
+
{
|
|
36
|
+
echo "Incident ID: $INCIDENT_ID"
|
|
37
|
+
echo "Response started: $(date)"
|
|
38
|
+
echo "Responding user: $(whoami)"
|
|
39
|
+
echo "========================================"
|
|
40
|
+
} | tee $INCIDENT_LOG
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Step 1: Service Status Check
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
echo -e "\n[STEP 1] SERVICE STATUS CHECK" | tee -a $INCIDENT_LOG
|
|
47
|
+
echo "------------------------------" | tee -a $INCIDENT_LOG
|
|
48
|
+
|
|
49
|
+
# Check PostgreSQL service
|
|
50
|
+
if systemctl is-active --quiet postgresql; then
|
|
51
|
+
echo "✅ PostgreSQL: RUNNING" | tee -a $INCIDENT_LOG
|
|
52
|
+
else
|
|
53
|
+
echo "❌ CRITICAL: PostgreSQL is DOWN" | tee -a $INCIDENT_LOG
|
|
54
|
+
echo "Attempting emergency restart..." | tee -a $INCIDENT_LOG
|
|
55
|
+
|
|
56
|
+
# Try to start the service
|
|
57
|
+
sudo systemctl start postgresql 2>&1 | tee -a $INCIDENT_LOG
|
|
58
|
+
|
|
59
|
+
sleep 5
|
|
60
|
+
|
|
61
|
+
if systemctl is-active --quiet postgresql; then
|
|
62
|
+
echo "✅ PostgreSQL restarted successfully" | tee -a $INCIDENT_LOG
|
|
63
|
+
else
|
|
64
|
+
echo "❌ FAILED to restart PostgreSQL" | tee -a $INCIDENT_LOG
|
|
65
|
+
echo "Checking for port conflicts..." | tee -a $INCIDENT_LOG
|
|
66
|
+
sudo netstat -tulpn | grep :5432 | tee -a $INCIDENT_LOG
|
|
67
|
+
|
|
68
|
+
# Check for corruption
|
|
69
|
+
echo "Checking for data corruption..." | tee -a $INCIDENT_LOG
|
|
70
|
+
sudo -u postgres /usr/lib/postgresql/16/bin/postgres -D /var/lib/postgresql/16/main -C data_directory 2>&1 | tee -a $INCIDENT_LOG
|
|
71
|
+
fi
|
|
72
|
+
fi
|
|
73
|
+
|
|
74
|
+
# Check disk space
|
|
75
|
+
echo -e "\nDisk Space:" | tee -a $INCIDENT_LOG
|
|
76
|
+
df -h | grep -E "^/dev|^Filesystem" | tee -a $INCIDENT_LOG
|
|
77
|
+
|
|
78
|
+
# Check for full disks
|
|
79
|
+
FULL_DISKS=$(df -h | grep -E "100%|9[5-9]%" | wc -l)
|
|
80
|
+
if [ $FULL_DISKS -gt 0 ]; then
|
|
81
|
+
echo "⚠️ CRITICAL: Disk space exhausted!" | tee -a $INCIDENT_LOG
|
|
82
|
+
echo "Emergency cleanup required..." | tee -a $INCIDENT_LOG
|
|
83
|
+
|
|
84
|
+
# Emergency log cleanup
|
|
85
|
+
find /var/log/postgresql -name "*.log" -mtime +7 -delete 2>/dev/null
|
|
86
|
+
find /opt/fairdb/logs -name "*.log" -mtime +7 -delete 2>/dev/null
|
|
87
|
+
|
|
88
|
+
echo "Old logs cleared. New disk usage:" | tee -a $INCIDENT_LOG
|
|
89
|
+
df -h | grep -E "^/dev" | tee -a $INCIDENT_LOG
|
|
90
|
+
fi
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## Step 2: Connection Diagnostics
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
echo -e "\n[STEP 2] CONNECTION DIAGNOSTICS" | tee -a $INCIDENT_LOG
|
|
97
|
+
echo "--------------------------------" | tee -a $INCIDENT_LOG
|
|
98
|
+
|
|
99
|
+
# Test local connection
|
|
100
|
+
echo "Testing local connection..." | tee -a $INCIDENT_LOG
|
|
101
|
+
if sudo -u postgres psql -c "SELECT 1;" > /dev/null 2>&1; then
|
|
102
|
+
echo "✅ Local connections: OK" | tee -a $INCIDENT_LOG
|
|
103
|
+
|
|
104
|
+
# Get connection stats
|
|
105
|
+
sudo -u postgres psql -t -c "
|
|
106
|
+
SELECT 'Active connections: ' || count(*)
|
|
107
|
+
FROM pg_stat_activity
|
|
108
|
+
WHERE state != 'idle';" | tee -a $INCIDENT_LOG
|
|
109
|
+
|
|
110
|
+
# Check for connection exhaustion
|
|
111
|
+
MAX_CONN=$(sudo -u postgres psql -t -c "SHOW max_connections;")
|
|
112
|
+
CURRENT_CONN=$(sudo -u postgres psql -t -c "SELECT count(*) FROM pg_stat_activity;")
|
|
113
|
+
|
|
114
|
+
echo "Connections: $CURRENT_CONN / $MAX_CONN" | tee -a $INCIDENT_LOG
|
|
115
|
+
|
|
116
|
+
if [ $CURRENT_CONN -gt $(( MAX_CONN * 90 / 100 )) ]; then
|
|
117
|
+
echo "⚠️ WARNING: Connection pool nearly exhausted" | tee -a $INCIDENT_LOG
|
|
118
|
+
echo "Terminating idle connections..." | tee -a $INCIDENT_LOG
|
|
119
|
+
|
|
120
|
+
# Kill idle connections older than 10 minutes
|
|
121
|
+
sudo -u postgres psql << 'EOF' | tee -a $INCIDENT_LOG
|
|
122
|
+
SELECT pg_terminate_backend(pid)
|
|
123
|
+
FROM pg_stat_activity
|
|
124
|
+
WHERE state = 'idle'
|
|
125
|
+
AND state_change < NOW() - INTERVAL '10 minutes'
|
|
126
|
+
AND pid != pg_backend_pid();
|
|
127
|
+
EOF
|
|
128
|
+
fi
|
|
129
|
+
else
|
|
130
|
+
echo "❌ CRITICAL: Cannot connect to PostgreSQL" | tee -a $INCIDENT_LOG
|
|
131
|
+
echo "Checking PostgreSQL logs..." | tee -a $INCIDENT_LOG
|
|
132
|
+
tail -50 /var/log/postgresql/postgresql-*.log | tee -a $INCIDENT_LOG
|
|
133
|
+
fi
|
|
134
|
+
|
|
135
|
+
# Check network connectivity
|
|
136
|
+
echo -e "\nNetwork status:" | tee -a $INCIDENT_LOG
|
|
137
|
+
ip addr show | grep "inet " | tee -a $INCIDENT_LOG
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## Step 3: Performance Emergency Response
|
|
141
|
+
|
|
142
|
+
```bash
|
|
143
|
+
echo -e "\n[STEP 3] PERFORMANCE TRIAGE" | tee -a $INCIDENT_LOG
|
|
144
|
+
echo "----------------------------" | tee -a $INCIDENT_LOG
|
|
145
|
+
|
|
146
|
+
# Find and kill long-running queries
|
|
147
|
+
echo "Checking for blocked/long queries..." | tee -a $INCIDENT_LOG
|
|
148
|
+
|
|
149
|
+
sudo -u postgres psql << 'EOF' | tee -a $INCIDENT_LOG
|
|
150
|
+
-- Queries running longer than 5 minutes
|
|
151
|
+
SELECT
|
|
152
|
+
pid,
|
|
153
|
+
now() - query_start as duration,
|
|
154
|
+
state,
|
|
155
|
+
LEFT(query, 100) as query_preview
|
|
156
|
+
FROM pg_stat_activity
|
|
157
|
+
WHERE state != 'idle'
|
|
158
|
+
AND now() - query_start > interval '5 minutes'
|
|
159
|
+
ORDER BY duration DESC;
|
|
160
|
+
|
|
161
|
+
-- Kill queries running longer than 30 minutes
|
|
162
|
+
SELECT pg_cancel_backend(pid)
|
|
163
|
+
FROM pg_stat_activity
|
|
164
|
+
WHERE state != 'idle'
|
|
165
|
+
AND now() - query_start > interval '30 minutes'
|
|
166
|
+
AND pid != pg_backend_pid();
|
|
167
|
+
EOF
|
|
168
|
+
|
|
169
|
+
# Check for locks
|
|
170
|
+
echo -e "\nChecking for lock conflicts..." | tee -a $INCIDENT_LOG
|
|
171
|
+
sudo -u postgres psql << 'EOF' | tee -a $INCIDENT_LOG
|
|
172
|
+
SELECT
|
|
173
|
+
blocked_locks.pid AS blocked_pid,
|
|
174
|
+
blocked_activity.usename AS blocked_user,
|
|
175
|
+
blocking_locks.pid AS blocking_pid,
|
|
176
|
+
blocking_activity.usename AS blocking_user,
|
|
177
|
+
blocked_activity.query AS blocked_statement,
|
|
178
|
+
blocking_activity.query AS blocking_statement
|
|
179
|
+
FROM pg_catalog.pg_locks blocked_locks
|
|
180
|
+
JOIN pg_catalog.pg_stat_activity blocked_activity ON blocked_activity.pid = blocked_locks.pid
|
|
181
|
+
JOIN pg_catalog.pg_locks blocking_locks ON blocking_locks.locktype = blocked_locks.locktype
|
|
182
|
+
AND blocking_locks.DATABASE IS NOT DISTINCT FROM blocked_locks.DATABASE
|
|
183
|
+
AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation
|
|
184
|
+
AND blocking_locks.page IS NOT DISTINCT FROM blocked_locks.page
|
|
185
|
+
AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple
|
|
186
|
+
AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid
|
|
187
|
+
AND blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid
|
|
188
|
+
AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid
|
|
189
|
+
AND blocking_locks.objid IS NOT DISTINCT FROM blocked_locks.objid
|
|
190
|
+
AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid
|
|
191
|
+
AND blocking_locks.pid != blocked_locks.pid
|
|
192
|
+
JOIN pg_catalog.pg_stat_activity blocking_activity ON blocking_activity.pid = blocking_locks.pid
|
|
193
|
+
WHERE NOT blocked_locks.GRANTED;
|
|
194
|
+
EOF
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
## Step 4: Data Integrity Check
|
|
198
|
+
|
|
199
|
+
```bash
|
|
200
|
+
echo -e "\n[STEP 4] DATA INTEGRITY CHECK" | tee -a $INCIDENT_LOG
|
|
201
|
+
echo "------------------------------" | tee -a $INCIDENT_LOG
|
|
202
|
+
|
|
203
|
+
# Check for corruption indicators
|
|
204
|
+
echo "Checking for corruption indicators..." | tee -a $INCIDENT_LOG
|
|
205
|
+
|
|
206
|
+
# Check PostgreSQL data directory
|
|
207
|
+
DATA_DIR="/var/lib/postgresql/16/main"
|
|
208
|
+
if [ -d "$DATA_DIR" ]; then
|
|
209
|
+
echo "Data directory exists: $DATA_DIR" | tee -a $INCIDENT_LOG
|
|
210
|
+
|
|
211
|
+
# Check for recovery in progress
|
|
212
|
+
if [ -f "$DATA_DIR/recovery.signal" ]; then
|
|
213
|
+
echo "⚠️ Recovery in progress!" | tee -a $INCIDENT_LOG
|
|
214
|
+
fi
|
|
215
|
+
|
|
216
|
+
# Check WAL status
|
|
217
|
+
WAL_COUNT=$(ls -1 $DATA_DIR/pg_wal/*.partial 2>/dev/null | wc -l)
|
|
218
|
+
if [ $WAL_COUNT -gt 0 ]; then
|
|
219
|
+
echo "⚠️ Partial WAL files detected: $WAL_COUNT" | tee -a $INCIDENT_LOG
|
|
220
|
+
fi
|
|
221
|
+
else
|
|
222
|
+
echo "❌ CRITICAL: Data directory not found!" | tee -a $INCIDENT_LOG
|
|
223
|
+
fi
|
|
224
|
+
|
|
225
|
+
# Run basic integrity check
|
|
226
|
+
echo -e "\nRunning integrity checks..." | tee -a $INCIDENT_LOG
|
|
227
|
+
for DB in $(sudo -u postgres psql -t -c "SELECT datname FROM pg_database WHERE datistemplate = false;"); do
|
|
228
|
+
echo "Checking database: $DB" | tee -a $INCIDENT_LOG
|
|
229
|
+
sudo -u postgres psql -d $DB -c "SELECT 1;" > /dev/null 2>&1
|
|
230
|
+
if [ $? -eq 0 ]; then
|
|
231
|
+
echo " ✅ Database $DB is accessible" | tee -a $INCIDENT_LOG
|
|
232
|
+
else
|
|
233
|
+
echo " ❌ Database $DB has issues!" | tee -a $INCIDENT_LOG
|
|
234
|
+
fi
|
|
235
|
+
done
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
## Step 5: Emergency Recovery Actions
|
|
239
|
+
|
|
240
|
+
```bash
|
|
241
|
+
echo -e "\n[STEP 5] RECOVERY ACTIONS" | tee -a $INCIDENT_LOG
|
|
242
|
+
echo "-------------------------" | tee -a $INCIDENT_LOG
|
|
243
|
+
|
|
244
|
+
# Determine if recovery is needed
|
|
245
|
+
read -p "Do you need to initiate emergency recovery? (yes/no): " NEED_RECOVERY
|
|
246
|
+
|
|
247
|
+
if [ "$NEED_RECOVERY" = "yes" ]; then
|
|
248
|
+
echo "Starting emergency recovery procedures..." | tee -a $INCIDENT_LOG
|
|
249
|
+
|
|
250
|
+
# Option 1: Restart in single-user mode for repairs
|
|
251
|
+
echo "Option 1: Single-user mode repair" | tee -a $INCIDENT_LOG
|
|
252
|
+
echo "Command: sudo -u postgres /usr/lib/postgresql/16/bin/postgres --single -D $DATA_DIR" | tee -a $INCIDENT_LOG
|
|
253
|
+
|
|
254
|
+
# Option 2: Restore from backup
|
|
255
|
+
echo "Option 2: Restore from backup" | tee -a $INCIDENT_LOG
|
|
256
|
+
|
|
257
|
+
# Check available backups
|
|
258
|
+
if command -v pgbackrest &> /dev/null; then
|
|
259
|
+
echo "Available backups:" | tee -a $INCIDENT_LOG
|
|
260
|
+
sudo -u postgres pgbackrest --stanza=fairdb info 2>&1 | tee -a $INCIDENT_LOG
|
|
261
|
+
fi
|
|
262
|
+
|
|
263
|
+
# Option 3: Point-in-time recovery
|
|
264
|
+
echo "Option 3: Point-in-time recovery" | tee -a $INCIDENT_LOG
|
|
265
|
+
echo "Use: /opt/fairdb/scripts/restore-pitr.sh 'YYYY-MM-DD HH:MM:SS'" | tee -a $INCIDENT_LOG
|
|
266
|
+
|
|
267
|
+
read -p "Select recovery option (1/2/3/none): " RECOVERY_OPTION
|
|
268
|
+
|
|
269
|
+
case $RECOVERY_OPTION in
|
|
270
|
+
1)
|
|
271
|
+
echo "Starting single-user mode..." | tee -a $INCIDENT_LOG
|
|
272
|
+
sudo systemctl stop postgresql
|
|
273
|
+
sudo -u postgres /usr/lib/postgresql/16/bin/postgres --single -D $DATA_DIR
|
|
274
|
+
;;
|
|
275
|
+
2)
|
|
276
|
+
echo "Starting backup restore..." | tee -a $INCIDENT_LOG
|
|
277
|
+
read -p "Enter backup label to restore: " BACKUP_LABEL
|
|
278
|
+
sudo systemctl stop postgresql
|
|
279
|
+
sudo -u postgres pgbackrest --stanza=fairdb --set=$BACKUP_LABEL restore
|
|
280
|
+
sudo systemctl start postgresql
|
|
281
|
+
;;
|
|
282
|
+
3)
|
|
283
|
+
echo "Starting PITR..." | tee -a $INCIDENT_LOG
|
|
284
|
+
read -p "Enter target time (YYYY-MM-DD HH:MM:SS): " TARGET_TIME
|
|
285
|
+
/opt/fairdb/scripts/restore-pitr.sh "$TARGET_TIME"
|
|
286
|
+
;;
|
|
287
|
+
*)
|
|
288
|
+
echo "No recovery action taken" | tee -a $INCIDENT_LOG
|
|
289
|
+
;;
|
|
290
|
+
esac
|
|
291
|
+
fi
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
## Step 6: Customer Communication
|
|
295
|
+
|
|
296
|
+
```bash
|
|
297
|
+
echo -e "\n[STEP 6] CUSTOMER IMPACT ASSESSMENT" | tee -a $INCIDENT_LOG
|
|
298
|
+
echo "------------------------------------" | tee -a $INCIDENT_LOG
|
|
299
|
+
|
|
300
|
+
# Identify affected customers
|
|
301
|
+
echo "Affected customer databases:" | tee -a $INCIDENT_LOG
|
|
302
|
+
|
|
303
|
+
AFFECTED_DBS=$(sudo -u postgres psql -t -c "
|
|
304
|
+
SELECT datname FROM pg_database
|
|
305
|
+
WHERE datname NOT IN ('postgres', 'template0', 'template1')
|
|
306
|
+
ORDER BY datname;")
|
|
307
|
+
|
|
308
|
+
for DB in $AFFECTED_DBS; do
|
|
309
|
+
# Check if database is accessible
|
|
310
|
+
if sudo -u postgres psql -d $DB -c "SELECT 1;" > /dev/null 2>&1; then
|
|
311
|
+
echo " ✅ $DB - Operational" | tee -a $INCIDENT_LOG
|
|
312
|
+
else
|
|
313
|
+
echo " ❌ $DB - IMPACTED" | tee -a $INCIDENT_LOG
|
|
314
|
+
fi
|
|
315
|
+
done
|
|
316
|
+
|
|
317
|
+
# Generate customer notification
|
|
318
|
+
cat << EOF | tee -a $INCIDENT_LOG
|
|
319
|
+
|
|
320
|
+
CUSTOMER NOTIFICATION TEMPLATE
|
|
321
|
+
===============================
|
|
322
|
+
Subject: FairDB Service Incident - $INCIDENT_ID
|
|
323
|
+
|
|
324
|
+
Dear Customer,
|
|
325
|
+
|
|
326
|
+
We are currently experiencing a service incident affecting FairDB PostgreSQL services.
|
|
327
|
+
|
|
328
|
+
Incident ID: $INCIDENT_ID
|
|
329
|
+
Start Time: $(date)
|
|
330
|
+
Severity: [P1/P2/P3/P4]
|
|
331
|
+
Status: Investigating / Identified / Monitoring / Resolved
|
|
332
|
+
|
|
333
|
+
Impact:
|
|
334
|
+
[Describe customer impact]
|
|
335
|
+
|
|
336
|
+
Current Actions:
|
|
337
|
+
[List recovery actions being taken]
|
|
338
|
+
|
|
339
|
+
Next Update:
|
|
340
|
+
We will provide an update within 30 minutes or sooner if the situation changes.
|
|
341
|
+
|
|
342
|
+
We apologize for any inconvenience and are working to resolve this as quickly as possible.
|
|
343
|
+
|
|
344
|
+
For urgent matters, please contact our emergency hotline: [PHONE]
|
|
345
|
+
|
|
346
|
+
Regards,
|
|
347
|
+
FairDB Operations Team
|
|
348
|
+
EOF
|
|
349
|
+
```
|
|
350
|
+
|
|
351
|
+
## Step 7: Post-Incident Checklist
|
|
352
|
+
|
|
353
|
+
```bash
|
|
354
|
+
echo -e "\n[STEP 7] STABILIZATION CHECKLIST" | tee -a $INCIDENT_LOG
|
|
355
|
+
echo "---------------------------------" | tee -a $INCIDENT_LOG
|
|
356
|
+
|
|
357
|
+
# Verification checklist
|
|
358
|
+
cat << 'EOF' | tee -a $INCIDENT_LOG
|
|
359
|
+
Post-Recovery Verification:
|
|
360
|
+
[ ] PostgreSQL service running
|
|
361
|
+
[ ] All customer databases accessible
|
|
362
|
+
[ ] Backup system operational
|
|
363
|
+
[ ] Monitoring alerts cleared
|
|
364
|
+
[ ] Network connectivity verified
|
|
365
|
+
[ ] Disk space adequate (>20% free)
|
|
366
|
+
[ ] CPU usage normal (<80%)
|
|
367
|
+
[ ] Memory usage normal (<90%)
|
|
368
|
+
[ ] No blocking locks
|
|
369
|
+
[ ] No long-running queries
|
|
370
|
+
[ ] Recent backup available
|
|
371
|
+
[ ] Customer access verified
|
|
372
|
+
[ ] Incident documented
|
|
373
|
+
[ ] Root cause identified
|
|
374
|
+
[ ] Prevention plan created
|
|
375
|
+
EOF
|
|
376
|
+
|
|
377
|
+
# Final status
|
|
378
|
+
echo -e "\n[FINAL STATUS]" | tee -a $INCIDENT_LOG
|
|
379
|
+
echo "==============" | tee -a $INCIDENT_LOG
|
|
380
|
+
/usr/local/bin/fairdb-health-check | head -20 | tee -a $INCIDENT_LOG
|
|
381
|
+
```
|
|
382
|
+
|
|
383
|
+
## Step 8: Root Cause Analysis
|
|
384
|
+
|
|
385
|
+
```bash
|
|
386
|
+
echo -e "\n[STEP 8] ROOT CAUSE ANALYSIS" | tee -a $INCIDENT_LOG
|
|
387
|
+
echo "-----------------------------" | tee -a $INCIDENT_LOG
|
|
388
|
+
|
|
389
|
+
# Collect evidence
|
|
390
|
+
echo "Collecting evidence for RCA..." | tee -a $INCIDENT_LOG
|
|
391
|
+
|
|
392
|
+
# System logs
|
|
393
|
+
echo -e "\nSystem logs (last hour):" | tee -a $INCIDENT_LOG
|
|
394
|
+
sudo journalctl --since "1 hour ago" -p err --no-pager | tail -20 | tee -a $INCIDENT_LOG
|
|
395
|
+
|
|
396
|
+
# PostgreSQL logs
|
|
397
|
+
echo -e "\nPostgreSQL error logs:" | tee -a $INCIDENT_LOG
|
|
398
|
+
find /var/log/postgresql -name "*.log" -mmin -60 -exec grep -i "error\|fatal\|panic" {} \; | tail -20 | tee -a $INCIDENT_LOG
|
|
399
|
+
|
|
400
|
+
# Resource history
|
|
401
|
+
echo -e "\nResource usage history:" | tee -a $INCIDENT_LOG
|
|
402
|
+
sar -u -f /var/log/sysstat/sa$(date +%d) | tail -10 | tee -a $INCIDENT_LOG 2>/dev/null
|
|
403
|
+
|
|
404
|
+
# Create RCA document
|
|
405
|
+
cat << EOF | tee /opt/fairdb/incidents/${INCIDENT_ID}-rca.md
|
|
406
|
+
# Root Cause Analysis - $INCIDENT_ID
|
|
407
|
+
|
|
408
|
+
## Incident Summary
|
|
409
|
+
- **Date/Time**: $(date)
|
|
410
|
+
- **Duration**: [TO BE FILLED]
|
|
411
|
+
- **Severity**: [P1/P2/P3/P4]
|
|
412
|
+
- **Impact**: [Number of customers/databases affected]
|
|
413
|
+
|
|
414
|
+
## Timeline
|
|
415
|
+
[Document sequence of events]
|
|
416
|
+
|
|
417
|
+
## Root Cause
|
|
418
|
+
[Identify primary cause]
|
|
419
|
+
|
|
420
|
+
## Contributing Factors
|
|
421
|
+
[List any contributing factors]
|
|
422
|
+
|
|
423
|
+
## Resolution
|
|
424
|
+
[Describe how the incident was resolved]
|
|
425
|
+
|
|
426
|
+
## Lessons Learned
|
|
427
|
+
[What was learned from this incident]
|
|
428
|
+
|
|
429
|
+
## Action Items
|
|
430
|
+
[ ] [Prevention measure 1]
|
|
431
|
+
[ ] [Prevention measure 2]
|
|
432
|
+
[ ] [Monitoring improvement]
|
|
433
|
+
|
|
434
|
+
## Metrics
|
|
435
|
+
- Time to Detection: [minutes]
|
|
436
|
+
- Time to Resolution: [minutes]
|
|
437
|
+
- Customer Impact Duration: [minutes]
|
|
438
|
+
|
|
439
|
+
Generated: $(date)
|
|
440
|
+
EOF
|
|
441
|
+
|
|
442
|
+
echo -e "\n================================================" | tee -a $INCIDENT_LOG
|
|
443
|
+
echo " INCIDENT RESPONSE COMPLETED" | tee -a $INCIDENT_LOG
|
|
444
|
+
echo " Incident ID: $INCIDENT_ID" | tee -a $INCIDENT_LOG
|
|
445
|
+
echo " Log saved to: $INCIDENT_LOG" | tee -a $INCIDENT_LOG
|
|
446
|
+
echo " RCA template: /opt/fairdb/incidents/${INCIDENT_ID}-rca.md" | tee -a $INCIDENT_LOG
|
|
447
|
+
echo "================================================" | tee -a $INCIDENT_LOG
|
|
448
|
+
```
|
|
449
|
+
|
|
450
|
+
## Emergency Contacts
|
|
451
|
+
|
|
452
|
+
Keep these contacts readily available:
|
|
453
|
+
- PostgreSQL Expert: [Contact info]
|
|
454
|
+
- Infrastructure Team: [Contact info]
|
|
455
|
+
- Customer Success: [Contact info]
|
|
456
|
+
- Management Escalation: [Contact info]
|
|
457
|
+
|
|
458
|
+
## Quick Reference Commands
|
|
459
|
+
|
|
460
|
+
```bash
|
|
461
|
+
# Emergency service control
|
|
462
|
+
sudo systemctl stop postgresql
|
|
463
|
+
sudo systemctl start postgresql
|
|
464
|
+
sudo systemctl restart postgresql
|
|
465
|
+
|
|
466
|
+
# Kill all connections
|
|
467
|
+
sudo -u postgres psql -c "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE pid != pg_backend_pid();"
|
|
468
|
+
|
|
469
|
+
# Emergency single-user mode
|
|
470
|
+
sudo -u postgres /usr/lib/postgresql/16/bin/postgres --single -D /var/lib/postgresql/16/main
|
|
471
|
+
|
|
472
|
+
# Force checkpoint
|
|
473
|
+
sudo -u postgres psql -c "CHECKPOINT;"
|
|
474
|
+
|
|
475
|
+
# Emergency vacuum
|
|
476
|
+
sudo -u postgres vacuumdb --all --analyze-in-stages
|
|
477
|
+
|
|
478
|
+
# Check data checksums
|
|
479
|
+
sudo -u postgres /usr/lib/postgresql/16/bin/pg_checksums -D /var/lib/postgresql/16/main --check
|
|
480
|
+
```
|