@intentsolutionsio/fairdb-operations-kit 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,459 @@
1
+ ---
2
+ name: fairdb-health-check
3
+ description: Comprehensive health check for FairDB PostgreSQL infrastructure
4
+ model: sonnet
5
+ ---
6
+
7
+ # FairDB System Health Check
8
+
9
+ Perform a comprehensive health check of the FairDB PostgreSQL infrastructure including server resources, database status, backup integrity, and customer databases.
10
+
11
+ ## System Health Overview
12
+
13
+ ```bash
14
+ #!/bin/bash
15
+ # FairDB Comprehensive Health Check
16
+
17
+ echo "================================================"
18
+ echo " FairDB System Health Check"
19
+ echo " $(date '+%Y-%m-%d %H:%M:%S')"
20
+ echo "================================================"
21
+ ```
22
+
23
+ ## Step 1: Server Resources Check
24
+
25
+ ```bash
26
+ echo -e "\n[1/10] SERVER RESOURCES"
27
+ echo "------------------------"
28
+
29
+ # CPU Usage
30
+ CPU_USAGE=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1)
31
+ echo "CPU Usage: ${CPU_USAGE}%"
32
+ if (( $(echo "$CPU_USAGE > 80" | bc -l) )); then
33
+ echo "⚠️ WARNING: High CPU usage detected"
34
+ fi
35
+
36
+ # Memory Usage
37
+ MEM_INFO=$(free -m | awk 'NR==2{printf "Memory: %s/%sMB (%.2f%%)\n", $3,$2,$3*100/$2 }')
38
+ echo "$MEM_INFO"
39
+ MEM_PERCENT=$(free | grep Mem | awk '{print $3/$2 * 100.0}')
40
+ if (( $(echo "$MEM_PERCENT > 90" | bc -l) )); then
41
+ echo "⚠️ WARNING: High memory usage detected"
42
+ fi
43
+
44
+ # Disk Usage
45
+ echo "Disk Usage:"
46
+ df -h | grep -E '^/dev/' | while read line; do
47
+ USAGE=$(echo $line | awk '{print $5}' | sed 's/%//')
48
+ MOUNT=$(echo $line | awk '{print $6}')
49
+ echo " $MOUNT: $line"
50
+ if [ $USAGE -gt 85 ]; then
51
+ echo " ⚠️ WARNING: Disk space critical on $MOUNT"
52
+ fi
53
+ done
54
+
55
+ # Load Average
56
+ LOAD=$(uptime | awk -F'load average:' '{print $2}')
57
+ echo "Load Average:$LOAD"
58
+ CORES=$(nproc)
59
+ LOAD_1=$(echo $LOAD | cut -d, -f1 | tr -d ' ')
60
+ if (( $(echo "$LOAD_1 > $CORES" | bc -l) )); then
61
+ echo "⚠️ WARNING: High load average detected"
62
+ fi
63
+ ```
64
+
65
+ ## Step 2: PostgreSQL Service Status
66
+
67
+ ```bash
68
+ echo -e "\n[2/10] POSTGRESQL SERVICE"
69
+ echo "-------------------------"
70
+
71
+ # Check if PostgreSQL is running
72
+ if systemctl is-active --quiet postgresql; then
73
+ echo "✅ PostgreSQL service: RUNNING"
74
+
75
+ # Get version and uptime
76
+ sudo -u postgres psql -t -c "SELECT version();" | head -1
77
+
78
+ UPTIME=$(sudo -u postgres psql -t -c "
79
+ SELECT now() - pg_postmaster_start_time() as uptime;")
80
+ echo "Uptime: $UPTIME"
81
+ else
82
+ echo "❌ CRITICAL: PostgreSQL service is NOT running!"
83
+ echo "Attempting to start..."
84
+ sudo systemctl start postgresql
85
+ sleep 5
86
+ if systemctl is-active --quiet postgresql; then
87
+ echo "✅ Service restarted successfully"
88
+ else
89
+ echo "❌ Failed to start PostgreSQL - manual intervention required!"
90
+ exit 1
91
+ fi
92
+ fi
93
+
94
+ # Check PostgreSQL cluster status
95
+ sudo pg_lsclusters
96
+ ```
97
+
98
+ ## Step 3: Database Connections
99
+
100
+ ```bash
101
+ echo -e "\n[3/10] DATABASE CONNECTIONS"
102
+ echo "---------------------------"
103
+
104
+ # Connection statistics
105
+ sudo -u postgres psql -t << EOF
106
+ SELECT
107
+ 'Total Connections: ' || count(*) || '/' || setting AS connection_info
108
+ FROM pg_stat_activity, pg_settings
109
+ WHERE pg_settings.name = 'max_connections'
110
+ GROUP BY setting;
111
+ EOF
112
+
113
+ # Connections by database
114
+ echo -e "\nConnections by database:"
115
+ sudo -u postgres psql -t -c "
116
+ SELECT datname, count(*) as connections
117
+ FROM pg_stat_activity
118
+ GROUP BY datname
119
+ ORDER BY connections DESC;"
120
+
121
+ # Connections by user
122
+ echo -e "\nConnections by user:"
123
+ sudo -u postgres psql -t -c "
124
+ SELECT usename, count(*) as connections
125
+ FROM pg_stat_activity
126
+ GROUP BY usename
127
+ ORDER BY connections DESC;"
128
+
129
+ # Check for idle connections
130
+ IDLE_COUNT=$(sudo -u postgres psql -t -c "
131
+ SELECT count(*)
132
+ FROM pg_stat_activity
133
+ WHERE state = 'idle'
134
+ AND state_change < NOW() - INTERVAL '10 minutes';")
135
+
136
+ if [ $IDLE_COUNT -gt 10 ]; then
137
+ echo "⚠️ WARNING: $IDLE_COUNT idle connections older than 10 minutes"
138
+ fi
139
+ ```
140
+
141
+ ## Step 4: Database Performance Metrics
142
+
143
+ ```bash
144
+ echo -e "\n[4/10] PERFORMANCE METRICS"
145
+ echo "--------------------------"
146
+
147
+ # Cache hit ratio
148
+ sudo -u postgres psql -t << 'EOF'
149
+ SELECT
150
+ 'Cache Hit Ratio: ' ||
151
+ ROUND(100.0 * sum(heap_blks_hit) /
152
+ NULLIF(sum(heap_blks_hit) + sum(heap_blks_read), 0), 2) || '%'
153
+ FROM pg_statio_user_tables;
154
+ EOF
155
+
156
+ # Transaction statistics
157
+ sudo -u postgres psql -t -c "
158
+ SELECT
159
+ 'Transactions: ' || xact_commit || ' commits, ' ||
160
+ xact_rollback || ' rollbacks, ' ||
161
+ ROUND(100.0 * xact_rollback / NULLIF(xact_commit + xact_rollback, 0), 2) || '% rollback rate'
162
+ FROM pg_stat_database
163
+ WHERE datname = 'postgres';"
164
+
165
+ # Longest running queries
166
+ echo -e "\nLong-running queries (>1 minute):"
167
+ sudo -u postgres psql -t -c "
168
+ SELECT pid, now() - query_start as duration,
169
+ LEFT(query, 50) as query_preview
170
+ FROM pg_stat_activity
171
+ WHERE state = 'active'
172
+ AND now() - query_start > interval '1 minute'
173
+ ORDER BY duration DESC
174
+ LIMIT 5;"
175
+
176
+ # Table bloat check
177
+ echo -e "\nTable bloat (top 5):"
178
+ sudo -u postgres psql -t << 'EOF'
179
+ SELECT
180
+ schemaname || '.' || tablename AS table,
181
+ pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) AS size,
182
+ ROUND(100 * pg_total_relation_size(schemaname||'.'||tablename) /
183
+ NULLIF(sum(pg_total_relation_size(schemaname||'.'||tablename))
184
+ OVER (), 0), 2) AS percentage
185
+ FROM pg_tables
186
+ WHERE schemaname NOT IN ('pg_catalog', 'information_schema')
187
+ ORDER BY pg_total_relation_size(schemaname||'.'||tablename) DESC
188
+ LIMIT 5;
189
+ EOF
190
+ ```
191
+
192
+ ## Step 5: Backup Status
193
+
194
+ ```bash
195
+ echo -e "\n[5/10] BACKUP STATUS"
196
+ echo "--------------------"
197
+
198
+ # Check pgBackRest status
199
+ if command -v pgbackrest &> /dev/null; then
200
+ echo "pgBackRest Status:"
201
+
202
+ # Get all stanzas
203
+ STANZAS=$(sudo -u postgres pgbackrest info --output=json 2>/dev/null | jq -r '.[].name' 2>/dev/null)
204
+
205
+ if [ -z "$STANZAS" ]; then
206
+ echo "⚠️ WARNING: No backup stanzas configured"
207
+ else
208
+ for STANZA in $STANZAS; do
209
+ echo -e "\nStanza: $STANZA"
210
+
211
+ # Get last backup info
212
+ LAST_BACKUP=$(sudo -u postgres pgbackrest --stanza=$STANZA info --output=json 2>/dev/null | \
213
+ jq -r '.[] | select(.name=="'$STANZA'") | .backup[-1].timestamp.stop' 2>/dev/null)
214
+
215
+ if [ ! -z "$LAST_BACKUP" ]; then
216
+ echo " Last backup: $LAST_BACKUP"
217
+
218
+ # Calculate age in hours
219
+ BACKUP_AGE=$(( ($(date +%s) - $(date -d "$LAST_BACKUP" +%s)) / 3600 ))
220
+
221
+ if [ $BACKUP_AGE -gt 25 ]; then
222
+ echo " ⚠️ WARNING: Last backup is $BACKUP_AGE hours old"
223
+ else
224
+ echo " ✅ Backup is current ($BACKUP_AGE hours old)"
225
+ fi
226
+ else
227
+ echo " ❌ ERROR: No backups found for this stanza"
228
+ fi
229
+ done
230
+ fi
231
+ else
232
+ echo "❌ ERROR: pgBackRest is not installed"
233
+ fi
234
+
235
+ # Check WAL archiving
236
+ WAL_STATUS=$(sudo -u postgres psql -t -c "SHOW archive_mode;")
237
+ echo -e "\nWAL Archiving: $WAL_STATUS"
238
+
239
+ if [ "$WAL_STATUS" = " on" ]; then
240
+ LAST_ARCHIVED=$(sudo -u postgres psql -t -c "
241
+ SELECT age(now(), last_archived_time)
242
+ FROM pg_stat_archiver;")
243
+ echo "Last WAL archived: $LAST_ARCHIVED ago"
244
+ fi
245
+ ```
246
+
247
+ ## Step 6: Replication Status
248
+
249
+ ```bash
250
+ echo -e "\n[6/10] REPLICATION STATUS"
251
+ echo "-------------------------"
252
+
253
+ # Check if this is a primary or replica
254
+ IS_PRIMARY=$(sudo -u postgres psql -t -c "SELECT pg_is_in_recovery();")
255
+
256
+ if [ "$IS_PRIMARY" = " f" ]; then
257
+ echo "Role: PRIMARY"
258
+
259
+ # Check replication slots
260
+ REP_SLOTS=$(sudo -u postgres psql -t -c "
261
+ SELECT count(*) FROM pg_replication_slots WHERE active = true;")
262
+ echo "Active replication slots: $REP_SLOTS"
263
+
264
+ # Check connected replicas
265
+ sudo -u postgres psql -t -c "
266
+ SELECT client_addr, state, sync_state,
267
+ pg_size_pretty(pg_wal_lsn_diff(sent_lsn, replay_lsn)) as lag
268
+ FROM pg_stat_replication;" 2>/dev/null
269
+ else
270
+ echo "Role: REPLICA"
271
+
272
+ # Check replication lag
273
+ LAG=$(sudo -u postgres psql -t -c "
274
+ SELECT EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp())) AS lag;")
275
+ echo "Replication lag: ${LAG} seconds"
276
+
277
+ if (( $(echo "$LAG > 60" | bc -l) )); then
278
+ echo "⚠️ WARNING: High replication lag detected"
279
+ fi
280
+ fi
281
+ ```
282
+
283
+ ## Step 7: Security Audit
284
+
285
+ ```bash
286
+ echo -e "\n[7/10] SECURITY AUDIT"
287
+ echo "---------------------"
288
+
289
+ # Check for default passwords
290
+ echo "Checking for common issues..."
291
+
292
+ # SSL status
293
+ SSL_STATUS=$(sudo -u postgres psql -t -c "SHOW ssl;")
294
+ echo "SSL: $SSL_STATUS"
295
+ if [ "$SSL_STATUS" != " on" ]; then
296
+ echo "⚠️ WARNING: SSL is not enabled"
297
+ fi
298
+
299
+ # Check for users without passwords
300
+ NO_PASS=$(sudo -u postgres psql -t -c "
301
+ SELECT count(*) FROM pg_shadow WHERE passwd IS NULL;")
302
+ if [ $NO_PASS -gt 0 ]; then
303
+ echo "⚠️ WARNING: $NO_PASS users without passwords"
304
+ fi
305
+
306
+ # Check firewall status
307
+ if sudo ufw status | grep -q "Status: active"; then
308
+ echo "✅ Firewall: ACTIVE"
309
+ else
310
+ echo "⚠️ WARNING: Firewall is not active"
311
+ fi
312
+
313
+ # Check fail2ban status
314
+ if systemctl is-active --quiet fail2ban; then
315
+ echo "✅ Fail2ban: RUNNING"
316
+ JAIL_STATUS=$(sudo fail2ban-client status postgresql 2>/dev/null | grep "Currently banned" || echo "Jail not configured")
317
+ echo " PostgreSQL jail: $JAIL_STATUS"
318
+ else
319
+ echo "⚠️ WARNING: Fail2ban is not running"
320
+ fi
321
+ ```
322
+
323
+ ## Step 8: Customer Database Health
324
+
325
+ ```bash
326
+ echo -e "\n[8/10] CUSTOMER DATABASES"
327
+ echo "-------------------------"
328
+
329
+ # Check each customer database
330
+ CUSTOMER_DBS=$(sudo -u postgres psql -t -c "
331
+ SELECT datname FROM pg_database
332
+ WHERE datname NOT IN ('postgres', 'template0', 'template1')
333
+ ORDER BY datname;")
334
+
335
+ for DB in $CUSTOMER_DBS; do
336
+ echo -e "\nDatabase: $DB"
337
+
338
+ # Size
339
+ SIZE=$(sudo -u postgres psql -t -c "
340
+ SELECT pg_size_pretty(pg_database_size('$DB'));")
341
+ echo " Size: $SIZE"
342
+
343
+ # Connection count
344
+ CONN=$(sudo -u postgres psql -t -c "
345
+ SELECT count(*) FROM pg_stat_activity WHERE datname = '$DB';")
346
+ echo " Connections: $CONN"
347
+
348
+ # Transaction rate
349
+ TPS=$(sudo -u postgres psql -t -c "
350
+ SELECT xact_commit + xact_rollback as transactions
351
+ FROM pg_stat_database WHERE datname = '$DB';")
352
+ echo " Total transactions: $TPS"
353
+
354
+ # Check for locks
355
+ LOCKS=$(sudo -u postgres psql -t -d $DB -c "
356
+ SELECT count(*) FROM pg_locks WHERE granted = false;")
357
+ if [ $LOCKS -gt 0 ]; then
358
+ echo " ⚠️ WARNING: $LOCKS blocked locks detected"
359
+ fi
360
+ done
361
+ ```
362
+
363
+ ## Step 9: System Logs Analysis
364
+
365
+ ```bash
366
+ echo -e "\n[9/10] LOG ANALYSIS"
367
+ echo "-------------------"
368
+
369
+ # Check PostgreSQL logs for errors
370
+ LOG_DIR="/var/log/postgresql"
371
+ if [ -d "$LOG_DIR" ]; then
372
+ echo "Recent PostgreSQL errors (last 24 hours):"
373
+ find $LOG_DIR -name "*.log" -mtime -1 -exec grep -i "error\|fatal\|panic" {} \; | \
374
+ tail -10 | head -5
375
+
376
+ ERROR_COUNT=$(find $LOG_DIR -name "*.log" -mtime -1 -exec grep -i "error\|fatal\|panic" {} \; | wc -l)
377
+ echo "Total errors in last 24 hours: $ERROR_COUNT"
378
+
379
+ if [ $ERROR_COUNT -gt 100 ]; then
380
+ echo "⚠️ WARNING: High error rate detected"
381
+ fi
382
+ fi
383
+
384
+ # Check system logs
385
+ echo -e "\nRecent system issues:"
386
+ sudo journalctl -p err -since "24 hours ago" --no-pager | tail -5
387
+ ```
388
+
389
+ ## Step 10: Recommendations
390
+
391
+ ```bash
392
+ echo -e "\n[10/10] HEALTH SUMMARY & RECOMMENDATIONS"
393
+ echo "========================================="
394
+
395
+ # Collect all warnings
396
+ WARNINGS=0
397
+ CRITICAL=0
398
+
399
+ # Generate recommendations based on findings
400
+ echo -e "\nRecommendations:"
401
+
402
+ # Check if vacuum is needed
403
+ LAST_VACUUM=$(sudo -u postgres psql -t -c "
404
+ SELECT MAX(last_autovacuum) FROM pg_stat_user_tables;")
405
+ echo "- Last autovacuum: $LAST_VACUUM"
406
+
407
+ # Check if analyze is needed
408
+ LAST_ANALYZE=$(sudo -u postgres psql -t -c "
409
+ SELECT MAX(last_autoanalyze) FROM pg_stat_user_tables;")
410
+ echo "- Last autoanalyze: $LAST_ANALYZE"
411
+
412
+ # Generate overall health score
413
+ echo -e "\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
414
+ if [ $CRITICAL -eq 0 ] && [ $WARNINGS -lt 3 ]; then
415
+ echo "✅ OVERALL HEALTH: GOOD"
416
+ elif [ $CRITICAL -eq 0 ] && [ $WARNINGS -lt 10 ]; then
417
+ echo "⚠️ OVERALL HEALTH: FAIR - Review warnings"
418
+ else
419
+ echo "❌ OVERALL HEALTH: POOR - Immediate action required"
420
+ fi
421
+ echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
422
+
423
+ # Save report
424
+ REPORT_FILE="/opt/fairdb/logs/health-check-$(date +%Y%m%d-%H%M%S).log"
425
+ echo -e "\nFull report saved to: $REPORT_FILE"
426
+ ```
427
+
428
+ ## Actions Based on Results
429
+
430
+ ### If Critical Issues Found:
431
+ 1. Check PostgreSQL service status
432
+ 2. Review disk space availability
433
+ 3. Verify backup integrity
434
+ 4. Check for data corruption
435
+ 5. Review security vulnerabilities
436
+
437
+ ### If Warnings Found:
438
+ 1. Schedule maintenance window
439
+ 2. Plan capacity upgrades
440
+ 3. Review query performance
441
+ 4. Update monitoring thresholds
442
+ 5. Document issues for trending
443
+
444
+ ### Regular Maintenance Tasks:
445
+ 1. Run VACUUM ANALYZE on large tables
446
+ 2. Update table statistics
447
+ 3. Review and optimize slow queries
448
+ 4. Clean up old logs
449
+ 5. Test backup restoration
450
+
451
+ ## Schedule Next Health Check
452
+
453
+ ```bash
454
+ # Schedule regular health checks
455
+ echo "30 */6 * * * root /usr/local/bin/fairdb-health-check > /dev/null 2>&1" | \
456
+ sudo tee /etc/cron.d/fairdb-health-check
457
+
458
+ echo "Health checks scheduled every 6 hours"
459
+ ```