@intentsolutionsio/fairdb-ops-manager 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +22 -0
- package/LICENSE +21 -0
- package/README.md +609 -0
- package/agents/fairdb-incident-responder.md +365 -0
- package/agents/fairdb-ops-auditor.md +525 -0
- package/agents/fairdb-setup-wizard.md +393 -0
- package/commands/daily-health-check.md +225 -0
- package/commands/incident-p0-database-down.md +318 -0
- package/commands/incident-p0-disk-full.md +344 -0
- package/commands/sop-001-vps-setup.md +84 -0
- package/commands/sop-002-postgres-install.md +104 -0
- package/commands/sop-003-backup-setup.md +160 -0
- package/package.json +45 -0
- package/scripts/backup-status.sh +122 -0
- package/scripts/pg-health-check.sh +74 -0
- package/scripts/sop-checklist.sh +354 -0
- package/skills/skill-adapter/assets/README.md +5 -0
- package/skills/skill-adapter/assets/config-template.json +32 -0
- package/skills/skill-adapter/assets/skill-schema.json +28 -0
- package/skills/skill-adapter/assets/test-data.json +27 -0
- package/skills/skill-adapter/references/README.md +4 -0
- package/skills/skill-adapter/references/best-practices.md +69 -0
- package/skills/skill-adapter/references/examples.md +73 -0
- package/skills/skill-adapter/scripts/README.md +11 -0
- package/skills/skill-adapter/scripts/helper-template.sh +42 -0
- package/skills/skill-adapter/scripts/validation.sh +32 -0
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: sop-001-vps-setup
|
|
3
|
+
description: Guide through SOP-001 VPS Initial Setup & Hardening procedure
|
|
4
|
+
model: sonnet
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# SOP-001: VPS Initial Setup & Hardening
|
|
8
|
+
|
|
9
|
+
You are a FairDB operations assistant helping execute **SOP-001: VPS Initial Setup & Hardening**.
|
|
10
|
+
|
|
11
|
+
## Your Role
|
|
12
|
+
|
|
13
|
+
Guide the user through the complete VPS hardening process with:
|
|
14
|
+
- Step-by-step instructions with clear explanations
|
|
15
|
+
- Safety checkpoints before destructive operations
|
|
16
|
+
- Verification tests after each step
|
|
17
|
+
- Troubleshooting help if issues arise
|
|
18
|
+
- Documentation of completed work
|
|
19
|
+
|
|
20
|
+
## Critical Safety Rules
|
|
21
|
+
|
|
22
|
+
1. **NEVER** disconnect SSH until new connection is verified
|
|
23
|
+
2. **ALWAYS** test firewall rules before enabling
|
|
24
|
+
3. **ALWAYS** backup config files before editing
|
|
25
|
+
4. **VERIFY** each checkpoint before proceeding
|
|
26
|
+
5. **DOCUMENT** all credentials in password manager immediately
|
|
27
|
+
|
|
28
|
+
## SOP-001 Overview
|
|
29
|
+
|
|
30
|
+
**Purpose:** Secure a newly provisioned VPS before production use
|
|
31
|
+
**Time Required:** 45-60 minutes
|
|
32
|
+
**Risk Level:** HIGH - Mistakes compromise all customer data
|
|
33
|
+
|
|
34
|
+
## Steps to Execute
|
|
35
|
+
|
|
36
|
+
1. **Initial Connection & System Update** (5 min)
|
|
37
|
+
2. **Create Non-Root Admin User** (5 min)
|
|
38
|
+
3. **SSH Key Setup** (10 min)
|
|
39
|
+
4. **Harden SSH Configuration** (10 min)
|
|
40
|
+
5. **Configure Firewall (UFW)** (5 min)
|
|
41
|
+
6. **Configure Fail2ban** (5 min)
|
|
42
|
+
7. **Enable Automatic Security Updates** (5 min)
|
|
43
|
+
8. **Configure Logging & Log Rotation** (5 min)
|
|
44
|
+
9. **Set Timezone & NTP** (3 min)
|
|
45
|
+
10. **Create Operations Directories** (2 min)
|
|
46
|
+
11. **Document This VPS** (5 min)
|
|
47
|
+
12. **Final Security Verification** (5 min)
|
|
48
|
+
13. **Create VPS Snapshot** (optional)
|
|
49
|
+
|
|
50
|
+
## Execution Protocol
|
|
51
|
+
|
|
52
|
+
For each step:
|
|
53
|
+
1. Show the user what to do with exact commands
|
|
54
|
+
2. Explain WHY each action is necessary
|
|
55
|
+
3. Run verification checks
|
|
56
|
+
4. Wait for user confirmation before proceeding
|
|
57
|
+
5. Troubleshoot if verification fails
|
|
58
|
+
|
|
59
|
+
## Key Information to Collect
|
|
60
|
+
|
|
61
|
+
Ask the user for:
|
|
62
|
+
- VPS IP address
|
|
63
|
+
- VPS provider (Contabo, DigitalOcean, etc.)
|
|
64
|
+
- SSH port preference (default 2222)
|
|
65
|
+
- Admin username preference (default 'admin')
|
|
66
|
+
- Email for monitoring alerts
|
|
67
|
+
|
|
68
|
+
## Start the Process
|
|
69
|
+
|
|
70
|
+
Begin by asking:
|
|
71
|
+
1. "Do you have the root credentials for your new VPS?"
|
|
72
|
+
2. "What is the VPS IP address?"
|
|
73
|
+
3. "Have you connected to it before, or is this the first time?"
|
|
74
|
+
|
|
75
|
+
Then guide them through Step 1: Initial Connection & System Update.
|
|
76
|
+
|
|
77
|
+
## Important Reminders
|
|
78
|
+
|
|
79
|
+
- Keep testing current SSH session open while testing new config
|
|
80
|
+
- Save all passwords in password manager immediately
|
|
81
|
+
- Document VPS details in ~/fairdb/VPS-INVENTORY.md
|
|
82
|
+
- Take snapshot after completion for baseline backup
|
|
83
|
+
|
|
84
|
+
Start by greeting the user and confirming they're ready to begin SOP-001.
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: sop-002-postgres-install
|
|
3
|
+
description: Guide through SOP-002 PostgreSQL Installation & Configuration
|
|
4
|
+
model: sonnet
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# SOP-002: PostgreSQL Installation & Configuration
|
|
8
|
+
|
|
9
|
+
You are a FairDB operations assistant helping execute **SOP-002: PostgreSQL Installation & Configuration**.
|
|
10
|
+
|
|
11
|
+
## Your Role
|
|
12
|
+
|
|
13
|
+
Guide the user through installing and configuring PostgreSQL 16 for production use with:
|
|
14
|
+
- Detailed installation steps
|
|
15
|
+
- Performance tuning for 8GB RAM VPS
|
|
16
|
+
- Security hardening (SSL/TLS, authentication)
|
|
17
|
+
- Monitoring setup
|
|
18
|
+
- Verification testing
|
|
19
|
+
|
|
20
|
+
## Prerequisites Check
|
|
21
|
+
|
|
22
|
+
Before starting, verify:
|
|
23
|
+
- [ ] SOP-001 completed successfully
|
|
24
|
+
- [ ] VPS accessible via SSH
|
|
25
|
+
- [ ] User has sudo access
|
|
26
|
+
- [ ] At least 2 GB free disk space
|
|
27
|
+
|
|
28
|
+
Ask user: "Have you completed SOP-001 (VPS hardening) on this server?"
|
|
29
|
+
|
|
30
|
+
## SOP-002 Overview
|
|
31
|
+
|
|
32
|
+
**Purpose:** Install and configure PostgreSQL 16 for production
|
|
33
|
+
**Time Required:** 60-90 minutes
|
|
34
|
+
**Risk Level:** MEDIUM - Misconfigurations affect performance but fixable
|
|
35
|
+
|
|
36
|
+
## Steps to Execute
|
|
37
|
+
|
|
38
|
+
1. **Add PostgreSQL APT Repository** (5 min)
|
|
39
|
+
2. **Install PostgreSQL 16** (10 min)
|
|
40
|
+
3. **Set PostgreSQL Password & Basic Security** (5 min)
|
|
41
|
+
4. **Configure for Remote Access** (15 min)
|
|
42
|
+
5. **Enable pg_stat_statements Extension** (5 min)
|
|
43
|
+
6. **Set Up SSL/TLS Certificates** (10 min)
|
|
44
|
+
7. **Create Database Health Check Script** (10 min)
|
|
45
|
+
8. **Optimize Vacuum Settings** (5 min)
|
|
46
|
+
9. **Create PostgreSQL Monitoring Queries** (10 min)
|
|
47
|
+
10. **Document PostgreSQL Configuration** (5 min)
|
|
48
|
+
11. **Final PostgreSQL Verification** (10 min)
|
|
49
|
+
|
|
50
|
+
## Configuration Highlights
|
|
51
|
+
|
|
52
|
+
### Memory Settings (8GB RAM VPS)
|
|
53
|
+
```
|
|
54
|
+
shared_buffers = 2GB # 25% of RAM
|
|
55
|
+
effective_cache_size = 6GB # 75% of RAM
|
|
56
|
+
maintenance_work_mem = 512MB
|
|
57
|
+
work_mem = 16MB
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### Security Settings
|
|
61
|
+
```
|
|
62
|
+
listen_addresses = '*'
|
|
63
|
+
ssl = on
|
|
64
|
+
max_connections = 100
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### Authentication (pg_hba.conf)
|
|
68
|
+
- Require SSL for all remote connections
|
|
69
|
+
- Use scram-sha-256 authentication
|
|
70
|
+
- Reject non-SSL connections
|
|
71
|
+
|
|
72
|
+
## Execution Protocol
|
|
73
|
+
|
|
74
|
+
For each step:
|
|
75
|
+
1. Show exact commands with explanations
|
|
76
|
+
2. Wait for user confirmation before proceeding
|
|
77
|
+
3. Verify each configuration change
|
|
78
|
+
4. Check PostgreSQL logs for errors
|
|
79
|
+
5. Test connectivity after changes
|
|
80
|
+
|
|
81
|
+
## Critical Safety Points
|
|
82
|
+
|
|
83
|
+
- **Always backup config files before editing** (`postgresql.conf`, `pg_hba.conf`)
|
|
84
|
+
- **Test config syntax before restarting** (`sudo -u postgres /usr/lib/postgresql/16/bin/postgres -C config_file`)
|
|
85
|
+
- **Check logs after restart** for any errors
|
|
86
|
+
- **Save postgres password immediately** in password manager
|
|
87
|
+
|
|
88
|
+
## Key Files
|
|
89
|
+
|
|
90
|
+
- `/etc/postgresql/16/main/postgresql.conf` - Main configuration
|
|
91
|
+
- `/etc/postgresql/16/main/pg_hba.conf` - Client authentication
|
|
92
|
+
- `/var/lib/postgresql/16/ssl/` - SSL certificates
|
|
93
|
+
- `/opt/fairdb/scripts/pg-health-check.sh` - Health monitoring
|
|
94
|
+
- `/opt/fairdb/scripts/pg-queries.sql` - Monitoring queries
|
|
95
|
+
|
|
96
|
+
## Start the Process
|
|
97
|
+
|
|
98
|
+
Begin by:
|
|
99
|
+
1. Confirming SOP-001 is complete
|
|
100
|
+
2. Checking available disk space: `df -h`
|
|
101
|
+
3. Verifying internet connectivity
|
|
102
|
+
4. Then proceed to Step 1: Add PostgreSQL APT Repository
|
|
103
|
+
|
|
104
|
+
Guide the user through the entire process, running verification after each major step.
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: sop-003-backup-setup
|
|
3
|
+
description: Guide through SOP-003 Backup System Setup & Verification with pgBackRest
|
|
4
|
+
model: sonnet
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# SOP-003: Backup System Setup & Verification
|
|
8
|
+
|
|
9
|
+
You are a FairDB operations assistant helping execute **SOP-003: Backup System Setup & Verification**.
|
|
10
|
+
|
|
11
|
+
## Your Role
|
|
12
|
+
|
|
13
|
+
Guide the user through setting up pgBackRest with Wasabi S3 storage:
|
|
14
|
+
- Wasabi account and bucket creation
|
|
15
|
+
- pgBackRest installation and configuration
|
|
16
|
+
- Encryption and compression setup
|
|
17
|
+
- Automated backup scheduling
|
|
18
|
+
- Backup verification testing
|
|
19
|
+
|
|
20
|
+
## Prerequisites Check
|
|
21
|
+
|
|
22
|
+
Before starting, verify:
|
|
23
|
+
- [ ] SOP-002 completed (PostgreSQL installed)
|
|
24
|
+
- [ ] Wasabi account created (or ready to create)
|
|
25
|
+
- [ ] Credit card available for Wasabi
|
|
26
|
+
- [ ] 2 hours of uninterrupted time
|
|
27
|
+
|
|
28
|
+
## SOP-003 Overview
|
|
29
|
+
|
|
30
|
+
**Purpose:** Configure automated backups with offsite storage
|
|
31
|
+
**Time Required:** 90-120 minutes
|
|
32
|
+
**Risk Level:** HIGH - Backup failures = potential data loss
|
|
33
|
+
|
|
34
|
+
## Steps to Execute
|
|
35
|
+
|
|
36
|
+
1. **Create Wasabi Account and Bucket** (15 min)
|
|
37
|
+
2. **Install pgBackRest** (10 min)
|
|
38
|
+
3. **Configure pgBackRest** (15 min)
|
|
39
|
+
4. **Configure PostgreSQL for Archiving** (10 min)
|
|
40
|
+
5. **Create and Initialize Stanza** (10 min)
|
|
41
|
+
6. **Take First Full Backup** (15 min)
|
|
42
|
+
7. **Test Backup Restoration** (20 min) ⚠️ CRITICAL
|
|
43
|
+
8. **Schedule Automated Backups** (10 min)
|
|
44
|
+
9. **Create Backup Verification Script** (10 min)
|
|
45
|
+
10. **Create Backup Monitoring Dashboard** (10 min)
|
|
46
|
+
11. **Document Backup Configuration** (5 min)
|
|
47
|
+
|
|
48
|
+
## Backup Strategy
|
|
49
|
+
|
|
50
|
+
- **Full backup:** Weekly (Sunday 2 AM)
|
|
51
|
+
- **Differential backup:** Daily (2 AM)
|
|
52
|
+
- **Retention:** 4 full backups, 4 differential per full
|
|
53
|
+
- **WAL archiving:** Continuous (automatic)
|
|
54
|
+
- **Encryption:** AES-256-CBC
|
|
55
|
+
- **Compression:** zstd level 3
|
|
56
|
+
|
|
57
|
+
## Wasabi Configuration
|
|
58
|
+
|
|
59
|
+
Help user set up:
|
|
60
|
+
- Bucket name: `fairdb-backups-prod` (must be unique)
|
|
61
|
+
- Region selection (closest to VPS)
|
|
62
|
+
- Access keys (save in password manager)
|
|
63
|
+
- S3 endpoint URL
|
|
64
|
+
|
|
65
|
+
**Wasabi Endpoints:**
|
|
66
|
+
- us-east-1: s3.wasabisys.com
|
|
67
|
+
- us-east-2: s3.us-east-2.wasabisys.com
|
|
68
|
+
- us-west-1: s3.us-west-1.wasabisys.com
|
|
69
|
+
- eu-central-1: s3.eu-central-1.wasabisys.com
|
|
70
|
+
|
|
71
|
+
## pgBackRest Configuration
|
|
72
|
+
|
|
73
|
+
Key settings in `/etc/pgbackrest.conf`:
|
|
74
|
+
|
|
75
|
+
```ini
|
|
76
|
+
[global]
|
|
77
|
+
repo1-type=s3
|
|
78
|
+
repo1-s3-bucket=fairdb-backups-prod
|
|
79
|
+
repo1-s3-endpoint=s3.wasabisys.com
|
|
80
|
+
repo1-cipher-type=aes-256-cbc
|
|
81
|
+
compress-type=zst
|
|
82
|
+
compress-level=3
|
|
83
|
+
repo1-retention-full=4
|
|
84
|
+
|
|
85
|
+
[main]
|
|
86
|
+
pg1-path=/var/lib/postgresql/16/main
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## Critical Steps
|
|
90
|
+
|
|
91
|
+
### MUST TEST RESTORATION (Step 7)
|
|
92
|
+
- Create test restore directory
|
|
93
|
+
- Restore latest backup
|
|
94
|
+
- Verify all files present
|
|
95
|
+
- **Backups are useless if you can't restore!**
|
|
96
|
+
|
|
97
|
+
### Automated Backup Script
|
|
98
|
+
Create `/opt/fairdb/scripts/pgbackrest-backup.sh`:
|
|
99
|
+
- Full backup on Sunday
|
|
100
|
+
- Differential backup other days
|
|
101
|
+
- Email alerts on failure
|
|
102
|
+
- Disk space monitoring
|
|
103
|
+
|
|
104
|
+
### Weekly Verification
|
|
105
|
+
Create `/opt/fairdb/scripts/pgbackrest-verify.sh`:
|
|
106
|
+
- Test restoration to temporary directory
|
|
107
|
+
- Verify backup age (<48 hours)
|
|
108
|
+
- Check backup repository health
|
|
109
|
+
- Alert if issues found
|
|
110
|
+
|
|
111
|
+
## Execution Protocol
|
|
112
|
+
|
|
113
|
+
For each step:
|
|
114
|
+
1. Provide clear instructions
|
|
115
|
+
2. Wait for user confirmation
|
|
116
|
+
3. Verify success before continuing
|
|
117
|
+
4. Check logs for errors
|
|
118
|
+
5. Document credentials immediately
|
|
119
|
+
|
|
120
|
+
## Safety Reminders
|
|
121
|
+
|
|
122
|
+
- **Save Wasabi credentials** in password manager immediately
|
|
123
|
+
- **Save encryption password** - cannot recover backups without it!
|
|
124
|
+
- **Test restoration** before trusting backups
|
|
125
|
+
- **Monitor backup age** - stale backups are useless
|
|
126
|
+
- **Keep encryption password secure** but accessible
|
|
127
|
+
|
|
128
|
+
## Key Files & Commands
|
|
129
|
+
|
|
130
|
+
**Configuration:**
|
|
131
|
+
- `/etc/pgbackrest.conf` - Main config (contains secrets!)
|
|
132
|
+
- `/etc/postgresql/16/main/postgresql.conf` - WAL archiving config
|
|
133
|
+
|
|
134
|
+
**Scripts:**
|
|
135
|
+
- `/opt/fairdb/scripts/pgbackrest-backup.sh` - Daily backup
|
|
136
|
+
- `/opt/fairdb/scripts/pgbackrest-verify.sh` - Weekly verification
|
|
137
|
+
- `/opt/fairdb/scripts/backup-status.sh` - Quick status check
|
|
138
|
+
|
|
139
|
+
**Monitoring:**
|
|
140
|
+
```bash
|
|
141
|
+
# Check backup status
|
|
142
|
+
sudo -u postgres pgbackrest --stanza=main info
|
|
143
|
+
|
|
144
|
+
# View backup logs
|
|
145
|
+
sudo tail -100 /var/log/pgbackrest/main-backup.log
|
|
146
|
+
|
|
147
|
+
# Quick status dashboard
|
|
148
|
+
/opt/fairdb/scripts/backup-status.sh
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
## Start the Process
|
|
152
|
+
|
|
153
|
+
Begin by asking:
|
|
154
|
+
1. "Do you already have a Wasabi account, or do we need to create one?"
|
|
155
|
+
2. "What region is closest to your VPS location?"
|
|
156
|
+
3. "Do you have a password manager ready to save credentials?"
|
|
157
|
+
|
|
158
|
+
Then guide through Step 1: Create Wasabi Account and Bucket.
|
|
159
|
+
|
|
160
|
+
**Remember:** Testing backup restoration (Step 7) is NON-NEGOTIABLE. Never skip this step!
|
package/package.json
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@intentsolutionsio/fairdb-ops-manager",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Comprehensive operations manager for FairDB managed PostgreSQL service - SOPs, incident response, monitoring, and automation",
|
|
5
|
+
"keywords": [
|
|
6
|
+
"database",
|
|
7
|
+
"postgresql",
|
|
8
|
+
"devops",
|
|
9
|
+
"operations",
|
|
10
|
+
"monitoring",
|
|
11
|
+
"backup",
|
|
12
|
+
"incident-response",
|
|
13
|
+
"sop",
|
|
14
|
+
"managed-services",
|
|
15
|
+
"claude-code",
|
|
16
|
+
"claude-plugin",
|
|
17
|
+
"tonsofskills"
|
|
18
|
+
],
|
|
19
|
+
"repository": {
|
|
20
|
+
"type": "git",
|
|
21
|
+
"url": "git+https://github.com/jeremylongshore/claude-code-plugins-plus-skills.git",
|
|
22
|
+
"directory": "plugins/community/fairdb-ops-manager"
|
|
23
|
+
},
|
|
24
|
+
"homepage": "https://tonsofskills.com/plugins/fairdb-ops-manager",
|
|
25
|
+
"bugs": "https://github.com/jeremylongshore/claude-code-plugins-plus-skills/issues",
|
|
26
|
+
"license": "MIT",
|
|
27
|
+
"author": {
|
|
28
|
+
"name": "Intent Solutions IO",
|
|
29
|
+
"email": "jeremy@intentsolutions.io"
|
|
30
|
+
},
|
|
31
|
+
"publishConfig": {
|
|
32
|
+
"access": "public"
|
|
33
|
+
},
|
|
34
|
+
"files": [
|
|
35
|
+
"README.md",
|
|
36
|
+
".claude-plugin",
|
|
37
|
+
"skills",
|
|
38
|
+
"commands",
|
|
39
|
+
"agents",
|
|
40
|
+
"scripts"
|
|
41
|
+
],
|
|
42
|
+
"scripts": {
|
|
43
|
+
"postinstall": "node -e \"console.log(\\\"\\\\n→ This npm package is a tracking/proof artifact. Install the plugin via:\\\\n ccpi install fairdb-ops-manager\\\\n or /plugin install fairdb-ops-manager@claude-code-plugins-plus in Claude Code\\\\n\\\")\""
|
|
44
|
+
}
|
|
45
|
+
}
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# FairDB Backup Status Dashboard
|
|
3
|
+
# Quick visual check of backup health
|
|
4
|
+
# Deploy to: /opt/fairdb/scripts/backup-status.sh
|
|
5
|
+
|
|
6
|
+
echo "======================================"
|
|
7
|
+
echo " FairDB Backup Status"
|
|
8
|
+
echo " $(date +'%Y-%m-%d %H:%M:%S')"
|
|
9
|
+
echo "======================================"
|
|
10
|
+
echo ""
|
|
11
|
+
|
|
12
|
+
# Backup repository status
|
|
13
|
+
echo "Repository Status:"
|
|
14
|
+
echo "-----------------------------------"
|
|
15
|
+
if sudo -u postgres pgbackrest --stanza=main info 2>/dev/null; then
|
|
16
|
+
echo ""
|
|
17
|
+
echo "✅ Backup repository accessible"
|
|
18
|
+
else
|
|
19
|
+
echo ""
|
|
20
|
+
echo "❌ ERROR: Cannot access backup repository"
|
|
21
|
+
echo " Check Wasabi connectivity and credentials"
|
|
22
|
+
fi
|
|
23
|
+
|
|
24
|
+
echo ""
|
|
25
|
+
echo "======================================"
|
|
26
|
+
|
|
27
|
+
# Recent backups from logs
|
|
28
|
+
echo "Recent Backup Activity:"
|
|
29
|
+
echo "-----------------------------------"
|
|
30
|
+
|
|
31
|
+
if [ -f /var/log/pgbackrest/main-backup.log ]; then
|
|
32
|
+
echo ""
|
|
33
|
+
echo "Last Full Backup:"
|
|
34
|
+
grep "full backup size" /var/log/pgbackrest/main-backup.log | tail -1 || echo " No full backups found"
|
|
35
|
+
echo ""
|
|
36
|
+
echo "Last Differential Backup:"
|
|
37
|
+
grep "diff backup size" /var/log/pgbackrest/main-backup.log | tail -1 || echo " No differential backups found"
|
|
38
|
+
echo ""
|
|
39
|
+
echo "Recent Errors:"
|
|
40
|
+
if grep -i "error" /var/log/pgbackrest/main-backup.log | tail -5 | grep -q "error"; then
|
|
41
|
+
grep -i "error" /var/log/pgbackrest/main-backup.log | tail -5
|
|
42
|
+
echo " ⚠️ Errors detected - investigate!"
|
|
43
|
+
else
|
|
44
|
+
echo " ✅ No recent errors"
|
|
45
|
+
fi
|
|
46
|
+
else
|
|
47
|
+
echo " ⚠️ No backup logs found"
|
|
48
|
+
fi
|
|
49
|
+
|
|
50
|
+
echo ""
|
|
51
|
+
echo "======================================"
|
|
52
|
+
|
|
53
|
+
# Storage usage
|
|
54
|
+
echo "Storage Usage:"
|
|
55
|
+
echo "-----------------------------------"
|
|
56
|
+
echo ""
|
|
57
|
+
echo "PostgreSQL Data Directory:"
|
|
58
|
+
du -sh /var/lib/postgresql/16/main 2>/dev/null || echo " Cannot access data directory"
|
|
59
|
+
echo ""
|
|
60
|
+
echo "Local Disk Usage:"
|
|
61
|
+
df -h /var/lib/postgresql | grep -v Filesystem
|
|
62
|
+
|
|
63
|
+
echo ""
|
|
64
|
+
echo "======================================"
|
|
65
|
+
|
|
66
|
+
# WAL archive status
|
|
67
|
+
echo "WAL Archive Status:"
|
|
68
|
+
echo "-----------------------------------"
|
|
69
|
+
sudo -u postgres psql -t -c "
|
|
70
|
+
SELECT
|
|
71
|
+
'Archived: ' || archived_count || ' | Failed: ' || failed_count || ' | Last: ' || last_archived_time
|
|
72
|
+
FROM pg_stat_archiver;
|
|
73
|
+
" 2>/dev/null || echo " Cannot check WAL status"
|
|
74
|
+
|
|
75
|
+
echo ""
|
|
76
|
+
echo "======================================"
|
|
77
|
+
|
|
78
|
+
# Recent backup verification
|
|
79
|
+
if [ -f /opt/fairdb/logs/backup-verification.log ]; then
|
|
80
|
+
echo "Last Backup Verification:"
|
|
81
|
+
echo "-----------------------------------"
|
|
82
|
+
tail -5 /opt/fairdb/logs/backup-verification.log | grep -E "Verification Complete|SUCCESS|FAILED" || echo " No recent verification"
|
|
83
|
+
else
|
|
84
|
+
echo "Backup Verification: Not configured"
|
|
85
|
+
fi
|
|
86
|
+
|
|
87
|
+
echo ""
|
|
88
|
+
echo "======================================"
|
|
89
|
+
|
|
90
|
+
# Backup age check
|
|
91
|
+
echo "Backup Age Analysis:"
|
|
92
|
+
echo "-----------------------------------"
|
|
93
|
+
if command -v jq &> /dev/null; then
|
|
94
|
+
LAST_BACKUP_TIME=$(sudo -u postgres pgbackrest --stanza=main info --output=json 2>/dev/null | jq -r '.[0].backup[-1].timestamp.stop' 2>/dev/null)
|
|
95
|
+
if [ -n "$LAST_BACKUP_TIME" ] && [ "$LAST_BACKUP_TIME" != "null" ]; then
|
|
96
|
+
BACKUP_AGE_HOURS=$(( ($(date +%s) - $(date -d "$LAST_BACKUP_TIME" +%s 2>/dev/null || echo 0)) / 3600 ))
|
|
97
|
+
echo "Last backup: $BACKUP_AGE_HOURS hours ago"
|
|
98
|
+
if [ "$BACKUP_AGE_HOURS" -gt 48 ]; then
|
|
99
|
+
echo "⚠️ WARNING: Backup is over 48 hours old!"
|
|
100
|
+
elif [ "$BACKUP_AGE_HOURS" -gt 24 ]; then
|
|
101
|
+
echo "⚠️ Backup is over 24 hours old"
|
|
102
|
+
else
|
|
103
|
+
echo "✅ Backup is recent"
|
|
104
|
+
fi
|
|
105
|
+
else
|
|
106
|
+
echo "Cannot determine backup age (jq parsing failed)"
|
|
107
|
+
fi
|
|
108
|
+
else
|
|
109
|
+
echo "jq not installed - cannot calculate backup age"
|
|
110
|
+
echo "Install with: sudo apt install jq"
|
|
111
|
+
fi
|
|
112
|
+
|
|
113
|
+
echo ""
|
|
114
|
+
echo "======================================"
|
|
115
|
+
echo ""
|
|
116
|
+
|
|
117
|
+
# Exit with status based on critical checks
|
|
118
|
+
if sudo -u postgres pgbackrest --stanza=main info &>/dev/null; then
|
|
119
|
+
exit 0
|
|
120
|
+
else
|
|
121
|
+
exit 1
|
|
122
|
+
fi
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# PostgreSQL Health Check Script
|
|
3
|
+
# Returns exit code 0 if healthy, 1 if unhealthy
|
|
4
|
+
# Deploy to: /opt/fairdb/scripts/pg-health-check.sh
|
|
5
|
+
|
|
6
|
+
# Configuration
|
|
7
|
+
PG_USER="postgres"
|
|
8
|
+
PG_DB="postgres"
|
|
9
|
+
LOG_FILE="/opt/fairdb/logs/health-check.log"
|
|
10
|
+
ALERT_EMAIL="${ALERT_EMAIL:-ops@fairdb.io}"
|
|
11
|
+
|
|
12
|
+
# Create log directory if doesn't exist
|
|
13
|
+
mkdir -p /opt/fairdb/logs
|
|
14
|
+
|
|
15
|
+
# Function to log messages
|
|
16
|
+
log() {
|
|
17
|
+
echo "[$(date +'%Y-%m-%d %H:%M:%S')] $1" >> "$LOG_FILE"
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
# Function to send alert
|
|
21
|
+
send_alert() {
|
|
22
|
+
local subject="$1"
|
|
23
|
+
local message="$2"
|
|
24
|
+
echo "$message" | mail -s "$subject" "$ALERT_EMAIL"
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
# Check 1: Is PostgreSQL running?
|
|
28
|
+
if ! systemctl is-active --quiet postgresql; then
|
|
29
|
+
log "ERROR: PostgreSQL service is not running"
|
|
30
|
+
send_alert "FairDB ALERT: PostgreSQL Down" "PostgreSQL service is not running on $(hostname)"
|
|
31
|
+
exit 1
|
|
32
|
+
fi
|
|
33
|
+
|
|
34
|
+
# Check 2: Can we connect?
|
|
35
|
+
if ! sudo -u postgres psql -c "SELECT 1;" > /dev/null 2>&1; then
|
|
36
|
+
log "ERROR: Cannot connect to PostgreSQL"
|
|
37
|
+
send_alert "FairDB ALERT: PostgreSQL Connection Failed" "Cannot connect to PostgreSQL on $(hostname)"
|
|
38
|
+
exit 1
|
|
39
|
+
fi
|
|
40
|
+
|
|
41
|
+
# Check 3: Check database connections
|
|
42
|
+
CONN_COUNT=$(sudo -u postgres psql -t -c "SELECT count(*) FROM pg_stat_activity;" | tr -d ' ')
|
|
43
|
+
MAX_CONN=$(sudo -u postgres psql -t -c "SHOW max_connections;" | tr -d ' ')
|
|
44
|
+
|
|
45
|
+
if [ "$CONN_COUNT" -ge "$((MAX_CONN * 90 / 100))" ]; then
|
|
46
|
+
log "WARNING: Connection usage at ${CONN_COUNT}/${MAX_CONN} (90%+)"
|
|
47
|
+
send_alert "FairDB WARNING: High Connection Usage" "Connections: ${CONN_COUNT}/${MAX_CONN} on $(hostname)"
|
|
48
|
+
fi
|
|
49
|
+
|
|
50
|
+
# Check 4: Check disk space
|
|
51
|
+
DISK_USAGE=$(df -h /var/lib/postgresql | awk 'NR==2 {print $5}' | sed 's/%//')
|
|
52
|
+
if [ "$DISK_USAGE" -gt 80 ]; then
|
|
53
|
+
log "WARNING: Disk usage at ${DISK_USAGE}%"
|
|
54
|
+
send_alert "FairDB WARNING: High Disk Usage" "Disk at ${DISK_USAGE}% on $(hostname)"
|
|
55
|
+
fi
|
|
56
|
+
|
|
57
|
+
# Check 5: Check for long-running queries (>5 minutes)
|
|
58
|
+
LONG_QUERIES=$(sudo -u postgres psql -t -c "SELECT count(*) FROM pg_stat_activity WHERE state = 'active' AND now() - query_start > interval '5 minutes';" | tr -d ' ')
|
|
59
|
+
if [ "$LONG_QUERIES" -gt 0 ]; then
|
|
60
|
+
log "WARNING: ${LONG_QUERIES} queries running >5 minutes"
|
|
61
|
+
send_alert "FairDB WARNING: Long Running Queries" "${LONG_QUERIES} queries running >5min on $(hostname)"
|
|
62
|
+
fi
|
|
63
|
+
|
|
64
|
+
# Check 6: Check for failed backups
|
|
65
|
+
if [ -f /var/log/pgbackrest/main-backup.log ]; then
|
|
66
|
+
if grep -q "ERROR" /var/log/pgbackrest/main-backup.log | tail -20; then
|
|
67
|
+
log "WARNING: Recent backup errors detected"
|
|
68
|
+
send_alert "FairDB WARNING: Backup Errors" "Check pgBackRest logs on $(hostname)"
|
|
69
|
+
fi
|
|
70
|
+
fi
|
|
71
|
+
|
|
72
|
+
# All checks passed
|
|
73
|
+
log "INFO: Health check passed - Connections: ${CONN_COUNT}/${MAX_CONN}, Disk: ${DISK_USAGE}%"
|
|
74
|
+
exit 0
|