active_postgres 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +23 -0
  3. data/README.md +158 -0
  4. data/exe/activepostgres +5 -0
  5. data/lib/active_postgres/cli.rb +157 -0
  6. data/lib/active_postgres/cluster_deployment_flow.rb +85 -0
  7. data/lib/active_postgres/component_resolver.rb +24 -0
  8. data/lib/active_postgres/components/base.rb +38 -0
  9. data/lib/active_postgres/components/core.rb +158 -0
  10. data/lib/active_postgres/components/extensions.rb +99 -0
  11. data/lib/active_postgres/components/monitoring.rb +55 -0
  12. data/lib/active_postgres/components/pgbackrest.rb +94 -0
  13. data/lib/active_postgres/components/pgbouncer.rb +137 -0
  14. data/lib/active_postgres/components/repmgr.rb +651 -0
  15. data/lib/active_postgres/components/ssl.rb +86 -0
  16. data/lib/active_postgres/configuration.rb +190 -0
  17. data/lib/active_postgres/connection_pooler.rb +429 -0
  18. data/lib/active_postgres/credentials.rb +17 -0
  19. data/lib/active_postgres/deployment_flow.rb +154 -0
  20. data/lib/active_postgres/error_handler.rb +185 -0
  21. data/lib/active_postgres/failover.rb +83 -0
  22. data/lib/active_postgres/generators/active_postgres/install_generator.rb +186 -0
  23. data/lib/active_postgres/health_checker.rb +244 -0
  24. data/lib/active_postgres/installer.rb +114 -0
  25. data/lib/active_postgres/log_sanitizer.rb +67 -0
  26. data/lib/active_postgres/logger.rb +125 -0
  27. data/lib/active_postgres/performance_tuner.rb +246 -0
  28. data/lib/active_postgres/rails/database_config.rb +174 -0
  29. data/lib/active_postgres/rails/migration_guard.rb +25 -0
  30. data/lib/active_postgres/railtie.rb +28 -0
  31. data/lib/active_postgres/retry_helper.rb +80 -0
  32. data/lib/active_postgres/rollback_manager.rb +140 -0
  33. data/lib/active_postgres/secrets.rb +86 -0
  34. data/lib/active_postgres/ssh_executor.rb +288 -0
  35. data/lib/active_postgres/standby_deployment_flow.rb +122 -0
  36. data/lib/active_postgres/validator.rb +143 -0
  37. data/lib/active_postgres/version.rb +3 -0
  38. data/lib/active_postgres.rb +67 -0
  39. data/lib/tasks/postgres.rake +855 -0
  40. data/lib/tasks/rolling_update.rake +258 -0
  41. data/lib/tasks/rotate_credentials.rake +193 -0
  42. data/templates/pg_hba.conf.erb +47 -0
  43. data/templates/pgbackrest.conf.erb +43 -0
  44. data/templates/pgbouncer.ini.erb +55 -0
  45. data/templates/postgresql.conf.erb +157 -0
  46. data/templates/repmgr.conf.erb +40 -0
  47. metadata +224 -0
@@ -0,0 +1,855 @@
1
+ def format_lag_status(lag)
2
+ if lag < 10
3
+ " Lag: ✅ #{lag}s (excellent)"
4
+ elsif lag < 60
5
+ " Lag: ⚠️ #{lag}s (acceptable)"
6
+ else
7
+ " Lag: ❌ #{lag}s (high)"
8
+ end
9
+ end
10
+
11
+ namespace :postgres do
12
+ desc 'Setup PostgreSQL HA cluster (use CLEAN=true for fresh install)'
13
+ task setup: :environment do
14
+ require 'active_postgres'
15
+
16
+ # Run purge first if CLEAN flag is set
17
+ if ENV['CLEAN'] == 'true'
18
+ puts "\n🧹 CLEAN flag detected - purging existing installation first...\n"
19
+ Rake::Task['postgres:purge'].invoke
20
+ puts "\n✅ Purge complete, proceeding with fresh setup...\n"
21
+ sleep 2 # Brief pause for user to see purge completion
22
+ end
23
+
24
+ config = ActivePostgres::Configuration.load
25
+ installer = ActivePostgres::Installer.new(config)
26
+ installer.setup
27
+ end
28
+
29
+ desc 'Destroy PostgreSQL cluster and remove all data (WARNING: destructive)'
30
+ task purge: :environment do
31
+ require 'active_postgres'
32
+
33
+ config = ActivePostgres::Configuration.load
34
+ ssh_executor = ActivePostgres::SSHExecutor.new(config)
35
+
36
+ puts "\n#{'=' * 80}"
37
+ puts '⚠️ PostgreSQL Cluster Destruction'
38
+ puts "#{'=' * 80}\n"
39
+
40
+ # 1. Run validation
41
+ puts '1. Running pre-flight validation'
42
+ validator = ActivePostgres::Validator.new(config, ssh_executor)
43
+ validation_result = validator.validate_all
44
+
45
+ puts "\n⚠️ Validation found errors, but continuing with purge..." unless validation_result
46
+
47
+ # 2. Show targets
48
+ puts "\n2. Destruction targets"
49
+ puts "Primary: #{config.primary_host}"
50
+ if config.standby_hosts.any?
51
+ puts "Standbys: #{config.standby_hosts.join(', ')}"
52
+ else
53
+ puts 'Standbys: None'
54
+ end
55
+
56
+ # 3. Show what will be destroyed
57
+ puts "\n⚠️ This will PERMANENTLY DELETE:"
58
+ puts " • All PostgreSQL installations (version #{config.version} and others)"
59
+ puts ' • All databases and data in /var/lib/postgresql'
60
+ puts ' • All configuration in /etc/postgresql'
61
+ puts ' • PgBouncer installation and configuration' if config.component_enabled?(:pgbouncer)
62
+ puts ' • Repmgr installation and configuration' if config.component_enabled?(:repmgr)
63
+ puts ' • Monitoring (prometheus-postgres-exporter)' if config.component_enabled?(:monitoring)
64
+ puts ' • SSL certificates and keys' if config.component_enabled?(:ssl)
65
+ puts ' • All log files'
66
+ puts ' • postgres system user and group'
67
+
68
+ # 4. Interactive confirmation
69
+ print "\n🚨 This action CANNOT be undone. Do you want to proceed? (yes/no): "
70
+ confirmation = $stdin.gets.chomp.downcase
71
+
72
+ unless confirmation == 'yes'
73
+ puts "\n❌ Purge cancelled"
74
+ exit 0
75
+ end
76
+
77
+ # 5. Execute purge
78
+ puts "\n🗑️ Purging PostgreSQL cluster..."
79
+
80
+ [config.primary_host, *config.standby_hosts].compact.each do |host|
81
+ puts "\n📦 Purging #{host}..."
82
+
83
+ ssh_executor.execute_on_host(host) do
84
+ # Stop all services
85
+ %w[postgresql pgbouncer repmgr prometheus-postgres-exporter].each do |service|
86
+ begin
87
+ execute :sudo, 'systemctl', 'stop', service
88
+ rescue StandardError
89
+ nil
90
+ end
91
+ begin
92
+ execute :sudo, 'systemctl', 'disable', service
93
+ rescue StandardError
94
+ nil
95
+ end
96
+ end
97
+
98
+ # Remove packages
99
+ begin
100
+ execute :sudo, 'DEBIAN_FRONTEND=noninteractive', 'apt-get', 'remove', '--purge', '-y',
101
+ 'postgresql*', 'pgbouncer', 'repmgr', 'prometheus-postgres-exporter'
102
+ rescue StandardError
103
+ nil
104
+ end
105
+ begin
106
+ execute :sudo, 'apt-get', 'autoremove', '-y'
107
+ rescue StandardError
108
+ nil
109
+ end
110
+
111
+ # Remove data and config directories
112
+ %w[
113
+ /var/lib/postgresql
114
+ /etc/postgresql
115
+ /etc/pgbouncer
116
+ /var/log/postgresql
117
+ /var/log/pgbouncer
118
+ /var/run/postgresql
119
+ ].each do |dir|
120
+ execute :sudo, 'rm', '-rf', dir
121
+ end
122
+
123
+ # Remove postgres system user and group
124
+ begin
125
+ execute :sudo, 'userdel', '-r', 'postgres'
126
+ rescue StandardError
127
+ nil
128
+ end
129
+ begin
130
+ execute :sudo, 'groupdel', 'postgres'
131
+ rescue StandardError
132
+ nil
133
+ end
134
+
135
+ puts " ✓ Purged PostgreSQL from #{host}"
136
+ end
137
+ end
138
+
139
+ puts "\n#{'=' * 80}"
140
+ puts '✅ Cluster purged successfully'
141
+ puts "#{'=' * 80}\n"
142
+ end
143
+
144
+ desc 'Check cluster status'
145
+ task status: :environment do
146
+ require 'active_postgres'
147
+
148
+ config = ActivePostgres::Configuration.load
149
+ health_checker = ActivePostgres::HealthChecker.new(config)
150
+ health_checker.show_status
151
+ end
152
+
153
+ desc 'Visualize cluster nodes and topology'
154
+ task nodes: :environment do
155
+ require 'active_postgres'
156
+
157
+ config = ActivePostgres::Configuration.load
158
+ ssh_executor = ActivePostgres::SSHExecutor.new(config, quiet: true)
159
+
160
+ puts "\n#{'=' * 80}"
161
+ puts 'PostgreSQL HA Cluster Topology'
162
+ puts "#{'=' * 80}\n"
163
+
164
+ # Primary node
165
+ puts '📍 PRIMARY'
166
+ puts " Host: #{config.primary_host}"
167
+ puts " Private IP: #{config.primary&.dig('private_ip') || 'N/A'}"
168
+ puts " Label: #{config.primary&.dig('label') || 'N/A'}"
169
+ puts " Port: #{config.component_enabled?(:pgbouncer) ? '6432 (PgBouncer)' : '5432 (Direct)'}"
170
+
171
+ # Check if running
172
+ begin
173
+ ssh_executor.execute_on_host(config.primary_host) do
174
+ status = begin
175
+ capture(:pg_lsclusters, '-h').split("\n").first&.split&.[](3)
176
+ rescue StandardError
177
+ 'unknown'
178
+ end
179
+ if status =~ /online/
180
+ puts ' Status: ✅ Running'
181
+ else
182
+ puts ' Status: ❌ Offline'
183
+ end
184
+ end
185
+ rescue StandardError
186
+ puts ' Status: ⚠️ Unknown (cannot connect)'
187
+ end
188
+
189
+ # Standby nodes
190
+ if config.standby_hosts&.any?
191
+ puts "\n📍 STANDBYS (#{config.standby_hosts.size})"
192
+ config.standby_hosts.each_with_index do |host, i|
193
+ standby_config = config.standbys[i]
194
+ puts "\n #{i + 1}. #{host}"
195
+ puts " Private IP: #{standby_config&.dig('private_ip') || 'N/A'}"
196
+ puts " Label: #{standby_config&.dig('label') || 'N/A'}"
197
+ puts ' Port: 5432'
198
+
199
+ begin
200
+ ssh_executor.execute_on_host(host) do
201
+ # Check if PostgreSQL is online
202
+ pg_status = begin
203
+ capture(:pg_lsclusters, '-h').split("\n").first&.split&.[](3)
204
+ rescue StandardError
205
+ 'unknown'
206
+ end
207
+
208
+ unless pg_status =~ /online/
209
+ puts ' Status: ❌ Offline'
210
+ next
211
+ end
212
+
213
+ # Check if in recovery mode (standby)
214
+ in_recovery = begin
215
+ result = capture(:sudo, '-u', 'postgres', 'psql', '-tA', '-c', '"SELECT pg_is_in_recovery();"')
216
+ result.strip == 't'
217
+ rescue StandardError
218
+ false
219
+ end
220
+
221
+ if in_recovery
222
+ # Check WAL byte lag
223
+ lag_bytes = begin
224
+ capture(:sudo, '-u', 'postgres', 'psql', '-tA', '-c',
225
+ '"SELECT pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn());"').strip.to_i.abs
226
+ rescue StandardError
227
+ nil
228
+ end
229
+
230
+ # Check time lag
231
+ lag_time = begin
232
+ capture(:sudo, '-u', 'postgres', 'psql', '-tA', '-c',
233
+ '"SELECT EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))::int;"').strip.to_i
234
+ rescue StandardError
235
+ nil
236
+ end
237
+
238
+ lag_str = lag_bytes&.zero? ? 'synced' : "#{lag_bytes} bytes behind"
239
+ time_str = lag_time ? "(last tx #{lag_time}s ago)" : ''
240
+ puts " Status: ✅ Replicating - #{lag_str} #{time_str}".rstrip
241
+ else
242
+ puts ' Status: ⚠️ Running but not in recovery mode'
243
+ end
244
+ end
245
+ rescue StandardError => e
246
+ puts " Status: ⚠️ Unknown (#{e.message})"
247
+ end
248
+ end
249
+ else
250
+ puts "\n📍 STANDBYS"
251
+ puts ' None configured (primary-only setup)'
252
+ end
253
+
254
+ # Components
255
+ puts "\n📦 ENABLED COMPONENTS"
256
+ components = []
257
+ components << '✅ PgBouncer (connection pooling)' if config.component_enabled?(:pgbouncer)
258
+ components << '✅ Repmgr (automatic failover)' if config.component_enabled?(:repmgr)
259
+ components << '✅ PgBackrest (backups)' if config.component_enabled?(:pgbackrest)
260
+ components << '✅ Monitoring (prometheus-postgres-exporter)' if config.component_enabled?(:monitoring)
261
+ components << '✅ SSL/TLS (encrypted connections)' if config.component_enabled?(:ssl)
262
+ components << '✅ Performance Tuning (auto-optimized)' if config.component_enabled?(:performance_tuning)
263
+
264
+ if components.any?
265
+ components.each { |c| puts " #{c}" }
266
+ else
267
+ puts ' None (minimal setup)'
268
+ end
269
+
270
+ puts "\n#{'=' * 80}\n"
271
+ end
272
+
273
+ desc 'Promote standby to primary'
274
+ task :promote, [:host] => :environment do |_t, args|
275
+ require 'active_postgres'
276
+
277
+ unless args[:host]
278
+ puts 'Usage: rake postgres:promote[standby-host]'
279
+ exit 1
280
+ end
281
+
282
+ config = ActivePostgres::Configuration.load
283
+ failover = ActivePostgres::Failover.new(config)
284
+ failover.promote(args[:host])
285
+ end
286
+
287
+ desc 'Run migrations on primary only'
288
+ task migrate: :environment do
289
+ # Ensure we're connected to primary
290
+ ActiveRecord::Base.connected_to(role: :writing) do
291
+ Rake::Task['db:migrate'].invoke
292
+ end
293
+ end
294
+
295
+ namespace :backup do
296
+ desc 'Create full backup'
297
+ task full: :environment do
298
+ require 'active_postgres'
299
+
300
+ config = ActivePostgres::Configuration.load
301
+ installer = ActivePostgres::Installer.new(config)
302
+ installer.run_backup('full')
303
+ end
304
+
305
+ desc 'Create incremental backup'
306
+ task incremental: :environment do
307
+ require 'active_postgres'
308
+
309
+ config = ActivePostgres::Configuration.load
310
+ installer = ActivePostgres::Installer.new(config)
311
+ installer.run_backup('incremental')
312
+ end
313
+
314
+ desc 'Restore from backup'
315
+ task :restore, [:backup_id] => :environment do |_t, args|
316
+ require 'active_postgres'
317
+
318
+ unless args[:backup_id]
319
+ puts 'Usage: rake postgres:backup:restore[backup_id]'
320
+ exit 1
321
+ end
322
+
323
+ config = ActivePostgres::Configuration.load
324
+ installer = ActivePostgres::Installer.new(config)
325
+ installer.run_restore(args[:backup_id])
326
+ end
327
+
328
+ desc 'List available backups'
329
+ task list: :environment do
330
+ require 'active_postgres'
331
+
332
+ config = ActivePostgres::Configuration.load
333
+ installer = ActivePostgres::Installer.new(config)
334
+ installer.list_backups
335
+ end
336
+ end
337
+
338
+ namespace :setup do
339
+ desc 'Setup only PgBouncer'
340
+ task pgbouncer: :environment do
341
+ require 'active_postgres'
342
+
343
+ config = ActivePostgres::Configuration.load
344
+ installer = ActivePostgres::Installer.new(config)
345
+ installer.setup_component('pgbouncer')
346
+ end
347
+
348
+ desc 'Setup only monitoring'
349
+ task monitoring: :environment do
350
+ require 'active_postgres'
351
+
352
+ config = ActivePostgres::Configuration.load
353
+ installer = ActivePostgres::Installer.new(config)
354
+ installer.setup_component('monitoring')
355
+ end
356
+
357
+ desc 'Setup only repmgr'
358
+ task repmgr: :environment do
359
+ require 'active_postgres'
360
+
361
+ config = ActivePostgres::Configuration.load
362
+ installer = ActivePostgres::Installer.new(config)
363
+ installer.setup_component('repmgr')
364
+ end
365
+ end
366
+
367
+ namespace :pgbouncer do
368
+ desc 'Update PgBouncer userlist with current database users'
369
+ task :update_userlist, [:users] => :environment do |_t, args|
370
+ require 'active_postgres'
371
+
372
+ config = ActivePostgres::Configuration.load
373
+ ssh_executor = ActivePostgres::SSHExecutor.new(config)
374
+ host = config.primary_host
375
+
376
+ # Get users to add (comma-separated or default to postgres + app user)
377
+ users = if args[:users]
378
+ args[:users].split(',').map(&:strip)
379
+ else
380
+ [config.postgres_user, config.app_user].compact.uniq
381
+ end
382
+
383
+ puts "Updating PgBouncer userlist on #{host}..."
384
+ puts " Users: #{users.join(', ')}"
385
+
386
+ ssh_executor.execute_on_host(host) do
387
+ postgres_user = config.postgres_user
388
+ userlist_entries = []
389
+
390
+ users.each do |user|
391
+ sql = <<~SQL.strip
392
+ SELECT concat('"', rolname, '" "', rolpassword, '"')
393
+ FROM pg_authid
394
+ WHERE rolname = '#{user}'
395
+ SQL
396
+
397
+ upload! StringIO.new(sql), '/tmp/get_user_hash.sql'
398
+ execute :chmod, '644', '/tmp/get_user_hash.sql'
399
+ user_hash = capture(:sudo, '-u', postgres_user, 'psql', '-t', '-f', '/tmp/get_user_hash.sql').strip
400
+ execute :rm, '-f', '/tmp/get_user_hash.sql'
401
+
402
+ if user_hash && !user_hash.empty?
403
+ userlist_entries << user_hash
404
+ puts " ✓ Added #{user}"
405
+ else
406
+ warn " ⚠ User #{user} not found in PostgreSQL"
407
+ end
408
+ rescue StandardError => e
409
+ warn " ✗ Error getting hash for #{user}: #{e.message}"
410
+ end
411
+
412
+ if userlist_entries.any?
413
+ userlist_content = "#{userlist_entries.join("\n")}\n"
414
+ # Upload to temp file first, then move to avoid stdin issues
415
+ upload! StringIO.new(userlist_content), '/tmp/userlist.txt'
416
+ execute :sudo, 'mv', '/tmp/userlist.txt', '/etc/pgbouncer/userlist.txt'
417
+ execute :sudo, 'chmod', '640', '/etc/pgbouncer/userlist.txt'
418
+ execute :sudo, 'chown', 'postgres:postgres', '/etc/pgbouncer/userlist.txt'
419
+ execute :sudo, 'systemctl', 'reload', 'pgbouncer'
420
+ puts "\n✅ Userlist updated with #{userlist_entries.size} user(s) and PgBouncer reloaded"
421
+ else
422
+ warn "\n⚠ No users added to userlist"
423
+ end
424
+ end
425
+ end
426
+
427
+ desc 'Show PgBouncer status and statistics'
428
+ task stats: :environment do
429
+ require 'active_postgres'
430
+
431
+ config = ActivePostgres::Configuration.load
432
+ ssh_executor = ActivePostgres::SSHExecutor.new(config)
433
+ host = config.primary_host
434
+
435
+ ssh_executor.execute_on_host(host) do
436
+ puts "PgBouncer Status on #{host}:"
437
+ execute :sudo, 'systemctl', 'status', 'pgbouncer', '--no-pager'
438
+ end
439
+ end
440
+ end
441
+
442
+ desc 'Verify cluster health and configuration (comprehensive checklist)'
443
+ task verify: :environment do
444
+ require 'active_postgres'
445
+
446
+ config = ActivePostgres::Configuration.load
447
+ ssh_executor = ActivePostgres::SSHExecutor.new(config)
448
+
449
+ puts "\n#{'=' * 80}\n🔍 PostgreSQL HA Cluster Verification Checklist\n#{'=' * 80}"
450
+
451
+ results = { passed: [], failed: [], warnings: [] }
452
+
453
+ [config.primary_host, *config.standby_hosts].compact.each do |host|
454
+ label = host == config.primary_host ? 'PRIMARY' : 'STANDBY'
455
+ puts "\n📊 #{label}: #{host}\n#{'-' * 80}"
456
+
457
+ ssh_executor.execute_on_host(host) do
458
+ # 1. PostgreSQL Installation & Status
459
+ puts "\n1️⃣ PostgreSQL Installation"
460
+ begin
461
+ version = capture(:sudo, '-u', 'postgres', 'psql', '--version').strip
462
+ info " Version: #{version}"
463
+ cluster_status = begin
464
+ capture(:pg_lsclusters, '-h').split("\n").first.split[3]
465
+ rescue StandardError
466
+ 'down'
467
+ end
468
+ if cluster_status == 'online' || cluster_status.start_with?('online,')
469
+ info ' Status: Running ✅'
470
+ results[:passed] << "#{label}: PostgreSQL running"
471
+ else
472
+ warn ' Status: Stopped ❌'
473
+ results[:failed] << "#{label}: PostgreSQL not running"
474
+ end
475
+ rescue StandardError => e
476
+ warn " ❌ PostgreSQL check failed: #{e.message}"
477
+ results[:failed] << "#{label}: PostgreSQL check failed"
478
+ end
479
+
480
+ # 2. Performance Tuning
481
+ puts "\n2️⃣ Performance Tuning"
482
+ tuning_ok = true
483
+ %w[shared_buffers effective_cache_size work_mem max_connections].each do |setting|
484
+ val = begin
485
+ capture(:sudo, '-u', 'postgres', 'psql', '-t', '-c', "'SHOW #{setting};'").strip
486
+ rescue StandardError
487
+ 'N/A'
488
+ end
489
+ info " #{setting.ljust(20)}: #{val}"
490
+ tuning_ok = false if val == 'N/A'
491
+ end
492
+ if tuning_ok
493
+ results[:passed] << "#{label}: Performance tuning applied"
494
+ else
495
+ results[:warnings] << "#{label}: Some performance settings missing"
496
+ end
497
+
498
+ # 3. SSL/TLS
499
+ puts "\n3️⃣ SSL/TLS Encryption"
500
+ ssl = begin
501
+ capture(:sudo, '-u', 'postgres', 'psql', '-t', '-c', "'SHOW ssl;'").strip
502
+ rescue StandardError
503
+ 'off'
504
+ end
505
+ if ssl == 'on'
506
+ info ' SSL: Enabled ✅'
507
+ cert_valid = test('[ -f /etc/postgresql/*/main/server.crt ]')
508
+ key_valid = test('[ -f /etc/postgresql/*/main/server.key ]')
509
+ if cert_valid && key_valid
510
+ info ' Certificates: Present ✅'
511
+ results[:passed] << "#{label}: SSL enabled with certificates"
512
+ else
513
+ warn ' Certificates: Missing ⚠️'
514
+ results[:warnings] << "#{label}: SSL enabled but certificates missing"
515
+ end
516
+ else
517
+ warn ' SSL: Disabled'
518
+ results[:warnings] << "#{label}: SSL not enabled"
519
+ end
520
+
521
+ # 4. Replication (standbys only)
522
+ if label == 'STANDBY'
523
+ puts "\n4️⃣ Replication"
524
+ begin
525
+ recovery = capture(:sudo, '-u', 'postgres', 'psql', '-t', '-c', "'SELECT pg_is_in_recovery();'").strip
526
+ if recovery == 't'
527
+ info ' Recovery mode: Yes ✅'
528
+
529
+ # Check WAL receiver
530
+ wal_status = capture(:sudo, '-u', 'postgres', 'psql', '-t', '-c', "'SELECT status FROM pg_stat_wal_receiver;'").strip
531
+ if wal_status == 'streaming'
532
+ info ' WAL receiver: Streaming ✅'
533
+ results[:passed] << "#{label}: Replication streaming"
534
+ else
535
+ warn " WAL receiver: #{wal_status} ⚠️"
536
+ results[:warnings] << "#{label}: Replication not streaming"
537
+ end
538
+
539
+ # WAL byte lag (actual replication delay)
540
+ byte_lag = capture(:sudo, '-u', 'postgres', 'psql', '-t', '-c',
541
+ "'SELECT pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn());'").strip.to_i
542
+ if byte_lag.zero?
543
+ info ' WAL lag: 0 bytes (fully synced) ✅'
544
+ results[:passed] << "#{label}: Replication fully synced"
545
+ else
546
+ info " WAL lag: #{byte_lag} bytes"
547
+ results[:passed] << "#{label}: Replication lag #{byte_lag} bytes"
548
+ end
549
+
550
+ # Time since last transaction (informational only)
551
+ last_tx = capture(:sudo, '-u', 'postgres', 'psql', '-t', '-c',
552
+ "'SELECT EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))::int;'").strip.to_i
553
+ info " Last write: #{last_tx}s ago (primary idle time)"
554
+ else
555
+ warn ' Recovery mode: No ❌'
556
+ results[:failed] << "#{label}: Not in recovery mode"
557
+ end
558
+ rescue StandardError => e
559
+ warn " ❌ Replication check failed: #{e.message}"
560
+ results[:failed] << "#{label}: Replication check failed"
561
+ end
562
+ end
563
+
564
+ # 5. Repmgr (if enabled)
565
+ if config.component_enabled?(:repmgr)
566
+ puts "\n5️⃣ Repmgr"
567
+ begin
568
+ # Check if node is registered in cluster
569
+ cluster_show = capture(:sudo, '-u', 'postgres', 'repmgr', 'cluster', 'show', '2>/dev/null')
570
+ node_registered = cluster_show.include?(host) || cluster_show.match?(/\|\s*(primary|standby)\s*\|/)
571
+
572
+ if node_registered
573
+ info ' Node registered: Yes ✅'
574
+ results[:passed] << "#{label}: Repmgr node registered"
575
+
576
+ # Check if repmgrd daemon is running (for automatic failover)
577
+ if test('systemctl is-active repmgrd')
578
+ info ' Auto-failover daemon: Running ✅'
579
+ else
580
+ warn ' Auto-failover daemon: Not running (manual failover only)'
581
+ end
582
+ else
583
+ warn ' Node registered: No ⚠️'
584
+ results[:warnings] << "#{label}: Repmgr node not registered"
585
+ end
586
+ rescue StandardError => e
587
+ warn " ❌ Repmgr check failed: #{e.message}"
588
+ results[:failed] << "#{label}: Repmgr check failed"
589
+ end
590
+ end
591
+
592
+ # 6. PgBouncer (primary only)
593
+ if label == 'PRIMARY' && config.component_enabled?(:pgbouncer)
594
+ puts "\n6️⃣ PgBouncer"
595
+ if test('systemctl is-active pgbouncer')
596
+ info ' Status: Running ✅'
597
+
598
+ # Check userlist (need sudo for file access)
599
+ if test(:sudo, 'test', '-s', '/etc/pgbouncer/userlist.txt')
600
+ user_count = capture(:sudo, :wc, '-l', '/etc/pgbouncer/userlist.txt').split.first.to_i
601
+ info " Userlist: #{user_count} user(s) configured ✅"
602
+ results[:passed] << "#{label}: PgBouncer running with #{user_count} user(s)"
603
+ else
604
+ warn ' Userlist: Empty ⚠️'
605
+ results[:warnings] << "#{label}: PgBouncer userlist empty"
606
+ end
607
+ else
608
+ warn ' Status: Not running ❌'
609
+ results[:failed] << "#{label}: PgBouncer not running"
610
+ end
611
+ end
612
+
613
+ # 7. Disk Space
614
+ puts "\n7️⃣ Disk Space"
615
+ df_output = capture(:df, '-h', '/var/lib/postgresql')
616
+ df_lines = df_output.split("\n")
617
+ if df_lines.size > 1
618
+ usage = df_lines[1].split[4].to_i
619
+ info " PostgreSQL data: #{df_lines[1].split[4]} used"
620
+ if usage > 90
621
+ warn ' ⚠️ Disk usage critical (>90%)'
622
+ results[:warnings] << "#{label}: Disk usage high (#{usage}%)"
623
+ elsif usage > 80
624
+ info ' ⚠️ Disk usage high (>80%)'
625
+ results[:warnings] << "#{label}: Disk usage moderate (#{usage}%)"
626
+ else
627
+ results[:passed] << "#{label}: Disk space OK (#{usage}%)"
628
+ end
629
+ end
630
+
631
+ # 8. Connectivity
632
+ puts "\n8️⃣ Connectivity"
633
+ if test(:sudo, '-u', 'postgres', 'psql', '-c', "'SELECT 1;'")
634
+ info ' Database connection: OK ✅'
635
+ results[:passed] << "#{label}: Database connectable"
636
+ else
637
+ warn ' Database connection: Failed ❌'
638
+ results[:failed] << "#{label}: Cannot connect to database"
639
+ end
640
+ end
641
+ end
642
+
643
+ # Summary
644
+ puts "\n#{'=' * 80}\n📋 Verification Summary\n#{'=' * 80}"
645
+ puts "\n✅ Passed (#{results[:passed].size}):"
646
+ results[:passed].each { |r| puts " #{r}" }
647
+
648
+ if results[:warnings].any?
649
+ puts "\n⚠️ Warnings (#{results[:warnings].size}):"
650
+ results[:warnings].each { |r| puts " #{r}" }
651
+ end
652
+
653
+ if results[:failed].any?
654
+ puts "\n❌ Failed (#{results[:failed].size}):"
655
+ results[:failed].each { |r| puts " #{r}" }
656
+ end
657
+
658
+ puts "\n#{'=' * 80}"
659
+ if results[:failed].empty?
660
+ puts '✅ Cluster verification complete - All critical checks passed!'
661
+ else
662
+ puts "⚠️ Cluster verification complete - #{results[:failed].size} critical issue(s) found"
663
+ exit 1
664
+ end
665
+ puts "#{'=' * 80}\n"
666
+ end
667
+
668
+ namespace :test do
669
+ desc 'Run replication stress test (creates temp DB, inserts rows, verifies replication, cleans up)'
670
+ task :replication, [:rows] => :environment do |_t, args|
671
+ require 'active_postgres'
672
+
673
+ rows = (args[:rows] || 1000).to_i
674
+ config = ActivePostgres::Configuration.load
675
+ ssh_executor = ActivePostgres::SSHExecutor.new(config, quiet: true)
676
+
677
+ puts "\n#{'=' * 60}"
678
+ puts '🧪 Replication Stress Test'
679
+ puts "#{'=' * 60}\n"
680
+
681
+ test_db = 'active_postgres_stress_test'
682
+ primary = config.primary_host
683
+ standbys = config.standby_hosts
684
+
685
+ begin
686
+ # Create test database
687
+ puts '1️⃣ Creating test database...'
688
+ ssh_executor.execute_on_host(primary) do
689
+ execute :sudo, '-u', 'postgres', 'psql', '-c', "\"DROP DATABASE IF EXISTS #{test_db};\""
690
+ execute :sudo, '-u', 'postgres', 'psql', '-c', "\"CREATE DATABASE #{test_db};\""
691
+ execute :sudo, '-u', 'postgres', 'psql', '-d', test_db, '-c',
692
+ '"CREATE TABLE test_inserts (id SERIAL PRIMARY KEY, data TEXT, created_at TIMESTAMP DEFAULT NOW());"'
693
+ end
694
+ puts ' ✓ Test database created'
695
+
696
+ # Run insert stress test
697
+ puts "\n2️⃣ Inserting #{rows} rows on primary..."
698
+ start_time = Time.now
699
+ ssh_executor.execute_on_host(primary) do
700
+ execute :sudo, '-u', 'postgres', 'psql', '-d', test_db, '-c',
701
+ "\"INSERT INTO test_inserts (data) SELECT md5(random()::text) FROM generate_series(1, #{rows});\""
702
+ end
703
+ insert_time = (Time.now - start_time).round(2)
704
+ puts " ✓ Inserted #{rows} rows in #{insert_time}s (#{(rows / insert_time).round} rows/sec)"
705
+
706
+ # Verify on primary
707
+ puts "\n3️⃣ Verifying row count..."
708
+ primary_count = 0
709
+ ssh_executor.execute_on_host(primary) do
710
+ result = capture(:sudo, '-u', 'postgres', 'psql', '-t', '-d', test_db, '-c', '"SELECT COUNT(*) FROM test_inserts;"')
711
+ primary_count = result.strip.to_i
712
+ end
713
+ puts " Primary: #{primary_count} rows"
714
+
715
+ # Wait for replication
716
+ sleep 1
717
+
718
+ # Verify on standbys
719
+ all_synced = true
720
+ standbys.each do |standby|
721
+ standby_count = 0
722
+ ssh_executor.execute_on_host(standby) do
723
+ result = capture(:sudo, '-u', 'postgres', 'psql', '-t', '-d', test_db, '-c', '"SELECT COUNT(*) FROM test_inserts;"')
724
+ standby_count = result.strip.to_i
725
+ end
726
+
727
+ if standby_count == primary_count
728
+ puts " Standby #{standby}: #{standby_count} rows ✓"
729
+ else
730
+ puts " Standby #{standby}: #{standby_count} rows ✗ (expected #{primary_count})"
731
+ all_synced = false
732
+ end
733
+ end
734
+
735
+ # Check replication lag
736
+ puts "\n4️⃣ Replication lag after test..."
737
+ ssh_executor.execute_on_host(primary) do
738
+ result = capture(:sudo, '-u', 'postgres', 'psql', '-t', '-c',
739
+ '"SELECT client_addr, pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn) as lag_bytes FROM pg_stat_replication;"')
740
+ result.strip.split("\n").each do |line|
741
+ next if line.strip.empty?
742
+
743
+ parts = line.split('|').map(&:strip)
744
+ puts " #{parts[0]}: #{parts[1]} bytes"
745
+ end
746
+ end
747
+
748
+ puts "\n#{'=' * 60}"
749
+ if all_synced
750
+ puts '✅ Replication stress test PASSED!'
751
+ else
752
+ puts '❌ Replication stress test FAILED - not all standbys synced'
753
+ end
754
+ puts "#{'=' * 60}\n"
755
+ ensure
756
+ # Cleanup
757
+ puts "\n5️⃣ Cleaning up test database..."
758
+ ssh_executor.execute_on_host(primary) do
759
+ execute :sudo, '-u', 'postgres', 'psql', '-c', "\"DROP DATABASE IF EXISTS #{test_db};\""
760
+ end
761
+ puts ' ✓ Test database removed'
762
+ end
763
+ end
764
+
765
+ desc 'Test PgBouncer connection pooling'
766
+ task :pgbouncer, [:connections] => :environment do |_t, args|
767
+ require 'active_postgres'
768
+
769
+ connections = (args[:connections] || 50).to_i
770
+ config = ActivePostgres::Configuration.load
771
+ ssh_executor = ActivePostgres::SSHExecutor.new(config, quiet: true)
772
+
773
+ unless config.component_enabled?(:pgbouncer)
774
+ puts '❌ PgBouncer is not enabled in config'
775
+ exit 1
776
+ end
777
+
778
+ puts "\n#{'=' * 60}"
779
+ puts '🧪 PgBouncer Connection Test'
780
+ puts "#{'=' * 60}\n"
781
+
782
+ primary = config.primary_host
783
+
784
+ ssh_executor.execute_on_host(primary) do
785
+ # Check PgBouncer status
786
+ puts '1️⃣ PgBouncer service status:'
787
+ status = capture(:systemctl, 'is-active', 'pgbouncer').strip
788
+ puts " Service: #{status == 'active' ? '✓ Running' : '✗ Not running'}"
789
+
790
+ # Show config
791
+ puts "\n2️⃣ PgBouncer configuration:"
792
+ config_output = capture(:sudo, :grep, '-E', '(listen_port|pool_mode|max_client|default_pool)', '/etc/pgbouncer/pgbouncer.ini')
793
+ config_output.split("\n").each { |line| puts " #{line.strip}" }
794
+
795
+ # Show users
796
+ puts "\n3️⃣ Configured users:"
797
+ users = capture(:sudo, :cut, "-d'\"'", '-f2', '/etc/pgbouncer/userlist.txt')
798
+ users.split("\n").each { |user| puts " - #{user.strip}" unless user.strip.empty? }
799
+
800
+ # Test direct PostgreSQL (port 5432)
801
+ puts "\n4️⃣ Direct PostgreSQL test (port 5432):"
802
+ begin
803
+ execute :sudo, '-u', 'postgres', 'psql', '-p', '5432', '-c', '"SELECT 1;"'
804
+ puts ' ✓ Direct connection works'
805
+ rescue StandardError
806
+ puts ' ✗ Direct connection failed'
807
+ end
808
+
809
+ # Test PgBouncer (port 6432)
810
+ puts "\n5️⃣ PgBouncer connection test (port 6432):"
811
+ begin
812
+ execute :sudo, '-u', 'postgres', 'psql', '-h', '127.0.0.1', '-p', '6432', '-d', 'postgres', '-c', '"SELECT 1;"'
813
+ puts ' ✓ PgBouncer connection works'
814
+ rescue StandardError => e
815
+ puts " ⚠️ PgBouncer connection test: #{e.message.split("\n").first}"
816
+ end
817
+
818
+ # Stress test - multiple concurrent connections
819
+ puts "\n6️⃣ Connection stress test (#{connections} concurrent connections via PgBouncer):"
820
+ begin
821
+ # Use pgbench for stress testing
822
+ if test('which pgbench')
823
+ execute :sudo, '-u', 'postgres', 'pgbench', '-i', '-s', '1', '-p', '6432', '-h', '127.0.0.1', 'postgres', '2>/dev/null', '||', 'true'
824
+ result = capture(:sudo, '-u', 'postgres', 'pgbench', '-c', connections.to_s, '-j', '4', '-t', '10',
825
+ '-p', '6432', '-h', '127.0.0.1', 'postgres', '2>&1')
826
+ tps_match = result.match(/tps = ([\d.]+)/)
827
+ if tps_match
828
+ puts " ✓ Stress test completed: #{tps_match[1]} TPS"
829
+ else
830
+ puts ' ✓ Stress test completed'
831
+ end
832
+ else
833
+ puts ' ⚠️ pgbench not installed, skipping stress test'
834
+ end
835
+ rescue StandardError => e
836
+ puts " ⚠️ Stress test failed: #{e.message.split("\n").first}"
837
+ end
838
+
839
+ # Show pool stats
840
+ puts "\n7️⃣ Pool statistics:"
841
+ begin
842
+ stats = capture(:sudo, '-u', 'postgres', 'psql', '-h', '127.0.0.1', '-p', '6432', '-d', 'pgbouncer',
843
+ '-c', '"SHOW POOLS;"', '2>/dev/null')
844
+ stats.split("\n").each { |line| puts " #{line}" }
845
+ rescue StandardError
846
+ puts ' ⚠️ Could not fetch pool stats'
847
+ end
848
+
849
+ puts "\n#{'=' * 60}"
850
+ puts '✅ PgBouncer test complete'
851
+ puts "#{'=' * 60}\n"
852
+ end
853
+ end
854
+ end
855
+ end