pgbus 0.6.7 → 0.6.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -21,6 +21,8 @@ module Pgbus
21
21
 
22
22
  throughput = compute_throughput(queues)
23
23
 
24
+ health = queue_health_stats
25
+
24
26
  {
25
27
  total_queues: queues.size,
26
28
  total_depth: total_depth,
@@ -29,7 +31,10 @@ module Pgbus
29
31
  failed_count: failed_events_count,
30
32
  dlq_depth: dlq_depth,
31
33
  recurring_count: recurring_tasks_count,
32
- throughput_rate: throughput
34
+ throughput_rate: throughput,
35
+ total_dead_tuples: health[:total_dead_tuples],
36
+ tables_needing_vacuum: health[:tables_needing_vacuum],
37
+ oldest_transaction_age_sec: health[:oldest_transaction_age_sec]
33
38
  }
34
39
  end
35
40
 
@@ -629,6 +634,50 @@ module Pgbus
629
634
  []
630
635
  end
631
636
 
637
+ # Queue health — vacuum stats, dead tuples, bloat, MVCC horizon.
638
+ # Returns aggregate health across all queue and archive tables, plus
639
+ # the oldest open transaction age (MVCC horizon pinning risk).
640
+ def queue_health_stats
641
+ tables = fetch_all_table_stats
642
+
643
+ total_dead = tables.sum { |t| t[:dead_tuples] }
644
+ total_live = tables.sum { |t| t[:live_tuples] }
645
+ worst_bloat = tables.map { |t| t[:bloat_ratio] }.max || 0.0
646
+ needs_vacuum = tables.count { |t| t[:bloat_ratio] > 0.1 }
647
+ oldest_vacuum = tables.filter_map { |t| t[:last_vacuum_ago_sec] }.max
648
+
649
+ {
650
+ total_dead_tuples: total_dead,
651
+ total_live_tuples: total_live,
652
+ worst_bloat_ratio: worst_bloat.round(4),
653
+ tables_needing_vacuum: needs_vacuum,
654
+ oldest_vacuum_ago_sec: oldest_vacuum,
655
+ oldest_transaction_age_sec: oldest_transaction_age,
656
+ tables: tables
657
+ }
658
+ rescue StandardError => e
659
+ Pgbus.logger.debug { "[Pgbus::Web] Error fetching queue health stats: #{e.class}: #{e.message}" }
660
+ {
661
+ total_dead_tuples: 0, total_live_tuples: 0, worst_bloat_ratio: 0.0,
662
+ tables_needing_vacuum: 0, oldest_vacuum_ago_sec: nil,
663
+ oldest_transaction_age_sec: nil, tables: []
664
+ }
665
+ end
666
+
667
+ # Per-queue health stats for the queue detail view.
668
+ def queue_health_detail(queue_name)
669
+ sanitized = sanitize_name(queue_name)
670
+ tables = [
671
+ fetch_table_stats("pgmq", "q_#{sanitized}", "queue"),
672
+ fetch_table_stats("pgmq", "a_#{sanitized}", "archive")
673
+ ].compact
674
+
675
+ { tables: tables, oldest_transaction_age_sec: oldest_transaction_age }
676
+ rescue StandardError => e
677
+ Pgbus.logger.debug { "[Pgbus::Web] Error fetching health detail for #{queue_name}: #{e.message}" }
678
+ { tables: [], oldest_transaction_age_sec: nil }
679
+ end
680
+
632
681
  # Stream stats — only populated when streams_stats_enabled is
633
682
  # true AND the migration has been run. Controllers should gate
634
683
  # rendering on `stream_stats_available?` to avoid showing empty
@@ -674,6 +723,89 @@ module Pgbus
674
723
  Pgbus::BusRecord.connection
675
724
  end
676
725
 
726
+ # Single query to fetch pg_stat_user_tables stats for all queue and
727
+ # archive tables. Avoids 2*N catalog queries on the dashboard.
728
+ def fetch_all_table_stats
729
+ rows = connection.select_all(<<~SQL, "Pgbus All Table Health")
730
+ WITH rels AS (
731
+ SELECT queue_name, 'q_' || queue_name AS relname, 'queue' AS kind FROM pgmq.meta
732
+ UNION ALL
733
+ SELECT queue_name, 'a_' || queue_name AS relname, 'archive' AS kind FROM pgmq.meta
734
+ )
735
+ SELECT
736
+ 'pgmq.' || r.relname AS table_name,
737
+ r.kind,
738
+ s.n_live_tup,
739
+ s.n_dead_tup,
740
+ EXTRACT(epoch FROM (NOW() - COALESCE(s.last_vacuum, s.last_autovacuum)))::int AS last_vacuum_ago_sec,
741
+ s.last_vacuum,
742
+ s.last_autovacuum
743
+ FROM rels r
744
+ LEFT JOIN pg_stat_user_tables s
745
+ ON s.schemaname = 'pgmq' AND s.relname = r.relname
746
+ ORDER BY r.queue_name, r.kind
747
+ SQL
748
+
749
+ rows.to_a.filter_map { |row| build_table_health_row(row) }
750
+ end
751
+
752
+ # Fetch pg_stat_user_tables stats for a single table (used by queue_health_detail).
753
+ def fetch_table_stats(schema, table_name, kind)
754
+ row = connection.select_one(<<~SQL, "Pgbus Table Health", [schema, table_name])
755
+ SELECT
756
+ n_live_tup,
757
+ n_dead_tup,
758
+ EXTRACT(epoch FROM (NOW() - COALESCE(last_vacuum, last_autovacuum)))::int AS last_vacuum_ago_sec,
759
+ last_vacuum,
760
+ last_autovacuum
761
+ FROM pg_stat_user_tables
762
+ WHERE schemaname = $1 AND relname = $2
763
+ SQL
764
+
765
+ return nil unless row
766
+
767
+ build_table_health_row(row.merge("table_name" => "#{schema}.#{table_name}", "kind" => kind))
768
+ end
769
+
770
+ def build_table_health_row(row)
771
+ return nil unless row["n_live_tup"] || row["n_dead_tup"]
772
+
773
+ live = row["n_live_tup"].to_i
774
+ dead = row["n_dead_tup"].to_i
775
+ total = live + dead
776
+ bloat = total.positive? ? (dead.to_f / total) : 0.0
777
+
778
+ {
779
+ table: row["table_name"],
780
+ kind: row["kind"],
781
+ live_tuples: live,
782
+ dead_tuples: dead,
783
+ bloat_ratio: bloat.round(4),
784
+ last_vacuum_ago_sec: row["last_vacuum_ago_sec"]&.to_i,
785
+ last_vacuum: row["last_vacuum"],
786
+ last_autovacuum: row["last_autovacuum"]
787
+ }
788
+ end
789
+
790
+ # Age of the oldest open transaction in seconds — indicates MVCC
791
+ # horizon pinning risk. Returns nil if no active transactions.
792
+ def oldest_transaction_age
793
+ row = connection.select_one(<<~SQL, "Pgbus Oldest Transaction")
794
+ SELECT EXTRACT(epoch FROM (NOW() - xact_start))::int AS age_sec
795
+ FROM pg_stat_activity
796
+ WHERE state != 'idle'
797
+ AND xact_start IS NOT NULL
798
+ AND pid != pg_backend_pid()
799
+ ORDER BY xact_start ASC
800
+ LIMIT 1
801
+ SQL
802
+
803
+ row&.dig("age_sec")&.to_i
804
+ rescue StandardError => e
805
+ Pgbus.logger.debug { "[Pgbus::Web] Error fetching oldest transaction age: #{e.class}: #{e.message}" }
806
+ nil
807
+ end
808
+
677
809
  # name is the full PGMQ queue name (already prefixed)
678
810
  def query_queue_messages(name, limit, offset)
679
811
  query_queue_messages_raw(name, limit, offset).map { |m| m.merge(queue: name) }
@@ -24,6 +24,7 @@ module Pgbus
24
24
  append_process_metrics(lines)
25
25
  append_summary_metrics(lines)
26
26
  append_stream_metrics(lines)
27
+ append_health_metrics(lines)
27
28
  "#{lines.join("\n")}\n"
28
29
  end
29
30
 
@@ -97,9 +98,41 @@ module Pgbus
97
98
  end
98
99
 
99
100
  def append_process_metrics(lines)
100
- count = @data_source.processes.count
101
+ procs = @data_source.processes
101
102
  gauge(lines, "pgbus_active_processes", "Number of active pgbus worker processes") do
102
- [[count]]
103
+ [[procs.count]]
104
+ end
105
+
106
+ workers = procs.select { |p| p[:kind] == "worker" && p[:metadata].is_a?(Hash) }
107
+ unless workers.empty?
108
+ gauge(lines, "pgbus_worker_pool_capacity", "Total thread/async pool capacity per worker") do
109
+ workers.filter_map do |w|
110
+ capacity = w[:metadata]["capacity"]
111
+ next unless capacity
112
+
113
+ [capacity, { pid: w[:pid], hostname: w[:hostname] }]
114
+ end
115
+ end
116
+
117
+ gauge(lines, "pgbus_worker_pool_busy", "Number of busy threads/slots per worker") do
118
+ workers.filter_map do |w|
119
+ busy = w[:metadata]["busy"]
120
+ next unless busy
121
+
122
+ [busy, { pid: w[:pid], hostname: w[:hostname] }]
123
+ end
124
+ end
125
+
126
+ gauge(lines, "pgbus_worker_pool_utilization", "Pool utilization ratio (busy / capacity)") do
127
+ workers.filter_map do |w|
128
+ capacity = w[:metadata]["capacity"].to_i
129
+ busy = w[:metadata]["busy"].to_i
130
+ next unless capacity.positive?
131
+
132
+ ratio = (busy.to_f / capacity).round(4)
133
+ [ratio, { pid: w[:pid], hostname: w[:hostname] }]
134
+ end
135
+ end
103
136
  end
104
137
  rescue StandardError => e
105
138
  Pgbus.logger.debug { "[Pgbus::Metrics] Error serializing process metrics: #{e.message}" }
@@ -141,6 +174,42 @@ module Pgbus
141
174
  Pgbus.logger.debug { "[Pgbus::Metrics] Error serializing stream metrics: #{e.message}" }
142
175
  end
143
176
 
177
+ def append_health_metrics(lines)
178
+ health = @data_source.queue_health_stats
179
+ return if health[:tables].empty? && health[:oldest_transaction_age_sec].nil?
180
+
181
+ tables = health[:tables]
182
+ unless tables.empty?
183
+ gauge(lines, "pgbus_table_dead_tuples", "Number of dead tuples in queue/archive table") do
184
+ tables.map { |t| [t[:dead_tuples], { table: t[:table], kind: t[:kind] }] }
185
+ end
186
+
187
+ gauge(lines, "pgbus_table_live_tuples", "Number of live tuples in queue/archive table") do
188
+ tables.map { |t| [t[:live_tuples], { table: t[:table], kind: t[:kind] }] }
189
+ end
190
+
191
+ gauge(lines, "pgbus_table_bloat_ratio", "Dead tuple ratio (dead / total) per table") do
192
+ tables.map { |t| [t[:bloat_ratio], { table: t[:table], kind: t[:kind] }] }
193
+ end
194
+
195
+ vacuum_tables = tables.select { |t| t[:last_vacuum_ago_sec] }
196
+ unless vacuum_tables.empty?
197
+ gauge(lines, "pgbus_table_last_vacuum_age_seconds", "Seconds since last vacuum") do
198
+ vacuum_tables.map { |t| [t[:last_vacuum_ago_sec], { table: t[:table], kind: t[:kind] }] }
199
+ end
200
+ end
201
+ end
202
+
203
+ if health[:oldest_transaction_age_sec]
204
+ gauge(lines, "pgbus_oldest_transaction_age_seconds",
205
+ "Age of the oldest open transaction (MVCC horizon pin risk)") do
206
+ [[health[:oldest_transaction_age_sec]]]
207
+ end
208
+ end
209
+ rescue StandardError => e
210
+ Pgbus.logger.debug { "[Pgbus::Metrics] Error serializing health metrics: #{e.message}" }
211
+ end
212
+
144
213
  # Emits a Prometheus gauge metric family. The block must return an array
145
214
  # of [value] or [value, { label: "val" }] pairs.
146
215
  def gauge(lines, name, help)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pgbus
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.7
4
+ version: 0.6.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mikael Henriksson
@@ -162,6 +162,7 @@ files:
162
162
  - app/models/pgbus/uniqueness_key.rb
163
163
  - app/views/layouts/pgbus/application.html.erb
164
164
  - app/views/pgbus/dashboard/_processes_table.html.erb
165
+ - app/views/pgbus/dashboard/_queue_health.html.erb
165
166
  - app/views/pgbus/dashboard/_queues_table.html.erb
166
167
  - app/views/pgbus/dashboard/_recent_failures.html.erb
167
168
  - app/views/pgbus/dashboard/_stats_cards.html.erb
@@ -230,12 +231,15 @@ files:
230
231
  - lib/generators/pgbus/templates/pgbus.yml.erb
231
232
  - lib/generators/pgbus/templates/pgbus_binstub.erb
232
233
  - lib/generators/pgbus/templates/recurring.yml.erb
234
+ - lib/generators/pgbus/templates/tune_autovacuum.rb.erb
233
235
  - lib/generators/pgbus/templates/upgrade_pgmq.rb.erb
236
+ - lib/generators/pgbus/tune_autovacuum_generator.rb
234
237
  - lib/generators/pgbus/update_generator.rb
235
238
  - lib/generators/pgbus/upgrade_pgmq_generator.rb
236
239
  - lib/pgbus.rb
237
240
  - lib/pgbus/active_job/adapter.rb
238
241
  - lib/pgbus/active_job/executor.rb
242
+ - lib/pgbus/autovacuum_tuning.rb
239
243
  - lib/pgbus/batch.rb
240
244
  - lib/pgbus/bus_record.rb
241
245
  - lib/pgbus/circuit_breaker.rb