pgbus 0.6.7 → 0.6.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/pgbus/dashboard_controller.rb +4 -0
- data/app/controllers/pgbus/queues_controller.rb +1 -0
- data/app/views/pgbus/dashboard/_queue_health.html.erb +78 -0
- data/app/views/pgbus/dashboard/show.html.erb +2 -0
- data/app/views/pgbus/queues/show.html.erb +37 -0
- data/config/locales/da.yml +35 -4
- data/config/locales/de.yml +35 -4
- data/config/locales/en.yml +35 -4
- data/config/locales/es.yml +35 -4
- data/config/locales/fi.yml +35 -4
- data/config/locales/fr.yml +35 -4
- data/config/locales/it.yml +35 -4
- data/config/locales/ja.yml +35 -4
- data/config/locales/nb.yml +35 -4
- data/config/locales/nl.yml +35 -4
- data/config/locales/pt.yml +35 -4
- data/config/locales/sv.yml +35 -4
- data/lib/generators/pgbus/templates/migration.rb.erb +6 -0
- data/lib/generators/pgbus/templates/tune_autovacuum.rb.erb +38 -0
- data/lib/generators/pgbus/tune_autovacuum_generator.rb +55 -0
- data/lib/pgbus/autovacuum_tuning.rb +93 -0
- data/lib/pgbus/client.rb +13 -1
- data/lib/pgbus/generators/migration_detector.rb +38 -3
- data/lib/pgbus/version.rb +1 -1
- data/lib/pgbus/web/data_source.rb +133 -1
- data/lib/pgbus/web/metrics_serializer.rb +71 -2
- metadata +5 -1
|
@@ -21,6 +21,8 @@ module Pgbus
|
|
|
21
21
|
|
|
22
22
|
throughput = compute_throughput(queues)
|
|
23
23
|
|
|
24
|
+
health = queue_health_stats
|
|
25
|
+
|
|
24
26
|
{
|
|
25
27
|
total_queues: queues.size,
|
|
26
28
|
total_depth: total_depth,
|
|
@@ -29,7 +31,10 @@ module Pgbus
|
|
|
29
31
|
failed_count: failed_events_count,
|
|
30
32
|
dlq_depth: dlq_depth,
|
|
31
33
|
recurring_count: recurring_tasks_count,
|
|
32
|
-
throughput_rate: throughput
|
|
34
|
+
throughput_rate: throughput,
|
|
35
|
+
total_dead_tuples: health[:total_dead_tuples],
|
|
36
|
+
tables_needing_vacuum: health[:tables_needing_vacuum],
|
|
37
|
+
oldest_transaction_age_sec: health[:oldest_transaction_age_sec]
|
|
33
38
|
}
|
|
34
39
|
end
|
|
35
40
|
|
|
@@ -629,6 +634,50 @@ module Pgbus
|
|
|
629
634
|
[]
|
|
630
635
|
end
|
|
631
636
|
|
|
637
|
+
# Queue health — vacuum stats, dead tuples, bloat, MVCC horizon.
|
|
638
|
+
# Returns aggregate health across all queue and archive tables, plus
|
|
639
|
+
# the oldest open transaction age (MVCC horizon pinning risk).
|
|
640
|
+
def queue_health_stats
|
|
641
|
+
tables = fetch_all_table_stats
|
|
642
|
+
|
|
643
|
+
total_dead = tables.sum { |t| t[:dead_tuples] }
|
|
644
|
+
total_live = tables.sum { |t| t[:live_tuples] }
|
|
645
|
+
worst_bloat = tables.map { |t| t[:bloat_ratio] }.max || 0.0
|
|
646
|
+
needs_vacuum = tables.count { |t| t[:bloat_ratio] > 0.1 }
|
|
647
|
+
oldest_vacuum = tables.filter_map { |t| t[:last_vacuum_ago_sec] }.max
|
|
648
|
+
|
|
649
|
+
{
|
|
650
|
+
total_dead_tuples: total_dead,
|
|
651
|
+
total_live_tuples: total_live,
|
|
652
|
+
worst_bloat_ratio: worst_bloat.round(4),
|
|
653
|
+
tables_needing_vacuum: needs_vacuum,
|
|
654
|
+
oldest_vacuum_ago_sec: oldest_vacuum,
|
|
655
|
+
oldest_transaction_age_sec: oldest_transaction_age,
|
|
656
|
+
tables: tables
|
|
657
|
+
}
|
|
658
|
+
rescue StandardError => e
|
|
659
|
+
Pgbus.logger.debug { "[Pgbus::Web] Error fetching queue health stats: #{e.class}: #{e.message}" }
|
|
660
|
+
{
|
|
661
|
+
total_dead_tuples: 0, total_live_tuples: 0, worst_bloat_ratio: 0.0,
|
|
662
|
+
tables_needing_vacuum: 0, oldest_vacuum_ago_sec: nil,
|
|
663
|
+
oldest_transaction_age_sec: nil, tables: []
|
|
664
|
+
}
|
|
665
|
+
end
|
|
666
|
+
|
|
667
|
+
# Per-queue health stats for the queue detail view.
|
|
668
|
+
def queue_health_detail(queue_name)
|
|
669
|
+
sanitized = sanitize_name(queue_name)
|
|
670
|
+
tables = [
|
|
671
|
+
fetch_table_stats("pgmq", "q_#{sanitized}", "queue"),
|
|
672
|
+
fetch_table_stats("pgmq", "a_#{sanitized}", "archive")
|
|
673
|
+
].compact
|
|
674
|
+
|
|
675
|
+
{ tables: tables, oldest_transaction_age_sec: oldest_transaction_age }
|
|
676
|
+
rescue StandardError => e
|
|
677
|
+
Pgbus.logger.debug { "[Pgbus::Web] Error fetching health detail for #{queue_name}: #{e.message}" }
|
|
678
|
+
{ tables: [], oldest_transaction_age_sec: nil }
|
|
679
|
+
end
|
|
680
|
+
|
|
632
681
|
# Stream stats — only populated when streams_stats_enabled is
|
|
633
682
|
# true AND the migration has been run. Controllers should gate
|
|
634
683
|
# rendering on `stream_stats_available?` to avoid showing empty
|
|
@@ -674,6 +723,89 @@ module Pgbus
|
|
|
674
723
|
Pgbus::BusRecord.connection
|
|
675
724
|
end
|
|
676
725
|
|
|
726
|
+
# Single query to fetch pg_stat_user_tables stats for all queue and
|
|
727
|
+
# archive tables. Avoids 2*N catalog queries on the dashboard.
|
|
728
|
+
def fetch_all_table_stats
|
|
729
|
+
rows = connection.select_all(<<~SQL, "Pgbus All Table Health")
|
|
730
|
+
WITH rels AS (
|
|
731
|
+
SELECT queue_name, 'q_' || queue_name AS relname, 'queue' AS kind FROM pgmq.meta
|
|
732
|
+
UNION ALL
|
|
733
|
+
SELECT queue_name, 'a_' || queue_name AS relname, 'archive' AS kind FROM pgmq.meta
|
|
734
|
+
)
|
|
735
|
+
SELECT
|
|
736
|
+
'pgmq.' || r.relname AS table_name,
|
|
737
|
+
r.kind,
|
|
738
|
+
s.n_live_tup,
|
|
739
|
+
s.n_dead_tup,
|
|
740
|
+
EXTRACT(epoch FROM (NOW() - COALESCE(s.last_vacuum, s.last_autovacuum)))::int AS last_vacuum_ago_sec,
|
|
741
|
+
s.last_vacuum,
|
|
742
|
+
s.last_autovacuum
|
|
743
|
+
FROM rels r
|
|
744
|
+
LEFT JOIN pg_stat_user_tables s
|
|
745
|
+
ON s.schemaname = 'pgmq' AND s.relname = r.relname
|
|
746
|
+
ORDER BY r.queue_name, r.kind
|
|
747
|
+
SQL
|
|
748
|
+
|
|
749
|
+
rows.to_a.filter_map { |row| build_table_health_row(row) }
|
|
750
|
+
end
|
|
751
|
+
|
|
752
|
+
# Fetch pg_stat_user_tables stats for a single table (used by queue_health_detail).
|
|
753
|
+
def fetch_table_stats(schema, table_name, kind)
|
|
754
|
+
row = connection.select_one(<<~SQL, "Pgbus Table Health", [schema, table_name])
|
|
755
|
+
SELECT
|
|
756
|
+
n_live_tup,
|
|
757
|
+
n_dead_tup,
|
|
758
|
+
EXTRACT(epoch FROM (NOW() - COALESCE(last_vacuum, last_autovacuum)))::int AS last_vacuum_ago_sec,
|
|
759
|
+
last_vacuum,
|
|
760
|
+
last_autovacuum
|
|
761
|
+
FROM pg_stat_user_tables
|
|
762
|
+
WHERE schemaname = $1 AND relname = $2
|
|
763
|
+
SQL
|
|
764
|
+
|
|
765
|
+
return nil unless row
|
|
766
|
+
|
|
767
|
+
build_table_health_row(row.merge("table_name" => "#{schema}.#{table_name}", "kind" => kind))
|
|
768
|
+
end
|
|
769
|
+
|
|
770
|
+
def build_table_health_row(row)
|
|
771
|
+
return nil unless row["n_live_tup"] || row["n_dead_tup"]
|
|
772
|
+
|
|
773
|
+
live = row["n_live_tup"].to_i
|
|
774
|
+
dead = row["n_dead_tup"].to_i
|
|
775
|
+
total = live + dead
|
|
776
|
+
bloat = total.positive? ? (dead.to_f / total) : 0.0
|
|
777
|
+
|
|
778
|
+
{
|
|
779
|
+
table: row["table_name"],
|
|
780
|
+
kind: row["kind"],
|
|
781
|
+
live_tuples: live,
|
|
782
|
+
dead_tuples: dead,
|
|
783
|
+
bloat_ratio: bloat.round(4),
|
|
784
|
+
last_vacuum_ago_sec: row["last_vacuum_ago_sec"]&.to_i,
|
|
785
|
+
last_vacuum: row["last_vacuum"],
|
|
786
|
+
last_autovacuum: row["last_autovacuum"]
|
|
787
|
+
}
|
|
788
|
+
end
|
|
789
|
+
|
|
790
|
+
# Age of the oldest open transaction in seconds — indicates MVCC
|
|
791
|
+
# horizon pinning risk. Returns nil if no active transactions.
|
|
792
|
+
def oldest_transaction_age
|
|
793
|
+
row = connection.select_one(<<~SQL, "Pgbus Oldest Transaction")
|
|
794
|
+
SELECT EXTRACT(epoch FROM (NOW() - xact_start))::int AS age_sec
|
|
795
|
+
FROM pg_stat_activity
|
|
796
|
+
WHERE state != 'idle'
|
|
797
|
+
AND xact_start IS NOT NULL
|
|
798
|
+
AND pid != pg_backend_pid()
|
|
799
|
+
ORDER BY xact_start ASC
|
|
800
|
+
LIMIT 1
|
|
801
|
+
SQL
|
|
802
|
+
|
|
803
|
+
row&.dig("age_sec")&.to_i
|
|
804
|
+
rescue StandardError => e
|
|
805
|
+
Pgbus.logger.debug { "[Pgbus::Web] Error fetching oldest transaction age: #{e.class}: #{e.message}" }
|
|
806
|
+
nil
|
|
807
|
+
end
|
|
808
|
+
|
|
677
809
|
# name is the full PGMQ queue name (already prefixed)
|
|
678
810
|
def query_queue_messages(name, limit, offset)
|
|
679
811
|
query_queue_messages_raw(name, limit, offset).map { |m| m.merge(queue: name) }
|
|
@@ -24,6 +24,7 @@ module Pgbus
|
|
|
24
24
|
append_process_metrics(lines)
|
|
25
25
|
append_summary_metrics(lines)
|
|
26
26
|
append_stream_metrics(lines)
|
|
27
|
+
append_health_metrics(lines)
|
|
27
28
|
"#{lines.join("\n")}\n"
|
|
28
29
|
end
|
|
29
30
|
|
|
@@ -97,9 +98,41 @@ module Pgbus
|
|
|
97
98
|
end
|
|
98
99
|
|
|
99
100
|
def append_process_metrics(lines)
|
|
100
|
-
|
|
101
|
+
procs = @data_source.processes
|
|
101
102
|
gauge(lines, "pgbus_active_processes", "Number of active pgbus worker processes") do
|
|
102
|
-
[[count]]
|
|
103
|
+
[[procs.count]]
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
workers = procs.select { |p| p[:kind] == "worker" && p[:metadata].is_a?(Hash) }
|
|
107
|
+
unless workers.empty?
|
|
108
|
+
gauge(lines, "pgbus_worker_pool_capacity", "Total thread/async pool capacity per worker") do
|
|
109
|
+
workers.filter_map do |w|
|
|
110
|
+
capacity = w[:metadata]["capacity"]
|
|
111
|
+
next unless capacity
|
|
112
|
+
|
|
113
|
+
[capacity, { pid: w[:pid], hostname: w[:hostname] }]
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
gauge(lines, "pgbus_worker_pool_busy", "Number of busy threads/slots per worker") do
|
|
118
|
+
workers.filter_map do |w|
|
|
119
|
+
busy = w[:metadata]["busy"]
|
|
120
|
+
next unless busy
|
|
121
|
+
|
|
122
|
+
[busy, { pid: w[:pid], hostname: w[:hostname] }]
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
gauge(lines, "pgbus_worker_pool_utilization", "Pool utilization ratio (busy / capacity)") do
|
|
127
|
+
workers.filter_map do |w|
|
|
128
|
+
capacity = w[:metadata]["capacity"].to_i
|
|
129
|
+
busy = w[:metadata]["busy"].to_i
|
|
130
|
+
next unless capacity.positive?
|
|
131
|
+
|
|
132
|
+
ratio = (busy.to_f / capacity).round(4)
|
|
133
|
+
[ratio, { pid: w[:pid], hostname: w[:hostname] }]
|
|
134
|
+
end
|
|
135
|
+
end
|
|
103
136
|
end
|
|
104
137
|
rescue StandardError => e
|
|
105
138
|
Pgbus.logger.debug { "[Pgbus::Metrics] Error serializing process metrics: #{e.message}" }
|
|
@@ -141,6 +174,42 @@ module Pgbus
|
|
|
141
174
|
Pgbus.logger.debug { "[Pgbus::Metrics] Error serializing stream metrics: #{e.message}" }
|
|
142
175
|
end
|
|
143
176
|
|
|
177
|
+
def append_health_metrics(lines)
|
|
178
|
+
health = @data_source.queue_health_stats
|
|
179
|
+
return if health[:tables].empty? && health[:oldest_transaction_age_sec].nil?
|
|
180
|
+
|
|
181
|
+
tables = health[:tables]
|
|
182
|
+
unless tables.empty?
|
|
183
|
+
gauge(lines, "pgbus_table_dead_tuples", "Number of dead tuples in queue/archive table") do
|
|
184
|
+
tables.map { |t| [t[:dead_tuples], { table: t[:table], kind: t[:kind] }] }
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
gauge(lines, "pgbus_table_live_tuples", "Number of live tuples in queue/archive table") do
|
|
188
|
+
tables.map { |t| [t[:live_tuples], { table: t[:table], kind: t[:kind] }] }
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
gauge(lines, "pgbus_table_bloat_ratio", "Dead tuple ratio (dead / total) per table") do
|
|
192
|
+
tables.map { |t| [t[:bloat_ratio], { table: t[:table], kind: t[:kind] }] }
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
vacuum_tables = tables.select { |t| t[:last_vacuum_ago_sec] }
|
|
196
|
+
unless vacuum_tables.empty?
|
|
197
|
+
gauge(lines, "pgbus_table_last_vacuum_age_seconds", "Seconds since last vacuum") do
|
|
198
|
+
vacuum_tables.map { |t| [t[:last_vacuum_ago_sec], { table: t[:table], kind: t[:kind] }] }
|
|
199
|
+
end
|
|
200
|
+
end
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
if health[:oldest_transaction_age_sec]
|
|
204
|
+
gauge(lines, "pgbus_oldest_transaction_age_seconds",
|
|
205
|
+
"Age of the oldest open transaction (MVCC horizon pin risk)") do
|
|
206
|
+
[[health[:oldest_transaction_age_sec]]]
|
|
207
|
+
end
|
|
208
|
+
end
|
|
209
|
+
rescue StandardError => e
|
|
210
|
+
Pgbus.logger.debug { "[Pgbus::Metrics] Error serializing health metrics: #{e.message}" }
|
|
211
|
+
end
|
|
212
|
+
|
|
144
213
|
# Emits a Prometheus gauge metric family. The block must return an array
|
|
145
214
|
# of [value] or [value, { label: "val" }] pairs.
|
|
146
215
|
def gauge(lines, name, help)
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: pgbus
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.6.
|
|
4
|
+
version: 0.6.8
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Mikael Henriksson
|
|
@@ -162,6 +162,7 @@ files:
|
|
|
162
162
|
- app/models/pgbus/uniqueness_key.rb
|
|
163
163
|
- app/views/layouts/pgbus/application.html.erb
|
|
164
164
|
- app/views/pgbus/dashboard/_processes_table.html.erb
|
|
165
|
+
- app/views/pgbus/dashboard/_queue_health.html.erb
|
|
165
166
|
- app/views/pgbus/dashboard/_queues_table.html.erb
|
|
166
167
|
- app/views/pgbus/dashboard/_recent_failures.html.erb
|
|
167
168
|
- app/views/pgbus/dashboard/_stats_cards.html.erb
|
|
@@ -230,12 +231,15 @@ files:
|
|
|
230
231
|
- lib/generators/pgbus/templates/pgbus.yml.erb
|
|
231
232
|
- lib/generators/pgbus/templates/pgbus_binstub.erb
|
|
232
233
|
- lib/generators/pgbus/templates/recurring.yml.erb
|
|
234
|
+
- lib/generators/pgbus/templates/tune_autovacuum.rb.erb
|
|
233
235
|
- lib/generators/pgbus/templates/upgrade_pgmq.rb.erb
|
|
236
|
+
- lib/generators/pgbus/tune_autovacuum_generator.rb
|
|
234
237
|
- lib/generators/pgbus/update_generator.rb
|
|
235
238
|
- lib/generators/pgbus/upgrade_pgmq_generator.rb
|
|
236
239
|
- lib/pgbus.rb
|
|
237
240
|
- lib/pgbus/active_job/adapter.rb
|
|
238
241
|
- lib/pgbus/active_job/executor.rb
|
|
242
|
+
- lib/pgbus/autovacuum_tuning.rb
|
|
239
243
|
- lib/pgbus/batch.rb
|
|
240
244
|
- lib/pgbus/bus_record.rb
|
|
241
245
|
- lib/pgbus/circuit_breaker.rb
|