pgbus 0.6.1 → 0.6.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/pgbus/api/metrics_controller.rb +16 -0
- data/app/models/pgbus/job_stat.rb +18 -5
- data/config/routes.rb +1 -0
- data/lib/pgbus/active_job/executor.rb +30 -0
- data/lib/pgbus/client/read_after.rb +51 -0
- data/lib/pgbus/configuration.rb +16 -3
- data/lib/pgbus/process/worker.rb +7 -2
- data/lib/pgbus/queue_name_validator.rb +17 -0
- data/lib/pgbus/retry_backoff.rb +91 -0
- data/lib/pgbus/streams.rb +1 -1
- data/lib/pgbus/version.rb +1 -1
- data/lib/pgbus/web/data_source.rb +19 -23
- data/lib/pgbus/web/metrics_serializer.rb +175 -0
- data/lib/pgbus/web/streamer/heartbeat.rb +2 -2
- data/lib/pgbus/web/streamer/instance.rb +2 -2
- data/lib/pgbus/web/streamer/{dispatcher.rb → stream_event_dispatcher.rb} +11 -7
- metadata +5 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 97572298add12c78b7d9f3ebb6a50af36a9803380d42202a66031ce7f1018d4c
|
|
4
|
+
data.tar.gz: b5498b051a4c1d134c2944d1b0c556124fef49a7276ee32cbe0172fac12f6c8d
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: ec8b0af9bac7c6156b12effa0ce3d3ecf9a29f06c091f5117c5567114c00f44c6dea26666306ff9b084c0b616aa3e47b5080d986c1159fcedf64e5fbfe1f909a
|
|
7
|
+
data.tar.gz: 0227ab4f57df35bd89c4dac8a022df20a16d8c30eaaff0cbc75c8a49e8debf9dba62e1fb54b1ac26c327aab6199398ac2fe1575db9b1ad1062fb0d6b21083ffc
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pgbus
|
|
4
|
+
module Api
|
|
5
|
+
class MetricsController < ApplicationController
|
|
6
|
+
PROMETHEUS_CONTENT_TYPE = "text/plain; version=0.0.4; charset=utf-8"
|
|
7
|
+
|
|
8
|
+
def show
|
|
9
|
+
return head(:not_found) unless Pgbus.configuration.metrics_enabled
|
|
10
|
+
|
|
11
|
+
body = Web::MetricsSerializer.new(data_source).serialize
|
|
12
|
+
render plain: body, content_type: PROMETHEUS_CONTENT_TYPE
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
@@ -30,21 +30,34 @@ module Pgbus
|
|
|
30
30
|
# Memoized — intentionally never invalidated at runtime. If the
|
|
31
31
|
# pgbus_job_stats migration runs while the app is already running,
|
|
32
32
|
# a restart is required for stat recording to begin.
|
|
33
|
+
#
|
|
34
|
+
# We only memoize a *successful* probe. A transient error (PG
|
|
35
|
+
# hiccup during boot, connection refused during a failover) is
|
|
36
|
+
# treated as "don't know yet" — the next call retries. Caching
|
|
37
|
+
# false on the first hiccup would permanently disable job stats
|
|
38
|
+
# for the process lifetime, which is a worse failure mode than
|
|
39
|
+
# a few retries. See issue #98 / PR #91 (StreamStat fix).
|
|
33
40
|
def self.table_exists?
|
|
34
41
|
return @table_exists if defined?(@table_exists)
|
|
35
42
|
|
|
36
43
|
@table_exists = connection.table_exists?(table_name)
|
|
37
|
-
rescue StandardError
|
|
38
|
-
|
|
44
|
+
rescue StandardError => e
|
|
45
|
+
Pgbus.logger.debug { "[Pgbus] Failed to check job stat table: #{e.message}" }
|
|
46
|
+
false
|
|
39
47
|
end
|
|
40
48
|
|
|
41
49
|
# Memoized — checks if the latency migration has been applied.
|
|
50
|
+
# Same transient-error handling as `table_exists?`: a failed
|
|
51
|
+
# probe is not cached, so a later successful probe can still
|
|
52
|
+
# enable latency recording.
|
|
42
53
|
def self.latency_columns?
|
|
43
54
|
return @latency_columns if defined?(@latency_columns)
|
|
55
|
+
return false unless table_exists?
|
|
44
56
|
|
|
45
|
-
@latency_columns =
|
|
46
|
-
rescue StandardError
|
|
47
|
-
|
|
57
|
+
@latency_columns = column_names.include?("enqueue_latency_ms")
|
|
58
|
+
rescue StandardError => e
|
|
59
|
+
Pgbus.logger.debug { "[Pgbus] Failed to check job stat latency columns: #{e.message}" }
|
|
60
|
+
false
|
|
48
61
|
end
|
|
49
62
|
|
|
50
63
|
# Throughput: jobs per minute bucketed by minute for the last N minutes
|
data/config/routes.rb
CHANGED
|
@@ -82,6 +82,7 @@ Pgbus::Engine.routes.draw do
|
|
|
82
82
|
namespace :api do
|
|
83
83
|
get :stats, to: "stats#show"
|
|
84
84
|
get :insights, to: "insights#show"
|
|
85
|
+
get :metrics, to: "metrics#show"
|
|
85
86
|
end
|
|
86
87
|
|
|
87
88
|
scope :frontend, controller: :frontends, defaults: { version: Pgbus::VERSION.tr(".", "-") } do
|
|
@@ -161,6 +161,36 @@ module Pgbus
|
|
|
161
161
|
error: error,
|
|
162
162
|
retry_count: [message.read_ct.to_i - 1, 0].max
|
|
163
163
|
)
|
|
164
|
+
|
|
165
|
+
apply_retry_backoff(message, queue_name, payload)
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# Extend the message's visibility timeout with exponential backoff
|
|
169
|
+
# so retries aren't all bunched at the default flat VT interval.
|
|
170
|
+
# Skipped on the first read (read_ct=1) — that's the initial
|
|
171
|
+
# attempt, not a retry.
|
|
172
|
+
def apply_retry_backoff(message, queue_name, payload)
|
|
173
|
+
attempt = message.read_ct.to_i - 1
|
|
174
|
+
return if attempt < 1
|
|
175
|
+
|
|
176
|
+
job_class = resolve_job_class(payload)
|
|
177
|
+
delay = if job_class
|
|
178
|
+
RetryBackoff.compute_delay_for_job(job_class, attempt: attempt)
|
|
179
|
+
else
|
|
180
|
+
RetryBackoff.compute_delay(attempt: attempt)
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
client.set_visibility_timeout(queue_name, message.msg_id.to_i, vt: delay)
|
|
184
|
+
rescue StandardError => e
|
|
185
|
+
Pgbus.logger.debug { "[Pgbus] Retry backoff VT update failed: #{e.message}" }
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
def resolve_job_class(payload)
|
|
189
|
+
return unless payload.is_a?(Hash) && payload["job_class"]
|
|
190
|
+
|
|
191
|
+
payload["job_class"].constantize
|
|
192
|
+
rescue NameError
|
|
193
|
+
nil
|
|
164
194
|
end
|
|
165
195
|
|
|
166
196
|
def instrument(event_name, payload = {})
|
|
@@ -24,6 +24,10 @@ module Pgbus
|
|
|
24
24
|
end
|
|
25
25
|
|
|
26
26
|
rows.map { |row| build_envelope(row) }
|
|
27
|
+
rescue StandardError => e
|
|
28
|
+
raise unless missing_stream_queue?(e, sanitized)
|
|
29
|
+
|
|
30
|
+
[]
|
|
27
31
|
end
|
|
28
32
|
|
|
29
33
|
def stream_current_msg_id(stream_name)
|
|
@@ -34,6 +38,10 @@ module Pgbus
|
|
|
34
38
|
conn.exec(sql).first.fetch("max").to_i
|
|
35
39
|
end
|
|
36
40
|
end
|
|
41
|
+
rescue StandardError => e
|
|
42
|
+
raise unless missing_stream_queue?(e, sanitized)
|
|
43
|
+
|
|
44
|
+
0
|
|
37
45
|
end
|
|
38
46
|
|
|
39
47
|
def stream_oldest_msg_id(stream_name)
|
|
@@ -50,10 +58,53 @@ module Pgbus
|
|
|
50
58
|
value&.to_i
|
|
51
59
|
end
|
|
52
60
|
end
|
|
61
|
+
rescue StandardError => e
|
|
62
|
+
raise unless missing_stream_queue?(e, sanitized)
|
|
63
|
+
|
|
64
|
+
nil
|
|
53
65
|
end
|
|
54
66
|
|
|
55
67
|
private
|
|
56
68
|
|
|
69
|
+
# True if +error+ is a PG::UndefinedTable (or an
|
|
70
|
+
# ActiveRecord::StatementInvalid wrapping one) complaining about
|
|
71
|
+
# the stream's own PGMQ queue table (pgmq.q_<sanitized> or
|
|
72
|
+
# pgmq.a_<sanitized>).
|
|
73
|
+
#
|
|
74
|
+
# The stream-watermark and replay SQL above run on every page render
|
|
75
|
+
# for streams like `pgbus_stream_from Current.user`, but the queue
|
|
76
|
+
# table is only created on the FIRST broadcast via
|
|
77
|
+
# `ensure_stream_queue`. On a fresh database the very first page
|
|
78
|
+
# render therefore reads from a table that doesn't exist yet —
|
|
79
|
+
# semantically, "no queue" means "no messages" and must translate
|
|
80
|
+
# to a 0 watermark / empty replay rather than an exception. Any
|
|
81
|
+
# OTHER UndefinedTable (wrong schema, typo, operator error) still
|
|
82
|
+
# propagates so real bugs don't get swallowed.
|
|
83
|
+
#
|
|
84
|
+
# See issues #101 and #104. The comparison is case-insensitive because
|
|
85
|
+
# Postgres downcases unquoted identifiers in its error output, while
|
|
86
|
+
# `sanitized` can contain uppercase characters for GlobalID-keyed streams
|
|
87
|
+
# (e.g. `pgbus_stream_Z2lkOi8vY29zbW9zL1VzZXIvMQ` from
|
|
88
|
+
# `pgbus_stream_from Current.user`). A case-sensitive substring match
|
|
89
|
+
# would miss the downcased relation name and let the exception escape.
|
|
90
|
+
#
|
|
91
|
+
# The regex uses `\b` word boundaries so `pgmq.q_<needle>` doesn't
|
|
92
|
+
# accidentally match longer related identifiers like
|
|
93
|
+
# `pgmq.q_<needle>_archive` — a PGMQ internal object we'd want to
|
|
94
|
+
# propagate as a real error rather than silently swallow.
|
|
95
|
+
def missing_stream_queue?(error, sanitized)
|
|
96
|
+
pg_error = pg_undefined_table?(error) ? error : error.cause
|
|
97
|
+
return false unless pg_undefined_table?(pg_error)
|
|
98
|
+
|
|
99
|
+
message = pg_error.message.to_s.downcase
|
|
100
|
+
needle = Regexp.escape(sanitized.downcase)
|
|
101
|
+
message.match?(/\bpgmq\.(?:q|a)_#{needle}\b/)
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def pg_undefined_table?(error)
|
|
105
|
+
defined?(::PG::UndefinedTable) && error.is_a?(::PG::UndefinedTable)
|
|
106
|
+
end
|
|
107
|
+
|
|
57
108
|
# Builds the union of live and archive tables. The outer ORDER BY + LIMIT
|
|
58
109
|
# ensures we never return more than `limit` rows total even if both
|
|
59
110
|
# subqueries hit it. The 'live'/'archive' constants are how the streamer
|
data/lib/pgbus/configuration.rb
CHANGED
|
@@ -37,6 +37,10 @@ module Pgbus
|
|
|
37
37
|
# Dead letter queue
|
|
38
38
|
attr_accessor :max_retries
|
|
39
39
|
|
|
40
|
+
# Retry backoff for the VT-based retry path (unhandled exceptions).
|
|
41
|
+
# Jobs can override these per-class via Pgbus::RetryBackoff::JobMixin.
|
|
42
|
+
attr_accessor :retry_backoff, :retry_backoff_max, :retry_backoff_jitter
|
|
43
|
+
|
|
40
44
|
# Priority queues
|
|
41
45
|
attr_accessor :priority_levels, :default_priority
|
|
42
46
|
|
|
@@ -82,7 +86,8 @@ module Pgbus
|
|
|
82
86
|
|
|
83
87
|
# Web dashboard
|
|
84
88
|
attr_accessor :web_auth, :web_refresh_interval, :web_per_page, :web_live_updates, :web_data_source,
|
|
85
|
-
:insights_default_minutes, :base_controller_class, :return_to_app_url
|
|
89
|
+
:insights_default_minutes, :base_controller_class, :return_to_app_url,
|
|
90
|
+
:metrics_enabled
|
|
86
91
|
|
|
87
92
|
# Streams (turbo-rails replacement, SSE-based)
|
|
88
93
|
attr_accessor :streams_enabled, :streams_queue_prefix, :streams_signed_name_secret,
|
|
@@ -116,6 +121,9 @@ module Pgbus
|
|
|
116
121
|
@circuit_breaker_enabled = true
|
|
117
122
|
|
|
118
123
|
@max_retries = 5
|
|
124
|
+
@retry_backoff = 5 # seconds — first VT-retry delay
|
|
125
|
+
@retry_backoff_max = 300 # 5 minutes cap
|
|
126
|
+
@retry_backoff_jitter = 0.15
|
|
119
127
|
|
|
120
128
|
@priority_levels = nil
|
|
121
129
|
@default_priority = 1
|
|
@@ -157,6 +165,7 @@ module Pgbus
|
|
|
157
165
|
@insights_default_minutes = 30 * 24 * 60 # 30 days
|
|
158
166
|
@base_controller_class = "::ActionController::Base"
|
|
159
167
|
@return_to_app_url = nil
|
|
168
|
+
@metrics_enabled = true
|
|
160
169
|
|
|
161
170
|
@streams_enabled = true
|
|
162
171
|
@streams_queue_prefix = "pgbus_stream"
|
|
@@ -189,8 +198,7 @@ module Pgbus
|
|
|
189
198
|
|
|
190
199
|
def queue_name(name)
|
|
191
200
|
full = "#{queue_prefix}_#{name}"
|
|
192
|
-
QueueNameValidator.
|
|
193
|
-
full
|
|
201
|
+
QueueNameValidator.normalize(full)
|
|
194
202
|
end
|
|
195
203
|
|
|
196
204
|
def dead_letter_queue_name(name)
|
|
@@ -227,6 +235,11 @@ module Pgbus
|
|
|
227
235
|
raise ArgumentError, "polling_interval must be > 0" unless polling_interval.is_a?(Numeric) && polling_interval.positive?
|
|
228
236
|
raise ArgumentError, "visibility_timeout must be > 0" unless visibility_timeout.is_a?(Numeric) && visibility_timeout.positive?
|
|
229
237
|
raise ArgumentError, "max_retries must be >= 0" unless max_retries.is_a?(Integer) && max_retries >= 0
|
|
238
|
+
raise ArgumentError, "retry_backoff must be > 0" unless retry_backoff.is_a?(Numeric) && retry_backoff.positive?
|
|
239
|
+
raise ArgumentError, "retry_backoff_max must be > 0" unless retry_backoff_max.is_a?(Numeric) && retry_backoff_max.positive?
|
|
240
|
+
unless retry_backoff_jitter.is_a?(Numeric) && retry_backoff_jitter >= 0 && retry_backoff_jitter <= 1
|
|
241
|
+
raise ArgumentError, "retry_backoff_jitter must be between 0 and 1"
|
|
242
|
+
end
|
|
230
243
|
|
|
231
244
|
Array(workers).each do |w|
|
|
232
245
|
threads = w[:threads] || w["threads"] || 5
|
data/lib/pgbus/process/worker.rb
CHANGED
|
@@ -87,6 +87,11 @@ module Pgbus
|
|
|
87
87
|
|
|
88
88
|
WILDCARD_REFRESH_INTERVAL = 30 # seconds
|
|
89
89
|
|
|
90
|
+
# Matches the physical queue name inside a "relation \"pgmq.q_foo\" does
|
|
91
|
+
# not exist" error. Frozen module constant to avoid recompiling the
|
|
92
|
+
# regex on every queue-missing error in hot fetch/read paths.
|
|
93
|
+
MISSING_QUEUE_REGEX = /pgmq\.q_(\w+)/
|
|
94
|
+
|
|
90
95
|
private
|
|
91
96
|
|
|
92
97
|
def claim_and_execute
|
|
@@ -255,8 +260,8 @@ module Pgbus
|
|
|
255
260
|
# Extract the queue name from the error and remove it from the active list.
|
|
256
261
|
def evict_missing_queues(error)
|
|
257
262
|
prefix = "#{config.queue_prefix}_"
|
|
258
|
-
if error.message
|
|
259
|
-
physical_name =
|
|
263
|
+
if (match = MISSING_QUEUE_REGEX.match(error.message))
|
|
264
|
+
physical_name = match[1]
|
|
260
265
|
logical_name = physical_name.delete_prefix(prefix)
|
|
261
266
|
if @queues.delete(logical_name)
|
|
262
267
|
Pgbus.logger.warn { "[Pgbus] Evicted deleted queue '#{logical_name}' (#{physical_name}) from worker" }
|
|
@@ -34,6 +34,23 @@ module Pgbus
|
|
|
34
34
|
name
|
|
35
35
|
end
|
|
36
36
|
|
|
37
|
+
# Normalizes a queue name by replacing common separators (hyphens, dots)
|
|
38
|
+
# with underscores, stripping remaining invalid characters, and collapsing
|
|
39
|
+
# consecutive underscores. Use this for names from external sources
|
|
40
|
+
# (e.g., Turbo stream names like "hotwire-livereload") where the intent
|
|
41
|
+
# is to derive a valid PGMQ queue name that preserves readability.
|
|
42
|
+
def normalize(name)
|
|
43
|
+
name = name.to_s
|
|
44
|
+
return validate!(name) if VALID_QUEUE_NAME_PATTERN.match?(name)
|
|
45
|
+
|
|
46
|
+
normalized = name.gsub(/[-.]/, "_") # hyphens/dots → underscores
|
|
47
|
+
.gsub(/[^a-zA-Z0-9_]/, "") # strip remaining invalid chars
|
|
48
|
+
.gsub(/_+/, "_") # collapse consecutive underscores
|
|
49
|
+
.gsub(/\A_|_\z/, "") # strip leading/trailing underscores
|
|
50
|
+
validate!(normalized)
|
|
51
|
+
normalized
|
|
52
|
+
end
|
|
53
|
+
|
|
37
54
|
# Sanitizes a queue name by removing invalid characters, then validates.
|
|
38
55
|
# Use this for names from untrusted sources (e.g., URL params).
|
|
39
56
|
def sanitize!(name)
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pgbus
|
|
4
|
+
# Computes exponential backoff delays for the VT-based retry path
|
|
5
|
+
# (unhandled exceptions that fall through ActiveJob's retry_on).
|
|
6
|
+
#
|
|
7
|
+
# The formula mirrors the CircuitBreaker pattern already used
|
|
8
|
+
# elsewhere in pgbus: base * 2^(attempt-1), capped at max,
|
|
9
|
+
# with optional jitter to prevent thundering-herd retries.
|
|
10
|
+
#
|
|
11
|
+
# Jobs can override the global config via the pgbus_retry_backoff
|
|
12
|
+
# class-level DSL (see JobMixin below).
|
|
13
|
+
module RetryBackoff
|
|
14
|
+
# Mixin for ActiveJob classes to declare per-job backoff config.
|
|
15
|
+
#
|
|
16
|
+
# class ImportJob < ApplicationJob
|
|
17
|
+
# include Pgbus::RetryBackoff::JobMixin
|
|
18
|
+
# pgbus_retry_backoff base: 15, max: 120, jitter: 0.2
|
|
19
|
+
# end
|
|
20
|
+
module JobMixin
|
|
21
|
+
extend ActiveSupport::Concern
|
|
22
|
+
|
|
23
|
+
class_methods do
|
|
24
|
+
def pgbus_retry_backoff(base: nil, max: nil, jitter: nil)
|
|
25
|
+
raise ArgumentError, "retry_backoff base must be > 0" if !base.nil? && (!base.is_a?(Numeric) || base <= 0)
|
|
26
|
+
raise ArgumentError, "retry_backoff max must be > 0" if !max.nil? && (!max.is_a?(Numeric) || max <= 0)
|
|
27
|
+
if !jitter.nil? && (!jitter.is_a?(Numeric) || jitter.negative? || jitter > 1)
|
|
28
|
+
raise ArgumentError, "retry_backoff jitter must be between 0 and 1"
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
@pgbus_retry_backoff = {
|
|
32
|
+
base: base,
|
|
33
|
+
max: max,
|
|
34
|
+
jitter: jitter
|
|
35
|
+
}.compact.freeze
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def pgbus_retry_backoff_config
|
|
39
|
+
@pgbus_retry_backoff
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
class << self
|
|
45
|
+
# Compute delay for a specific job class, falling back to global
|
|
46
|
+
# config for any options not overridden at the job level.
|
|
47
|
+
def compute_delay_for_job(job_class, attempt:, jitter: nil)
|
|
48
|
+
overrides = (job_class.respond_to?(:pgbus_retry_backoff_config) &&
|
|
49
|
+
job_class.pgbus_retry_backoff_config) || {}
|
|
50
|
+
|
|
51
|
+
config = Pgbus.configuration
|
|
52
|
+
compute_delay(
|
|
53
|
+
attempt: attempt,
|
|
54
|
+
base: overrides[:base] || config.retry_backoff,
|
|
55
|
+
max: overrides[:max] || config.retry_backoff_max,
|
|
56
|
+
jitter: jitter || overrides[:jitter] || config.retry_backoff_jitter
|
|
57
|
+
)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Core backoff computation.
|
|
61
|
+
#
|
|
62
|
+
# @param attempt [Integer] 1-based retry attempt number (read_ct - 1)
|
|
63
|
+
# @param base [Numeric] base delay in seconds (default: config.retry_backoff)
|
|
64
|
+
# @param max [Numeric] maximum delay cap (default: config.retry_backoff_max)
|
|
65
|
+
# @param jitter [Numeric] jitter factor 0..1 (default: config.retry_backoff_jitter)
|
|
66
|
+
# @return [Integer] delay in seconds
|
|
67
|
+
def compute_delay(attempt:, base: nil, max: nil, jitter: nil)
|
|
68
|
+
config = Pgbus.configuration
|
|
69
|
+
base ||= config.retry_backoff
|
|
70
|
+
max ||= config.retry_backoff_max
|
|
71
|
+
jitter = config.retry_backoff_jitter if jitter.nil?
|
|
72
|
+
|
|
73
|
+
exponent = [attempt - 1, 0].max
|
|
74
|
+
delay = base * (2**exponent)
|
|
75
|
+
delay = [delay, max].min
|
|
76
|
+
|
|
77
|
+
[apply_jitter(delay, jitter), max].min
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
private
|
|
81
|
+
|
|
82
|
+
def apply_jitter(delay, jitter)
|
|
83
|
+
return delay.to_i if jitter.nil? || jitter.zero?
|
|
84
|
+
|
|
85
|
+
spread = delay * jitter
|
|
86
|
+
jittered = delay + rand(-spread..spread)
|
|
87
|
+
[jittered.round, 0].max
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
data/lib/pgbus/streams.rb
CHANGED
|
@@ -38,7 +38,7 @@ module Pgbus
|
|
|
38
38
|
# Broadcasts a Turbo Stream HTML payload through the pgbus streamer.
|
|
39
39
|
# PGMQ's `message` column is JSONB, so raw HTML strings can't be passed
|
|
40
40
|
# directly. We wrap as `{"html": "..."}` on the way in and unwrap in
|
|
41
|
-
# Pgbus::Web::Streamer::
|
|
41
|
+
# Pgbus::Web::Streamer::StreamEventDispatcher before delivering to the SSE client.
|
|
42
42
|
# Callers pass a plain HTML string; the wrapping is an implementation
|
|
43
43
|
# detail.
|
|
44
44
|
#
|
data/lib/pgbus/version.rb
CHANGED
|
@@ -875,28 +875,14 @@ module Pgbus
|
|
|
875
875
|
|
|
876
876
|
# Extract uniqueness key from a JSON payload string and release its lock.
|
|
877
877
|
def release_lock_for_payload(payload_str)
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
payload = payload_str.is_a?(String) ? JSON.parse(payload_str) : payload_str
|
|
881
|
-
key = payload[Uniqueness::METADATA_KEY]
|
|
878
|
+
key = extract_uniqueness_key_from_payload_str(payload_str)
|
|
882
879
|
UniquenessKey.release!(key) if key
|
|
883
|
-
rescue JSON::ParserError => e
|
|
884
|
-
Pgbus.logger.debug { "[Pgbus::Web] Error parsing payload for lock release: #{e.message}" }
|
|
885
880
|
end
|
|
886
881
|
|
|
887
882
|
# Extract uniqueness keys from a collection of formatted messages and
|
|
888
883
|
# release all associated locks in a single query.
|
|
889
884
|
def release_locks_for_messages(messages)
|
|
890
|
-
keys = messages.filter_map
|
|
891
|
-
payload = m[:message]
|
|
892
|
-
next unless payload
|
|
893
|
-
|
|
894
|
-
parsed = payload.is_a?(String) ? JSON.parse(payload) : payload
|
|
895
|
-
parsed[Uniqueness::METADATA_KEY]
|
|
896
|
-
rescue JSON::ParserError
|
|
897
|
-
nil
|
|
898
|
-
end
|
|
899
|
-
|
|
885
|
+
keys = messages.filter_map { |m| extract_uniqueness_key_from_payload_str(m[:message]) }
|
|
900
886
|
UniquenessKey.where(lock_key: keys).delete_all if keys.any?
|
|
901
887
|
rescue StandardError => e
|
|
902
888
|
Pgbus.logger.debug { "[Pgbus::Web] Error releasing locks for messages: #{e.message}" }
|
|
@@ -908,18 +894,28 @@ module Pgbus
|
|
|
908
894
|
"SELECT payload FROM pgbus_failed_events", "Pgbus Collect Failed Keys"
|
|
909
895
|
)
|
|
910
896
|
|
|
911
|
-
keys = rows.to_a.filter_map
|
|
912
|
-
payload = JSON.parse(row["payload"])
|
|
913
|
-
payload[Uniqueness::METADATA_KEY]
|
|
914
|
-
rescue JSON::ParserError
|
|
915
|
-
nil
|
|
916
|
-
end
|
|
917
|
-
|
|
897
|
+
keys = rows.to_a.filter_map { |row| extract_uniqueness_key_from_payload_str(row["payload"]) }
|
|
918
898
|
UniquenessKey.where(lock_key: keys).delete_all if keys.any?
|
|
919
899
|
rescue StandardError => e
|
|
920
900
|
Pgbus.logger.debug { "[Pgbus::Web] Error releasing locks for failed events: #{e.message}" }
|
|
921
901
|
end
|
|
922
902
|
|
|
903
|
+
# Single unwrap point for PGMQ message / failed_event payload strings.
|
|
904
|
+
# Accepts a raw JSON string or an already-parsed Hash and returns the
|
|
905
|
+
# uniqueness metadata key, or nil when the payload is blank, unparseable,
|
|
906
|
+
# or carries no uniqueness metadata. Parse errors are swallowed at debug
|
|
907
|
+
# level because callers treat missing keys and malformed payloads
|
|
908
|
+
# identically (no lock to release).
|
|
909
|
+
def extract_uniqueness_key_from_payload_str(payload_str)
|
|
910
|
+
return nil unless payload_str
|
|
911
|
+
|
|
912
|
+
payload = payload_str.is_a?(String) ? JSON.parse(payload_str) : payload_str
|
|
913
|
+
payload[Uniqueness::METADATA_KEY]
|
|
914
|
+
rescue JSON::ParserError => e
|
|
915
|
+
Pgbus.logger.debug { "[Pgbus::Web] Error parsing payload for uniqueness key: #{e.message}" }
|
|
916
|
+
nil
|
|
917
|
+
end
|
|
918
|
+
|
|
923
919
|
# Archive the queue message a failed_event row points to. Idempotent —
|
|
924
920
|
# silently no-ops if the message no longer exists in the queue.
|
|
925
921
|
def archive_failed_message(event)
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pgbus
|
|
4
|
+
module Web
|
|
5
|
+
# Converts DataSource output into Prometheus text exposition format
|
|
6
|
+
# (Content-Type: text/plain; version=0.0.4; charset=utf-8).
|
|
7
|
+
#
|
|
8
|
+
# Each metric family gets a HELP line, a TYPE line, and one or more
|
|
9
|
+
# sample lines. Labels are double-quoted per the Prometheus spec.
|
|
10
|
+
# All timing values are converted from milliseconds to seconds.
|
|
11
|
+
#
|
|
12
|
+
# Resilient by design: each section rescues StandardError independently
|
|
13
|
+
# so a failure in one data source method doesn't blank the entire
|
|
14
|
+
# scrape response.
|
|
15
|
+
class MetricsSerializer
|
|
16
|
+
def initialize(data_source)
|
|
17
|
+
@data_source = data_source
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def serialize
|
|
21
|
+
lines = []
|
|
22
|
+
append_queue_metrics(lines)
|
|
23
|
+
append_job_metrics(lines)
|
|
24
|
+
append_process_metrics(lines)
|
|
25
|
+
append_summary_metrics(lines)
|
|
26
|
+
append_stream_metrics(lines)
|
|
27
|
+
"#{lines.join("\n")}\n"
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
private
|
|
31
|
+
|
|
32
|
+
def append_queue_metrics(lines)
|
|
33
|
+
queues = @data_source.queues_with_metrics
|
|
34
|
+
return if queues.empty?
|
|
35
|
+
|
|
36
|
+
gauge(lines, "pgbus_queue_depth", "Number of messages in the queue (including invisible)") do
|
|
37
|
+
queues.map { |q| [q[:queue_length], { queue: q[:name] }] }
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
gauge(lines, "pgbus_queue_visible_depth", "Number of visible (ready to read) messages") do
|
|
41
|
+
queues.map { |q| [q[:queue_visible_length], { queue: q[:name] }] }
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
gauge(lines, "pgbus_queue_total_messages", "Total messages ever enqueued") do
|
|
45
|
+
queues.map { |q| [q[:total_messages], { queue: q[:name] }] }
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
gauge(lines, "pgbus_queue_oldest_message_age_seconds", "Age of the oldest message in seconds") do
|
|
49
|
+
queues.filter_map do |q|
|
|
50
|
+
next unless q[:oldest_msg_age_sec]
|
|
51
|
+
|
|
52
|
+
[q[:oldest_msg_age_sec], { queue: q[:name] }]
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
gauge(lines, "pgbus_queue_paused", "Whether the queue is paused (1) or active (0)") do
|
|
57
|
+
queues.map { |q| [q[:paused] ? 1 : 0, { queue: q[:name] }] }
|
|
58
|
+
end
|
|
59
|
+
rescue StandardError => e
|
|
60
|
+
Pgbus.logger.debug { "[Pgbus::Metrics] Error serializing queue metrics: #{e.message}" }
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def append_job_metrics(lines)
|
|
64
|
+
counts = @data_source.job_status_counts
|
|
65
|
+
unless counts.empty?
|
|
66
|
+
gauge(lines, "pgbus_jobs_total", "Number of jobs by status in the stats window") do
|
|
67
|
+
counts.map { |status, count| [count, { status: status }] }
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
summary = @data_source.job_stats_summary
|
|
72
|
+
if summary[:total].positive?
|
|
73
|
+
gauge(lines, "pgbus_job_duration_avg_seconds", "Average job duration in seconds") do
|
|
74
|
+
[[ms_to_s(summary[:avg_duration_ms])]]
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
gauge(lines, "pgbus_job_duration_max_seconds", "Maximum job duration in seconds") do
|
|
78
|
+
[[ms_to_s(summary[:max_duration_ms])]]
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
return unless Pgbus::JobStat.latency_columns? && summary[:avg_latency_ms]
|
|
83
|
+
|
|
84
|
+
gauge(lines, "pgbus_job_enqueue_latency_seconds", "Enqueue latency percentiles in seconds") do
|
|
85
|
+
[
|
|
86
|
+
[ms_to_s(summary[:p50_latency_ms]), { quantile: "0.5" }],
|
|
87
|
+
[ms_to_s(summary[:p95_latency_ms]), { quantile: "0.95" }],
|
|
88
|
+
[ms_to_s(summary[:p99_latency_ms]), { quantile: "0.99" }]
|
|
89
|
+
]
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
gauge(lines, "pgbus_job_avg_retries", "Average retry count per job") do
|
|
93
|
+
[[summary[:avg_retries]]]
|
|
94
|
+
end
|
|
95
|
+
rescue StandardError => e
|
|
96
|
+
Pgbus.logger.debug { "[Pgbus::Metrics] Error serializing job metrics: #{e.message}" }
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def append_process_metrics(lines)
|
|
100
|
+
count = @data_source.processes.count
|
|
101
|
+
gauge(lines, "pgbus_active_processes", "Number of active pgbus worker processes") do
|
|
102
|
+
[[count]]
|
|
103
|
+
end
|
|
104
|
+
rescue StandardError => e
|
|
105
|
+
Pgbus.logger.debug { "[Pgbus::Metrics] Error serializing process metrics: #{e.message}" }
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def append_summary_metrics(lines)
|
|
109
|
+
stats = @data_source.summary_stats
|
|
110
|
+
gauge(lines, "pgbus_failed_events_total", "Total failed events") do
|
|
111
|
+
[[stats[:failed_count]]]
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
gauge(lines, "pgbus_dlq_depth", "Total messages across all dead letter queues") do
|
|
115
|
+
[[stats[:dlq_depth]]]
|
|
116
|
+
end
|
|
117
|
+
rescue StandardError => e
|
|
118
|
+
Pgbus.logger.debug { "[Pgbus::Metrics] Error serializing summary metrics: #{e.message}" }
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def append_stream_metrics(lines)
|
|
122
|
+
return unless @data_source.stream_stats_available?
|
|
123
|
+
|
|
124
|
+
summary = @data_source.stream_stats_summary
|
|
125
|
+
gauge(lines, "pgbus_stream_events_total", "Stream events by type in the stats window") do
|
|
126
|
+
[
|
|
127
|
+
[summary[:broadcasts], { event_type: "broadcast" }],
|
|
128
|
+
[summary[:connects], { event_type: "connect" }],
|
|
129
|
+
[summary[:disconnects], { event_type: "disconnect" }]
|
|
130
|
+
]
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
gauge(lines, "pgbus_stream_active_connections", "Estimated active SSE connections") do
|
|
134
|
+
[[summary[:active_estimate]]]
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
gauge(lines, "pgbus_stream_avg_fanout", "Average broadcast fanout (subscribers per broadcast)") do
|
|
138
|
+
[[summary[:avg_fanout]]]
|
|
139
|
+
end
|
|
140
|
+
rescue StandardError => e
|
|
141
|
+
Pgbus.logger.debug { "[Pgbus::Metrics] Error serializing stream metrics: #{e.message}" }
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# Emits a Prometheus gauge metric family. The block must return an array
|
|
145
|
+
# of [value] or [value, { label: "val" }] pairs.
|
|
146
|
+
def gauge(lines, name, help)
|
|
147
|
+
samples = yield
|
|
148
|
+
return if samples.empty?
|
|
149
|
+
|
|
150
|
+
lines << "# HELP #{name} #{help}"
|
|
151
|
+
lines << "# TYPE #{name} gauge"
|
|
152
|
+
samples.each do |value, labels|
|
|
153
|
+
lines << format_sample(name, value, labels)
|
|
154
|
+
end
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
def format_sample(name, value, labels = nil)
|
|
158
|
+
if labels && !labels.empty?
|
|
159
|
+
label_str = labels.map { |k, v| "#{k}=\"#{v}\"" }.join(",")
|
|
160
|
+
"#{name}{#{label_str}} #{format_value(value)}"
|
|
161
|
+
else
|
|
162
|
+
"#{name} #{format_value(value)}"
|
|
163
|
+
end
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
def format_value(value)
|
|
167
|
+
value.is_a?(Float) ? value.to_s : value.to_i.to_s
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
def ms_to_s(milliseconds)
|
|
171
|
+
(milliseconds.to_f / 1000).round(4)
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
end
|
|
@@ -13,7 +13,7 @@ module Pgbus
|
|
|
13
13
|
# SSE clients.
|
|
14
14
|
#
|
|
15
15
|
# 2. Mark connections that have been idle longer than the
|
|
16
|
-
# configured idle_timeout as dead. The
|
|
16
|
+
# configured idle_timeout as dead. The StreamEventDispatcher's next pass
|
|
17
17
|
# picks them up via its disconnect path.
|
|
18
18
|
#
|
|
19
19
|
# 3. Post a DisconnectMessage for any connection already flagged
|
|
@@ -97,7 +97,7 @@ module Pgbus
|
|
|
97
97
|
end
|
|
98
98
|
|
|
99
99
|
def enqueue_disconnect(connection)
|
|
100
|
-
@queue <<
|
|
100
|
+
@queue << StreamEventDispatcher::DisconnectMessage.new(connection: connection)
|
|
101
101
|
end
|
|
102
102
|
end
|
|
103
103
|
end
|
|
@@ -43,7 +43,7 @@ module Pgbus
|
|
|
43
43
|
health_check_ms: @config.streams_listen_health_check_ms,
|
|
44
44
|
logger: @logger
|
|
45
45
|
)
|
|
46
|
-
@dispatcher =
|
|
46
|
+
@dispatcher = StreamEventDispatcher.new(
|
|
47
47
|
client: @client,
|
|
48
48
|
registry: @registry,
|
|
49
49
|
listener: @listener,
|
|
@@ -92,7 +92,7 @@ module Pgbus
|
|
|
92
92
|
return
|
|
93
93
|
end
|
|
94
94
|
|
|
95
|
-
@dispatch_queue <<
|
|
95
|
+
@dispatch_queue << StreamEventDispatcher::ConnectMessage.new(connection: connection)
|
|
96
96
|
end
|
|
97
97
|
end
|
|
98
98
|
|
|
@@ -26,8 +26,12 @@ module Pgbus
|
|
|
26
26
|
#
|
|
27
27
|
# All state ownership lives on this one thread: the registry is
|
|
28
28
|
# thread-safe (Phase 2.1) but the in-flight buffers are local to
|
|
29
|
-
# the
|
|
30
|
-
|
|
29
|
+
# the dispatcher and accessed only from this thread, so no locks.
|
|
30
|
+
#
|
|
31
|
+
# Named StreamEventDispatcher (rather than just "Dispatcher") to
|
|
32
|
+
# disambiguate from Pgbus::Process::Dispatcher, which is an
|
|
33
|
+
# unrelated worker-side pool coordinator. See issue #98 item 8.
|
|
34
|
+
class StreamEventDispatcher
|
|
31
35
|
WakeMessage = Listener::WakeMessage
|
|
32
36
|
ConnectMessage = Data.define(:connection)
|
|
33
37
|
DisconnectMessage = Data.define(:connection)
|
|
@@ -110,7 +114,7 @@ module Pgbus
|
|
|
110
114
|
# than calling Thread#kill, which leaves IO state corrupt.
|
|
111
115
|
# The orphaned thread will exit on its own once the blocking
|
|
112
116
|
# call returns and it sees @running == false on the next loop.
|
|
113
|
-
@logger.warn { "[Pgbus::Streamer::
|
|
117
|
+
@logger.warn { "[Pgbus::Streamer::StreamEventDispatcher] thread did not terminate within 5s" }
|
|
114
118
|
end
|
|
115
119
|
@thread = nil
|
|
116
120
|
self
|
|
@@ -141,7 +145,7 @@ module Pgbus
|
|
|
141
145
|
end
|
|
142
146
|
end
|
|
143
147
|
rescue StandardError => e
|
|
144
|
-
@logger.error { "[Pgbus::Streamer::
|
|
148
|
+
@logger.error { "[Pgbus::Streamer::StreamEventDispatcher] crashed: #{e.class}: #{e.message}" }
|
|
145
149
|
raise
|
|
146
150
|
end
|
|
147
151
|
|
|
@@ -176,7 +180,7 @@ module Pgbus
|
|
|
176
180
|
when ConnectMessage then handle_connect(msg)
|
|
177
181
|
when DisconnectMessage then handle_disconnect(msg)
|
|
178
182
|
else
|
|
179
|
-
@logger.warn { "[Pgbus::Streamer::
|
|
183
|
+
@logger.warn { "[Pgbus::Streamer::StreamEventDispatcher] unknown message: #{msg.class}" }
|
|
180
184
|
end
|
|
181
185
|
rescue StandardError => e
|
|
182
186
|
# Intentionally swallows per-message failures so one bad
|
|
@@ -184,7 +188,7 @@ module Pgbus
|
|
|
184
188
|
# connected client. The top-level run_loop rescue (below)
|
|
185
189
|
# does re-raise — a crash *between* messages is a real bug
|
|
186
190
|
# and the supervisor should see it.
|
|
187
|
-
@logger.error { "[Pgbus::Streamer::
|
|
191
|
+
@logger.error { "[Pgbus::Streamer::StreamEventDispatcher] handling #{msg.class} raised #{e.class}: #{e.message}" }
|
|
188
192
|
end
|
|
189
193
|
|
|
190
194
|
def handle_wake(msg)
|
|
@@ -310,7 +314,7 @@ module Pgbus
|
|
|
310
314
|
@scanned_cursor.delete(connection)
|
|
311
315
|
cleanup_stream_if_unused(stream)
|
|
312
316
|
connection.mark_dead!
|
|
313
|
-
@logger.error { "[Pgbus::Streamer::
|
|
317
|
+
@logger.error { "[Pgbus::Streamer::StreamEventDispatcher] connect failed for #{connection.id}: #{e.class}: #{e.message}" }
|
|
314
318
|
end
|
|
315
319
|
|
|
316
320
|
def handle_disconnect(msg)
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: pgbus
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.6.
|
|
4
|
+
version: 0.6.3
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Mikael Henriksson
|
|
@@ -122,6 +122,7 @@ files:
|
|
|
122
122
|
- README.md
|
|
123
123
|
- Rakefile
|
|
124
124
|
- app/controllers/pgbus/api/insights_controller.rb
|
|
125
|
+
- app/controllers/pgbus/api/metrics_controller.rb
|
|
125
126
|
- app/controllers/pgbus/api/stats_controller.rb
|
|
126
127
|
- app/controllers/pgbus/application_controller.rb
|
|
127
128
|
- app/controllers/pgbus/dashboard_controller.rb
|
|
@@ -283,6 +284,7 @@ files:
|
|
|
283
284
|
- lib/pgbus/recurring/schedule.rb
|
|
284
285
|
- lib/pgbus/recurring/scheduler.rb
|
|
285
286
|
- lib/pgbus/recurring/task.rb
|
|
287
|
+
- lib/pgbus/retry_backoff.rb
|
|
286
288
|
- lib/pgbus/serializer.rb
|
|
287
289
|
- lib/pgbus/stat_buffer.rb
|
|
288
290
|
- lib/pgbus/streams.rb
|
|
@@ -297,15 +299,16 @@ files:
|
|
|
297
299
|
- lib/pgbus/version.rb
|
|
298
300
|
- lib/pgbus/web/authentication.rb
|
|
299
301
|
- lib/pgbus/web/data_source.rb
|
|
302
|
+
- lib/pgbus/web/metrics_serializer.rb
|
|
300
303
|
- lib/pgbus/web/stream_app.rb
|
|
301
304
|
- lib/pgbus/web/streamer.rb
|
|
302
305
|
- lib/pgbus/web/streamer/connection.rb
|
|
303
|
-
- lib/pgbus/web/streamer/dispatcher.rb
|
|
304
306
|
- lib/pgbus/web/streamer/heartbeat.rb
|
|
305
307
|
- lib/pgbus/web/streamer/instance.rb
|
|
306
308
|
- lib/pgbus/web/streamer/io_writer.rb
|
|
307
309
|
- lib/pgbus/web/streamer/listener.rb
|
|
308
310
|
- lib/pgbus/web/streamer/registry.rb
|
|
311
|
+
- lib/pgbus/web/streamer/stream_event_dispatcher.rb
|
|
309
312
|
- lib/puma/plugin/pgbus_streams.rb
|
|
310
313
|
- lib/tasks/pgbus_pgmq.rake
|
|
311
314
|
- lib/tasks/pgbus_streams.rake
|