pgbus 0.6.2 → 0.6.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3dea01adf92fdfda132e0ce03189733ca9d3e4a60e26517abe81d53d64b02b19
4
- data.tar.gz: f539018648e02a5ea4753e058a3fe383b26d7f09a00d53124489eb640e1a2cf8
3
+ metadata.gz: 29a8cea3845cee18f561a257dd07a7793b44b2b7f49cdfa7cf520de2bf95cde2
4
+ data.tar.gz: bbc2e2bddf762b5e8add4bf5d83767afa089aaf2345fbb5c0c3f0018e8be62df
5
5
  SHA512:
6
- metadata.gz: a125013c9c84b87a276f7dce8a6bb3c5c1df7c5299dd34c9addcca030da0d2983dab7c61e0576eed4de813b25fc841a632a74baaaa05a0239d758bc651f9bdcc
7
- data.tar.gz: 2610ca4fb946d381ed594fb56479c7ceb56d6dc46b2f492ce6bc0a3e966af6647ddb83a3dfb679bf4dc4d6e0e152362464f6a62706f4d4e6b29646be6f4af6f7
6
+ metadata.gz: c539792bc228074715f4903a34cfd2e84d7145d08676049dc7b8ef3ef26edc1f66f6fab8c3f5f527ca8b51e84570217d36499e810e6290fb96d1f76575e711e2
7
+ data.tar.gz: 20bd68684e632377de4691845bf26e92848095aa9cf528a7d401931d239b133354d491f90f32b3c294ca03240760d746624f408d50667a562062e563ce484c7e
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pgbus
4
+ module Api
5
+ class MetricsController < ApplicationController
6
+ PROMETHEUS_CONTENT_TYPE = "text/plain; version=0.0.4; charset=utf-8"
7
+
8
+ def show
9
+ return head(:not_found) unless Pgbus.configuration.metrics_enabled
10
+
11
+ body = Web::MetricsSerializer.new(data_source).serialize
12
+ render plain: body, content_type: PROMETHEUS_CONTENT_TYPE
13
+ end
14
+ end
15
+ end
16
+ end
@@ -75,22 +75,34 @@ module Pgbus
75
75
  end
76
76
  end
77
77
 
78
- # Build the SSE endpoint URL by asking the engine where its
79
- # `:streams` mount point lives, then appending the signed name.
80
- # The base comes from Pgbus::Engine.routes.url_helpers.streams_path
81
- # so the URL follows whatever mount point the host app chose for
82
- # the engine ("/pgbus", "/admin/dashboard", etc.). A
83
- # `NoMethodError` fallback covers the test-only context where
84
- # the helper is included in a plain class outside a Rails
85
- # request and the engine's url_helpers aren't wired in.
78
+ # Build the SSE endpoint URL for the given signed stream name.
79
+ #
80
+ # Resolution order:
81
+ # 1. `config.streams_path` explicit override, useful when the
82
+ # engine is mounted behind an auth constraint but the SSE
83
+ # endpoint is mounted publicly at a separate path:
84
+ #
85
+ # # config/routes.rb
86
+ # authenticate :user, ->(u) { u.admin? } do
87
+ # mount Pgbus::Engine => "/admin/jobs"
88
+ # end
89
+ # mount Pgbus::Web::StreamApp.new => "/pgbus/streams"
90
+ #
91
+ # # config/initializers/pgbus.rb
92
+ # Pgbus.configure { |c| c.streams_path = "/pgbus/streams" }
93
+ #
94
+ # 2. Engine route helper — derives the path from wherever the
95
+ # host app mounted the engine.
96
+ #
97
+ # 3. Fallback `/pgbus/streams` — test-only context where the
98
+ # engine's url_helpers aren't wired in.
86
99
  def pgbus_stream_src(signed_name)
100
+ base = Pgbus.configuration.streams_path
101
+ return "#{base.delete_suffix("/")}/#{signed_name}" if base
102
+
87
103
  base = Pgbus::Engine.routes.url_helpers.streams_path
88
104
  "#{base}/#{signed_name}"
89
105
  rescue NameError
90
- # NameError covers both uninitialized-constant (Pgbus::Engine
91
- # not loaded, e.g. plain-Ruby unit specs) and NoMethodError
92
- # (a NameError subclass) when the routes helper chain isn't
93
- # wired in.
94
106
  "/pgbus/streams/#{signed_name}"
95
107
  end
96
108
 
@@ -30,21 +30,34 @@ module Pgbus
30
30
  # Memoized — intentionally never invalidated at runtime. If the
31
31
  # pgbus_job_stats migration runs while the app is already running,
32
32
  # a restart is required for stat recording to begin.
33
+ #
34
+ # We only memoize a *successful* probe. A transient error (PG
35
+ # hiccup during boot, connection refused during a failover) is
36
+ # treated as "don't know yet" — the next call retries. Caching
37
+ # false on the first hiccup would permanently disable job stats
38
+ # for the process lifetime, which is a worse failure mode than
39
+ # a few retries. See issue #98 / PR #91 (StreamStat fix).
33
40
  def self.table_exists?
34
41
  return @table_exists if defined?(@table_exists)
35
42
 
36
43
  @table_exists = connection.table_exists?(table_name)
37
- rescue StandardError
38
- @table_exists = false
44
+ rescue StandardError => e
45
+ Pgbus.logger.debug { "[Pgbus] Failed to check job stat table: #{e.message}" }
46
+ false
39
47
  end
40
48
 
41
49
  # Memoized — checks if the latency migration has been applied.
50
+ # Same transient-error handling as `table_exists?`: a failed
51
+ # probe is not cached, so a later successful probe can still
52
+ # enable latency recording.
42
53
  def self.latency_columns?
43
54
  return @latency_columns if defined?(@latency_columns)
55
+ return false unless table_exists?
44
56
 
45
- @latency_columns = table_exists? && column_names.include?("enqueue_latency_ms")
46
- rescue StandardError
47
- @latency_columns = false
57
+ @latency_columns = column_names.include?("enqueue_latency_ms")
58
+ rescue StandardError => e
59
+ Pgbus.logger.debug { "[Pgbus] Failed to check job stat latency columns: #{e.message}" }
60
+ false
48
61
  end
49
62
 
50
63
  # Throughput: jobs per minute bucketed by minute for the last N minutes
data/config/routes.rb CHANGED
@@ -82,6 +82,7 @@ Pgbus::Engine.routes.draw do
82
82
  namespace :api do
83
83
  get :stats, to: "stats#show"
84
84
  get :insights, to: "insights#show"
85
+ get :metrics, to: "metrics#show"
85
86
  end
86
87
 
87
88
  scope :frontend, controller: :frontends, defaults: { version: Pgbus::VERSION.tr(".", "-") } do
@@ -161,6 +161,36 @@ module Pgbus
161
161
  error: error,
162
162
  retry_count: [message.read_ct.to_i - 1, 0].max
163
163
  )
164
+
165
+ apply_retry_backoff(message, queue_name, payload)
166
+ end
167
+
168
+ # Extend the message's visibility timeout with exponential backoff
169
+ # so retries aren't all bunched at the default flat VT interval.
170
+ # Skipped on the first read (read_ct=1) — that's the initial
171
+ # attempt, not a retry.
172
+ def apply_retry_backoff(message, queue_name, payload)
173
+ attempt = message.read_ct.to_i - 1
174
+ return if attempt < 1
175
+
176
+ job_class = resolve_job_class(payload)
177
+ delay = if job_class
178
+ RetryBackoff.compute_delay_for_job(job_class, attempt: attempt)
179
+ else
180
+ RetryBackoff.compute_delay(attempt: attempt)
181
+ end
182
+
183
+ client.set_visibility_timeout(queue_name, message.msg_id.to_i, vt: delay)
184
+ rescue StandardError => e
185
+ Pgbus.logger.debug { "[Pgbus] Retry backoff VT update failed: #{e.message}" }
186
+ end
187
+
188
+ def resolve_job_class(payload)
189
+ return unless payload.is_a?(Hash) && payload["job_class"]
190
+
191
+ payload["job_class"].constantize
192
+ rescue NameError
193
+ nil
164
194
  end
165
195
 
166
196
  def instrument(event_name, payload = {})
@@ -81,13 +81,24 @@ module Pgbus
81
81
  # OTHER UndefinedTable (wrong schema, typo, operator error) still
82
82
  # propagates so real bugs don't get swallowed.
83
83
  #
84
- # See issue #101.
84
+ # See issues #101 and #104. The comparison is case-insensitive because
85
+ # Postgres downcases unquoted identifiers in its error output, while
86
+ # `sanitized` can contain uppercase characters for GlobalID-keyed streams
87
+ # (e.g. `pgbus_stream_Z2lkOi8vY29zbW9zL1VzZXIvMQ` from
88
+ # `pgbus_stream_from Current.user`). A case-sensitive substring match
89
+ # would miss the downcased relation name and let the exception escape.
90
+ #
91
+ # The regex uses `\b` word boundaries so `pgmq.q_<needle>` doesn't
92
+ # accidentally match longer related identifiers like
93
+ # `pgmq.q_<needle>_archive` — a PGMQ internal object we'd want to
94
+ # propagate as a real error rather than silently swallow.
85
95
  def missing_stream_queue?(error, sanitized)
86
96
  pg_error = pg_undefined_table?(error) ? error : error.cause
87
97
  return false unless pg_undefined_table?(pg_error)
88
98
 
89
- message = pg_error.message.to_s
90
- message.include?("pgmq.q_#{sanitized}") || message.include?("pgmq.a_#{sanitized}")
99
+ message = pg_error.message.to_s.downcase
100
+ needle = Regexp.escape(sanitized.downcase)
101
+ message.match?(/\bpgmq\.(?:q|a)_#{needle}\b/)
91
102
  end
92
103
 
93
104
  def pg_undefined_table?(error)
@@ -37,6 +37,10 @@ module Pgbus
37
37
  # Dead letter queue
38
38
  attr_accessor :max_retries
39
39
 
40
+ # Retry backoff for the VT-based retry path (unhandled exceptions).
41
+ # Jobs can override these per-class via Pgbus::RetryBackoff::JobMixin.
42
+ attr_accessor :retry_backoff, :retry_backoff_max, :retry_backoff_jitter
43
+
40
44
  # Priority queues
41
45
  attr_accessor :priority_levels, :default_priority
42
46
 
@@ -82,10 +86,11 @@ module Pgbus
82
86
 
83
87
  # Web dashboard
84
88
  attr_accessor :web_auth, :web_refresh_interval, :web_per_page, :web_live_updates, :web_data_source,
85
- :insights_default_minutes, :base_controller_class, :return_to_app_url
89
+ :insights_default_minutes, :base_controller_class, :return_to_app_url,
90
+ :metrics_enabled
86
91
 
87
92
  # Streams (turbo-rails replacement, SSE-based)
88
- attr_accessor :streams_enabled, :streams_queue_prefix, :streams_signed_name_secret,
93
+ attr_accessor :streams_enabled, :streams_path, :streams_queue_prefix, :streams_signed_name_secret,
89
94
  :streams_default_retention, :streams_retention, :streams_heartbeat_interval,
90
95
  :streams_max_connections, :streams_idle_timeout, :streams_listen_health_check_ms,
91
96
  :streams_write_deadline_ms, :streams_falcon_streaming_body,
@@ -116,6 +121,9 @@ module Pgbus
116
121
  @circuit_breaker_enabled = true
117
122
 
118
123
  @max_retries = 5
124
+ @retry_backoff = 5 # seconds — first VT-retry delay
125
+ @retry_backoff_max = 300 # 5 minutes cap
126
+ @retry_backoff_jitter = 0.15
119
127
 
120
128
  @priority_levels = nil
121
129
  @default_priority = 1
@@ -157,8 +165,10 @@ module Pgbus
157
165
  @insights_default_minutes = 30 * 24 * 60 # 30 days
158
166
  @base_controller_class = "::ActionController::Base"
159
167
  @return_to_app_url = nil
168
+ @metrics_enabled = true
160
169
 
161
170
  @streams_enabled = true
171
+ @streams_path = nil
162
172
  @streams_queue_prefix = "pgbus_stream"
163
173
  @streams_signed_name_secret = nil
164
174
  @streams_default_retention = 5 * 60 # 5 minutes
@@ -189,8 +199,7 @@ module Pgbus
189
199
 
190
200
  def queue_name(name)
191
201
  full = "#{queue_prefix}_#{name}"
192
- QueueNameValidator.validate!(full)
193
- full
202
+ QueueNameValidator.normalize(full)
194
203
  end
195
204
 
196
205
  def dead_letter_queue_name(name)
@@ -227,6 +236,11 @@ module Pgbus
227
236
  raise ArgumentError, "polling_interval must be > 0" unless polling_interval.is_a?(Numeric) && polling_interval.positive?
228
237
  raise ArgumentError, "visibility_timeout must be > 0" unless visibility_timeout.is_a?(Numeric) && visibility_timeout.positive?
229
238
  raise ArgumentError, "max_retries must be >= 0" unless max_retries.is_a?(Integer) && max_retries >= 0
239
+ raise ArgumentError, "retry_backoff must be > 0" unless retry_backoff.is_a?(Numeric) && retry_backoff.positive?
240
+ raise ArgumentError, "retry_backoff_max must be > 0" unless retry_backoff_max.is_a?(Numeric) && retry_backoff_max.positive?
241
+ unless retry_backoff_jitter.is_a?(Numeric) && retry_backoff_jitter >= 0 && retry_backoff_jitter <= 1
242
+ raise ArgumentError, "retry_backoff_jitter must be between 0 and 1"
243
+ end
230
244
 
231
245
  Array(workers).each do |w|
232
246
  threads = w[:threads] || w["threads"] || 5
@@ -87,6 +87,11 @@ module Pgbus
87
87
 
88
88
  WILDCARD_REFRESH_INTERVAL = 30 # seconds
89
89
 
90
+ # Matches the physical queue name inside a "relation \"pgmq.q_foo\" does
91
+ # not exist" error. Frozen module constant to avoid recompiling the
92
+ # regex on every queue-missing error in hot fetch/read paths.
93
+ MISSING_QUEUE_REGEX = /pgmq\.q_(\w+)/
94
+
90
95
  private
91
96
 
92
97
  def claim_and_execute
@@ -255,8 +260,8 @@ module Pgbus
255
260
  # Extract the queue name from the error and remove it from the active list.
256
261
  def evict_missing_queues(error)
257
262
  prefix = "#{config.queue_prefix}_"
258
- if error.message =~ /pgmq\.q_(\w+)/
259
- physical_name = Regexp.last_match(1)
263
+ if (match = MISSING_QUEUE_REGEX.match(error.message))
264
+ physical_name = match[1]
260
265
  logical_name = physical_name.delete_prefix(prefix)
261
266
  if @queues.delete(logical_name)
262
267
  Pgbus.logger.warn { "[Pgbus] Evicted deleted queue '#{logical_name}' (#{physical_name}) from worker" }
@@ -34,6 +34,23 @@ module Pgbus
34
34
  name
35
35
  end
36
36
 
37
+ # Normalizes a queue name by replacing common separators (hyphens, dots)
38
+ # with underscores, stripping remaining invalid characters, and collapsing
39
+ # consecutive underscores. Use this for names from external sources
40
+ # (e.g., Turbo stream names like "hotwire-livereload") where the intent
41
+ # is to derive a valid PGMQ queue name that preserves readability.
42
+ def normalize(name)
43
+ name = name.to_s
44
+ return validate!(name) if VALID_QUEUE_NAME_PATTERN.match?(name)
45
+
46
+ normalized = name.gsub(/[-.]/, "_") # hyphens/dots → underscores
47
+ .gsub(/[^a-zA-Z0-9_]/, "") # strip remaining invalid chars
48
+ .gsub(/_+/, "_") # collapse consecutive underscores
49
+ .gsub(/\A_|_\z/, "") # strip leading/trailing underscores
50
+ validate!(normalized)
51
+ normalized
52
+ end
53
+
37
54
  # Sanitizes a queue name by removing invalid characters, then validates.
38
55
  # Use this for names from untrusted sources (e.g., URL params).
39
56
  def sanitize!(name)
@@ -0,0 +1,91 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pgbus
4
+ # Computes exponential backoff delays for the VT-based retry path
5
+ # (unhandled exceptions that fall through ActiveJob's retry_on).
6
+ #
7
+ # The formula mirrors the CircuitBreaker pattern already used
8
+ # elsewhere in pgbus: base * 2^(attempt-1), capped at max,
9
+ # with optional jitter to prevent thundering-herd retries.
10
+ #
11
+ # Jobs can override the global config via the pgbus_retry_backoff
12
+ # class-level DSL (see JobMixin below).
13
+ module RetryBackoff
14
+ # Mixin for ActiveJob classes to declare per-job backoff config.
15
+ #
16
+ # class ImportJob < ApplicationJob
17
+ # include Pgbus::RetryBackoff::JobMixin
18
+ # pgbus_retry_backoff base: 15, max: 120, jitter: 0.2
19
+ # end
20
+ module JobMixin
21
+ extend ActiveSupport::Concern
22
+
23
+ class_methods do
24
+ def pgbus_retry_backoff(base: nil, max: nil, jitter: nil)
25
+ raise ArgumentError, "retry_backoff base must be > 0" if !base.nil? && (!base.is_a?(Numeric) || base <= 0)
26
+ raise ArgumentError, "retry_backoff max must be > 0" if !max.nil? && (!max.is_a?(Numeric) || max <= 0)
27
+ if !jitter.nil? && (!jitter.is_a?(Numeric) || jitter.negative? || jitter > 1)
28
+ raise ArgumentError, "retry_backoff jitter must be between 0 and 1"
29
+ end
30
+
31
+ @pgbus_retry_backoff = {
32
+ base: base,
33
+ max: max,
34
+ jitter: jitter
35
+ }.compact.freeze
36
+ end
37
+
38
+ def pgbus_retry_backoff_config
39
+ @pgbus_retry_backoff
40
+ end
41
+ end
42
+ end
43
+
44
+ class << self
45
+ # Compute delay for a specific job class, falling back to global
46
+ # config for any options not overridden at the job level.
47
+ def compute_delay_for_job(job_class, attempt:, jitter: nil)
48
+ overrides = (job_class.respond_to?(:pgbus_retry_backoff_config) &&
49
+ job_class.pgbus_retry_backoff_config) || {}
50
+
51
+ config = Pgbus.configuration
52
+ compute_delay(
53
+ attempt: attempt,
54
+ base: overrides[:base] || config.retry_backoff,
55
+ max: overrides[:max] || config.retry_backoff_max,
56
+ jitter: jitter || overrides[:jitter] || config.retry_backoff_jitter
57
+ )
58
+ end
59
+
60
+ # Core backoff computation.
61
+ #
62
+ # @param attempt [Integer] 1-based retry attempt number (read_ct - 1)
63
+ # @param base [Numeric] base delay in seconds (default: config.retry_backoff)
64
+ # @param max [Numeric] maximum delay cap (default: config.retry_backoff_max)
65
+ # @param jitter [Numeric] jitter factor 0..1 (default: config.retry_backoff_jitter)
66
+ # @return [Integer] delay in seconds
67
+ def compute_delay(attempt:, base: nil, max: nil, jitter: nil)
68
+ config = Pgbus.configuration
69
+ base ||= config.retry_backoff
70
+ max ||= config.retry_backoff_max
71
+ jitter = config.retry_backoff_jitter if jitter.nil?
72
+
73
+ exponent = [attempt - 1, 0].max
74
+ delay = base * (2**exponent)
75
+ delay = [delay, max].min
76
+
77
+ [apply_jitter(delay, jitter), max].min
78
+ end
79
+
80
+ private
81
+
82
+ def apply_jitter(delay, jitter)
83
+ return delay.to_i if jitter.nil? || jitter.zero?
84
+
85
+ spread = delay * jitter
86
+ jittered = delay + rand(-spread..spread)
87
+ [jittered.round, 0].max
88
+ end
89
+ end
90
+ end
91
+ end
data/lib/pgbus/streams.rb CHANGED
@@ -38,7 +38,7 @@ module Pgbus
38
38
  # Broadcasts a Turbo Stream HTML payload through the pgbus streamer.
39
39
  # PGMQ's `message` column is JSONB, so raw HTML strings can't be passed
40
40
  # directly. We wrap as `{"html": "..."}` on the way in and unwrap in
41
- # Pgbus::Web::Streamer::Dispatcher before delivering to the SSE client.
41
+ # Pgbus::Web::Streamer::StreamEventDispatcher before delivering to the SSE client.
42
42
  # Callers pass a plain HTML string; the wrapping is an implementation
43
43
  # detail.
44
44
  #
data/lib/pgbus/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Pgbus
4
- VERSION = "0.6.2"
4
+ VERSION = "0.6.4"
5
5
  end
@@ -875,28 +875,14 @@ module Pgbus
875
875
 
876
876
  # Extract uniqueness key from a JSON payload string and release its lock.
877
877
  def release_lock_for_payload(payload_str)
878
- return unless payload_str
879
-
880
- payload = payload_str.is_a?(String) ? JSON.parse(payload_str) : payload_str
881
- key = payload[Uniqueness::METADATA_KEY]
878
+ key = extract_uniqueness_key_from_payload_str(payload_str)
882
879
  UniquenessKey.release!(key) if key
883
- rescue JSON::ParserError => e
884
- Pgbus.logger.debug { "[Pgbus::Web] Error parsing payload for lock release: #{e.message}" }
885
880
  end
886
881
 
887
882
  # Extract uniqueness keys from a collection of formatted messages and
888
883
  # release all associated locks in a single query.
889
884
  def release_locks_for_messages(messages)
890
- keys = messages.filter_map do |m|
891
- payload = m[:message]
892
- next unless payload
893
-
894
- parsed = payload.is_a?(String) ? JSON.parse(payload) : payload
895
- parsed[Uniqueness::METADATA_KEY]
896
- rescue JSON::ParserError
897
- nil
898
- end
899
-
885
+ keys = messages.filter_map { |m| extract_uniqueness_key_from_payload_str(m[:message]) }
900
886
  UniquenessKey.where(lock_key: keys).delete_all if keys.any?
901
887
  rescue StandardError => e
902
888
  Pgbus.logger.debug { "[Pgbus::Web] Error releasing locks for messages: #{e.message}" }
@@ -908,18 +894,28 @@ module Pgbus
908
894
  "SELECT payload FROM pgbus_failed_events", "Pgbus Collect Failed Keys"
909
895
  )
910
896
 
911
- keys = rows.to_a.filter_map do |row|
912
- payload = JSON.parse(row["payload"])
913
- payload[Uniqueness::METADATA_KEY]
914
- rescue JSON::ParserError
915
- nil
916
- end
917
-
897
+ keys = rows.to_a.filter_map { |row| extract_uniqueness_key_from_payload_str(row["payload"]) }
918
898
  UniquenessKey.where(lock_key: keys).delete_all if keys.any?
919
899
  rescue StandardError => e
920
900
  Pgbus.logger.debug { "[Pgbus::Web] Error releasing locks for failed events: #{e.message}" }
921
901
  end
922
902
 
903
+ # Single unwrap point for PGMQ message / failed_event payload strings.
904
+ # Accepts a raw JSON string or an already-parsed Hash and returns the
905
+ # uniqueness metadata key, or nil when the payload is blank, unparseable,
906
+ # or carries no uniqueness metadata. Parse errors are swallowed at debug
907
+ # level because callers treat missing keys and malformed payloads
908
+ # identically (no lock to release).
909
+ def extract_uniqueness_key_from_payload_str(payload_str)
910
+ return nil unless payload_str
911
+
912
+ payload = payload_str.is_a?(String) ? JSON.parse(payload_str) : payload_str
913
+ payload[Uniqueness::METADATA_KEY]
914
+ rescue JSON::ParserError => e
915
+ Pgbus.logger.debug { "[Pgbus::Web] Error parsing payload for uniqueness key: #{e.message}" }
916
+ nil
917
+ end
918
+
923
919
  # Archive the queue message a failed_event row points to. Idempotent —
924
920
  # silently no-ops if the message no longer exists in the queue.
925
921
  def archive_failed_message(event)
@@ -0,0 +1,175 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pgbus
4
+ module Web
5
+ # Converts DataSource output into Prometheus text exposition format
6
+ # (Content-Type: text/plain; version=0.0.4; charset=utf-8).
7
+ #
8
+ # Each metric family gets a HELP line, a TYPE line, and one or more
9
+ # sample lines. Labels are double-quoted per the Prometheus spec.
10
+ # All timing values are converted from milliseconds to seconds.
11
+ #
12
+ # Resilient by design: each section rescues StandardError independently
13
+ # so a failure in one data source method doesn't blank the entire
14
+ # scrape response.
15
+ class MetricsSerializer
16
+ def initialize(data_source)
17
+ @data_source = data_source
18
+ end
19
+
20
+ def serialize
21
+ lines = []
22
+ append_queue_metrics(lines)
23
+ append_job_metrics(lines)
24
+ append_process_metrics(lines)
25
+ append_summary_metrics(lines)
26
+ append_stream_metrics(lines)
27
+ "#{lines.join("\n")}\n"
28
+ end
29
+
30
+ private
31
+
32
+ def append_queue_metrics(lines)
33
+ queues = @data_source.queues_with_metrics
34
+ return if queues.empty?
35
+
36
+ gauge(lines, "pgbus_queue_depth", "Number of messages in the queue (including invisible)") do
37
+ queues.map { |q| [q[:queue_length], { queue: q[:name] }] }
38
+ end
39
+
40
+ gauge(lines, "pgbus_queue_visible_depth", "Number of visible (ready to read) messages") do
41
+ queues.map { |q| [q[:queue_visible_length], { queue: q[:name] }] }
42
+ end
43
+
44
+ gauge(lines, "pgbus_queue_total_messages", "Total messages ever enqueued") do
45
+ queues.map { |q| [q[:total_messages], { queue: q[:name] }] }
46
+ end
47
+
48
+ gauge(lines, "pgbus_queue_oldest_message_age_seconds", "Age of the oldest message in seconds") do
49
+ queues.filter_map do |q|
50
+ next unless q[:oldest_msg_age_sec]
51
+
52
+ [q[:oldest_msg_age_sec], { queue: q[:name] }]
53
+ end
54
+ end
55
+
56
+ gauge(lines, "pgbus_queue_paused", "Whether the queue is paused (1) or active (0)") do
57
+ queues.map { |q| [q[:paused] ? 1 : 0, { queue: q[:name] }] }
58
+ end
59
+ rescue StandardError => e
60
+ Pgbus.logger.debug { "[Pgbus::Metrics] Error serializing queue metrics: #{e.message}" }
61
+ end
62
+
63
+ def append_job_metrics(lines)
64
+ counts = @data_source.job_status_counts
65
+ unless counts.empty?
66
+ gauge(lines, "pgbus_jobs_total", "Number of jobs by status in the stats window") do
67
+ counts.map { |status, count| [count, { status: status }] }
68
+ end
69
+ end
70
+
71
+ summary = @data_source.job_stats_summary
72
+ if summary[:total].positive?
73
+ gauge(lines, "pgbus_job_duration_avg_seconds", "Average job duration in seconds") do
74
+ [[ms_to_s(summary[:avg_duration_ms])]]
75
+ end
76
+
77
+ gauge(lines, "pgbus_job_duration_max_seconds", "Maximum job duration in seconds") do
78
+ [[ms_to_s(summary[:max_duration_ms])]]
79
+ end
80
+ end
81
+
82
+ return unless Pgbus::JobStat.latency_columns? && summary[:avg_latency_ms]
83
+
84
+ gauge(lines, "pgbus_job_enqueue_latency_seconds", "Enqueue latency percentiles in seconds") do
85
+ [
86
+ [ms_to_s(summary[:p50_latency_ms]), { quantile: "0.5" }],
87
+ [ms_to_s(summary[:p95_latency_ms]), { quantile: "0.95" }],
88
+ [ms_to_s(summary[:p99_latency_ms]), { quantile: "0.99" }]
89
+ ]
90
+ end
91
+
92
+ gauge(lines, "pgbus_job_avg_retries", "Average retry count per job") do
93
+ [[summary[:avg_retries]]]
94
+ end
95
+ rescue StandardError => e
96
+ Pgbus.logger.debug { "[Pgbus::Metrics] Error serializing job metrics: #{e.message}" }
97
+ end
98
+
99
+ def append_process_metrics(lines)
100
+ count = @data_source.processes.count
101
+ gauge(lines, "pgbus_active_processes", "Number of active pgbus worker processes") do
102
+ [[count]]
103
+ end
104
+ rescue StandardError => e
105
+ Pgbus.logger.debug { "[Pgbus::Metrics] Error serializing process metrics: #{e.message}" }
106
+ end
107
+
108
+ def append_summary_metrics(lines)
109
+ stats = @data_source.summary_stats
110
+ gauge(lines, "pgbus_failed_events_total", "Total failed events") do
111
+ [[stats[:failed_count]]]
112
+ end
113
+
114
+ gauge(lines, "pgbus_dlq_depth", "Total messages across all dead letter queues") do
115
+ [[stats[:dlq_depth]]]
116
+ end
117
+ rescue StandardError => e
118
+ Pgbus.logger.debug { "[Pgbus::Metrics] Error serializing summary metrics: #{e.message}" }
119
+ end
120
+
121
+ def append_stream_metrics(lines)
122
+ return unless @data_source.stream_stats_available?
123
+
124
+ summary = @data_source.stream_stats_summary
125
+ gauge(lines, "pgbus_stream_events_total", "Stream events by type in the stats window") do
126
+ [
127
+ [summary[:broadcasts], { event_type: "broadcast" }],
128
+ [summary[:connects], { event_type: "connect" }],
129
+ [summary[:disconnects], { event_type: "disconnect" }]
130
+ ]
131
+ end
132
+
133
+ gauge(lines, "pgbus_stream_active_connections", "Estimated active SSE connections") do
134
+ [[summary[:active_estimate]]]
135
+ end
136
+
137
+ gauge(lines, "pgbus_stream_avg_fanout", "Average broadcast fanout (subscribers per broadcast)") do
138
+ [[summary[:avg_fanout]]]
139
+ end
140
+ rescue StandardError => e
141
+ Pgbus.logger.debug { "[Pgbus::Metrics] Error serializing stream metrics: #{e.message}" }
142
+ end
143
+
144
+ # Emits a Prometheus gauge metric family. The block must return an array
145
+ # of [value] or [value, { label: "val" }] pairs.
146
+ def gauge(lines, name, help)
147
+ samples = yield
148
+ return if samples.empty?
149
+
150
+ lines << "# HELP #{name} #{help}"
151
+ lines << "# TYPE #{name} gauge"
152
+ samples.each do |value, labels|
153
+ lines << format_sample(name, value, labels)
154
+ end
155
+ end
156
+
157
+ def format_sample(name, value, labels = nil)
158
+ if labels && !labels.empty?
159
+ label_str = labels.map { |k, v| "#{k}=\"#{v}\"" }.join(",")
160
+ "#{name}{#{label_str}} #{format_value(value)}"
161
+ else
162
+ "#{name} #{format_value(value)}"
163
+ end
164
+ end
165
+
166
+ def format_value(value)
167
+ value.is_a?(Float) ? value.to_s : value.to_i.to_s
168
+ end
169
+
170
+ def ms_to_s(milliseconds)
171
+ (milliseconds.to_f / 1000).round(4)
172
+ end
173
+ end
174
+ end
175
+ end
@@ -13,7 +13,7 @@ module Pgbus
13
13
  # SSE clients.
14
14
  #
15
15
  # 2. Mark connections that have been idle longer than the
16
- # configured idle_timeout as dead. The Dispatcher's next pass
16
+ # configured idle_timeout as dead. The StreamEventDispatcher's next pass
17
17
  # picks them up via its disconnect path.
18
18
  #
19
19
  # 3. Post a DisconnectMessage for any connection already flagged
@@ -97,7 +97,7 @@ module Pgbus
97
97
  end
98
98
 
99
99
  def enqueue_disconnect(connection)
100
- @queue << Dispatcher::DisconnectMessage.new(connection: connection)
100
+ @queue << StreamEventDispatcher::DisconnectMessage.new(connection: connection)
101
101
  end
102
102
  end
103
103
  end
@@ -43,7 +43,7 @@ module Pgbus
43
43
  health_check_ms: @config.streams_listen_health_check_ms,
44
44
  logger: @logger
45
45
  )
46
- @dispatcher = Dispatcher.new(
46
+ @dispatcher = StreamEventDispatcher.new(
47
47
  client: @client,
48
48
  registry: @registry,
49
49
  listener: @listener,
@@ -92,7 +92,7 @@ module Pgbus
92
92
  return
93
93
  end
94
94
 
95
- @dispatch_queue << Dispatcher::ConnectMessage.new(connection: connection)
95
+ @dispatch_queue << StreamEventDispatcher::ConnectMessage.new(connection: connection)
96
96
  end
97
97
  end
98
98
 
@@ -26,8 +26,12 @@ module Pgbus
26
26
  #
27
27
  # All state ownership lives on this one thread: the registry is
28
28
  # thread-safe (Phase 2.1) but the in-flight buffers are local to
29
- # the Dispatcher and accessed only from this thread, so no locks.
30
- class Dispatcher
29
+ # the dispatcher and accessed only from this thread, so no locks.
30
+ #
31
+ # Named StreamEventDispatcher (rather than just "Dispatcher") to
32
+ # disambiguate from Pgbus::Process::Dispatcher, which is an
33
+ # unrelated worker-side pool coordinator. See issue #98 item 8.
34
+ class StreamEventDispatcher
31
35
  WakeMessage = Listener::WakeMessage
32
36
  ConnectMessage = Data.define(:connection)
33
37
  DisconnectMessage = Data.define(:connection)
@@ -110,7 +114,7 @@ module Pgbus
110
114
  # than calling Thread#kill, which leaves IO state corrupt.
111
115
  # The orphaned thread will exit on its own once the blocking
112
116
  # call returns and it sees @running == false on the next loop.
113
- @logger.warn { "[Pgbus::Streamer::Dispatcher] thread did not terminate within 5s" }
117
+ @logger.warn { "[Pgbus::Streamer::StreamEventDispatcher] thread did not terminate within 5s" }
114
118
  end
115
119
  @thread = nil
116
120
  self
@@ -141,7 +145,7 @@ module Pgbus
141
145
  end
142
146
  end
143
147
  rescue StandardError => e
144
- @logger.error { "[Pgbus::Streamer::Dispatcher] crashed: #{e.class}: #{e.message}" }
148
+ @logger.error { "[Pgbus::Streamer::StreamEventDispatcher] crashed: #{e.class}: #{e.message}" }
145
149
  raise
146
150
  end
147
151
 
@@ -176,7 +180,7 @@ module Pgbus
176
180
  when ConnectMessage then handle_connect(msg)
177
181
  when DisconnectMessage then handle_disconnect(msg)
178
182
  else
179
- @logger.warn { "[Pgbus::Streamer::Dispatcher] unknown message: #{msg.class}" }
183
+ @logger.warn { "[Pgbus::Streamer::StreamEventDispatcher] unknown message: #{msg.class}" }
180
184
  end
181
185
  rescue StandardError => e
182
186
  # Intentionally swallows per-message failures so one bad
@@ -184,7 +188,7 @@ module Pgbus
184
188
  # connected client. The top-level run_loop rescue (below)
185
189
  # does re-raise — a crash *between* messages is a real bug
186
190
  # and the supervisor should see it.
187
- @logger.error { "[Pgbus::Streamer::Dispatcher] handling #{msg.class} raised #{e.class}: #{e.message}" }
191
+ @logger.error { "[Pgbus::Streamer::StreamEventDispatcher] handling #{msg.class} raised #{e.class}: #{e.message}" }
188
192
  end
189
193
 
190
194
  def handle_wake(msg)
@@ -310,7 +314,7 @@ module Pgbus
310
314
  @scanned_cursor.delete(connection)
311
315
  cleanup_stream_if_unused(stream)
312
316
  connection.mark_dead!
313
- @logger.error { "[Pgbus::Streamer::Dispatcher] connect failed for #{connection.id}: #{e.class}: #{e.message}" }
317
+ @logger.error { "[Pgbus::Streamer::StreamEventDispatcher] connect failed for #{connection.id}: #{e.class}: #{e.message}" }
314
318
  end
315
319
 
316
320
  def handle_disconnect(msg)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pgbus
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.2
4
+ version: 0.6.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mikael Henriksson
@@ -122,6 +122,7 @@ files:
122
122
  - README.md
123
123
  - Rakefile
124
124
  - app/controllers/pgbus/api/insights_controller.rb
125
+ - app/controllers/pgbus/api/metrics_controller.rb
125
126
  - app/controllers/pgbus/api/stats_controller.rb
126
127
  - app/controllers/pgbus/application_controller.rb
127
128
  - app/controllers/pgbus/dashboard_controller.rb
@@ -283,6 +284,7 @@ files:
283
284
  - lib/pgbus/recurring/schedule.rb
284
285
  - lib/pgbus/recurring/scheduler.rb
285
286
  - lib/pgbus/recurring/task.rb
287
+ - lib/pgbus/retry_backoff.rb
286
288
  - lib/pgbus/serializer.rb
287
289
  - lib/pgbus/stat_buffer.rb
288
290
  - lib/pgbus/streams.rb
@@ -297,15 +299,16 @@ files:
297
299
  - lib/pgbus/version.rb
298
300
  - lib/pgbus/web/authentication.rb
299
301
  - lib/pgbus/web/data_source.rb
302
+ - lib/pgbus/web/metrics_serializer.rb
300
303
  - lib/pgbus/web/stream_app.rb
301
304
  - lib/pgbus/web/streamer.rb
302
305
  - lib/pgbus/web/streamer/connection.rb
303
- - lib/pgbus/web/streamer/dispatcher.rb
304
306
  - lib/pgbus/web/streamer/heartbeat.rb
305
307
  - lib/pgbus/web/streamer/instance.rb
306
308
  - lib/pgbus/web/streamer/io_writer.rb
307
309
  - lib/pgbus/web/streamer/listener.rb
308
310
  - lib/pgbus/web/streamer/registry.rb
311
+ - lib/pgbus/web/streamer/stream_event_dispatcher.rb
309
312
  - lib/puma/plugin/pgbus_streams.rb
310
313
  - lib/tasks/pgbus_pgmq.rake
311
314
  - lib/tasks/pgbus_streams.rake