pgbus 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,7 +11,17 @@ module Pgbus
11
11
  attr_accessor :default_queue, :queue_prefix
12
12
 
13
13
  # Worker settings
14
- attr_accessor :workers, :polling_interval, :visibility_timeout, :prefetch_limit
14
+ attr_accessor :polling_interval, :prefetch_limit
15
+ attr_reader :workers, :visibility_timeout # rubocop:disable Style/AccessorGrouping
16
+
17
+ # Supervisor role selection.
18
+ # nil = boot all roles (default behavior).
19
+ # Array of role symbols = boot only the listed roles.
20
+ # Set via the CLI flags --workers-only / --scheduler-only / --dispatcher-only,
21
+ # or directly in an initializer for advanced cases.
22
+ attr_reader :roles
23
+
24
+ VALID_ROLES = %i[workers dispatcher scheduler consumers outbox].freeze
15
25
 
16
26
  # Worker recycling
17
27
  attr_accessor :max_jobs_per_worker, :max_memory_mb, :max_worker_lifetime
@@ -19,30 +29,37 @@ module Pgbus
19
29
  # Dispatcher settings
20
30
  attr_accessor :dispatch_interval
21
31
 
22
- # Circuit breaker
23
- attr_accessor :circuit_breaker_enabled, :circuit_breaker_threshold,
24
- :circuit_breaker_base_backoff, :circuit_breaker_max_backoff
32
+ # Circuit breaker. Only `enabled` is user-facing — the trip threshold and
33
+ # backoff curve are tuned via constants on Pgbus::CircuitBreaker because
34
+ # they are implementation details that have never been worth exposing.
35
+ attr_accessor :circuit_breaker_enabled
25
36
 
26
37
  # Dead letter queue
27
- attr_accessor :max_retries, :dead_letter_queue_suffix
38
+ attr_accessor :max_retries
28
39
 
29
40
  # Priority queues
30
41
  attr_accessor :priority_levels, :default_priority
31
42
 
32
- # Archive compaction
33
- attr_accessor :archive_retention, :archive_compaction_interval, :archive_compaction_batch_size
43
+ # Archive compaction. Only the user-facing retention window is configurable;
44
+ # the loop interval and batch size are tuned via constants on
45
+ # Pgbus::Process::Dispatcher.
46
+ attr_reader :archive_retention
34
47
 
35
48
  # Transactional outbox
36
- attr_accessor :outbox_enabled, :outbox_poll_interval, :outbox_batch_size, :outbox_retention
49
+ attr_accessor :outbox_enabled, :outbox_poll_interval, :outbox_batch_size
50
+ attr_reader :outbox_retention # rubocop:disable Style/AccessorGrouping
37
51
 
38
52
  # Event bus
39
- attr_accessor :idempotency_ttl, :allowed_global_id_models
53
+ attr_accessor :allowed_global_id_models
54
+ attr_reader :idempotency_ttl # rubocop:disable Style/AccessorGrouping
40
55
 
41
56
  # Logging
42
57
  attr_accessor :logger
43
58
 
44
- # LISTEN/NOTIFY
45
- attr_accessor :listen_notify, :notify_throttle_ms
59
+ # LISTEN/NOTIFY. Only the on/off switch is user-facing — the throttle
60
+ # interval is a Postgres-side tuning knob that lives as a constant on
61
+ # Pgbus::Client (NOTIFY_THROTTLE_MS).
62
+ attr_accessor :listen_notify
46
63
 
47
64
  # PGMQ schema installation mode (:auto, :extension, :embedded)
48
65
  attr_reader :pgmq_schema_mode
@@ -51,8 +68,8 @@ module Pgbus
51
68
  attr_accessor :event_consumers
52
69
 
53
70
  # Recurring jobs
54
- attr_accessor :recurring_tasks, :recurring_schedule_interval, :recurring_tasks_file,
55
- :skip_recurring, :recurring_execution_retention
71
+ attr_accessor :recurring_tasks, :recurring_schedule_interval, :recurring_tasks_file, :skip_recurring
72
+ attr_reader :recurring_execution_retention # rubocop:disable Style/AccessorGrouping
56
73
 
57
74
  # Multi-database support (optional separate database for pgbus tables)
58
75
  # Set to { database: { writing: :pgbus, reading: :pgbus } } to use a separate database.
@@ -60,7 +77,8 @@ module Pgbus
60
77
  attr_accessor :connects_to
61
78
 
62
79
  # Job stats
63
- attr_accessor :stats_retention, :stats_enabled
80
+ attr_accessor :stats_enabled
81
+ attr_reader :stats_retention # rubocop:disable Style/AccessorGrouping
64
82
 
65
83
  # Web dashboard
66
84
  attr_accessor :web_auth, :web_refresh_interval, :web_per_page, :web_live_updates, :web_data_source,
@@ -69,13 +87,14 @@ module Pgbus
69
87
  def initialize
70
88
  @database_url = nil
71
89
  @connection_params = nil
72
- @pool_size = 5
90
+ @pool_size = nil
73
91
  @pool_timeout = 5
74
92
 
75
93
  @default_queue = "default"
76
94
  @queue_prefix = "pgbus"
77
95
 
78
96
  @workers = [{ queues: %w[default], threads: 5 }]
97
+ @roles = nil
79
98
  @polling_interval = 0.1
80
99
  @visibility_timeout = 30
81
100
 
@@ -88,19 +107,13 @@ module Pgbus
88
107
  @dispatch_interval = 1.0
89
108
 
90
109
  @circuit_breaker_enabled = true
91
- @circuit_breaker_threshold = 5
92
- @circuit_breaker_base_backoff = 30
93
- @circuit_breaker_max_backoff = 600
94
110
 
95
111
  @max_retries = 5
96
- @dead_letter_queue_suffix = "_dlq"
97
112
 
98
113
  @priority_levels = nil
99
114
  @default_priority = 1
100
115
 
101
116
  @archive_retention = 7 * 24 * 3600 # 7 days
102
- @archive_compaction_interval = 3600
103
- @archive_compaction_batch_size = 1000
104
117
 
105
118
  @outbox_enabled = false
106
119
  @outbox_poll_interval = 1.0
@@ -113,7 +126,6 @@ module Pgbus
113
126
  @logger = (defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger) || Logger.new($stdout)
114
127
 
115
128
  @listen_notify = true
116
- @notify_throttle_ms = 250
117
129
 
118
130
  @pgmq_schema_mode = :auto
119
131
 
@@ -147,7 +159,7 @@ module Pgbus
147
159
  end
148
160
 
149
161
  def dead_letter_queue_name(name)
150
- "#{queue_name(name)}#{dead_letter_queue_suffix}"
162
+ "#{queue_name(name)}#{Pgbus::DEAD_LETTER_SUFFIX}"
151
163
  end
152
164
 
153
165
  def priority_queue_name(name, priority)
@@ -172,13 +184,16 @@ module Pgbus
172
184
  end
173
185
 
174
186
  def validate!
175
- raise ArgumentError, "pool_size must be > 0" unless pool_size.is_a?(Numeric) && pool_size.positive?
187
+ if pool_size && !(pool_size.is_a?(Numeric) && pool_size.positive?)
188
+ raise ArgumentError, "pool_size must be a positive number or nil (auto-tune)"
189
+ end
190
+
176
191
  raise ArgumentError, "pool_timeout must be > 0" unless pool_timeout.is_a?(Numeric) && pool_timeout.positive?
177
192
  raise ArgumentError, "polling_interval must be > 0" unless polling_interval.is_a?(Numeric) && polling_interval.positive?
178
193
  raise ArgumentError, "visibility_timeout must be > 0" unless visibility_timeout.is_a?(Numeric) && visibility_timeout.positive?
179
194
  raise ArgumentError, "max_retries must be >= 0" unless max_retries.is_a?(Integer) && max_retries >= 0
180
195
 
181
- workers.each do |w|
196
+ Array(workers).each do |w|
182
197
  threads = w[:threads] || w["threads"] || 5
183
198
  raise ArgumentError, "worker threads must be > 0" unless threads.is_a?(Integer) && threads.positive?
184
199
  end
@@ -196,6 +211,177 @@ module Pgbus
196
211
  self
197
212
  end
198
213
 
214
+ # Set the worker capsule list. Accepts:
215
+ #
216
+ # String — parsed via Pgbus::Configuration::CapsuleDSL into capsules
217
+ # with auto-generated names (each capsule's :name is its
218
+ # first queue token).
219
+ #
220
+ # c.workers "*: 5"
221
+ # c.workers "critical: 5; default, mailers: 10"
222
+ #
223
+ # Array — legacy explicit form. Each entry is a Hash with :queues
224
+ # and :threads (and optionally :name, :single_active_consumer,
225
+ # :consumer_priority, :prefetch_limit).
226
+ #
227
+ # c.workers [{ queues: %w[default], threads: 5 }]
228
+ #
229
+ # nil — no workers configured (used when running scheduler-only or
230
+ # dispatcher-only processes).
231
+ #
232
+ # Raises ArgumentError for any other type.
233
+ def workers=(value)
234
+ @workers = case value
235
+ when nil
236
+ nil
237
+ when String
238
+ CapsuleDSL.parse(value).map { |entry| entry.merge(name: entry[:queues].first.to_s) }
239
+ when Array
240
+ value
241
+ else
242
+ raise ArgumentError,
243
+ "workers must be a String (DSL), Array (legacy form), or nil — got #{value.class}"
244
+ end
245
+ end
246
+
247
+ # Define a named capsule and append it to the workers list.
248
+ #
249
+ # c.capsule :critical, queues: %w[critical], threads: 5
250
+ # c.capsule :gated, queues: %w[gated], threads: 1, single_active_consumer: true
251
+ #
252
+ # Names must be unique. Queues must not overlap with capsules already
253
+ # defined (would cause double-processing). Composes with the string DSL —
254
+ # +c.workers "..."+ followed by +c.capsule :name, ...+ appends the
255
+ # named capsule to the list parsed from the string.
256
+ def capsule(name, queues:, threads:, **)
257
+ raise ArgumentError, "capsule queues must be a non-empty Array" unless queues.is_a?(Array) && queues.any?
258
+ raise ArgumentError, "capsule threads must be a positive Integer" unless threads.is_a?(Integer) && threads.positive?
259
+
260
+ normalized_name = name.to_s
261
+ @workers ||= []
262
+
263
+ raise ArgumentError, "capsule #{name.inspect} is already defined" if @workers.any? { |c| capsule_name(c) == normalized_name }
264
+
265
+ validate_no_queue_overlap!(queues)
266
+
267
+ @workers << { name: normalized_name, queues: queues, threads: threads, ** }
268
+ end
269
+
270
+ # Look up a capsule by its name. Accepts symbol or string. Returns the
271
+ # matching Hash, or nil. Used by the CLI's --capsule selector.
272
+ def capsule_named(name)
273
+ return nil unless @workers
274
+
275
+ key = name.to_s
276
+ @workers.find { |c| capsule_name(c) == key }
277
+ end
278
+
279
+ # Returns true if the given role should be booted by the supervisor.
280
+ # When +roles+ is nil (the default), every role is enabled — this matches
281
+ # the legacy single-process behavior. When +roles+ is set (e.g. via the
282
+ # CLI's --workers-only / --scheduler-only / --dispatcher-only flags),
283
+ # only the listed roles boot.
284
+ #
285
+ # Accepts symbol or string for case-insensitive comparison.
286
+ def role_enabled?(role)
287
+ return true if @roles.nil?
288
+
289
+ @roles.include?(role.to_s.downcase.to_sym)
290
+ end
291
+
292
+ # Set the supervisor role filter. Accepts:
293
+ #
294
+ # nil — boot all roles (default)
295
+ # Symbol/String — wraps into a single-element array
296
+ # Array — list of roles to boot
297
+ #
298
+ # Each role is normalized to a downcased symbol and validated against
299
+ # VALID_ROLES. Unknown role names raise ArgumentError immediately so
300
+ # typos like `[:workres]` fail loud at boot rather than leaving the
301
+ # supervisor idling with no children.
302
+ def roles=(value)
303
+ if value.nil?
304
+ @roles = nil
305
+ return
306
+ end
307
+
308
+ normalized = Array(value).map { |r| r.to_s.downcase.to_sym }.uniq
309
+ invalid = normalized - VALID_ROLES
310
+ if invalid.any?
311
+ raise ArgumentError,
312
+ "invalid role(s) #{invalid.inspect} — valid roles are: #{VALID_ROLES.join(", ")}"
313
+ end
314
+
315
+ @roles = normalized
316
+ end
317
+
318
+ # Duration setters: each accepts either a Numeric (seconds) or an
319
+ # ActiveSupport::Duration (e.g. 10.minutes, 7.days). Validation runs
320
+ # immediately on assignment so misconfigurations crash at boot rather
321
+ # than leaving stale state until a `validate!` call somewhere.
322
+ #
323
+ # Numeric values are stored unchanged (preserving Float for sub-second
324
+ # values). Duration values are coerced to Integer seconds via .to_i.
325
+
326
+ def visibility_timeout=(value)
327
+ @visibility_timeout = coerce_duration!(value, :visibility_timeout)
328
+ end
329
+
330
+ def archive_retention=(value)
331
+ @archive_retention = coerce_duration!(value, :archive_retention)
332
+ end
333
+
334
+ def outbox_retention=(value)
335
+ @outbox_retention = coerce_duration!(value, :outbox_retention)
336
+ end
337
+
338
+ def idempotency_ttl=(value)
339
+ @idempotency_ttl = coerce_duration!(value, :idempotency_ttl)
340
+ end
341
+
342
+ def stats_retention=(value)
343
+ @stats_retention = coerce_duration!(value, :stats_retention)
344
+ end
345
+
346
+ def recurring_execution_retention=(value)
347
+ @recurring_execution_retention = coerce_duration!(value, :recurring_execution_retention)
348
+ end
349
+
350
+ # Returns the connection pool size to use for the PGMQ client.
351
+ #
352
+ # If +pool_size+ was explicitly set, returns that value unchanged. Otherwise
353
+ # auto-derives from the threads needed by the roles this process actually
354
+ # runs (respects +Configuration#roles+ from --workers-only / --scheduler-only
355
+ # / --dispatcher-only):
356
+ #
357
+ # workers role → sum(workers.threads)
358
+ # consumers role → sum(event_consumers.threads)
359
+ # dispatcher role → +1
360
+ # scheduler role → +1
361
+ #
362
+ # A --scheduler-only deployment that has 50 worker threads configured
363
+ # only needs 1 connection (for the scheduler), not 52.
364
+ #
365
+ # Auto-tune protects users from the common pitfall of running 15 worker
366
+ # threads with a hand-set pool_size of 5 (resulting in ConnectionPool
367
+ # timeouts under load). Setting pool_size explicitly is still supported
368
+ # for advanced cases where you need a tighter or looser pool than the
369
+ # default formula provides.
370
+ POOL_SIZE_WARN_THRESHOLD = 50
371
+
372
+ def resolved_pool_size
373
+ return pool_size if pool_size
374
+
375
+ total = 0
376
+ total += sum_thread_counts(workers, default_threads: 5, group: "worker") if role_enabled?(:workers)
377
+ total += sum_thread_counts(event_consumers, default_threads: 3, group: "event_consumer") if role_enabled?(:consumers)
378
+ total += 1 if role_enabled?(:dispatcher)
379
+ total += 1 if role_enabled?(:scheduler)
380
+
381
+ warn_if_oversized(total)
382
+ total
383
+ end
384
+
199
385
  def connection_options
200
386
  if database_url
201
387
  database_url
@@ -217,6 +403,100 @@ module Pgbus
217
403
 
218
404
  private
219
405
 
406
+ # Coerce a duration setting value to a positive Numeric.
407
+ #
408
+ # Accepts an ActiveSupport::Duration (coerced to Integer seconds via .to_i)
409
+ # or a Numeric (stored as-is, preserving Float for sub-second values).
410
+ # Raises ArgumentError immediately for nil, zero, negative, or non-numeric
411
+ # input — callers crash at boot rather than carrying silently-broken state.
412
+ def coerce_duration!(value, name)
413
+ # nil is a valid sentinel for "feature disabled" (e.g. archive_retention,
414
+ # idempotency_ttl, recurring_execution_retention all use nil to skip the
415
+ # corresponding maintenance task in the dispatcher).
416
+ return nil if value.nil?
417
+
418
+ # Check Duration FIRST because ActiveSupport overrides Numeric#is_a?
419
+ # to return true for Integer, so a duration would otherwise be caught
420
+ # by the Numeric branch and stored as-is (uncoerced).
421
+ duration_class_loaded = defined?(ActiveSupport::Duration)
422
+ return validate_positive_duration!(value.to_i, name) if duration_class_loaded && value.is_a?(ActiveSupport::Duration)
423
+
424
+ # Plain Numeric (Integer, Float, Rational). Use class identity rather
425
+ # than is_a? for the Duration exclusion because ActiveSupport overrides
426
+ # is_a? — see comment above.
427
+ if value.is_a?(Numeric) && (!defined?(ActiveSupport::Duration) || value.class != ActiveSupport::Duration)
428
+ return validate_positive_duration!(value, name)
429
+ end
430
+
431
+ raise ArgumentError,
432
+ "#{name} must be a Numeric (seconds), ActiveSupport::Duration, or nil to disable, got #{value.inspect}"
433
+ end
434
+
435
+ def validate_positive_duration!(numeric, name)
436
+ raise ArgumentError, "#{name} must be a positive number, got #{numeric}" unless numeric.positive?
437
+
438
+ numeric
439
+ end
440
+
441
+ # Read a capsule's name from either symbol or string key, normalized
442
+ # to a string for comparison. Returns nil for unnamed (legacy) entries.
443
+ def capsule_name(entry)
444
+ raw = entry[:name] || entry["name"]
445
+ raw&.to_s
446
+ end
447
+
448
+ # Validates that no queue in +new_queues+ would overlap with any
449
+ # existing capsule. The wildcard '*' counts as overlapping with EVERY
450
+ # other queue (and vice versa) because at runtime '*' is expanded to
451
+ # all known queues. Raises ArgumentError on overlap.
452
+ def validate_no_queue_overlap!(new_queues)
453
+ existing = (@workers || []).flat_map { |c| c[:queues] || c["queues"] || [] }
454
+ return if existing.empty?
455
+
456
+ if existing.include?(CapsuleDSL::WILDCARD)
457
+ raise ArgumentError,
458
+ "an existing capsule already uses '*' (matches every queue) — " \
459
+ "the new capsule's queues #{new_queues.inspect} would overlap with it"
460
+ end
461
+
462
+ if new_queues.include?(CapsuleDSL::WILDCARD)
463
+ raise ArgumentError,
464
+ "the new capsule uses '*' (matches every queue) but other capsules " \
465
+ "are already defined with queues #{existing.inspect} — " \
466
+ "the wildcard would overlap with all of them"
467
+ end
468
+
469
+ conflict = new_queues.find { |q| existing.include?(q) }
470
+ return unless conflict
471
+
472
+ raise ArgumentError,
473
+ "queue #{conflict.inspect} is already assigned to another capsule — " \
474
+ "each queue can only belong to one capsule"
475
+ end
476
+
477
+ def sum_thread_counts(entries, default_threads:, group:)
478
+ return 0 unless entries
479
+
480
+ entries.sum do |entry|
481
+ threads = entry[:threads] || entry["threads"] || default_threads
482
+ unless threads.is_a?(Integer) && threads.positive?
483
+ raise ArgumentError,
484
+ "#{group} threads must be a positive integer, got #{threads.inspect}"
485
+ end
486
+ threads
487
+ end
488
+ end
489
+
490
+ def warn_if_oversized(size)
491
+ return unless size > POOL_SIZE_WARN_THRESHOLD
492
+
493
+ Pgbus.logger.warn do
494
+ "[Pgbus] Auto-tuned pool_size is #{size} (over #{POOL_SIZE_WARN_THRESHOLD}). " \
495
+ "Verify your worker thread counts are intentional. " \
496
+ "Set Pgbus.configuration.pool_size explicitly to override."
497
+ end
498
+ end
499
+
220
500
  def extract_ar_connection_hash
221
501
  base = connects_to ? Pgbus::BusRecord : ActiveRecord::Base
222
502
  db_config = base.connection_db_config
@@ -32,7 +32,14 @@ module Pgbus
32
32
  ]
33
33
  )
34
34
  rescue StandardError => e
35
- Pgbus.logger.debug { "[Pgbus] Failed to record failed event: #{e.message}" }
35
+ # ERROR-level: silent loss of failure-tracking data defeats the
36
+ # purpose of the dashboard's "Failed Jobs" section. If recording
37
+ # fails, surface it loudly so the broken state can be diagnosed
38
+ # rather than silently masked.
39
+ Pgbus.logger.error do
40
+ "[Pgbus] Failed to record failed event for queue=#{queue_name} msg_id=#{msg_id}: " \
41
+ "#{e.class}: #{e.message}"
42
+ end
36
43
  end
37
44
 
38
45
  def clear!(queue_name:, msg_id:)
@@ -42,7 +49,13 @@ module Pgbus
42
49
  [queue_name, msg_id.to_i]
43
50
  )
44
51
  rescue StandardError => e
45
- Pgbus.logger.debug { "[Pgbus] Failed to clear failed event: #{e.message}" }
52
+ # ERROR-level: a failed clear leaves a stale row in the dashboard
53
+ # AFTER the job actually succeeded — confusing and load-bearing
54
+ # for users debugging recurring duplicates.
55
+ Pgbus.logger.error do
56
+ "[Pgbus] Failed to clear failed event for queue=#{queue_name} msg_id=#{msg_id}: " \
57
+ "#{e.class}: #{e.message}"
58
+ end
46
59
  end
47
60
 
48
61
  private