pgbus 0.7.0 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dca12989cdb8d34cd026facd0ecb71cc9df51fb966135806a858e1c4bdb0f7cc
4
- data.tar.gz: 4f94274ec1bda644087f5711f2a75f8b1c70f5840f5e0d1d9a1b7ea137f6e3ea
3
+ metadata.gz: b1a647a2e485cace147ca201861ff4613e9c14f72f8fd8be9060f64d6eccfff5
4
+ data.tar.gz: bf3ca37850d549c6e9ed86bedbad1d88f9a50462b86a2d8a09113de7c1227d0f
5
5
  SHA512:
6
- metadata.gz: decefd5c55ee2f4ca0c1aa6d52e1e1732f89ba3e789a0ae64c6b0bbd1cf60fe0d0c138dc87398f0a80b5e28d7e5cd7cd09eb44e2265265265adcfd9c0ee829f5
7
- data.tar.gz: ad140f317e5de14558842881f2dd428281a8fb141986e7d8099976969a2d85b87dd08e7d01f64ebf5128f5b3174dd485b7af5c1f8d00b07bb4037ec095256c8c
6
+ metadata.gz: b32dac214071005fa5974478f2cb8bd4d8a270efc3e6bc94141b74c28cebfea0dfa84e41ad9bf1ec2a10c8ab0d09ae08d0df989f0894b8d5cbc6a461698ecf5b
7
+ data.tar.gz: ea55e1a29e53e491063c34aaea3a643498d1567968e64833d9a38deef2bcd7a48dfb35e377601d347fa8e05e55a5647a77dbe958d9584fcdc8669a362ff253a8
@@ -13,9 +13,17 @@ module Pgbus
13
13
  @stat_buffer = stat_buffer
14
14
  end
15
15
 
16
+ # Exceptions we never want to swallow — let the process die/signal propagate.
17
+ FATAL_EXCEPTIONS = [SystemExit, Interrupt, SignalException, NoMemoryError, SystemStackError].freeze
18
+ private_constant :FATAL_EXCEPTIONS
19
+
16
20
  def execute(message, queue_name, source_queue: nil)
17
21
  execution_start = monotonic_now
22
+ tag = "msg_id=#{message.msg_id} queue=#{queue_name} read_ct=#{message.read_ct}"
23
+ Pgbus.logger.debug { "[Pgbus::Executor] start #{tag}" }
24
+
18
25
  payload = JSON.parse(message.message)
26
+ job_class = payload["job_class"]
19
27
  read_count = message.read_ct.to_i
20
28
 
21
29
  if read_count > config.max_retries
@@ -25,10 +33,9 @@ module Pgbus
25
33
  signal_batch_discarded(payload)
26
34
  Uniqueness.release_lock(Uniqueness.extract_key(payload))
27
35
  record_stat(payload, queue_name, "dead_lettered", execution_start, message: message)
36
+ Pgbus.logger.debug { "[Pgbus::Executor] dead_lettered #{tag} job_class=#{job_class}" }
28
37
  return :dead_lettered
29
38
  end
30
-
31
- job_class = payload["job_class"]
32
39
  uniqueness_key = Uniqueness.extract_key(payload)
33
40
  uniqueness_strategy = Uniqueness.extract_strategy(payload)
34
41
 
@@ -49,23 +56,38 @@ module Pgbus
49
56
  end
50
57
  end
51
58
 
59
+ Pgbus.logger.debug { "[Pgbus::Executor] deserialized #{tag} job_class=#{job_class}" }
52
60
  job_succeeded = false
53
61
 
62
+ msg_id = message.msg_id.to_i
54
63
  Instrumentation.instrument("pgbus.executor.execute", queue: queue_name, job_class: job_class) do
55
64
  job = ::ActiveJob::Base.deserialize(payload)
65
+ Pgbus.logger.debug { "[Pgbus::Executor] running #{tag} job_class=#{job_class}" }
56
66
  execute_job(job)
57
- archive_from(queue_name, message.msg_id.to_i, source_queue: source_queue)
58
- FailedEventRecorder.clear!(queue_name: queue_name, msg_id: message.msg_id.to_i)
67
+ Pgbus.logger.debug { "[Pgbus::Executor] perform_returned #{tag} job_class=#{job_class}" }
68
+ archive_from(queue_name, msg_id, source_queue: source_queue)
69
+ Pgbus.logger.debug { "[Pgbus::Executor] archived #{tag} job_class=#{job_class}" }
70
+ FailedEventRecorder.clear!(queue_name: queue_name, msg_id: msg_id)
59
71
  job_succeeded = true
60
72
  end
61
73
 
62
74
  instrument("pgbus.job_completed", queue: queue_name, job_class: job_class)
63
75
  record_stat(payload, queue_name, "success", execution_start, message: message)
76
+ Pgbus.logger.debug { "[Pgbus::Executor] done #{tag} job_class=#{job_class}" }
64
77
  :success
65
- rescue StandardError => e
78
+ rescue *FATAL_EXCEPTIONS
79
+ # Process-fatal: propagate so the supervisor/OS can react.
80
+ raise
81
+ rescue Exception => e # rubocop:disable Lint/RescueException
82
+ # Widened from StandardError to catch Async::Stop / Async::Cancel
83
+ # (both inherit from Exception, not StandardError) under execution_mode: :async.
84
+ # Before this, a fiber interruption between perform_now and archive_from
85
+ # silently lost control flow — no failed event row, no job_failed
86
+ # notification, uniqueness lock held until VT expired. See issue #126.
66
87
  handle_failure(message, queue_name, e, payload: payload)
67
88
  instrument("pgbus.job_failed", queue: queue_name, job_class: payload&.dig("job_class"), error: e.class.name)
68
89
  record_stat(payload, queue_name, "failed", execution_start, message: message)
90
+ Pgbus.logger.debug { "[Pgbus::Executor] failed #{tag} job_class=#{payload&.dig("job_class")} error=#{e.class}" }
69
91
  # Don't signal concurrency on transient failure — the job will be retried.
70
92
  # Semaphore is released only on success or dead-lettering.
71
93
  :failed
@@ -27,7 +27,9 @@ module Pgbus
27
27
  # sensitive and need every broadcast to fire a NOTIFY, even
28
28
  # when several are batched within a single millisecond.
29
29
  # Override the throttle to 0 specifically for stream queues.
30
- synchronized { @pgmq.enable_notify_insert(full_name, throttle_interval_ms: 0) } if config.listen_notify
30
+ # Use the idempotent path to avoid deadlocks when multiple
31
+ # processes race to set up the same stream queue.
32
+ synchronized { enable_notify_if_needed(full_name, 0) }
31
33
 
32
34
  # CREATE INDEX IF NOT EXISTS is idempotent in Postgres but still
33
35
  # requires a roundtrip and a brief ACCESS SHARE lock on the archive
data/lib/pgbus/client.rb CHANGED
@@ -457,12 +457,48 @@ module Pgbus
457
457
  synchronized do
458
458
  @pgmq.create(full_name)
459
459
  tune_autovacuum(full_name)
460
- @pgmq.enable_notify_insert(full_name, throttle_interval_ms: NOTIFY_THROTTLE_MS) if config.listen_notify
460
+ enable_notify_if_needed(full_name, NOTIFY_THROTTLE_MS)
461
461
  end
462
462
  true
463
463
  end
464
464
  end
465
465
 
466
+ def enable_notify_if_needed(full_name, throttle_ms)
467
+ return unless config.listen_notify
468
+ return if notify_trigger_current?(full_name, throttle_ms)
469
+
470
+ @pgmq.enable_notify_insert(full_name, throttle_interval_ms: throttle_ms)
471
+ end
472
+
473
+ # Check whether the NOTIFY trigger already exists on this queue with the
474
+ # expected throttle interval. When it does, we can skip the destructive
475
+ # DROP TRIGGER + CREATE TRIGGER cycle that causes deadlocks when multiple
476
+ # forked processes race during bootstrap.
477
+ def notify_trigger_current?(full_name, throttle_ms)
478
+ with_raw_connection do |conn|
479
+ result = conn.exec_params(<<~SQL, [full_name, throttle_ms])
480
+ SELECT 1
481
+ FROM pg_trigger t
482
+ JOIN pg_class c ON t.tgrelid = c.oid
483
+ JOIN pg_namespace n ON c.relnamespace = n.oid
484
+ WHERE n.nspname = 'pgmq'
485
+ AND c.relname = pgmq.format_table_name($1, 'q')
486
+ AND t.tgname = 'trigger_notify_queue_insert_listeners'
487
+ AND EXISTS (
488
+ SELECT 1 FROM pgmq.notify_insert_throttle
489
+ WHERE queue_name = $1
490
+ AND throttle_interval_ms = $2
491
+ )
492
+ LIMIT 1
493
+ SQL
494
+ result.ntuples.positive?
495
+ end
496
+ rescue StandardError
497
+ # If we can't check (e.g. pgmq schema not fully ready), fall back to
498
+ # the unconditional path — same behavior as before this fix.
499
+ false
500
+ end
501
+
466
502
  def tune_autovacuum(queue_name)
467
503
  with_raw_connection do |conn|
468
504
  conn.exec(AutovacuumTuning.sql_for_queue(queue_name))
@@ -85,6 +85,10 @@ module Pgbus
85
85
  # Requires a matching entry in config/database.yml under the "pgbus" key.
86
86
  attr_accessor :connects_to
87
87
 
88
+ # Zombie message detection — logs a warning when a message is redelivered
89
+ # (read_ct > 1) without any prior failure recorded in pgbus_failed_events.
90
+ attr_accessor :zombie_detection
91
+
88
92
  # Job stats
89
93
  attr_accessor :stats_enabled
90
94
  attr_reader :stats_retention # rubocop:disable Style/AccessorGrouping
@@ -160,6 +164,8 @@ module Pgbus
160
164
  @skip_recurring = false
161
165
  @recurring_execution_retention = 7 * 24 * 3600 # 7 days
162
166
 
167
+ @zombie_detection = true
168
+
163
169
  @stats_enabled = true
164
170
  @stats_retention = 30 * 24 * 3600 # 30 days
165
171
 
@@ -128,9 +128,20 @@ module Pgbus
128
128
  nil
129
129
  end
130
130
 
131
+ # Supervisor-level rescue: catch any Exception raised from the user
132
+ # block so capacity is always restored and the failure is logged.
133
+ # The `async` gem uses Async::Stop / Async::Cancel (Exception subclasses,
134
+ # NOT StandardError) to cancel tasks, and prior to issue #126 those
135
+ # would leak past `rescue StandardError` and silently vanish.
136
+ # Process-fatal signals still propagate so the supervisor can react.
137
+ FATAL_EXCEPTIONS = [SystemExit, Interrupt, SignalException, NoMemoryError, SystemStackError].freeze
138
+ private_constant :FATAL_EXCEPTIONS
139
+
131
140
  def perform(block)
132
141
  block.call
133
- rescue StandardError => e
142
+ rescue *FATAL_EXCEPTIONS
143
+ raise
144
+ rescue Exception => e # rubocop:disable Lint/RescueException
134
145
  Pgbus.logger.error { "[Pgbus] Async pool fiber error: #{e.class}: #{e.message}" }
135
146
  ensure
136
147
  restore_capacity
@@ -35,6 +35,18 @@ module Pgbus
35
35
  ErrorReporter.report(e, { action: "record_failed_event", queue: queue_name, msg_id: msg_id })
36
36
  end
37
37
 
38
+ def exists?(queue_name:, msg_id:)
39
+ result = connection.select_value(
40
+ "SELECT 1 FROM pgbus_failed_events WHERE queue_name = $1 AND msg_id = $2 LIMIT 1",
41
+ "FailedEvent Exists",
42
+ [queue_name, msg_id.to_i]
43
+ )
44
+ !result.nil?
45
+ rescue StandardError => e
46
+ Pgbus.logger.debug { "[Pgbus] FailedEvent exists? check failed: #{e.class}: #{e.message}" }
47
+ false
48
+ end
49
+
38
50
  def clear!(queue_name:, msg_id:)
39
51
  connection.exec_delete(
40
52
  "DELETE FROM pgbus_failed_events WHERE queue_name = $1 AND msg_id = $2",
@@ -21,6 +21,14 @@ module Pgbus
21
21
 
22
22
  Pgbus.logger.info { "[Pgbus] Supervisor starting pid=#{::Process.pid}" }
23
23
 
24
+ # Bootstrap queues once in the parent process before forking children.
25
+ # This avoids the deadlock that occurs when multiple forked children
26
+ # race to call enable_notify_insert (DROP TRIGGER + CREATE TRIGGER)
27
+ # concurrently on the same queue tables. Children still call
28
+ # bootstrap_queues post-fork but the idempotent check in
29
+ # notify_trigger_current? makes those calls cheap no-ops.
30
+ bootstrap_queues
31
+
24
32
  boot_processes
25
33
  monitor_loop
26
34
  ensure
@@ -126,6 +126,7 @@ module Pgbus
126
126
 
127
127
  @rate_counter.increment(:dequeued, tagged_messages.size)
128
128
  tagged_messages.each do |queue_name, message, source_queue|
129
+ detect_zombie(queue_name, message)
129
130
  @in_flight.increment
130
131
  @pool.post { process_message(message, queue_name, source_queue: source_queue) }
131
132
  end
@@ -285,6 +286,21 @@ module Pgbus
285
286
  Pgbus.logger.error { "[Pgbus] Queue table missing: #{error.message}" }
286
287
  end
287
288
 
289
+ def detect_zombie(queue_name, message)
290
+ return unless config.zombie_detection
291
+ return unless message.read_ct.to_i > 1
292
+
293
+ return if FailedEventRecorder.exists?(queue_name: queue_name, msg_id: message.msg_id.to_i)
294
+
295
+ Pgbus.logger.warn do
296
+ "[Pgbus] Zombie message redelivered: queue=#{queue_name} msg_id=#{message.msg_id} " \
297
+ "read_ct=#{message.read_ct} — previous read did not record a failure. " \
298
+ "The worker may have crashed mid-execute or the executor silently dropped the job."
299
+ end
300
+ rescue StandardError => e
301
+ Pgbus.logger.debug { "[Pgbus] Zombie detection failed: #{e.class}: #{e.message}" }
302
+ end
303
+
288
304
  def check_recycle
289
305
  return unless @lifecycle.running? && recycle_needed?
290
306
 
data/lib/pgbus/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Pgbus
4
- VERSION = "0.7.0"
4
+ VERSION = "0.7.2"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pgbus
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.0
4
+ version: 0.7.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mikael Henriksson