pgbus 0.7.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dca12989cdb8d34cd026facd0ecb71cc9df51fb966135806a858e1c4bdb0f7cc
4
- data.tar.gz: 4f94274ec1bda644087f5711f2a75f8b1c70f5840f5e0d1d9a1b7ea137f6e3ea
3
+ metadata.gz: c45dd364f341b6819b7901f583e058dbe761b375210e4162f19faf75917e3043
4
+ data.tar.gz: ed5ee189a3ff3d7fe0610deada3b2daba3726a2647d7762c58735e0771c9e0eb
5
5
  SHA512:
6
- metadata.gz: decefd5c55ee2f4ca0c1aa6d52e1e1732f89ba3e789a0ae64c6b0bbd1cf60fe0d0c138dc87398f0a80b5e28d7e5cd7cd09eb44e2265265265adcfd9c0ee829f5
7
- data.tar.gz: ad140f317e5de14558842881f2dd428281a8fb141986e7d8099976969a2d85b87dd08e7d01f64ebf5128f5b3174dd485b7af5c1f8d00b07bb4037ec095256c8c
6
+ metadata.gz: e28c032dc7b4f2cba37bd709c4a45030bcedd90274b86a1499d55dd4f4e255769e580983023602a42267b8728068a7eb11f7fdcbc8d12bd3efd83f35a50f241b
7
+ data.tar.gz: dc6d8d5d2e4feebbf7d940c173f53700549b5364c0e15df3f6afac1d72ecc6b1222127d85649016e9c6590e3c184eb9365da95c424a21a98be683862f4a24e67
@@ -13,6 +13,10 @@ module Pgbus
13
13
  @stat_buffer = stat_buffer
14
14
  end
15
15
 
16
+ # Exceptions we never want to swallow — let the process die/signal propagate.
17
+ FATAL_EXCEPTIONS = [SystemExit, Interrupt, SignalException, NoMemoryError, SystemStackError].freeze
18
+ private_constant :FATAL_EXCEPTIONS
19
+
16
20
  def execute(message, queue_name, source_queue: nil)
17
21
  execution_start = monotonic_now
18
22
  payload = JSON.parse(message.message)
@@ -51,18 +55,36 @@ module Pgbus
51
55
 
52
56
  job_succeeded = false
53
57
 
58
+ # Debug-level phase markers. Silent at INFO+, but invaluable when a
59
+ # fiber interrupt or connection issue loses control flow between phases
60
+ # (issue #126). Each line identifies msg_id + phase so the gap is
61
+ # visible in logs: "deserialized" without "archived" means the job
62
+ # ran but its message was never archived.
63
+ msg_id = message.msg_id.to_i
54
64
  Instrumentation.instrument("pgbus.executor.execute", queue: queue_name, job_class: job_class) do
65
+ Pgbus.logger.debug { "[Pgbus] Executor phase=deserialize msg_id=#{msg_id} job=#{job_class}" }
55
66
  job = ::ActiveJob::Base.deserialize(payload)
67
+ Pgbus.logger.debug { "[Pgbus] Executor phase=perform msg_id=#{msg_id} job=#{job_class}" }
56
68
  execute_job(job)
57
- archive_from(queue_name, message.msg_id.to_i, source_queue: source_queue)
58
- FailedEventRecorder.clear!(queue_name: queue_name, msg_id: message.msg_id.to_i)
69
+ Pgbus.logger.debug { "[Pgbus] Executor phase=archive msg_id=#{msg_id} job=#{job_class}" }
70
+ archive_from(queue_name, msg_id, source_queue: source_queue)
71
+ FailedEventRecorder.clear!(queue_name: queue_name, msg_id: msg_id)
59
72
  job_succeeded = true
73
+ Pgbus.logger.debug { "[Pgbus] Executor phase=succeeded msg_id=#{msg_id} job=#{job_class}" }
60
74
  end
61
75
 
62
76
  instrument("pgbus.job_completed", queue: queue_name, job_class: job_class)
63
77
  record_stat(payload, queue_name, "success", execution_start, message: message)
64
78
  :success
65
- rescue StandardError => e
79
+ rescue *FATAL_EXCEPTIONS
80
+ # Process-fatal: propagate so the supervisor/OS can react.
81
+ raise
82
+ rescue Exception => e # rubocop:disable Lint/RescueException
83
+ # Widened from StandardError to catch Async::Stop / Async::Cancel
84
+ # (both inherit from Exception, not StandardError) under execution_mode: :async.
85
+ # Before this, a fiber interruption between perform_now and archive_from
86
+ # silently lost control flow — no failed event row, no job_failed
87
+ # notification, uniqueness lock held until VT expired. See issue #126.
66
88
  handle_failure(message, queue_name, e, payload: payload)
67
89
  instrument("pgbus.job_failed", queue: queue_name, job_class: payload&.dig("job_class"), error: e.class.name)
68
90
  record_stat(payload, queue_name, "failed", execution_start, message: message)
@@ -27,7 +27,9 @@ module Pgbus
27
27
  # sensitive and need every broadcast to fire a NOTIFY, even
28
28
  # when several are batched within a single millisecond.
29
29
  # Override the throttle to 0 specifically for stream queues.
30
- synchronized { @pgmq.enable_notify_insert(full_name, throttle_interval_ms: 0) } if config.listen_notify
30
+ # Use the idempotent path to avoid deadlocks when multiple
31
+ # processes race to set up the same stream queue.
32
+ synchronized { enable_notify_if_needed(full_name, 0) }
31
33
 
32
34
  # CREATE INDEX IF NOT EXISTS is idempotent in Postgres but still
33
35
  # requires a roundtrip and a brief ACCESS SHARE lock on the archive
data/lib/pgbus/client.rb CHANGED
@@ -457,12 +457,48 @@ module Pgbus
457
457
  synchronized do
458
458
  @pgmq.create(full_name)
459
459
  tune_autovacuum(full_name)
460
- @pgmq.enable_notify_insert(full_name, throttle_interval_ms: NOTIFY_THROTTLE_MS) if config.listen_notify
460
+ enable_notify_if_needed(full_name, NOTIFY_THROTTLE_MS)
461
461
  end
462
462
  true
463
463
  end
464
464
  end
465
465
 
466
+ def enable_notify_if_needed(full_name, throttle_ms)
467
+ return unless config.listen_notify
468
+ return if notify_trigger_current?(full_name, throttle_ms)
469
+
470
+ @pgmq.enable_notify_insert(full_name, throttle_interval_ms: throttle_ms)
471
+ end
472
+
473
+ # Check whether the NOTIFY trigger already exists on this queue with the
474
+ # expected throttle interval. When it does, we can skip the destructive
475
+ # DROP TRIGGER + CREATE TRIGGER cycle that causes deadlocks when multiple
476
+ # forked processes race during bootstrap.
477
+ def notify_trigger_current?(full_name, throttle_ms)
478
+ with_raw_connection do |conn|
479
+ result = conn.exec_params(<<~SQL, [full_name, throttle_ms])
480
+ SELECT 1
481
+ FROM pg_trigger t
482
+ JOIN pg_class c ON t.tgrelid = c.oid
483
+ JOIN pg_namespace n ON c.relnamespace = n.oid
484
+ WHERE n.nspname = 'pgmq'
485
+ AND c.relname = pgmq.format_table_name($1, 'q')
486
+ AND t.tgname = 'trigger_notify_queue_insert_listeners'
487
+ AND EXISTS (
488
+ SELECT 1 FROM pgmq.notify_insert_throttle
489
+ WHERE queue_name = $1
490
+ AND throttle_interval_ms = $2
491
+ )
492
+ LIMIT 1
493
+ SQL
494
+ result.ntuples.positive?
495
+ end
496
+ rescue StandardError
497
+ # If we can't check (e.g. pgmq schema not fully ready), fall back to
498
+ # the unconditional path — same behavior as before this fix.
499
+ false
500
+ end
501
+
466
502
  def tune_autovacuum(queue_name)
467
503
  with_raw_connection do |conn|
468
504
  conn.exec(AutovacuumTuning.sql_for_queue(queue_name))
@@ -128,9 +128,20 @@ module Pgbus
128
128
  nil
129
129
  end
130
130
 
131
+ # Supervisor-level rescue: catch any Exception raised from the user
132
+ # block so capacity is always restored and the failure is logged.
133
+ # The `async` gem uses Async::Stop / Async::Cancel (Exception subclasses,
134
+ # NOT StandardError) to cancel tasks, and prior to issue #126 those
135
+ # would leak past `rescue StandardError` and silently vanish.
136
+ # Process-fatal signals still propagate so the supervisor can react.
137
+ FATAL_EXCEPTIONS = [SystemExit, Interrupt, SignalException, NoMemoryError, SystemStackError].freeze
138
+ private_constant :FATAL_EXCEPTIONS
139
+
131
140
  def perform(block)
132
141
  block.call
133
- rescue StandardError => e
142
+ rescue *FATAL_EXCEPTIONS
143
+ raise
144
+ rescue Exception => e # rubocop:disable Lint/RescueException
134
145
  Pgbus.logger.error { "[Pgbus] Async pool fiber error: #{e.class}: #{e.message}" }
135
146
  ensure
136
147
  restore_capacity
@@ -21,6 +21,14 @@ module Pgbus
21
21
 
22
22
  Pgbus.logger.info { "[Pgbus] Supervisor starting pid=#{::Process.pid}" }
23
23
 
24
+ # Bootstrap queues once in the parent process before forking children.
25
+ # This avoids the deadlock that occurs when multiple forked children
26
+ # race to call enable_notify_insert (DROP TRIGGER + CREATE TRIGGER)
27
+ # concurrently on the same queue tables. Children still call
28
+ # bootstrap_queues post-fork but the idempotent check in
29
+ # notify_trigger_current? makes those calls cheap no-ops.
30
+ bootstrap_queues
31
+
24
32
  boot_processes
25
33
  monitor_loop
26
34
  ensure
data/lib/pgbus/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Pgbus
4
- VERSION = "0.7.0"
4
+ VERSION = "0.7.1"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pgbus
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.0
4
+ version: 0.7.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mikael Henriksson