pgbus 0.7.0 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/pgbus/active_job/executor.rb +27 -5
- data/lib/pgbus/client/ensure_stream_queue.rb +3 -1
- data/lib/pgbus/client.rb +37 -1
- data/lib/pgbus/configuration.rb +6 -0
- data/lib/pgbus/execution_pools/async_pool.rb +12 -1
- data/lib/pgbus/failed_event_recorder.rb +12 -0
- data/lib/pgbus/process/supervisor.rb +8 -0
- data/lib/pgbus/process/worker.rb +16 -0
- data/lib/pgbus/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: b1a647a2e485cace147ca201861ff4613e9c14f72f8fd8be9060f64d6eccfff5
|
|
4
|
+
data.tar.gz: bf3ca37850d549c6e9ed86bedbad1d88f9a50462b86a2d8a09113de7c1227d0f
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: b32dac214071005fa5974478f2cb8bd4d8a270efc3e6bc94141b74c28cebfea0dfa84e41ad9bf1ec2a10c8ab0d09ae08d0df989f0894b8d5cbc6a461698ecf5b
|
|
7
|
+
data.tar.gz: ea55e1a29e53e491063c34aaea3a643498d1567968e64833d9a38deef2bcd7a48dfb35e377601d347fa8e05e55a5647a77dbe958d9584fcdc8669a362ff253a8
|
|
@@ -13,9 +13,17 @@ module Pgbus
|
|
|
13
13
|
@stat_buffer = stat_buffer
|
|
14
14
|
end
|
|
15
15
|
|
|
16
|
+
# Exceptions we never want to swallow — let the process die/signal propagate.
|
|
17
|
+
FATAL_EXCEPTIONS = [SystemExit, Interrupt, SignalException, NoMemoryError, SystemStackError].freeze
|
|
18
|
+
private_constant :FATAL_EXCEPTIONS
|
|
19
|
+
|
|
16
20
|
def execute(message, queue_name, source_queue: nil)
|
|
17
21
|
execution_start = monotonic_now
|
|
22
|
+
tag = "msg_id=#{message.msg_id} queue=#{queue_name} read_ct=#{message.read_ct}"
|
|
23
|
+
Pgbus.logger.debug { "[Pgbus::Executor] start #{tag}" }
|
|
24
|
+
|
|
18
25
|
payload = JSON.parse(message.message)
|
|
26
|
+
job_class = payload["job_class"]
|
|
19
27
|
read_count = message.read_ct.to_i
|
|
20
28
|
|
|
21
29
|
if read_count > config.max_retries
|
|
@@ -25,10 +33,9 @@ module Pgbus
|
|
|
25
33
|
signal_batch_discarded(payload)
|
|
26
34
|
Uniqueness.release_lock(Uniqueness.extract_key(payload))
|
|
27
35
|
record_stat(payload, queue_name, "dead_lettered", execution_start, message: message)
|
|
36
|
+
Pgbus.logger.debug { "[Pgbus::Executor] dead_lettered #{tag} job_class=#{job_class}" }
|
|
28
37
|
return :dead_lettered
|
|
29
38
|
end
|
|
30
|
-
|
|
31
|
-
job_class = payload["job_class"]
|
|
32
39
|
uniqueness_key = Uniqueness.extract_key(payload)
|
|
33
40
|
uniqueness_strategy = Uniqueness.extract_strategy(payload)
|
|
34
41
|
|
|
@@ -49,23 +56,38 @@ module Pgbus
|
|
|
49
56
|
end
|
|
50
57
|
end
|
|
51
58
|
|
|
59
|
+
Pgbus.logger.debug { "[Pgbus::Executor] deserialized #{tag} job_class=#{job_class}" }
|
|
52
60
|
job_succeeded = false
|
|
53
61
|
|
|
62
|
+
msg_id = message.msg_id.to_i
|
|
54
63
|
Instrumentation.instrument("pgbus.executor.execute", queue: queue_name, job_class: job_class) do
|
|
55
64
|
job = ::ActiveJob::Base.deserialize(payload)
|
|
65
|
+
Pgbus.logger.debug { "[Pgbus::Executor] running #{tag} job_class=#{job_class}" }
|
|
56
66
|
execute_job(job)
|
|
57
|
-
|
|
58
|
-
|
|
67
|
+
Pgbus.logger.debug { "[Pgbus::Executor] perform_returned #{tag} job_class=#{job_class}" }
|
|
68
|
+
archive_from(queue_name, msg_id, source_queue: source_queue)
|
|
69
|
+
Pgbus.logger.debug { "[Pgbus::Executor] archived #{tag} job_class=#{job_class}" }
|
|
70
|
+
FailedEventRecorder.clear!(queue_name: queue_name, msg_id: msg_id)
|
|
59
71
|
job_succeeded = true
|
|
60
72
|
end
|
|
61
73
|
|
|
62
74
|
instrument("pgbus.job_completed", queue: queue_name, job_class: job_class)
|
|
63
75
|
record_stat(payload, queue_name, "success", execution_start, message: message)
|
|
76
|
+
Pgbus.logger.debug { "[Pgbus::Executor] done #{tag} job_class=#{job_class}" }
|
|
64
77
|
:success
|
|
65
|
-
rescue
|
|
78
|
+
rescue *FATAL_EXCEPTIONS
|
|
79
|
+
# Process-fatal: propagate so the supervisor/OS can react.
|
|
80
|
+
raise
|
|
81
|
+
rescue Exception => e # rubocop:disable Lint/RescueException
|
|
82
|
+
# Widened from StandardError to catch Async::Stop / Async::Cancel
|
|
83
|
+
# (both inherit from Exception, not StandardError) under execution_mode: :async.
|
|
84
|
+
# Before this, a fiber interruption between perform_now and archive_from
|
|
85
|
+
# silently lost control flow — no failed event row, no job_failed
|
|
86
|
+
# notification, uniqueness lock held until VT expired. See issue #126.
|
|
66
87
|
handle_failure(message, queue_name, e, payload: payload)
|
|
67
88
|
instrument("pgbus.job_failed", queue: queue_name, job_class: payload&.dig("job_class"), error: e.class.name)
|
|
68
89
|
record_stat(payload, queue_name, "failed", execution_start, message: message)
|
|
90
|
+
Pgbus.logger.debug { "[Pgbus::Executor] failed #{tag} job_class=#{payload&.dig("job_class")} error=#{e.class}" }
|
|
69
91
|
# Don't signal concurrency on transient failure — the job will be retried.
|
|
70
92
|
# Semaphore is released only on success or dead-lettering.
|
|
71
93
|
:failed
|
|
@@ -27,7 +27,9 @@ module Pgbus
|
|
|
27
27
|
# sensitive and need every broadcast to fire a NOTIFY, even
|
|
28
28
|
# when several are batched within a single millisecond.
|
|
29
29
|
# Override the throttle to 0 specifically for stream queues.
|
|
30
|
-
|
|
30
|
+
# Use the idempotent path to avoid deadlocks when multiple
|
|
31
|
+
# processes race to set up the same stream queue.
|
|
32
|
+
synchronized { enable_notify_if_needed(full_name, 0) }
|
|
31
33
|
|
|
32
34
|
# CREATE INDEX IF NOT EXISTS is idempotent in Postgres but still
|
|
33
35
|
# requires a roundtrip and a brief ACCESS SHARE lock on the archive
|
data/lib/pgbus/client.rb
CHANGED
|
@@ -457,12 +457,48 @@ module Pgbus
|
|
|
457
457
|
synchronized do
|
|
458
458
|
@pgmq.create(full_name)
|
|
459
459
|
tune_autovacuum(full_name)
|
|
460
|
-
|
|
460
|
+
enable_notify_if_needed(full_name, NOTIFY_THROTTLE_MS)
|
|
461
461
|
end
|
|
462
462
|
true
|
|
463
463
|
end
|
|
464
464
|
end
|
|
465
465
|
|
|
466
|
+
def enable_notify_if_needed(full_name, throttle_ms)
|
|
467
|
+
return unless config.listen_notify
|
|
468
|
+
return if notify_trigger_current?(full_name, throttle_ms)
|
|
469
|
+
|
|
470
|
+
@pgmq.enable_notify_insert(full_name, throttle_interval_ms: throttle_ms)
|
|
471
|
+
end
|
|
472
|
+
|
|
473
|
+
# Check whether the NOTIFY trigger already exists on this queue with the
|
|
474
|
+
# expected throttle interval. When it does, we can skip the destructive
|
|
475
|
+
# DROP TRIGGER + CREATE TRIGGER cycle that causes deadlocks when multiple
|
|
476
|
+
# forked processes race during bootstrap.
|
|
477
|
+
def notify_trigger_current?(full_name, throttle_ms)
|
|
478
|
+
with_raw_connection do |conn|
|
|
479
|
+
result = conn.exec_params(<<~SQL, [full_name, throttle_ms])
|
|
480
|
+
SELECT 1
|
|
481
|
+
FROM pg_trigger t
|
|
482
|
+
JOIN pg_class c ON t.tgrelid = c.oid
|
|
483
|
+
JOIN pg_namespace n ON c.relnamespace = n.oid
|
|
484
|
+
WHERE n.nspname = 'pgmq'
|
|
485
|
+
AND c.relname = pgmq.format_table_name($1, 'q')
|
|
486
|
+
AND t.tgname = 'trigger_notify_queue_insert_listeners'
|
|
487
|
+
AND EXISTS (
|
|
488
|
+
SELECT 1 FROM pgmq.notify_insert_throttle
|
|
489
|
+
WHERE queue_name = $1
|
|
490
|
+
AND throttle_interval_ms = $2
|
|
491
|
+
)
|
|
492
|
+
LIMIT 1
|
|
493
|
+
SQL
|
|
494
|
+
result.ntuples.positive?
|
|
495
|
+
end
|
|
496
|
+
rescue StandardError
|
|
497
|
+
# If we can't check (e.g. pgmq schema not fully ready), fall back to
|
|
498
|
+
# the unconditional path — same behavior as before this fix.
|
|
499
|
+
false
|
|
500
|
+
end
|
|
501
|
+
|
|
466
502
|
def tune_autovacuum(queue_name)
|
|
467
503
|
with_raw_connection do |conn|
|
|
468
504
|
conn.exec(AutovacuumTuning.sql_for_queue(queue_name))
|
data/lib/pgbus/configuration.rb
CHANGED
|
@@ -85,6 +85,10 @@ module Pgbus
|
|
|
85
85
|
# Requires a matching entry in config/database.yml under the "pgbus" key.
|
|
86
86
|
attr_accessor :connects_to
|
|
87
87
|
|
|
88
|
+
# Zombie message detection — logs a warning when a message is redelivered
|
|
89
|
+
# (read_ct > 1) without any prior failure recorded in pgbus_failed_events.
|
|
90
|
+
attr_accessor :zombie_detection
|
|
91
|
+
|
|
88
92
|
# Job stats
|
|
89
93
|
attr_accessor :stats_enabled
|
|
90
94
|
attr_reader :stats_retention # rubocop:disable Style/AccessorGrouping
|
|
@@ -160,6 +164,8 @@ module Pgbus
|
|
|
160
164
|
@skip_recurring = false
|
|
161
165
|
@recurring_execution_retention = 7 * 24 * 3600 # 7 days
|
|
162
166
|
|
|
167
|
+
@zombie_detection = true
|
|
168
|
+
|
|
163
169
|
@stats_enabled = true
|
|
164
170
|
@stats_retention = 30 * 24 * 3600 # 30 days
|
|
165
171
|
|
|
@@ -128,9 +128,20 @@ module Pgbus
|
|
|
128
128
|
nil
|
|
129
129
|
end
|
|
130
130
|
|
|
131
|
+
# Supervisor-level rescue: catch any Exception raised from the user
|
|
132
|
+
# block so capacity is always restored and the failure is logged.
|
|
133
|
+
# The `async` gem uses Async::Stop / Async::Cancel (Exception subclasses,
|
|
134
|
+
# NOT StandardError) to cancel tasks, and prior to issue #126 those
|
|
135
|
+
# would leak past `rescue StandardError` and silently vanish.
|
|
136
|
+
# Process-fatal signals still propagate so the supervisor can react.
|
|
137
|
+
FATAL_EXCEPTIONS = [SystemExit, Interrupt, SignalException, NoMemoryError, SystemStackError].freeze
|
|
138
|
+
private_constant :FATAL_EXCEPTIONS
|
|
139
|
+
|
|
131
140
|
def perform(block)
|
|
132
141
|
block.call
|
|
133
|
-
rescue
|
|
142
|
+
rescue *FATAL_EXCEPTIONS
|
|
143
|
+
raise
|
|
144
|
+
rescue Exception => e # rubocop:disable Lint/RescueException
|
|
134
145
|
Pgbus.logger.error { "[Pgbus] Async pool fiber error: #{e.class}: #{e.message}" }
|
|
135
146
|
ensure
|
|
136
147
|
restore_capacity
|
|
@@ -35,6 +35,18 @@ module Pgbus
|
|
|
35
35
|
ErrorReporter.report(e, { action: "record_failed_event", queue: queue_name, msg_id: msg_id })
|
|
36
36
|
end
|
|
37
37
|
|
|
38
|
+
def exists?(queue_name:, msg_id:)
|
|
39
|
+
result = connection.select_value(
|
|
40
|
+
"SELECT 1 FROM pgbus_failed_events WHERE queue_name = $1 AND msg_id = $2 LIMIT 1",
|
|
41
|
+
"FailedEvent Exists",
|
|
42
|
+
[queue_name, msg_id.to_i]
|
|
43
|
+
)
|
|
44
|
+
!result.nil?
|
|
45
|
+
rescue StandardError => e
|
|
46
|
+
Pgbus.logger.debug { "[Pgbus] FailedEvent exists? check failed: #{e.class}: #{e.message}" }
|
|
47
|
+
false
|
|
48
|
+
end
|
|
49
|
+
|
|
38
50
|
def clear!(queue_name:, msg_id:)
|
|
39
51
|
connection.exec_delete(
|
|
40
52
|
"DELETE FROM pgbus_failed_events WHERE queue_name = $1 AND msg_id = $2",
|
|
@@ -21,6 +21,14 @@ module Pgbus
|
|
|
21
21
|
|
|
22
22
|
Pgbus.logger.info { "[Pgbus] Supervisor starting pid=#{::Process.pid}" }
|
|
23
23
|
|
|
24
|
+
# Bootstrap queues once in the parent process before forking children.
|
|
25
|
+
# This avoids the deadlock that occurs when multiple forked children
|
|
26
|
+
# race to call enable_notify_insert (DROP TRIGGER + CREATE TRIGGER)
|
|
27
|
+
# concurrently on the same queue tables. Children still call
|
|
28
|
+
# bootstrap_queues post-fork but the idempotent check in
|
|
29
|
+
# notify_trigger_current? makes those calls cheap no-ops.
|
|
30
|
+
bootstrap_queues
|
|
31
|
+
|
|
24
32
|
boot_processes
|
|
25
33
|
monitor_loop
|
|
26
34
|
ensure
|
data/lib/pgbus/process/worker.rb
CHANGED
|
@@ -126,6 +126,7 @@ module Pgbus
|
|
|
126
126
|
|
|
127
127
|
@rate_counter.increment(:dequeued, tagged_messages.size)
|
|
128
128
|
tagged_messages.each do |queue_name, message, source_queue|
|
|
129
|
+
detect_zombie(queue_name, message)
|
|
129
130
|
@in_flight.increment
|
|
130
131
|
@pool.post { process_message(message, queue_name, source_queue: source_queue) }
|
|
131
132
|
end
|
|
@@ -285,6 +286,21 @@ module Pgbus
|
|
|
285
286
|
Pgbus.logger.error { "[Pgbus] Queue table missing: #{error.message}" }
|
|
286
287
|
end
|
|
287
288
|
|
|
289
|
+
def detect_zombie(queue_name, message)
|
|
290
|
+
return unless config.zombie_detection
|
|
291
|
+
return unless message.read_ct.to_i > 1
|
|
292
|
+
|
|
293
|
+
return if FailedEventRecorder.exists?(queue_name: queue_name, msg_id: message.msg_id.to_i)
|
|
294
|
+
|
|
295
|
+
Pgbus.logger.warn do
|
|
296
|
+
"[Pgbus] Zombie message redelivered: queue=#{queue_name} msg_id=#{message.msg_id} " \
|
|
297
|
+
"read_ct=#{message.read_ct} — previous read did not record a failure. " \
|
|
298
|
+
"The worker may have crashed mid-execute or the executor silently dropped the job."
|
|
299
|
+
end
|
|
300
|
+
rescue StandardError => e
|
|
301
|
+
Pgbus.logger.debug { "[Pgbus] Zombie detection failed: #{e.class}: #{e.message}" }
|
|
302
|
+
end
|
|
303
|
+
|
|
288
304
|
def check_recycle
|
|
289
305
|
return unless @lifecycle.running? && recycle_needed?
|
|
290
306
|
|
data/lib/pgbus/version.rb
CHANGED