ruby_reactor 0.5.1 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.release-please-manifest.json +1 -1
- data/CHANGELOG.md +7 -0
- data/README.md +54 -14
- data/lib/ruby_reactor/dsl/compose_builder.rb +20 -0
- data/lib/ruby_reactor/dsl/lockable.rb +41 -1
- data/lib/ruby_reactor/executor/ordered_lock_support.rb +307 -0
- data/lib/ruby_reactor/executor.rb +82 -50
- data/lib/ruby_reactor/ordered_lock.rb +158 -0
- data/lib/ruby_reactor/reactor.rb +41 -0
- data/lib/ruby_reactor/rspec/helpers.rb +6 -0
- data/lib/ruby_reactor/rspec/matchers.rb +66 -0
- data/lib/ruby_reactor/rspec/sidekiq_helpers.rb +70 -0
- data/lib/ruby_reactor/rspec/storage_reset.rb +23 -0
- data/lib/ruby_reactor/rspec/test_subject.rb +14 -28
- data/lib/ruby_reactor/rspec.rb +37 -0
- data/lib/ruby_reactor/sidekiq_workers/worker.rb +46 -8
- data/lib/ruby_reactor/storage/redis_adapter.rb +1 -0
- data/lib/ruby_reactor/storage/redis_ordered_locking.rb +382 -0
- data/lib/ruby_reactor/version.rb +1 -1
- data/lib/ruby_reactor.rb +1 -0
- metadata +6 -1
data/lib/ruby_reactor/rspec.rb
CHANGED
|
@@ -2,17 +2,54 @@
|
|
|
2
2
|
|
|
3
3
|
require_relative "rspec/helpers"
|
|
4
4
|
require_relative "rspec/matchers"
|
|
5
|
+
require_relative "rspec/sidekiq_helpers"
|
|
6
|
+
require_relative "rspec/storage_reset"
|
|
5
7
|
require_relative "rspec/test_subject"
|
|
6
8
|
|
|
7
9
|
module RubyReactor
|
|
8
10
|
module RSpec
|
|
11
|
+
# Examples opt into RubyReactor's RSpec setup (Sidekiq fake mode,
|
|
12
|
+
# storage wipe, snooze knob reset) by declaring `type: :reactor`.
|
|
13
|
+
REACTOR_METADATA = { type: :reactor }.freeze
|
|
14
|
+
|
|
15
|
+
DEFAULT_SNOOZE_BASE_DELAY = 5
|
|
16
|
+
DEFAULT_SNOOZE_JITTER = 5
|
|
17
|
+
DEFAULT_SNOOZE_MAX_ATTEMPTS = 20
|
|
18
|
+
|
|
9
19
|
def self.configure(config)
|
|
10
20
|
require_relative "rspec/step_executor_patch"
|
|
11
21
|
|
|
12
22
|
config.include RubyReactor::RSpec::Helpers
|
|
13
23
|
config.include RubyReactor::RSpec::Matchers
|
|
24
|
+
config.include RubyReactor::RSpec::SidekiqHelpers, REACTOR_METADATA
|
|
14
25
|
|
|
15
26
|
::RubyReactor::Executor::StepExecutor.prepend(RubyReactor::RSpec::StepExecutorPatch)
|
|
27
|
+
StorageReset.install!
|
|
28
|
+
|
|
29
|
+
config.before(:each, REACTOR_METADATA) do
|
|
30
|
+
RubyReactor::RSpec.prepare_example!
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Idempotent setup invoked before each `type: :reactor` example. Restores
|
|
35
|
+
# Sidekiq fake mode, clears the queues, wipes the storage adapter, and
|
|
36
|
+
# rolls back snooze knobs so cross-example bleed-through can't happen.
|
|
37
|
+
def self.prepare_example!
|
|
38
|
+
if defined?(::Sidekiq::Testing)
|
|
39
|
+
begin
|
|
40
|
+
::Sidekiq::Testing.fake! unless ::Sidekiq::Testing.fake?
|
|
41
|
+
rescue ::Sidekiq::Testing::TestModeAlreadySetError
|
|
42
|
+
# Nested fake!/inline! block already active in this thread; leave it.
|
|
43
|
+
end
|
|
44
|
+
::Sidekiq::Worker.clear_all
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
adapter = ::RubyReactor.configuration.storage_adapter
|
|
48
|
+
adapter.reset! if adapter.respond_to?(:reset!)
|
|
49
|
+
|
|
50
|
+
::RubyReactor.configuration.lock_snooze_base_delay = DEFAULT_SNOOZE_BASE_DELAY
|
|
51
|
+
::RubyReactor.configuration.lock_snooze_jitter = DEFAULT_SNOOZE_JITTER
|
|
52
|
+
::RubyReactor.configuration.lock_snooze_max_attempts = DEFAULT_SNOOZE_MAX_ATTEMPTS
|
|
16
53
|
end
|
|
17
54
|
end
|
|
18
55
|
end
|
|
@@ -48,17 +48,24 @@ module RubyReactor
|
|
|
48
48
|
# Resume execution from the failed step
|
|
49
49
|
executor = Executor.new(context.reactor_class, {}, context)
|
|
50
50
|
executor.resume_execution
|
|
51
|
-
executor
|
|
51
|
+
# Skip the post-run save when the executor deliberately suppressed
|
|
52
|
+
# persistence (stale-batch redelivery of an already-terminal context)
|
|
53
|
+
# — re-saving here would clobber the stored terminal record with this
|
|
54
|
+
# run's stale in-memory status.
|
|
55
|
+
executor.save_context unless executor.skip_context_persist?
|
|
52
56
|
|
|
53
57
|
# Return the executor (which now has the result stored in it)
|
|
54
58
|
executor
|
|
55
59
|
rescue RubyReactor::Lock::AcquisitionError,
|
|
56
60
|
RubyReactor::Semaphore::AcquisitionError,
|
|
57
|
-
RubyReactor::RateLimit::ExceededError
|
|
58
|
-
|
|
59
|
-
#
|
|
60
|
-
#
|
|
61
|
-
#
|
|
61
|
+
RubyReactor::RateLimit::ExceededError,
|
|
62
|
+
RubyReactor::OrderedLock::WaitError => e
|
|
63
|
+
# Snooze on expected concurrency, rate, or ordering contention.
|
|
64
|
+
# OrderedLock::WaitError carries a poison-pill-derived retry hint,
|
|
65
|
+
# consumed by compute_snooze_delay below. We avoid Sidekiq's native
|
|
66
|
+
# retry path so this doesn't burn the job's retry budget or appear
|
|
67
|
+
# as an error in dashboards. After the configured cap is reached we
|
|
68
|
+
# escalate by marking the reactor as failed.
|
|
62
69
|
handle_snooze(serialized_context, reactor_class_name, context, snooze_count, e)
|
|
63
70
|
rescue RubyReactor::RateLimitRegistry::UnknownLimitError => e
|
|
64
71
|
# Permanent configuration error — snoozing or retrying the same job
|
|
@@ -73,7 +80,15 @@ module RubyReactor
|
|
|
73
80
|
config = RubyReactor.configuration
|
|
74
81
|
max = config.lock_snooze_max_attempts
|
|
75
82
|
|
|
76
|
-
|
|
83
|
+
# OrderedLock::WaitError bypasses the snooze cap. The gate's
|
|
84
|
+
# poison_pill_timeout is the only meaningful upper bound on how long a
|
|
85
|
+
# nonce can legitimately wait; capping snoozes would either fail jobs
|
|
86
|
+
# prematurely or strand the nonce in `assigned_at` until poison_pill
|
|
87
|
+
# eventually advances past it. Snooze until the gate passes (or poison
|
|
88
|
+
# auto-advance moves the cursor past us).
|
|
89
|
+
capped = !error.is_a?(RubyReactor::OrderedLock::WaitError)
|
|
90
|
+
|
|
91
|
+
if capped && max != :infinity && snooze_count >= max
|
|
77
92
|
escalate_snooze(context, snooze_count, error)
|
|
78
93
|
return
|
|
79
94
|
end
|
|
@@ -86,17 +101,32 @@ module RubyReactor
|
|
|
86
101
|
# (RateLimit::ExceededError carries the time until the bucket rolls);
|
|
87
102
|
# otherwise fall back to the configured base + jitter for lock/semaphore
|
|
88
103
|
# contention which has no precise hint.
|
|
104
|
+
#
|
|
105
|
+
# OrderedLock::WaitError is deliberately excluded from the hint path: its
|
|
106
|
+
# `retry_after_seconds` is the poison-pill window (the upper bound before
|
|
107
|
+
# a *dead* blocker is force-advanced), NOT how long the *live* blocker
|
|
108
|
+
# will take — which is usually milliseconds. Snoozing for the full window
|
|
109
|
+
# would make every out-of-order nonce sleep up to poison_pill_timeout even
|
|
110
|
+
# though its blocker finishes immediately, collapsing throughput. Re-poll
|
|
111
|
+
# at the base delay instead; poison auto-advance still clears a genuinely
|
|
112
|
+
# dead blocker on a later gate.
|
|
89
113
|
def compute_snooze_delay(config, error)
|
|
90
114
|
jitter = config.lock_snooze_jitter.to_f
|
|
91
115
|
jitter_amount = jitter.positive? ? rand(0.0..jitter) : 0.0
|
|
92
116
|
|
|
93
|
-
if
|
|
117
|
+
if hinted_retry?(error)
|
|
94
118
|
[error.retry_after_seconds.to_f, 0.1].max + jitter_amount
|
|
95
119
|
else
|
|
96
120
|
config.lock_snooze_base_delay.to_f + jitter_amount
|
|
97
121
|
end
|
|
98
122
|
end
|
|
99
123
|
|
|
124
|
+
def hinted_retry?(error)
|
|
125
|
+
return false if error.is_a?(RubyReactor::OrderedLock::WaitError)
|
|
126
|
+
|
|
127
|
+
error.respond_to?(:retry_after_seconds) && error.retry_after_seconds
|
|
128
|
+
end
|
|
129
|
+
|
|
100
130
|
def escalate_snooze(context, snooze_count, error)
|
|
101
131
|
RubyReactor.configuration.logger.warn(
|
|
102
132
|
"RubyReactor snooze limit reached after #{snooze_count} attempts " \
|
|
@@ -117,6 +147,14 @@ module RubyReactor
|
|
|
117
147
|
serialized,
|
|
118
148
|
reactor_class_name
|
|
119
149
|
)
|
|
150
|
+
|
|
151
|
+
# Escalation is a terminal Failure that never reaches the Executor's
|
|
152
|
+
# ensure path, so advance the ordered-lock cursor here. Without this
|
|
153
|
+
# the nonce stays stranded in assigned_at (successors stall for the
|
|
154
|
+
# full poison_pill_timeout) and, worse, the strict-mode chain marker
|
|
155
|
+
# is never recorded — successors would RUN instead of being skipped.
|
|
156
|
+
info = Executor::OrderedLockSupport.info_from(context)
|
|
157
|
+
Executor::OrderedLockSupport.advance_with_retry(info, failed: true) if info
|
|
120
158
|
end
|
|
121
159
|
|
|
122
160
|
def log_infrastructure_failure(msg, exception)
|
|
@@ -0,0 +1,382 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyReactor
|
|
4
|
+
module Storage
|
|
5
|
+
# Ordered Lock Primitives — used by `with_ordered_lock` to enforce
|
|
6
|
+
# strict transaction ordering via a monotonically increasing nonce
|
|
7
|
+
# assigned at enqueue time. See `RubyReactor::OrderedLock`.
|
|
8
|
+
#
|
|
9
|
+
# Storage layout (hash-tagged so a Redis cluster keeps them on the same
|
|
10
|
+
# shard):
|
|
11
|
+
# STRING ordered_lock:{<key>}:next — last-assigned nonce
|
|
12
|
+
# STRING ordered_lock:{<key>}:last_completed — last-advanced nonce
|
|
13
|
+
# HASH ordered_lock:{<key>}:assigned_at — { nonce => unix_ts }
|
|
14
|
+
# STRING ordered_lock:{<key>}:first_failed — nonce of the FIRST run
|
|
15
|
+
# whose terminal status
|
|
16
|
+
# was Failure (strict mode
|
|
17
|
+
# poison marker; 0 / unset
|
|
18
|
+
# if no failure yet).
|
|
19
|
+
# STRING ordered_lock:{<key>}:epoch — generation counter,
|
|
20
|
+
# bumped each time a fresh
|
|
21
|
+
# batch starts (nonce 1).
|
|
22
|
+
# Captured at assign and
|
|
23
|
+
# carried by every gate /
|
|
24
|
+
# advance call so a stale
|
|
25
|
+
# straggler from a drained
|
|
26
|
+
# batch (whose nonce numbers
|
|
27
|
+
# the next batch reuses)
|
|
28
|
+
# is fenced out as a no-op.
|
|
29
|
+
#
|
|
30
|
+
# When `last_completed == next`, the next/last_completed/assigned_at/
|
|
31
|
+
# first_failed keys are garbage-collected by the ADVANCE script and the next
|
|
32
|
+
# assign starts at 1 again — a fresh batch always starts un-poisoned. The
|
|
33
|
+
# `epoch` key is deliberately NOT GC'd (only TTL-expires when fully idle) so
|
|
34
|
+
# the generation keeps incrementing across back-to-back batches.
|
|
35
|
+
module RedisOrderedLocking # rubocop:disable Metrics/ModuleLength
|
|
36
|
+
ASSIGN_SCRIPT = <<~LUA
|
|
37
|
+
local next_key = KEYS[1]
|
|
38
|
+
local last_key = KEYS[2]
|
|
39
|
+
local at_key = KEYS[3]
|
|
40
|
+
local epoch_key = KEYS[4]
|
|
41
|
+
local now = ARGV[1]
|
|
42
|
+
local ttl = tonumber(ARGV[2])
|
|
43
|
+
|
|
44
|
+
local nonce = redis.call('incr', next_key)
|
|
45
|
+
redis.call('expire', next_key, ttl)
|
|
46
|
+
|
|
47
|
+
if redis.call('exists', last_key) == 0 then
|
|
48
|
+
redis.call('set', last_key, 0, 'EX', ttl)
|
|
49
|
+
else
|
|
50
|
+
redis.call('expire', last_key, ttl)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
-- nonce == 1 means `next` was absent (first ever, or GC'd after a full
|
|
54
|
+
-- drain), so this is the start of a fresh batch -> bump the generation.
|
|
55
|
+
-- Later nonces in the same batch read the epoch the nonce-1 caller set.
|
|
56
|
+
local epoch
|
|
57
|
+
if nonce == 1 then
|
|
58
|
+
epoch = redis.call('incr', epoch_key)
|
|
59
|
+
else
|
|
60
|
+
epoch = tonumber(redis.call('get', epoch_key) or '1')
|
|
61
|
+
end
|
|
62
|
+
redis.call('expire', epoch_key, ttl)
|
|
63
|
+
|
|
64
|
+
redis.call('hset', at_key, nonce, now)
|
|
65
|
+
redis.call('expire', at_key, ttl)
|
|
66
|
+
return {nonce, epoch}
|
|
67
|
+
LUA
|
|
68
|
+
|
|
69
|
+
CAN_PROCEED_SCRIPT = <<~LUA
|
|
70
|
+
local next_key = KEYS[1]
|
|
71
|
+
local last_key = KEYS[2]
|
|
72
|
+
local at_key = KEYS[3]
|
|
73
|
+
local fail_key = KEYS[4]
|
|
74
|
+
local epoch_key = KEYS[5]
|
|
75
|
+
local my = tonumber(ARGV[1])
|
|
76
|
+
local now = tonumber(ARGV[2])
|
|
77
|
+
local pp = tonumber(ARGV[3])
|
|
78
|
+
local my_epoch = tonumber(ARGV[4] or '0')
|
|
79
|
+
|
|
80
|
+
local last = tonumber(redis.call('get', last_key) or '0')
|
|
81
|
+
local first_failed = tonumber(redis.call('get', fail_key) or '0')
|
|
82
|
+
|
|
83
|
+
-- Stale-batch fence: a caller carrying an epoch from a drained batch
|
|
84
|
+
-- (whose nonce numbers the current batch reused) must not gate against
|
|
85
|
+
-- or poison-advance this batch. my_epoch == 0 is a legacy/no-epoch
|
|
86
|
+
-- caller (e.g. an in-flight job from before this field existed) — skip.
|
|
87
|
+
local cur_epoch = tonumber(redis.call('get', epoch_key) or '0')
|
|
88
|
+
if my_epoch > 0 and my_epoch ~= cur_epoch then
|
|
89
|
+
return {'stale', 0, last, first_failed}
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
-- Drained-batch fence: both counters absent means this caller's batch
|
|
93
|
+
-- fully drained and was GC'd (or wholly TTL-expired) while it slept —
|
|
94
|
+
-- e.g. a poison-passed straggler waking between the drain and the next
|
|
95
|
+
-- batch's first assign (epoch not yet bumped, so the stale fence can't
|
|
96
|
+
-- catch it). It may run late (poison semantics) but must NOT enter the
|
|
97
|
+
-- poison loop below: SET on the missing last_key would resurrect the
|
|
98
|
+
-- cursor with no TTL and let every nonce of the NEXT batch gate
|
|
99
|
+
-- straight through. Only both-absent is conclusive — a mid-batch
|
|
100
|
+
-- next_key TTL hiccup leaves last_key in place and proceeds normally.
|
|
101
|
+
-- Returns a DISTINCT 'drained_go' (not plain 'go') so the executor can
|
|
102
|
+
-- tell this apart: a genuine late straggler should run, but a Sidekiq
|
|
103
|
+
-- at-least-once redelivery of an already-terminal context must NOT
|
|
104
|
+
-- re-execute. The executor consults the stored context status to decide.
|
|
105
|
+
if redis.call('exists', next_key) == 0 and redis.call('exists', last_key) == 0 then
|
|
106
|
+
return {'drained_go', 0, last, first_failed}
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
-- Liveness heartbeat: a gate check proves this caller is alive (about
|
|
110
|
+
-- to run, or snoozing on lock/rate contention and re-checking), so
|
|
111
|
+
-- restamp its own assigned_at. assigned_at is otherwise set once at
|
|
112
|
+
-- ENQUEUE, meaning a job that merely sat in a deep queue longer than
|
|
113
|
+
-- poison_pill_timeout would be poison-passed the moment a successor
|
|
114
|
+
-- gates — this restamp gives it a full pp window from the time it
|
|
115
|
+
-- actually starts. hexists guard: never resurrect an entry that an
|
|
116
|
+
-- out-of-order terminal advance already deleted.
|
|
117
|
+
if my > last and redis.call('hexists', at_key, my) == 1 then
|
|
118
|
+
redis.call('hset', at_key, my, now)
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
if my <= last then
|
|
122
|
+
return {'go', 0, last, first_failed}
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
if my == last + 1 then
|
|
126
|
+
return {'go', 0, last, first_failed}
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
-- Drain consecutive poisoned blockers in one shot. Without this loop a
|
|
130
|
+
-- cluster of N dead blockers takes N snooze rounds to clear (one round
|
|
131
|
+
-- per blocker); with it, a single can_proceed call sweeps them all.
|
|
132
|
+
-- Bounded by `my` so the loop runs at most O(stream length) per call.
|
|
133
|
+
local advanced_via_poison = false
|
|
134
|
+
while last + 1 < my do
|
|
135
|
+
local blocker = last + 1
|
|
136
|
+
local at = tonumber(redis.call('hget', at_key, blocker) or '0')
|
|
137
|
+
-- Only a blocker with a recent assigned_at timestamp is genuinely in
|
|
138
|
+
-- flight; stop draining there. `at == 0` means the timer is gone (an
|
|
139
|
+
-- out-of-order advance deleted it, or the assigned_at hash expired),
|
|
140
|
+
-- so the blocker can never make progress on its own — advance past it
|
|
141
|
+
-- rather than stalling forever (the original `break` here was a
|
|
142
|
+
-- permanent head-of-line hang, the exact thing poison-pill prevents).
|
|
143
|
+
if at > 0 and (now - at) <= pp then
|
|
144
|
+
break
|
|
145
|
+
end
|
|
146
|
+
redis.call('set', last_key, blocker, 'KEEPTTL')
|
|
147
|
+
redis.call('hdel', at_key, blocker)
|
|
148
|
+
last = blocker
|
|
149
|
+
advanced_via_poison = true
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
if my <= last then
|
|
153
|
+
return {advanced_via_poison and 'poison_advance' or 'go', 0, last, first_failed}
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
if my == last + 1 then
|
|
157
|
+
return {advanced_via_poison and 'poison_advance' or 'go', 0, last, first_failed}
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
local blocker = last + 1
|
|
161
|
+
local blocker_assigned = tonumber(redis.call('hget', at_key, blocker) or '0')
|
|
162
|
+
local hint
|
|
163
|
+
if blocker_assigned > 0 then
|
|
164
|
+
hint = pp - (now - blocker_assigned)
|
|
165
|
+
else
|
|
166
|
+
hint = pp
|
|
167
|
+
end
|
|
168
|
+
if hint < 1 then hint = 1 end
|
|
169
|
+
return {'wait', hint, last, first_failed}
|
|
170
|
+
LUA
|
|
171
|
+
|
|
172
|
+
ADVANCE_SCRIPT = <<~LUA
|
|
173
|
+
local next_key = KEYS[1]
|
|
174
|
+
local last_key = KEYS[2]
|
|
175
|
+
local at_key = KEYS[3]
|
|
176
|
+
local fail_key = KEYS[4]
|
|
177
|
+
local epoch_key = KEYS[5]
|
|
178
|
+
local my = tonumber(ARGV[1])
|
|
179
|
+
local failed = tonumber(ARGV[2]) == 1
|
|
180
|
+
local ttl = tonumber(ARGV[3])
|
|
181
|
+
local my_epoch = tonumber(ARGV[4] or '0')
|
|
182
|
+
|
|
183
|
+
-- Stale-batch fence: an advance carrying an epoch from a drained batch
|
|
184
|
+
-- whose nonce numbers were reused must not mutate the current batch's
|
|
185
|
+
-- counters. This is the core protection against a slow straggler from a
|
|
186
|
+
-- prior batch corrupting a later one. my_epoch == 0 = legacy caller.
|
|
187
|
+
local cur_epoch = tonumber(redis.call('get', epoch_key) or '0')
|
|
188
|
+
if my_epoch > 0 and my_epoch ~= cur_epoch then
|
|
189
|
+
return tonumber(redis.call('get', last_key) or '0')
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
-- Drained-batch fence (mirrors CAN_PROCEED): a late terminal from a
|
|
193
|
+
-- batch that already drained and GC'd must be a complete no-op. Without
|
|
194
|
+
-- it, an in-order straggler advance (my == 0 + 1) would SET last_key
|
|
195
|
+
-- with KEEPTTL on a missing key — resurrecting a TTL-less cursor that
|
|
196
|
+
-- un-gates every nonce of the next batch — and a failed straggler
|
|
197
|
+
-- would write fail_key, strict-poisoning a batch that hasn't started.
|
|
198
|
+
if redis.call('exists', next_key) == 0 and redis.call('exists', last_key) == 0 then
|
|
199
|
+
return 0
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
local last = tonumber(redis.call('get', last_key) or '0')
|
|
203
|
+
|
|
204
|
+
-- Record the chain poison marker for ANY terminal failure ahead of the
|
|
205
|
+
-- cursor (my > last), not just the in-order successor; strict-mode
|
|
206
|
+
-- chain-skip relies on the SMALLEST failed nonce being recorded. A
|
|
207
|
+
-- failure at or behind the cursor (my <= last) is deliberately NOT
|
|
208
|
+
-- recorded: that guard is what keeps a Sidekiq duplicate redelivery of
|
|
209
|
+
-- an already-terminated failure from re-poisoning a chain that moved
|
|
210
|
+
-- on. Cost of the trade-off: a run the poison-advance already passed
|
|
211
|
+
-- (cursor moved beyond it) that later fails does NOT poison the chain
|
|
212
|
+
-- — by then ordering was already ceded for that nonce and successors
|
|
213
|
+
-- may have run.
|
|
214
|
+
if failed and my > last then
|
|
215
|
+
local existing = tonumber(redis.call('get', fail_key) or '0')
|
|
216
|
+
if existing == 0 or my < existing then
|
|
217
|
+
redis.call('set', fail_key, my, 'EX', ttl)
|
|
218
|
+
end
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
if my == last + 1 then
|
|
222
|
+
redis.call('set', last_key, my, 'KEEPTTL')
|
|
223
|
+
redis.call('hdel', at_key, my)
|
|
224
|
+
last = my
|
|
225
|
+
|
|
226
|
+
local nxt = tonumber(redis.call('get', next_key) or '0')
|
|
227
|
+
-- Guard with `nxt > 0` (mirroring SKIP_SCRIPT): a missing/expired
|
|
228
|
+
-- next_key reads as 0 and `last >= 0` would otherwise GC live
|
|
229
|
+
-- counters mid-sequence, resetting numbering and dropping the marker.
|
|
230
|
+
if last >= nxt and nxt > 0 then
|
|
231
|
+
redis.call('del', next_key)
|
|
232
|
+
redis.call('del', last_key)
|
|
233
|
+
redis.call('del', at_key)
|
|
234
|
+
redis.call('del', fail_key)
|
|
235
|
+
end
|
|
236
|
+
return last
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
redis.call('hdel', at_key, my)
|
|
240
|
+
return last
|
|
241
|
+
LUA
|
|
242
|
+
|
|
243
|
+
# Liveness restamp for a nonce that is actively executing its steps. The
|
|
244
|
+
# CAN_PROCEED heartbeat only fires when a job runs its gate; a blocker that
|
|
245
|
+
# passed the gate and is now running long steps never re-gates, so without
|
|
246
|
+
# this a successor would poison-advance past a still-running blocker once
|
|
247
|
+
# its steps outlast poison_pill_timeout — a silent ordering violation. A
|
|
248
|
+
# background thread calls this every pp/3 seconds while steps run.
|
|
249
|
+
#
|
|
250
|
+
# Guards: epoch-fenced (a stale-batch straggler must not touch the current
|
|
251
|
+
# batch) and hexists-guarded (never resurrect a timer a terminal advance
|
|
252
|
+
# already deleted — if we were already poison-passed, stay passed).
|
|
253
|
+
HEARTBEAT_SCRIPT = <<~LUA
|
|
254
|
+
local at_key = KEYS[1]
|
|
255
|
+
local epoch_key = KEYS[2]
|
|
256
|
+
local my = ARGV[1]
|
|
257
|
+
local now = ARGV[2]
|
|
258
|
+
local my_epoch = tonumber(ARGV[3] or '0')
|
|
259
|
+
|
|
260
|
+
local cur_epoch = tonumber(redis.call('get', epoch_key) or '0')
|
|
261
|
+
if my_epoch > 0 and my_epoch ~= cur_epoch then
|
|
262
|
+
return 0
|
|
263
|
+
end
|
|
264
|
+
|
|
265
|
+
if redis.call('hexists', at_key, my) == 1 then
|
|
266
|
+
redis.call('hset', at_key, my, now)
|
|
267
|
+
return 1
|
|
268
|
+
end
|
|
269
|
+
return 0
|
|
270
|
+
LUA
|
|
271
|
+
|
|
272
|
+
SKIP_SCRIPT = <<~LUA
|
|
273
|
+
local next_key = KEYS[1]
|
|
274
|
+
local last_key = KEYS[2]
|
|
275
|
+
local at_key = KEYS[3]
|
|
276
|
+
local fail_key = KEYS[4]
|
|
277
|
+
local my = tonumber(ARGV[1])
|
|
278
|
+
|
|
279
|
+
-- Drained-batch fence (mirrors CAN_PROCEED/ADVANCE): an ops `skip!` of a
|
|
280
|
+
-- nonce whose batch already drained must not SET last_key on a missing
|
|
281
|
+
-- key — KEEPTTL on an absent key would create a TTL-less cursor and
|
|
282
|
+
-- un-gate the next batch. Nothing to skip in a drained batch anyway.
|
|
283
|
+
if redis.call('exists', next_key) == 0 and redis.call('exists', last_key) == 0 then
|
|
284
|
+
return 0
|
|
285
|
+
end
|
|
286
|
+
|
|
287
|
+
local last = tonumber(redis.call('get', last_key) or '0')
|
|
288
|
+
|
|
289
|
+
if my > last then
|
|
290
|
+
-- KEEPTTL: forcing the cursor forward must not strip the sequence TTL
|
|
291
|
+
-- and leave a persistent key behind for a sequence that never drains.
|
|
292
|
+
redis.call('set', last_key, my, 'KEEPTTL')
|
|
293
|
+
end
|
|
294
|
+
redis.call('hdel', at_key, my)
|
|
295
|
+
|
|
296
|
+
last = tonumber(redis.call('get', last_key) or '0')
|
|
297
|
+
local nxt = tonumber(redis.call('get', next_key) or '0')
|
|
298
|
+
if last >= nxt and nxt > 0 then
|
|
299
|
+
redis.call('del', next_key)
|
|
300
|
+
redis.call('del', last_key)
|
|
301
|
+
redis.call('del', at_key)
|
|
302
|
+
redis.call('del', fail_key)
|
|
303
|
+
end
|
|
304
|
+
return last
|
|
305
|
+
LUA
|
|
306
|
+
|
|
307
|
+
# Returns `[nonce, epoch]`: the assigned nonce plus the generation it
|
|
308
|
+
# belongs to. The caller stashes both so later gate/advance calls can be
|
|
309
|
+
# fenced if the batch drains and its numbers get reused.
|
|
310
|
+
def ordered_lock_assign(key, ttl: 86_400, now: Time.now.to_i)
|
|
311
|
+
next_k, last_k, at_k, _fail_k, epoch_k = ordered_lock_keys(key)
|
|
312
|
+
nonce, epoch = @redis.eval(
|
|
313
|
+
ASSIGN_SCRIPT, keys: [next_k, last_k, at_k, epoch_k], argv: [now.to_s, ttl]
|
|
314
|
+
)
|
|
315
|
+
[nonce.to_i, epoch.to_i]
|
|
316
|
+
end
|
|
317
|
+
|
|
318
|
+
def ordered_lock_can_proceed(key, nonce:, poison_pill_timeout:, epoch: 0, now: Time.now.to_i)
|
|
319
|
+
state, retry_after, last_completed, first_failed = @redis.eval(
|
|
320
|
+
CAN_PROCEED_SCRIPT,
|
|
321
|
+
keys: ordered_lock_keys(key),
|
|
322
|
+
argv: [nonce.to_i, now.to_i, poison_pill_timeout.to_i, epoch.to_i]
|
|
323
|
+
)
|
|
324
|
+
[state.to_s, retry_after.to_i, last_completed.to_i, first_failed.to_i]
|
|
325
|
+
end
|
|
326
|
+
|
|
327
|
+
def ordered_lock_advance(key, nonce:, failed: false, epoch: 0, ttl: 86_400)
|
|
328
|
+
@redis.eval(
|
|
329
|
+
ADVANCE_SCRIPT,
|
|
330
|
+
keys: ordered_lock_keys(key),
|
|
331
|
+
argv: [nonce.to_i, failed ? 1 : 0, ttl.to_i, epoch.to_i]
|
|
332
|
+
).to_i
|
|
333
|
+
end
|
|
334
|
+
|
|
335
|
+
def ordered_lock_skip(key, nonce:)
|
|
336
|
+
@redis.eval(SKIP_SCRIPT, keys: ordered_lock_keys(key), argv: [nonce.to_i]).to_i
|
|
337
|
+
end
|
|
338
|
+
|
|
339
|
+
# Restamp a running nonce's assigned_at to keep it from being poison-passed
|
|
340
|
+
# while its steps execute. Returns 1 if restamped, 0 if fenced (stale
|
|
341
|
+
# epoch) or the timer was already gone. `now` is injectable for testing.
|
|
342
|
+
def ordered_lock_heartbeat(key, nonce:, epoch: 0, now: Time.now.to_i)
|
|
343
|
+
_next_k, _last_k, at_k, _fail_k, epoch_k = ordered_lock_keys(key)
|
|
344
|
+
@redis.eval(
|
|
345
|
+
HEARTBEAT_SCRIPT, keys: [at_k, epoch_k], argv: [nonce.to_i, now.to_i, epoch.to_i]
|
|
346
|
+
).to_i
|
|
347
|
+
end
|
|
348
|
+
|
|
349
|
+
def ordered_lock_reset(key)
|
|
350
|
+
@redis.del(*ordered_lock_keys(key))
|
|
351
|
+
end
|
|
352
|
+
|
|
353
|
+
def ordered_lock_peek(key)
|
|
354
|
+
next_k, last_k, at_k, fail_k = ordered_lock_keys(key)
|
|
355
|
+
next_v = @redis.get(next_k)
|
|
356
|
+
last_v = @redis.get(last_k)
|
|
357
|
+
fail_v = @redis.get(fail_k)
|
|
358
|
+
in_flight = @redis.hkeys(at_k).map(&:to_i).sort
|
|
359
|
+
{
|
|
360
|
+
next: next_v ? next_v.to_i : 0,
|
|
361
|
+
last_completed: last_v ? last_v.to_i : 0,
|
|
362
|
+
in_flight: in_flight,
|
|
363
|
+
first_failed: fail_v ? fail_v.to_i : 0
|
|
364
|
+
}
|
|
365
|
+
end
|
|
366
|
+
|
|
367
|
+
# Order matters: callers destructure positionally and some scripts take a
|
|
368
|
+
# subset. `epoch` is appended last so existing 4-key destructures keep
|
|
369
|
+
# working unchanged.
|
|
370
|
+
def ordered_lock_keys(key)
|
|
371
|
+
tag = "{#{key}}"
|
|
372
|
+
[
|
|
373
|
+
"ordered_lock:#{tag}:next",
|
|
374
|
+
"ordered_lock:#{tag}:last_completed",
|
|
375
|
+
"ordered_lock:#{tag}:assigned_at",
|
|
376
|
+
"ordered_lock:#{tag}:first_failed",
|
|
377
|
+
"ordered_lock:#{tag}:epoch"
|
|
378
|
+
]
|
|
379
|
+
end
|
|
380
|
+
end
|
|
381
|
+
end
|
|
382
|
+
end
|
data/lib/ruby_reactor/version.rb
CHANGED
data/lib/ruby_reactor.rb
CHANGED
|
@@ -6,6 +6,7 @@ require_relative "ruby_reactor/registry"
|
|
|
6
6
|
require_relative "ruby_reactor/utils/code_extractor"
|
|
7
7
|
require_relative "ruby_reactor/dsl/lockable" # Add this
|
|
8
8
|
require_relative "ruby_reactor/lock"
|
|
9
|
+
require_relative "ruby_reactor/ordered_lock"
|
|
9
10
|
require_relative "ruby_reactor/semaphore"
|
|
10
11
|
require_relative "ruby_reactor/period"
|
|
11
12
|
require_relative "ruby_reactor/rate_limit"
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: ruby_reactor
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.5.
|
|
4
|
+
version: 0.5.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Artur
|
|
@@ -125,6 +125,7 @@ files:
|
|
|
125
125
|
- lib/ruby_reactor/executor/compensation_manager.rb
|
|
126
126
|
- lib/ruby_reactor/executor/graph_manager.rb
|
|
127
127
|
- lib/ruby_reactor/executor/input_validator.rb
|
|
128
|
+
- lib/ruby_reactor/executor/ordered_lock_support.rb
|
|
128
129
|
- lib/ruby_reactor/executor/result_handler.rb
|
|
129
130
|
- lib/ruby_reactor/executor/retry_manager.rb
|
|
130
131
|
- lib/ruby_reactor/executor/step_executor.rb
|
|
@@ -139,6 +140,7 @@ files:
|
|
|
139
140
|
- lib/ruby_reactor/middleware.rb
|
|
140
141
|
- lib/ruby_reactor/middleware_runner.rb
|
|
141
142
|
- lib/ruby_reactor/open_telemetry.rb
|
|
143
|
+
- lib/ruby_reactor/ordered_lock.rb
|
|
142
144
|
- lib/ruby_reactor/period.rb
|
|
143
145
|
- lib/ruby_reactor/rate_limit.rb
|
|
144
146
|
- lib/ruby_reactor/rate_limit_registry.rb
|
|
@@ -149,7 +151,9 @@ files:
|
|
|
149
151
|
- lib/ruby_reactor/rspec.rb
|
|
150
152
|
- lib/ruby_reactor/rspec/helpers.rb
|
|
151
153
|
- lib/ruby_reactor/rspec/matchers.rb
|
|
154
|
+
- lib/ruby_reactor/rspec/sidekiq_helpers.rb
|
|
152
155
|
- lib/ruby_reactor/rspec/step_executor_patch.rb
|
|
156
|
+
- lib/ruby_reactor/rspec/storage_reset.rb
|
|
153
157
|
- lib/ruby_reactor/rspec/test_subject.rb
|
|
154
158
|
- lib/ruby_reactor/semaphore.rb
|
|
155
159
|
- lib/ruby_reactor/sidekiq_adapter.rb
|
|
@@ -163,6 +167,7 @@ files:
|
|
|
163
167
|
- lib/ruby_reactor/storage/configuration.rb
|
|
164
168
|
- lib/ruby_reactor/storage/redis_adapter.rb
|
|
165
169
|
- lib/ruby_reactor/storage/redis_locking.rb
|
|
170
|
+
- lib/ruby_reactor/storage/redis_ordered_locking.rb
|
|
166
171
|
- lib/ruby_reactor/template/base.rb
|
|
167
172
|
- lib/ruby_reactor/template/dynamic_source.rb
|
|
168
173
|
- lib/ruby_reactor/template/element.rb
|