ruby_reactor 0.5.2 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.release-please-manifest.json +1 -1
- data/CHANGELOG.md +7 -0
- data/README.md +147 -34
- data/lib/ruby_reactor/configuration.rb +66 -2
- data/lib/ruby_reactor/context_serializer.rb +9 -4
- data/lib/ruby_reactor/executor/ordered_lock_support.rb +1 -1
- data/lib/ruby_reactor/executor/retry_manager.rb +7 -2
- data/lib/ruby_reactor/executor/step_executor.rb +25 -5
- data/lib/ruby_reactor/executor.rb +85 -3
- data/lib/ruby_reactor/lock.rb +13 -0
- data/lib/ruby_reactor/map/collector.rb +41 -0
- data/lib/ruby_reactor/map/dispatcher.rb +42 -0
- data/lib/ruby_reactor/map/element_executor.rb +39 -0
- data/lib/ruby_reactor/map/helpers.rb +10 -3
- data/lib/ruby_reactor/map/sweeper.rb +110 -0
- data/lib/ruby_reactor/reactor.rb +7 -5
- data/lib/ruby_reactor/sidekiq_adapter.rb +9 -8
- data/lib/ruby_reactor/sidekiq_workers/sweeper_worker.rb +73 -0
- data/lib/ruby_reactor/sidekiq_workers/worker.rb +42 -34
- data/lib/ruby_reactor/step/map_step.rb +18 -2
- data/lib/ruby_reactor/storage/redis_adapter.rb +83 -60
- data/lib/ruby_reactor/storage/redis_locking.rb +8 -0
- data/lib/ruby_reactor/sweeper.rb +58 -0
- data/lib/ruby_reactor/version.rb +1 -1
- data/lib/ruby_reactor.rb +42 -0
- metadata +4 -1
|
@@ -8,6 +8,42 @@ module RubyReactor
|
|
|
8
8
|
def self.perform(arguments)
|
|
9
9
|
arguments = arguments.transform_keys(&:to_sym)
|
|
10
10
|
map_id = arguments[:map_id]
|
|
11
|
+
|
|
12
|
+
# Serialize concurrent collector deliveries for the SAME map (eager queue +
|
|
13
|
+
# counter-zero trigger + sweeper re-trigger could otherwise all resume the
|
|
14
|
+
# parent at once and both write its context). A dedicated map_collect lock
|
|
15
|
+
# is used rather than the parent's own lock so it never conflicts with the
|
|
16
|
+
# context lock the parent's resume_execution acquires for itself.
|
|
17
|
+
lock = acquire_collect_lock(map_id)
|
|
18
|
+
return if lock == :contended
|
|
19
|
+
|
|
20
|
+
begin
|
|
21
|
+
perform_collection(arguments)
|
|
22
|
+
ensure
|
|
23
|
+
lock.release if lock.respond_to?(:release)
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def self.acquire_collect_lock(map_id)
|
|
28
|
+
return :inline if inline_testing_mode?
|
|
29
|
+
|
|
30
|
+
lock = RubyReactor::Lock.new(
|
|
31
|
+
"map_collect:#{map_id}",
|
|
32
|
+
owner: SecureRandom.uuid, ttl: RubyReactor.configuration.context_lock_ttl,
|
|
33
|
+
wait: 0, auto_extend: true
|
|
34
|
+
)
|
|
35
|
+
lock.acquire
|
|
36
|
+
lock
|
|
37
|
+
rescue RubyReactor::Lock::AcquisitionError
|
|
38
|
+
:contended
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def self.inline_testing_mode?
|
|
42
|
+
defined?(Sidekiq::Testing) && Sidekiq::Testing.respond_to?(:inline?) && Sidekiq::Testing.inline?
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def self.perform_collection(arguments)
|
|
46
|
+
map_id = arguments[:map_id]
|
|
11
47
|
parent_context_id = arguments[:parent_context_id]
|
|
12
48
|
parent_reactor_class_name = arguments[:parent_reactor_class_name]
|
|
13
49
|
step_name = arguments[:step_name]
|
|
@@ -18,6 +54,11 @@ module RubyReactor
|
|
|
18
54
|
parent_context_data = storage.retrieve_context(parent_context_id, parent_reactor_class_name)
|
|
19
55
|
parent_context = RubyReactor::Context.deserialize_from_retry(parent_context_data)
|
|
20
56
|
|
|
57
|
+
# Idempotency: if the parent already recorded this map step's result, a
|
|
58
|
+
# prior collector already resumed it. Re-resuming would double-execute the
|
|
59
|
+
# steps after the map. Skip.
|
|
60
|
+
return if parent_context.intermediate_results.key?(step_name.to_sym)
|
|
61
|
+
|
|
21
62
|
# Check if all tasks are completed
|
|
22
63
|
metadata = storage.retrieve_map_metadata(map_id, parent_reactor_class_name)
|
|
23
64
|
total_count = metadata ? metadata["count"].to_i : 0
|
|
@@ -104,6 +104,48 @@ module RubyReactor
|
|
|
104
104
|
end
|
|
105
105
|
end
|
|
106
106
|
|
|
107
|
+
# Re-dispatch a SPECIFIC index whose result slot is missing (Phase 5c, used
|
|
108
|
+
# by the map sweeper). Index-driven rather than offset-driven: resolve the
|
|
109
|
+
# source from the stored parent context and pick source[index]. Idempotent
|
|
110
|
+
# because store_map_result HSETs by index — a re-run overwrites slot `index`,
|
|
111
|
+
# never duplicates.
|
|
112
|
+
def self.requeue_index(map_meta, index)
|
|
113
|
+
storage = RubyReactor.configuration.storage_adapter
|
|
114
|
+
parent_class_name = map_meta["parent_reactor_class_name"]
|
|
115
|
+
parent_context = load_parent_context_from_storage(map_meta["parent_context_id"], parent_class_name, storage)
|
|
116
|
+
|
|
117
|
+
arguments = {
|
|
118
|
+
map_id: map_meta["map_id"],
|
|
119
|
+
step_name: map_meta["step_name"],
|
|
120
|
+
strict_ordering: map_meta["strict_ordering"],
|
|
121
|
+
parent_context_id: map_meta["parent_context_id"],
|
|
122
|
+
parent_reactor_class_name: parent_class_name,
|
|
123
|
+
fail_fast: map_meta["fail_fast"],
|
|
124
|
+
batch_size: map_meta["batch_size"]
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
source = resolve_source(arguments, parent_context)
|
|
128
|
+
element = element_at(source, index)
|
|
129
|
+
|
|
130
|
+
queue_element_job(element, index, {
|
|
131
|
+
map_id: map_meta["map_id"],
|
|
132
|
+
arguments: arguments,
|
|
133
|
+
context: parent_context,
|
|
134
|
+
reactor_class_info: map_meta["reactor_class_info"],
|
|
135
|
+
step_name: map_meta["step_name"]
|
|
136
|
+
})
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
def self.element_at(source, index)
|
|
140
|
+
if source.is_a?(Array)
|
|
141
|
+
source[index]
|
|
142
|
+
elsif source.respond_to?(:offset) && source.respond_to?(:limit)
|
|
143
|
+
source.offset(index).limit(1).to_a.first
|
|
144
|
+
else
|
|
145
|
+
source.drop(index).first
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
|
|
107
149
|
def self.queue_element_job(element, index, options)
|
|
108
150
|
arguments = options[:arguments]
|
|
109
151
|
context = options[:context]
|
|
@@ -8,6 +8,45 @@ module RubyReactor
|
|
|
8
8
|
def self.perform(arguments)
|
|
9
9
|
arguments = arguments.transform_keys(&:to_sym)
|
|
10
10
|
|
|
11
|
+
# Per-element liveness lock (Phase 5b): its presence is the map sweeper's
|
|
12
|
+
# "element alive" signal, and it serializes duplicate deliveries so a
|
|
13
|
+
# re-run can't double-decrement the counter (M3). A duplicate of a live
|
|
14
|
+
# element is dropped — the live original stores the result and finalizes.
|
|
15
|
+
lock = acquire_element_lock(arguments)
|
|
16
|
+
return if lock == :contended
|
|
17
|
+
|
|
18
|
+
begin
|
|
19
|
+
perform_element(arguments)
|
|
20
|
+
ensure
|
|
21
|
+
lock.release if lock.respond_to?(:release)
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def self.acquire_element_lock(arguments)
|
|
26
|
+
# In Sidekiq::Testing.inline! an element's async-retry perform_map_element_in
|
|
27
|
+
# re-enters synchronously inside this frame; the lock would self-contend.
|
|
28
|
+
# It only guards concurrent cross-process delivery, impossible inline.
|
|
29
|
+
return :inline if inline_testing_mode?
|
|
30
|
+
|
|
31
|
+
lock = RubyReactor::Lock.new(
|
|
32
|
+
"map_element:#{arguments[:map_id]}:#{arguments[:index]}",
|
|
33
|
+
owner: SecureRandom.uuid, ttl: RubyReactor.configuration.context_lock_ttl,
|
|
34
|
+
wait: 0, auto_extend: true
|
|
35
|
+
)
|
|
36
|
+
lock.acquire
|
|
37
|
+
lock
|
|
38
|
+
rescue RubyReactor::Lock::AcquisitionError
|
|
39
|
+
RubyReactor.configuration.logger.info(
|
|
40
|
+
"RubyReactor map element #{arguments[:map_id]}:#{arguments[:index]} already in flight; dropping duplicate"
|
|
41
|
+
)
|
|
42
|
+
:contended
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def self.inline_testing_mode?
|
|
46
|
+
defined?(Sidekiq::Testing) && Sidekiq::Testing.respond_to?(:inline?) && Sidekiq::Testing.inline?
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def self.perform_element(arguments)
|
|
11
50
|
context = hydrate_or_create_context(arguments)
|
|
12
51
|
# The element already runs inside its own background worker, so any async
|
|
13
52
|
# steps (and async retries) must execute inline here rather than handing
|
|
@@ -108,10 +108,17 @@ module RubyReactor
|
|
|
108
108
|
executor.resume_execution
|
|
109
109
|
end
|
|
110
110
|
|
|
111
|
+
# Checkpoint the ROOT, not the sub (F9/C2). When the map is embedded in a
|
|
112
|
+
# composed sub-reactor, parent_context is the *sub*; storing only the sub
|
|
113
|
+
# would leave the root blob stale and a rehydrate-by-root-id resume would
|
|
114
|
+
# lose the map's completion. Resolve the root (which embeds the sub's
|
|
115
|
+
# post-map state via composed_contexts) and store that. For a top-level
|
|
116
|
+
# map parent_context IS the root, so this is unchanged.
|
|
117
|
+
root = parent_context.root_context || parent_context
|
|
111
118
|
storage.store_context(
|
|
112
|
-
|
|
113
|
-
ContextSerializer.serialize(
|
|
114
|
-
|
|
119
|
+
root.context_id,
|
|
120
|
+
ContextSerializer.serialize(root),
|
|
121
|
+
RubyReactor.reactor_storage_name(root.reactor_class)
|
|
115
122
|
)
|
|
116
123
|
end
|
|
117
124
|
end
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyReactor
|
|
4
|
+
module Map
|
|
5
|
+
# Recovers map fan-out from a hard kill (Phase 5d). Maps are the path most
|
|
6
|
+
# exposed to a lost job: one missing element result hangs the whole map and
|
|
7
|
+
# its parent forever. The unifying signal is the results hash — index-keyed
|
|
8
|
+
# and idempotent (HSET) — so completion is authoritative on `missing`, not on
|
|
9
|
+
# the fragile counter:
|
|
10
|
+
#
|
|
11
|
+
# missing = (0...count) - HKEYS(results)
|
|
12
|
+
#
|
|
13
|
+
# For each active map:
|
|
14
|
+
# * missing indices with NO live element lock are re-dispatched (M1/M4/M5).
|
|
15
|
+
# * if nothing is missing but the parent never resumed, the collector is
|
|
16
|
+
# re-triggered (M2) — gated so it never fires while a collector or the
|
|
17
|
+
# parent is alive, or after the parent already collected.
|
|
18
|
+
#
|
|
19
|
+
# `run_once` is pure and idempotent; the host wires the cadence (same contract
|
|
20
|
+
# as RubyReactor::Sweeper).
|
|
21
|
+
class Sweeper
|
|
22
|
+
def self.run_once(limit: 1000)
|
|
23
|
+
new.run_once(limit: limit)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def initialize(storage: nil, async_router: nil, logger: nil)
|
|
27
|
+
@storage = storage || RubyReactor.configuration.storage_adapter
|
|
28
|
+
@async_router = async_router || RubyReactor.configuration.async_router
|
|
29
|
+
@logger = logger || RubyReactor.configuration.logger
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Returns { redispatched:, recollected: } counts.
|
|
33
|
+
def run_once(limit: 1000)
|
|
34
|
+
redispatched = 0
|
|
35
|
+
recollected = 0
|
|
36
|
+
|
|
37
|
+
@storage.scan_maps(count: limit).each do |meta|
|
|
38
|
+
missing = missing_indices(meta)
|
|
39
|
+
if missing.any?
|
|
40
|
+
redispatched += redispatch_missing(meta, missing)
|
|
41
|
+
elsif recollect?(meta)
|
|
42
|
+
retrigger_collector(meta)
|
|
43
|
+
recollected += 1
|
|
44
|
+
end
|
|
45
|
+
rescue StandardError => e
|
|
46
|
+
@logger.warn("RubyReactor::Map::Sweeper failed on map #{meta["map_id"]}: #{e.class}: #{e.message}")
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
{ redispatched: redispatched, recollected: recollected }
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
private
|
|
53
|
+
|
|
54
|
+
def missing_indices(meta)
|
|
55
|
+
@storage.missing_map_indices(meta["map_id"], meta["count"].to_i, meta["parent_reactor_class_name"])
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def redispatch_missing(meta, missing)
|
|
59
|
+
count = 0
|
|
60
|
+
missing.each do |index|
|
|
61
|
+
next if @storage.lock_held?("map_element:#{meta["map_id"]}:#{index}") # element alive
|
|
62
|
+
|
|
63
|
+
RubyReactor::Map::Dispatcher.requeue_index(meta, index)
|
|
64
|
+
count += 1
|
|
65
|
+
end
|
|
66
|
+
count
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# All results are in. Re-trigger the collector only if no collector/parent is
|
|
70
|
+
# alive and the parent has not already collected this step.
|
|
71
|
+
def recollect?(meta)
|
|
72
|
+
return false if @storage.lock_held?("map_collect:#{meta["map_id"]}") # a collector is running
|
|
73
|
+
return false if parent_live_lock?(meta) # parent execution alive
|
|
74
|
+
return false if parent_already_collected?(meta)
|
|
75
|
+
|
|
76
|
+
true
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# N1: a nested map's parent is a map element running under a `map_element:`
|
|
80
|
+
# lock, not an `async:` lock. Derive the right key from metadata.
|
|
81
|
+
def parent_live_lock?(meta)
|
|
82
|
+
if meta["parent_is_map_element"]
|
|
83
|
+
@storage.lock_held?("map_element:#{meta["outer_map_id"]}:#{meta["outer_index"]}")
|
|
84
|
+
else
|
|
85
|
+
@storage.lock_held?("async:#{meta["parent_context_id"]}")
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def parent_already_collected?(meta)
|
|
90
|
+
data = @storage.retrieve_context(meta["parent_context_id"], meta["parent_reactor_class_name"])
|
|
91
|
+
return false unless data
|
|
92
|
+
|
|
93
|
+
results = data["intermediate_results"] || {}
|
|
94
|
+
status = data["status"].to_s
|
|
95
|
+
results.key?(meta["step_name"].to_s) || %w[completed failed skipped].include?(status)
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def retrigger_collector(meta)
|
|
99
|
+
@async_router.perform_map_collection_async(
|
|
100
|
+
parent_context_id: meta["parent_context_id"],
|
|
101
|
+
map_id: meta["map_id"],
|
|
102
|
+
parent_reactor_class_name: meta["parent_reactor_class_name"],
|
|
103
|
+
step_name: meta["step_name"],
|
|
104
|
+
strict_ordering: meta["strict_ordering"],
|
|
105
|
+
timeout: 3600
|
|
106
|
+
)
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
end
|
data/lib/ruby_reactor/reactor.rb
CHANGED
|
@@ -111,10 +111,11 @@ module RubyReactor
|
|
|
111
111
|
# For async reactors, queue a job for the whole reactor
|
|
112
112
|
@context.status = :running
|
|
113
113
|
Executor.middlewares_for(self.class).on(:before_async_enqueue, @context)
|
|
114
|
+
# Persist BEFORE enqueue — the job payload is identity-only (F2).
|
|
114
115
|
save_context
|
|
115
116
|
|
|
116
|
-
|
|
117
|
-
|
|
117
|
+
@result = configuration.async_router.perform_async(@context.context_id,
|
|
118
|
+
RubyReactor.reactor_storage_name(self.class),
|
|
118
119
|
intermediate_results: @context.intermediate_results)
|
|
119
120
|
|
|
120
121
|
# Even if it's an AsyncResult, it might have finished inline (e.g. Sidekiq::Testing.inline!)
|
|
@@ -312,10 +313,11 @@ module RubyReactor
|
|
|
312
313
|
|
|
313
314
|
def perform_async_run
|
|
314
315
|
@context.status = :running
|
|
316
|
+
# Persist BEFORE enqueue — the job payload is identity-only (F2).
|
|
315
317
|
save_context
|
|
316
318
|
|
|
317
|
-
|
|
318
|
-
|
|
319
|
+
@result = configuration.async_router.perform_async(@context.context_id,
|
|
320
|
+
RubyReactor.reactor_storage_name(self.class),
|
|
319
321
|
intermediate_results: @context.intermediate_results)
|
|
320
322
|
|
|
321
323
|
check_for_inline_completion
|
|
@@ -424,7 +426,7 @@ module RubyReactor
|
|
|
424
426
|
|
|
425
427
|
def save_context
|
|
426
428
|
storage = configuration.storage_adapter
|
|
427
|
-
reactor_class_name =
|
|
429
|
+
reactor_class_name = RubyReactor.reactor_storage_name(self.class)
|
|
428
430
|
serialized_context = ContextSerializer.serialize(@context)
|
|
429
431
|
storage.store_context(@context.context_id, serialized_context, reactor_class_name)
|
|
430
432
|
end
|
|
@@ -2,18 +2,19 @@
|
|
|
2
2
|
|
|
3
3
|
module RubyReactor
|
|
4
4
|
class SidekiqAdapter
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
5
|
+
# Identity-only payload: the worker rehydrates the live context from storage
|
|
6
|
+
# by (context_id, reactor_class_name). The caller already holds context_id, so
|
|
7
|
+
# there is no blob to deserialize here.
|
|
8
|
+
def self.perform_async(context_id, reactor_class_name = nil, intermediate_results: {})
|
|
9
|
+
job_id = SidekiqWorkers::Worker.perform_async(context_id, reactor_class_name)
|
|
8
10
|
RubyReactor::AsyncResult.new(job_id: job_id, intermediate_results: intermediate_results,
|
|
9
|
-
execution_id:
|
|
11
|
+
execution_id: context_id)
|
|
10
12
|
end
|
|
11
13
|
|
|
12
|
-
def self.perform_in(delay,
|
|
13
|
-
job_id = SidekiqWorkers::Worker.perform_in(delay,
|
|
14
|
-
context = ContextSerializer.deserialize(serialized_context)
|
|
14
|
+
def self.perform_in(delay, context_id, reactor_class_name = nil, intermediate_results: {})
|
|
15
|
+
job_id = SidekiqWorkers::Worker.perform_in(delay, context_id, reactor_class_name)
|
|
15
16
|
RubyReactor::AsyncResult.new(job_id: job_id, intermediate_results: intermediate_results,
|
|
16
|
-
execution_id:
|
|
17
|
+
execution_id: context_id)
|
|
17
18
|
end
|
|
18
19
|
|
|
19
20
|
# rubocop:disable Metrics/ParameterLists
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "sidekiq"
|
|
4
|
+
require "securerandom"
|
|
5
|
+
|
|
6
|
+
module RubyReactor
|
|
7
|
+
module SidekiqWorkers
|
|
8
|
+
# Self-rescheduling recovery tick. Each run sweeps both the top-level reactor
|
|
9
|
+
# sweeper and the map sweeper, then schedules the next tick — a perpetual
|
|
10
|
+
# chain the host kicks once via `RubyReactor.start_sweeper!`.
|
|
11
|
+
#
|
|
12
|
+
# super_fetch safety. Sidekiq Enterprise `super_fetch` reliably re-runs a job
|
|
13
|
+
# whose worker died mid-execution. For a self-rescheduling chain that is a
|
|
14
|
+
# hazard: a tick can crash AFTER enqueuing its successor but BEFORE acking, so
|
|
15
|
+
# super_fetch recovers the crashed tick *alongside* the successor it already
|
|
16
|
+
# scheduled — the chain forks and then doubles every interval. We therefore do
|
|
17
|
+
# NOT rely on "exactly one job exists". The next tick is claimed by a
|
|
18
|
+
# per-time-window lock: every duplicate computes the SAME target window and
|
|
19
|
+
# only one wins the claim, so recovered/duplicated ticks collapse back to a
|
|
20
|
+
# single chain. The claim lock is never released — it simply expires — so no
|
|
21
|
+
# delete can race two duplicates into both winning.
|
|
22
|
+
class SweeperWorker
|
|
23
|
+
include ::Sidekiq::Worker
|
|
24
|
+
|
|
25
|
+
# retry: false — the sweep is idempotent and self-rescheduling, so a failed
|
|
26
|
+
# tick must not pile up Sidekiq retries; the next tick (or a super_fetch
|
|
27
|
+
# recovery) re-runs it anyway.
|
|
28
|
+
sidekiq_options retry: false, queue: RubyReactor.configuration.sidekiq_queue
|
|
29
|
+
|
|
30
|
+
def perform
|
|
31
|
+
config = RubyReactor.configuration
|
|
32
|
+
return unless config.sweeper_enabled
|
|
33
|
+
|
|
34
|
+
run_sweeps(config)
|
|
35
|
+
ensure
|
|
36
|
+
# Always chain forward (unless disabled), even after an error above, so a
|
|
37
|
+
# single bad sweep can't kill recovery. The window lock keeps this from
|
|
38
|
+
# forking under super_fetch.
|
|
39
|
+
self.class.schedule_next if RubyReactor.configuration.sweeper_enabled
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def run_sweeps(config)
|
|
43
|
+
RubyReactor::Sweeper.run_once(limit: config.sweeper_limit)
|
|
44
|
+
RubyReactor::Map::Sweeper.run_once(limit: config.sweeper_limit)
|
|
45
|
+
rescue StandardError => e
|
|
46
|
+
config.logger.error("RubyReactor::SweeperWorker sweep failed: #{e.class}: #{e.message}")
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Enqueue the next tick for the upcoming time window, claiming that window
|
|
50
|
+
# so concurrent/duplicate/recovered ticks produce exactly one successor.
|
|
51
|
+
# Idempotent: also safe to call from `start_sweeper!` on every process boot.
|
|
52
|
+
def self.schedule_next
|
|
53
|
+
interval = RubyReactor.configuration.sweeper_interval
|
|
54
|
+
window = (Time.now.to_i / interval) + 1
|
|
55
|
+
|
|
56
|
+
lock = RubyReactor::Lock.new(
|
|
57
|
+
"sweeper:window:#{window}",
|
|
58
|
+
owner: SecureRandom.uuid,
|
|
59
|
+
ttl: interval * 2, # outlive the window; expires on its own (never released)
|
|
60
|
+
wait: 0,
|
|
61
|
+
auto_extend: false
|
|
62
|
+
)
|
|
63
|
+
lock.acquire # raises AcquisitionError if this window is already claimed
|
|
64
|
+
|
|
65
|
+
delay = (window * interval) - Time.now.to_i
|
|
66
|
+
perform_in([delay, 1].max)
|
|
67
|
+
rescue RubyReactor::Lock::AcquisitionError
|
|
68
|
+
# Another tick already scheduled this window — collapse the duplicate.
|
|
69
|
+
nil
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
@@ -17,15 +17,26 @@ module RubyReactor
|
|
|
17
17
|
# Handle infrastructure failures (network, Redis, etc.)
|
|
18
18
|
end
|
|
19
19
|
|
|
20
|
-
|
|
20
|
+
# Identity-only payload: storage is the source of truth. Rehydrate the live
|
|
21
|
+
# context from storage by id, then resume. A nil read means the context was
|
|
22
|
+
# swept, expired, or already terminal-and-collected — nothing to resume.
|
|
23
|
+
def perform(context_id, reactor_class_name = nil, snooze_count = 0)
|
|
24
|
+
# Normalize so a nil/omitted name resolves to the same storage key the
|
|
25
|
+
# enqueue path wrote (always via reactor_storage_name). Without this a
|
|
26
|
+
# nil here builds "reactor::context:<id>" and misses the stored
|
|
27
|
+
# "reactor:AnonymousReactor:context:<id>", silently no-op'ing.
|
|
28
|
+
reactor_class_name ||= RubyReactor.reactor_storage_name(nil)
|
|
29
|
+
data = RubyReactor.configuration.storage_adapter.retrieve_context(context_id, reactor_class_name)
|
|
30
|
+
return if data.nil?
|
|
31
|
+
|
|
21
32
|
begin
|
|
22
|
-
context = ContextSerializer.
|
|
33
|
+
context = ContextSerializer.deserialize_hash(data)
|
|
23
34
|
rescue RubyReactor::Error::DeserializationError,
|
|
24
35
|
RubyReactor::Error::SchemaVersionError => e
|
|
25
|
-
# Permanent failures —
|
|
26
|
-
# Mark the context as failed (best-effort) and return so
|
|
27
|
-
# does not burn its retry budget.
|
|
28
|
-
handle_deserialization_failure(
|
|
36
|
+
# Permanent failures — re-reading the same stored blob will keep
|
|
37
|
+
# failing. Mark the context as failed (best-effort) and return so
|
|
38
|
+
# Sidekiq does not burn its retry budget.
|
|
39
|
+
handle_deserialization_failure(context_id, reactor_class_name, e)
|
|
29
40
|
return
|
|
30
41
|
end
|
|
31
42
|
|
|
@@ -48,11 +59,12 @@ module RubyReactor
|
|
|
48
59
|
# Resume execution from the failed step
|
|
49
60
|
executor = Executor.new(context.reactor_class, {}, context)
|
|
50
61
|
executor.resume_execution
|
|
51
|
-
#
|
|
52
|
-
#
|
|
53
|
-
#
|
|
54
|
-
#
|
|
55
|
-
|
|
62
|
+
# No explicit save here: resume_execution's ensure block already persists
|
|
63
|
+
# the final root state (`save_context unless skip_context_persist?`), and
|
|
64
|
+
# in the worker the executor's context IS the root, so an extra checkpoint!
|
|
65
|
+
# would just re-write the identical blob to the identical key. The
|
|
66
|
+
# skip_context_persist? guard (stale-batch redelivery of an already-terminal
|
|
67
|
+
# context) is likewise honored there.
|
|
56
68
|
|
|
57
69
|
# Return the executor (which now has the result stored in it)
|
|
58
70
|
executor
|
|
@@ -66,7 +78,7 @@ module RubyReactor
|
|
|
66
78
|
# retry path so this doesn't burn the job's retry budget or appear
|
|
67
79
|
# as an error in dashboards. After the configured cap is reached we
|
|
68
80
|
# escalate by marking the reactor as failed.
|
|
69
|
-
handle_snooze(
|
|
81
|
+
handle_snooze(context_id, reactor_class_name, context, snooze_count, e)
|
|
70
82
|
rescue RubyReactor::RateLimitRegistry::UnknownLimitError => e
|
|
71
83
|
# Permanent configuration error — snoozing or retrying the same job
|
|
72
84
|
# will keep failing. Mark the context failed immediately.
|
|
@@ -76,7 +88,7 @@ module RubyReactor
|
|
|
76
88
|
|
|
77
89
|
private
|
|
78
90
|
|
|
79
|
-
def handle_snooze(
|
|
91
|
+
def handle_snooze(context_id, reactor_class_name, context, snooze_count, error)
|
|
80
92
|
config = RubyReactor.configuration
|
|
81
93
|
max = config.lock_snooze_max_attempts
|
|
82
94
|
|
|
@@ -86,7 +98,12 @@ module RubyReactor
|
|
|
86
98
|
# prematurely or strand the nonce in `assigned_at` until poison_pill
|
|
87
99
|
# eventually advances past it. Snooze until the gate passes (or poison
|
|
88
100
|
# auto-advance moves the cursor past us).
|
|
89
|
-
|
|
101
|
+
# The per-context liveness lock (`async:<id>`) is also uncapped: a
|
|
102
|
+
# duplicate of the *same* execution may wait arbitrarily long for the
|
|
103
|
+
# live original to finish (e.g. a sweeper re-enqueue racing a slow but
|
|
104
|
+
# alive worker). Capping it would fail a legitimately-waiting duplicate.
|
|
105
|
+
capped = !(error.is_a?(RubyReactor::OrderedLock::WaitError) ||
|
|
106
|
+
error.is_a?(RubyReactor::Lock::ContextLockContention))
|
|
90
107
|
|
|
91
108
|
if capped && max != :infinity && snooze_count >= max
|
|
92
109
|
escalate_snooze(context, snooze_count, error)
|
|
@@ -94,7 +111,9 @@ module RubyReactor
|
|
|
94
111
|
end
|
|
95
112
|
|
|
96
113
|
delay = compute_snooze_delay(config, error)
|
|
97
|
-
|
|
114
|
+
# Re-enqueue by id: the context is already persisted in storage, so the
|
|
115
|
+
# rescheduled job rehydrates fresh state (no stale blob).
|
|
116
|
+
self.class.perform_in(delay, context_id, reactor_class_name, snooze_count + 1)
|
|
98
117
|
end
|
|
99
118
|
|
|
100
119
|
# Use the error's `retry_after_seconds` hint when available
|
|
@@ -141,7 +160,7 @@ module RubyReactor
|
|
|
141
160
|
}
|
|
142
161
|
|
|
143
162
|
serialized = ContextSerializer.serialize(context)
|
|
144
|
-
reactor_class_name = context.reactor_class
|
|
163
|
+
reactor_class_name = RubyReactor.reactor_storage_name(context.reactor_class)
|
|
145
164
|
RubyReactor.configuration.storage_adapter.store_context(
|
|
146
165
|
context.context_id,
|
|
147
166
|
serialized,
|
|
@@ -162,23 +181,22 @@ module RubyReactor
|
|
|
162
181
|
RubyReactor.configuration.logger.error("Job details: #{msg.inspect}")
|
|
163
182
|
end
|
|
164
183
|
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
184
|
+
# The id-only payload already carries context_id and reactor_class_name, so
|
|
185
|
+
# there is no blob to parse for metadata — just mark the stored context
|
|
186
|
+
# failed (best-effort) so the job stops retrying a permanently-broken blob.
|
|
187
|
+
def handle_deserialization_failure(context_id, reactor_class_name, error)
|
|
170
188
|
RubyReactor.configuration.logger.error(
|
|
171
189
|
"RubyReactor deserialization failure for context " \
|
|
172
190
|
"#{context_id || "unknown"}: #{error.class.name}: #{error.message}"
|
|
173
191
|
)
|
|
174
192
|
|
|
175
|
-
return unless context_id &&
|
|
193
|
+
return unless context_id && reactor_class_name
|
|
176
194
|
|
|
177
|
-
payload = build_failed_context_payload(context_id,
|
|
195
|
+
payload = build_failed_context_payload(context_id, reactor_class_name, error)
|
|
178
196
|
RubyReactor.configuration.storage_adapter.store_context(
|
|
179
197
|
context_id,
|
|
180
198
|
payload,
|
|
181
|
-
|
|
199
|
+
reactor_class_name
|
|
182
200
|
)
|
|
183
201
|
rescue StandardError => e
|
|
184
202
|
# Don't let a persistence failure mask the original deserialization error.
|
|
@@ -187,16 +205,6 @@ module RubyReactor
|
|
|
187
205
|
)
|
|
188
206
|
end
|
|
189
207
|
|
|
190
|
-
def extract_failure_metadata(serialized_context)
|
|
191
|
-
data = JSON.parse(serialized_context)
|
|
192
|
-
{
|
|
193
|
-
context_id: data["context_id"],
|
|
194
|
-
reactor_class_name: data["reactor_class"]
|
|
195
|
-
}
|
|
196
|
-
rescue StandardError
|
|
197
|
-
{}
|
|
198
|
-
end
|
|
199
|
-
|
|
200
208
|
def build_failed_context_payload(context_id, reactor_class_name, error)
|
|
201
209
|
JSON.generate(
|
|
202
210
|
"schema_version" => ContextSerializer::SCHEMA_VERSION,
|
|
@@ -179,10 +179,25 @@ module RubyReactor
|
|
|
179
179
|
storage = RubyReactor.configuration.storage_adapter
|
|
180
180
|
storage.initialize_map_operation(
|
|
181
181
|
map_id, arguments[:source].size, context.reactor_class.name,
|
|
182
|
-
strict_ordering: arguments[:strict_ordering], reactor_class_info: reactor_class_info
|
|
182
|
+
strict_ordering: arguments[:strict_ordering], reactor_class_info: reactor_class_info,
|
|
183
|
+
**map_recovery_metadata(context, arguments[:step_name] || context.current_step)
|
|
183
184
|
)
|
|
184
185
|
end
|
|
185
186
|
|
|
187
|
+
# Recovery metadata for the map sweeper. When this map runs inside a map
|
|
188
|
+
# element (context.map_metadata present), it is a NESTED map: its parent
|
|
189
|
+
# holds the element's `map_element:` lock, not an `async:` lock (N1).
|
|
190
|
+
def map_recovery_metadata(context, step_name)
|
|
191
|
+
outer = context.map_metadata
|
|
192
|
+
{
|
|
193
|
+
parent_context_id: context.context_id,
|
|
194
|
+
step_name: step_name.to_s,
|
|
195
|
+
parent_is_map_element: !outer.nil?,
|
|
196
|
+
outer_map_id: outer && (outer[:map_id] || outer["map_id"]),
|
|
197
|
+
outer_index: outer && (outer[:index] || outer["index"])
|
|
198
|
+
}
|
|
199
|
+
end
|
|
200
|
+
|
|
186
201
|
def dispatch_async_map(map_id, arguments, context, _reactor_class_info, step_name)
|
|
187
202
|
# Every async map runs through the per-element Dispatcher path. When no
|
|
188
203
|
# batch_size is given we default to the full source size (one fan-out
|
|
@@ -231,7 +246,8 @@ module RubyReactor
|
|
|
231
246
|
storage = RubyReactor.configuration.storage_adapter
|
|
232
247
|
storage.initialize_map_operation(
|
|
233
248
|
map_id, arguments[:source].size, context.reactor_class.name,
|
|
234
|
-
strict_ordering: arguments[:strict_ordering], reactor_class_info: reactor_class_info
|
|
249
|
+
strict_ordering: arguments[:strict_ordering], reactor_class_info: reactor_class_info,
|
|
250
|
+
**map_recovery_metadata(context, step_name)
|
|
235
251
|
)
|
|
236
252
|
|
|
237
253
|
limit ||= arguments[:source].size
|