ruby_reactor 0.5.2 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,6 +8,42 @@ module RubyReactor
8
8
  def self.perform(arguments)
9
9
  arguments = arguments.transform_keys(&:to_sym)
10
10
  map_id = arguments[:map_id]
11
+
12
+ # Serialize concurrent collector deliveries for the SAME map (eager queue +
13
+ # counter-zero trigger + sweeper re-trigger could otherwise all resume the
14
+ # parent at once and both write its context). A dedicated map_collect lock
15
+ # is used rather than the parent's own lock so it never conflicts with the
16
+ # context lock the parent's resume_execution acquires for itself.
17
+ lock = acquire_collect_lock(map_id)
18
+ return if lock == :contended
19
+
20
+ begin
21
+ perform_collection(arguments)
22
+ ensure
23
+ lock.release if lock.respond_to?(:release)
24
+ end
25
+ end
26
+
27
+ def self.acquire_collect_lock(map_id)
28
+ return :inline if inline_testing_mode?
29
+
30
+ lock = RubyReactor::Lock.new(
31
+ "map_collect:#{map_id}",
32
+ owner: SecureRandom.uuid, ttl: RubyReactor.configuration.context_lock_ttl,
33
+ wait: 0, auto_extend: true
34
+ )
35
+ lock.acquire
36
+ lock
37
+ rescue RubyReactor::Lock::AcquisitionError
38
+ :contended
39
+ end
40
+
41
+ def self.inline_testing_mode?
42
+ defined?(Sidekiq::Testing) && Sidekiq::Testing.respond_to?(:inline?) && Sidekiq::Testing.inline?
43
+ end
44
+
45
+ def self.perform_collection(arguments)
46
+ map_id = arguments[:map_id]
11
47
  parent_context_id = arguments[:parent_context_id]
12
48
  parent_reactor_class_name = arguments[:parent_reactor_class_name]
13
49
  step_name = arguments[:step_name]
@@ -18,6 +54,11 @@ module RubyReactor
18
54
  parent_context_data = storage.retrieve_context(parent_context_id, parent_reactor_class_name)
19
55
  parent_context = RubyReactor::Context.deserialize_from_retry(parent_context_data)
20
56
 
57
+ # Idempotency: if the parent already recorded this map step's result, a
58
+ # prior collector already resumed it. Re-resuming would double-execute the
59
+ # steps after the map. Skip.
60
+ return if parent_context.intermediate_results.key?(step_name.to_sym)
61
+
21
62
  # Check if all tasks are completed
22
63
  metadata = storage.retrieve_map_metadata(map_id, parent_reactor_class_name)
23
64
  total_count = metadata ? metadata["count"].to_i : 0
@@ -104,6 +104,48 @@ module RubyReactor
104
104
  end
105
105
  end
106
106
 
107
+ # Re-dispatch a SPECIFIC index whose result slot is missing (Phase 5c, used
108
+ # by the map sweeper). Index-driven rather than offset-driven: resolve the
109
+ # source from the stored parent context and pick source[index]. Idempotent
110
+ # because store_map_result HSETs by index — a re-run overwrites slot `index`,
111
+ # never duplicates.
112
+ def self.requeue_index(map_meta, index)
113
+ storage = RubyReactor.configuration.storage_adapter
114
+ parent_class_name = map_meta["parent_reactor_class_name"]
115
+ parent_context = load_parent_context_from_storage(map_meta["parent_context_id"], parent_class_name, storage)
116
+
117
+ arguments = {
118
+ map_id: map_meta["map_id"],
119
+ step_name: map_meta["step_name"],
120
+ strict_ordering: map_meta["strict_ordering"],
121
+ parent_context_id: map_meta["parent_context_id"],
122
+ parent_reactor_class_name: parent_class_name,
123
+ fail_fast: map_meta["fail_fast"],
124
+ batch_size: map_meta["batch_size"]
125
+ }
126
+
127
+ source = resolve_source(arguments, parent_context)
128
+ element = element_at(source, index)
129
+
130
+ queue_element_job(element, index, {
131
+ map_id: map_meta["map_id"],
132
+ arguments: arguments,
133
+ context: parent_context,
134
+ reactor_class_info: map_meta["reactor_class_info"],
135
+ step_name: map_meta["step_name"]
136
+ })
137
+ end
138
+
139
+ def self.element_at(source, index)
140
+ if source.is_a?(Array)
141
+ source[index]
142
+ elsif source.respond_to?(:offset) && source.respond_to?(:limit)
143
+ source.offset(index).limit(1).to_a.first
144
+ else
145
+ source.drop(index).first
146
+ end
147
+ end
148
+
107
149
  def self.queue_element_job(element, index, options)
108
150
  arguments = options[:arguments]
109
151
  context = options[:context]
@@ -8,6 +8,45 @@ module RubyReactor
8
8
  def self.perform(arguments)
9
9
  arguments = arguments.transform_keys(&:to_sym)
10
10
 
11
+ # Per-element liveness lock (Phase 5b): its presence is the map sweeper's
12
+ # "element alive" signal, and it serializes duplicate deliveries so a
13
+ # re-run can't double-decrement the counter (M3). A duplicate of a live
14
+ # element is dropped — the live original stores the result and finalizes.
15
+ lock = acquire_element_lock(arguments)
16
+ return if lock == :contended
17
+
18
+ begin
19
+ perform_element(arguments)
20
+ ensure
21
+ lock.release if lock.respond_to?(:release)
22
+ end
23
+ end
24
+
25
+ def self.acquire_element_lock(arguments)
26
+ # In Sidekiq::Testing.inline! an element's async-retry perform_map_element_in
27
+ # re-enters synchronously inside this frame; the lock would self-contend.
28
+ # It only guards concurrent cross-process delivery, impossible inline.
29
+ return :inline if inline_testing_mode?
30
+
31
+ lock = RubyReactor::Lock.new(
32
+ "map_element:#{arguments[:map_id]}:#{arguments[:index]}",
33
+ owner: SecureRandom.uuid, ttl: RubyReactor.configuration.context_lock_ttl,
34
+ wait: 0, auto_extend: true
35
+ )
36
+ lock.acquire
37
+ lock
38
+ rescue RubyReactor::Lock::AcquisitionError
39
+ RubyReactor.configuration.logger.info(
40
+ "RubyReactor map element #{arguments[:map_id]}:#{arguments[:index]} already in flight; dropping duplicate"
41
+ )
42
+ :contended
43
+ end
44
+
45
+ def self.inline_testing_mode?
46
+ defined?(Sidekiq::Testing) && Sidekiq::Testing.respond_to?(:inline?) && Sidekiq::Testing.inline?
47
+ end
48
+
49
+ def self.perform_element(arguments)
11
50
  context = hydrate_or_create_context(arguments)
12
51
  # The element already runs inside its own background worker, so any async
13
52
  # steps (and async retries) must execute inline here rather than handing
@@ -108,10 +108,17 @@ module RubyReactor
108
108
  executor.resume_execution
109
109
  end
110
110
 
111
+ # Checkpoint the ROOT, not the sub (F9/C2). When the map is embedded in a
112
+ # composed sub-reactor, parent_context is the *sub*; storing only the sub
113
+ # would leave the root blob stale and a rehydrate-by-root-id resume would
114
+ # lose the map's completion. Resolve the root (which embeds the sub's
115
+ # post-map state via composed_contexts) and store that. For a top-level
116
+ # map parent_context IS the root, so this is unchanged.
117
+ root = parent_context.root_context || parent_context
111
118
  storage.store_context(
112
- parent_context.context_id,
113
- ContextSerializer.serialize(parent_context),
114
- parent_context.reactor_class.name
119
+ root.context_id,
120
+ ContextSerializer.serialize(root),
121
+ RubyReactor.reactor_storage_name(root.reactor_class)
115
122
  )
116
123
  end
117
124
  end
@@ -0,0 +1,110 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RubyReactor
4
+ module Map
5
+ # Recovers map fan-out from a hard kill (Phase 5d). Maps are the path most
6
+ # exposed to a lost job: one missing element result hangs the whole map and
7
+ # its parent forever. The unifying signal is the results hash — index-keyed
8
+ # and idempotent (HSET) — so completion is authoritative on `missing`, not on
9
+ # the fragile counter:
10
+ #
11
+ # missing = (0...count) - HKEYS(results)
12
+ #
13
+ # For each active map:
14
+ # * missing indices with NO live element lock are re-dispatched (M1/M4/M5).
15
+ # * if nothing is missing but the parent never resumed, the collector is
16
+ # re-triggered (M2) — gated so it never fires while a collector or the
17
+ # parent is alive, or after the parent already collected.
18
+ #
19
+ # `run_once` is pure and idempotent; the host wires the cadence (same contract
20
+ # as RubyReactor::Sweeper).
21
+ class Sweeper
22
+ def self.run_once(limit: 1000)
23
+ new.run_once(limit: limit)
24
+ end
25
+
26
+ def initialize(storage: nil, async_router: nil, logger: nil)
27
+ @storage = storage || RubyReactor.configuration.storage_adapter
28
+ @async_router = async_router || RubyReactor.configuration.async_router
29
+ @logger = logger || RubyReactor.configuration.logger
30
+ end
31
+
32
+ # Returns { redispatched:, recollected: } counts.
33
+ def run_once(limit: 1000)
34
+ redispatched = 0
35
+ recollected = 0
36
+
37
+ @storage.scan_maps(count: limit).each do |meta|
38
+ missing = missing_indices(meta)
39
+ if missing.any?
40
+ redispatched += redispatch_missing(meta, missing)
41
+ elsif recollect?(meta)
42
+ retrigger_collector(meta)
43
+ recollected += 1
44
+ end
45
+ rescue StandardError => e
46
+ @logger.warn("RubyReactor::Map::Sweeper failed on map #{meta["map_id"]}: #{e.class}: #{e.message}")
47
+ end
48
+
49
+ { redispatched: redispatched, recollected: recollected }
50
+ end
51
+
52
+ private
53
+
54
+ def missing_indices(meta)
55
+ @storage.missing_map_indices(meta["map_id"], meta["count"].to_i, meta["parent_reactor_class_name"])
56
+ end
57
+
58
+ def redispatch_missing(meta, missing)
59
+ count = 0
60
+ missing.each do |index|
61
+ next if @storage.lock_held?("map_element:#{meta["map_id"]}:#{index}") # element alive
62
+
63
+ RubyReactor::Map::Dispatcher.requeue_index(meta, index)
64
+ count += 1
65
+ end
66
+ count
67
+ end
68
+
69
+ # All results are in. Re-trigger the collector only if no collector/parent is
70
+ # alive and the parent has not already collected this step.
71
+ def recollect?(meta)
72
+ return false if @storage.lock_held?("map_collect:#{meta["map_id"]}") # a collector is running
73
+ return false if parent_live_lock?(meta) # parent execution alive
74
+ return false if parent_already_collected?(meta)
75
+
76
+ true
77
+ end
78
+
79
+ # N1: a nested map's parent is a map element running under a `map_element:`
80
+ # lock, not an `async:` lock. Derive the right key from metadata.
81
+ def parent_live_lock?(meta)
82
+ if meta["parent_is_map_element"]
83
+ @storage.lock_held?("map_element:#{meta["outer_map_id"]}:#{meta["outer_index"]}")
84
+ else
85
+ @storage.lock_held?("async:#{meta["parent_context_id"]}")
86
+ end
87
+ end
88
+
89
+ def parent_already_collected?(meta)
90
+ data = @storage.retrieve_context(meta["parent_context_id"], meta["parent_reactor_class_name"])
91
+ return false unless data
92
+
93
+ results = data["intermediate_results"] || {}
94
+ status = data["status"].to_s
95
+ results.key?(meta["step_name"].to_s) || %w[completed failed skipped].include?(status)
96
+ end
97
+
98
+ def retrigger_collector(meta)
99
+ @async_router.perform_map_collection_async(
100
+ parent_context_id: meta["parent_context_id"],
101
+ map_id: meta["map_id"],
102
+ parent_reactor_class_name: meta["parent_reactor_class_name"],
103
+ step_name: meta["step_name"],
104
+ strict_ordering: meta["strict_ordering"],
105
+ timeout: 3600
106
+ )
107
+ end
108
+ end
109
+ end
110
+ end
@@ -111,10 +111,11 @@ module RubyReactor
111
111
  # For async reactors, queue a job for the whole reactor
112
112
  @context.status = :running
113
113
  Executor.middlewares_for(self.class).on(:before_async_enqueue, @context)
114
+ # Persist BEFORE enqueue — the job payload is identity-only (F2).
114
115
  save_context
115
116
 
116
- serialized_context = ContextSerializer.serialize(@context)
117
- @result = configuration.async_router.perform_async(serialized_context, self.class.name,
117
+ @result = configuration.async_router.perform_async(@context.context_id,
118
+ RubyReactor.reactor_storage_name(self.class),
118
119
  intermediate_results: @context.intermediate_results)
119
120
 
120
121
  # Even if it's an AsyncResult, it might have finished inline (e.g. Sidekiq::Testing.inline!)
@@ -312,10 +313,11 @@ module RubyReactor
312
313
 
313
314
  def perform_async_run
314
315
  @context.status = :running
316
+ # Persist BEFORE enqueue — the job payload is identity-only (F2).
315
317
  save_context
316
318
 
317
- serialized_context = ContextSerializer.serialize(@context)
318
- @result = configuration.async_router.perform_async(serialized_context, self.class.name,
319
+ @result = configuration.async_router.perform_async(@context.context_id,
320
+ RubyReactor.reactor_storage_name(self.class),
319
321
  intermediate_results: @context.intermediate_results)
320
322
 
321
323
  check_for_inline_completion
@@ -424,7 +426,7 @@ module RubyReactor
424
426
 
425
427
  def save_context
426
428
  storage = configuration.storage_adapter
427
- reactor_class_name = self.class.name || "AnonymousReactor-#{self.class.object_id}"
429
+ reactor_class_name = RubyReactor.reactor_storage_name(self.class)
428
430
  serialized_context = ContextSerializer.serialize(@context)
429
431
  storage.store_context(@context.context_id, serialized_context, reactor_class_name)
430
432
  end
@@ -2,18 +2,19 @@
2
2
 
3
3
  module RubyReactor
4
4
  class SidekiqAdapter
5
- def self.perform_async(serialized_context, reactor_class_name = nil, intermediate_results: {})
6
- job_id = SidekiqWorkers::Worker.perform_async(serialized_context, reactor_class_name)
7
- context = ContextSerializer.deserialize(serialized_context)
5
+ # Identity-only payload: the worker rehydrates the live context from storage
6
+ # by (context_id, reactor_class_name). The caller already holds context_id, so
7
+ # there is no blob to deserialize here.
8
+ def self.perform_async(context_id, reactor_class_name = nil, intermediate_results: {})
9
+ job_id = SidekiqWorkers::Worker.perform_async(context_id, reactor_class_name)
8
10
  RubyReactor::AsyncResult.new(job_id: job_id, intermediate_results: intermediate_results,
9
- execution_id: context.context_id)
11
+ execution_id: context_id)
10
12
  end
11
13
 
12
- def self.perform_in(delay, serialized_context, reactor_class_name = nil, intermediate_results: {})
13
- job_id = SidekiqWorkers::Worker.perform_in(delay, serialized_context, reactor_class_name)
14
- context = ContextSerializer.deserialize(serialized_context)
14
+ def self.perform_in(delay, context_id, reactor_class_name = nil, intermediate_results: {})
15
+ job_id = SidekiqWorkers::Worker.perform_in(delay, context_id, reactor_class_name)
15
16
  RubyReactor::AsyncResult.new(job_id: job_id, intermediate_results: intermediate_results,
16
- execution_id: context.context_id)
17
+ execution_id: context_id)
17
18
  end
18
19
 
19
20
  # rubocop:disable Metrics/ParameterLists
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "sidekiq"
4
+ require "securerandom"
5
+
6
+ module RubyReactor
7
+ module SidekiqWorkers
8
+ # Self-rescheduling recovery tick. Each run sweeps both the top-level reactor
9
+ # sweeper and the map sweeper, then schedules the next tick — a perpetual
10
+ # chain the host kicks once via `RubyReactor.start_sweeper!`.
11
+ #
12
+ # super_fetch safety. Sidekiq Enterprise `super_fetch` reliably re-runs a job
13
+ # whose worker died mid-execution. For a self-rescheduling chain that is a
14
+ # hazard: a tick can crash AFTER enqueuing its successor but BEFORE acking, so
15
+ # super_fetch recovers the crashed tick *alongside* the successor it already
16
+ # scheduled — the chain forks and then doubles every interval. We therefore do
17
+ # NOT rely on "exactly one job exists". The next tick is claimed by a
18
+ # per-time-window lock: every duplicate computes the SAME target window and
19
+ # only one wins the claim, so recovered/duplicated ticks collapse back to a
20
+ # single chain. The claim lock is never released — it simply expires — so no
21
+ # delete can race two duplicates into both winning.
22
+ class SweeperWorker
23
+ include ::Sidekiq::Worker
24
+
25
+ # retry: false — the sweep is idempotent and self-rescheduling, so a failed
26
+ # tick must not pile up Sidekiq retries; the next tick (or a super_fetch
27
+ # recovery) re-runs it anyway.
28
+ sidekiq_options retry: false, queue: RubyReactor.configuration.sidekiq_queue
29
+
30
+ def perform
31
+ config = RubyReactor.configuration
32
+ return unless config.sweeper_enabled
33
+
34
+ run_sweeps(config)
35
+ ensure
36
+ # Always chain forward (unless disabled), even after an error above, so a
37
+ # single bad sweep can't kill recovery. The window lock keeps this from
38
+ # forking under super_fetch.
39
+ self.class.schedule_next if RubyReactor.configuration.sweeper_enabled
40
+ end
41
+
42
+ def run_sweeps(config)
43
+ RubyReactor::Sweeper.run_once(limit: config.sweeper_limit)
44
+ RubyReactor::Map::Sweeper.run_once(limit: config.sweeper_limit)
45
+ rescue StandardError => e
46
+ config.logger.error("RubyReactor::SweeperWorker sweep failed: #{e.class}: #{e.message}")
47
+ end
48
+
49
+ # Enqueue the next tick for the upcoming time window, claiming that window
50
+ # so concurrent/duplicate/recovered ticks produce exactly one successor.
51
+ # Idempotent: also safe to call from `start_sweeper!` on every process boot.
52
+ def self.schedule_next
53
+ interval = RubyReactor.configuration.sweeper_interval
54
+ window = (Time.now.to_i / interval) + 1
55
+
56
+ lock = RubyReactor::Lock.new(
57
+ "sweeper:window:#{window}",
58
+ owner: SecureRandom.uuid,
59
+ ttl: interval * 2, # outlive the window; expires on its own (never released)
60
+ wait: 0,
61
+ auto_extend: false
62
+ )
63
+ lock.acquire # raises AcquisitionError if this window is already claimed
64
+
65
+ delay = (window * interval) - Time.now.to_i
66
+ perform_in([delay, 1].max)
67
+ rescue RubyReactor::Lock::AcquisitionError
68
+ # Another tick already scheduled this window — collapse the duplicate.
69
+ nil
70
+ end
71
+ end
72
+ end
73
+ end
@@ -17,15 +17,26 @@ module RubyReactor
17
17
  # Handle infrastructure failures (network, Redis, etc.)
18
18
  end
19
19
 
20
- def perform(serialized_context, reactor_class_name = nil, snooze_count = 0)
20
+ # Identity-only payload: storage is the source of truth. Rehydrate the live
21
+ # context from storage by id, then resume. A nil read means the context was
22
+ # swept, expired, or already terminal-and-collected — nothing to resume.
23
+ def perform(context_id, reactor_class_name = nil, snooze_count = 0)
24
+ # Normalize so a nil/omitted name resolves to the same storage key the
25
+ # enqueue path wrote (always via reactor_storage_name). Without this a
26
+ # nil here builds "reactor::context:<id>" and misses the stored
27
+ # "reactor:AnonymousReactor:context:<id>", silently no-op'ing.
28
+ reactor_class_name ||= RubyReactor.reactor_storage_name(nil)
29
+ data = RubyReactor.configuration.storage_adapter.retrieve_context(context_id, reactor_class_name)
30
+ return if data.nil?
31
+
21
32
  begin
22
- context = ContextSerializer.deserialize(serialized_context)
33
+ context = ContextSerializer.deserialize_hash(data)
23
34
  rescue RubyReactor::Error::DeserializationError,
24
35
  RubyReactor::Error::SchemaVersionError => e
25
- # Permanent failures — retrying the same blob will keep failing.
26
- # Mark the context as failed (best-effort) and return so Sidekiq
27
- # does not burn its retry budget.
28
- handle_deserialization_failure(serialized_context, reactor_class_name, e)
36
+ # Permanent failures — re-reading the same stored blob will keep
37
+ # failing. Mark the context as failed (best-effort) and return so
38
+ # Sidekiq does not burn its retry budget.
39
+ handle_deserialization_failure(context_id, reactor_class_name, e)
29
40
  return
30
41
  end
31
42
 
@@ -48,11 +59,12 @@ module RubyReactor
48
59
  # Resume execution from the failed step
49
60
  executor = Executor.new(context.reactor_class, {}, context)
50
61
  executor.resume_execution
51
- # Skip the post-run save when the executor deliberately suppressed
52
- # persistence (stale-batch redelivery of an already-terminal context)
53
- # re-saving here would clobber the stored terminal record with this
54
- # run's stale in-memory status.
55
- executor.save_context unless executor.skip_context_persist?
62
+ # No explicit save here: resume_execution's ensure block already persists
63
+ # the final root state (`save_context unless skip_context_persist?`), and
64
+ # in the worker the executor's context IS the root, so an extra checkpoint!
65
+ # would just re-write the identical blob to the identical key. The
66
+ # skip_context_persist? guard (stale-batch redelivery of an already-terminal
67
+ # context) is likewise honored there.
56
68
 
57
69
  # Return the executor (which now has the result stored in it)
58
70
  executor
@@ -66,7 +78,7 @@ module RubyReactor
66
78
  # retry path so this doesn't burn the job's retry budget or appear
67
79
  # as an error in dashboards. After the configured cap is reached we
68
80
  # escalate by marking the reactor as failed.
69
- handle_snooze(serialized_context, reactor_class_name, context, snooze_count, e)
81
+ handle_snooze(context_id, reactor_class_name, context, snooze_count, e)
70
82
  rescue RubyReactor::RateLimitRegistry::UnknownLimitError => e
71
83
  # Permanent configuration error — snoozing or retrying the same job
72
84
  # will keep failing. Mark the context failed immediately.
@@ -76,7 +88,7 @@ module RubyReactor
76
88
 
77
89
  private
78
90
 
79
- def handle_snooze(serialized_context, reactor_class_name, context, snooze_count, error)
91
+ def handle_snooze(context_id, reactor_class_name, context, snooze_count, error)
80
92
  config = RubyReactor.configuration
81
93
  max = config.lock_snooze_max_attempts
82
94
 
@@ -86,7 +98,12 @@ module RubyReactor
86
98
  # prematurely or strand the nonce in `assigned_at` until poison_pill
87
99
  # eventually advances past it. Snooze until the gate passes (or poison
88
100
  # auto-advance moves the cursor past us).
89
- capped = !error.is_a?(RubyReactor::OrderedLock::WaitError)
101
+ # The per-context liveness lock (`async:<id>`) is also uncapped: a
102
+ # duplicate of the *same* execution may wait arbitrarily long for the
103
+ # live original to finish (e.g. a sweeper re-enqueue racing a slow but
104
+ # alive worker). Capping it would fail a legitimately-waiting duplicate.
105
+ capped = !(error.is_a?(RubyReactor::OrderedLock::WaitError) ||
106
+ error.is_a?(RubyReactor::Lock::ContextLockContention))
90
107
 
91
108
  if capped && max != :infinity && snooze_count >= max
92
109
  escalate_snooze(context, snooze_count, error)
@@ -94,7 +111,9 @@ module RubyReactor
94
111
  end
95
112
 
96
113
  delay = compute_snooze_delay(config, error)
97
- self.class.perform_in(delay, serialized_context, reactor_class_name, snooze_count + 1)
114
+ # Re-enqueue by id: the context is already persisted in storage, so the
115
+ # rescheduled job rehydrates fresh state (no stale blob).
116
+ self.class.perform_in(delay, context_id, reactor_class_name, snooze_count + 1)
98
117
  end
99
118
 
100
119
  # Use the error's `retry_after_seconds` hint when available
@@ -141,7 +160,7 @@ module RubyReactor
141
160
  }
142
161
 
143
162
  serialized = ContextSerializer.serialize(context)
144
- reactor_class_name = context.reactor_class&.name || "AnonymousReactor"
163
+ reactor_class_name = RubyReactor.reactor_storage_name(context.reactor_class)
145
164
  RubyReactor.configuration.storage_adapter.store_context(
146
165
  context.context_id,
147
166
  serialized,
@@ -162,23 +181,22 @@ module RubyReactor
162
181
  RubyReactor.configuration.logger.error("Job details: #{msg.inspect}")
163
182
  end
164
183
 
165
- def handle_deserialization_failure(serialized_context, reactor_class_name, error)
166
- metadata = extract_failure_metadata(serialized_context)
167
- context_id = metadata[:context_id]
168
- resolved_reactor_class_name = reactor_class_name || metadata[:reactor_class_name]
169
-
184
+ # The id-only payload already carries context_id and reactor_class_name, so
185
+ # there is no blob to parse for metadata just mark the stored context
186
+ # failed (best-effort) so the job stops retrying a permanently-broken blob.
187
+ def handle_deserialization_failure(context_id, reactor_class_name, error)
170
188
  RubyReactor.configuration.logger.error(
171
189
  "RubyReactor deserialization failure for context " \
172
190
  "#{context_id || "unknown"}: #{error.class.name}: #{error.message}"
173
191
  )
174
192
 
175
- return unless context_id && resolved_reactor_class_name
193
+ return unless context_id && reactor_class_name
176
194
 
177
- payload = build_failed_context_payload(context_id, resolved_reactor_class_name, error)
195
+ payload = build_failed_context_payload(context_id, reactor_class_name, error)
178
196
  RubyReactor.configuration.storage_adapter.store_context(
179
197
  context_id,
180
198
  payload,
181
- resolved_reactor_class_name
199
+ reactor_class_name
182
200
  )
183
201
  rescue StandardError => e
184
202
  # Don't let a persistence failure mask the original deserialization error.
@@ -187,16 +205,6 @@ module RubyReactor
187
205
  )
188
206
  end
189
207
 
190
- def extract_failure_metadata(serialized_context)
191
- data = JSON.parse(serialized_context)
192
- {
193
- context_id: data["context_id"],
194
- reactor_class_name: data["reactor_class"]
195
- }
196
- rescue StandardError
197
- {}
198
- end
199
-
200
208
  def build_failed_context_payload(context_id, reactor_class_name, error)
201
209
  JSON.generate(
202
210
  "schema_version" => ContextSerializer::SCHEMA_VERSION,
@@ -179,10 +179,25 @@ module RubyReactor
179
179
  storage = RubyReactor.configuration.storage_adapter
180
180
  storage.initialize_map_operation(
181
181
  map_id, arguments[:source].size, context.reactor_class.name,
182
- strict_ordering: arguments[:strict_ordering], reactor_class_info: reactor_class_info
182
+ strict_ordering: arguments[:strict_ordering], reactor_class_info: reactor_class_info,
183
+ **map_recovery_metadata(context, arguments[:step_name] || context.current_step)
183
184
  )
184
185
  end
185
186
 
187
+ # Recovery metadata for the map sweeper. When this map runs inside a map
188
+ # element (context.map_metadata present), it is a NESTED map: its parent
189
+ # holds the element's `map_element:` lock, not an `async:` lock (N1).
190
+ def map_recovery_metadata(context, step_name)
191
+ outer = context.map_metadata
192
+ {
193
+ parent_context_id: context.context_id,
194
+ step_name: step_name.to_s,
195
+ parent_is_map_element: !outer.nil?,
196
+ outer_map_id: outer && (outer[:map_id] || outer["map_id"]),
197
+ outer_index: outer && (outer[:index] || outer["index"])
198
+ }
199
+ end
200
+
186
201
  def dispatch_async_map(map_id, arguments, context, _reactor_class_info, step_name)
187
202
  # Every async map runs through the per-element Dispatcher path. When no
188
203
  # batch_size is given we default to the full source size (one fan-out
@@ -231,7 +246,8 @@ module RubyReactor
231
246
  storage = RubyReactor.configuration.storage_adapter
232
247
  storage.initialize_map_operation(
233
248
  map_id, arguments[:source].size, context.reactor_class.name,
234
- strict_ordering: arguments[:strict_ordering], reactor_class_info: reactor_class_info
249
+ strict_ordering: arguments[:strict_ordering], reactor_class_info: reactor_class_info,
250
+ **map_recovery_metadata(context, step_name)
235
251
  )
236
252
 
237
253
  limit ||= arguments[:source].size