RubyGems - ruby_reactor - Versions diffs - 0.5.2 → 0.5.3 - Mend

ruby_reactor 0.5.2 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

checksums.yaml +4 -4
data/.release-please-manifest.json +1 -1
data/CHANGELOG.md +7 -0
data/README.md +147 -34
data/lib/ruby_reactor/configuration.rb +66 -2
data/lib/ruby_reactor/context_serializer.rb +9 -4
data/lib/ruby_reactor/executor/ordered_lock_support.rb +1 -1
data/lib/ruby_reactor/executor/retry_manager.rb +7 -2
data/lib/ruby_reactor/executor/step_executor.rb +25 -5
data/lib/ruby_reactor/executor.rb +85 -3
data/lib/ruby_reactor/lock.rb +13 -0
data/lib/ruby_reactor/map/collector.rb +41 -0
data/lib/ruby_reactor/map/dispatcher.rb +42 -0
data/lib/ruby_reactor/map/element_executor.rb +39 -0
data/lib/ruby_reactor/map/helpers.rb +10 -3
data/lib/ruby_reactor/map/sweeper.rb +110 -0
data/lib/ruby_reactor/reactor.rb +7 -5
data/lib/ruby_reactor/sidekiq_adapter.rb +9 -8
data/lib/ruby_reactor/sidekiq_workers/sweeper_worker.rb +73 -0
data/lib/ruby_reactor/sidekiq_workers/worker.rb +42 -34
data/lib/ruby_reactor/step/map_step.rb +18 -2
data/lib/ruby_reactor/storage/redis_adapter.rb +83 -60
data/lib/ruby_reactor/storage/redis_locking.rb +8 -0
data/lib/ruby_reactor/sweeper.rb +58 -0
data/lib/ruby_reactor/version.rb +1 -1
data/lib/ruby_reactor.rb +42 -0
metadata +4 -1

data/lib/ruby_reactor/map/collector.rb CHANGED Viewed

@@ -8,6 +8,42 @@ module RubyReactor
       def self.perform(arguments)
         arguments = arguments.transform_keys(&:to_sym)
         map_id = arguments[:map_id]
+        # Serialize concurrent collector deliveries for the SAME map (eager queue +
+        # counter-zero trigger + sweeper re-trigger could otherwise all resume the
+        # parent at once and both write its context). A dedicated map_collect lock
+        # is used rather than the parent's own lock so it never conflicts with the
+        # context lock the parent's resume_execution acquires for itself.
+        lock = acquire_collect_lock(map_id)
+        return if lock == :contended
+        begin
+          perform_collection(arguments)
+        ensure
+          lock.release if lock.respond_to?(:release)
+        end
+      end
+      def self.acquire_collect_lock(map_id)
+        return :inline if inline_testing_mode?
+        lock = RubyReactor::Lock.new(
+          "map_collect:#{map_id}",
+          owner: SecureRandom.uuid, ttl: RubyReactor.configuration.context_lock_ttl,
+          wait: 0, auto_extend: true
+        )
+        lock.acquire
+        lock
+      rescue RubyReactor::Lock::AcquisitionError
+        :contended
+      end
+      def self.inline_testing_mode?
+        defined?(Sidekiq::Testing) && Sidekiq::Testing.respond_to?(:inline?) && Sidekiq::Testing.inline?
+      end
+      def self.perform_collection(arguments)
+        map_id = arguments[:map_id]
         parent_context_id = arguments[:parent_context_id]
         parent_reactor_class_name = arguments[:parent_reactor_class_name]
         step_name = arguments[:step_name]
@@ -18,6 +54,11 @@ module RubyReactor
         parent_context_data = storage.retrieve_context(parent_context_id, parent_reactor_class_name)
         parent_context = RubyReactor::Context.deserialize_from_retry(parent_context_data)
+        # Idempotency: if the parent already recorded this map step's result, a
+        # prior collector already resumed it. Re-resuming would double-execute the
+        # steps after the map. Skip.
+        return if parent_context.intermediate_results.key?(step_name.to_sym)
         # Check if all tasks are completed
         metadata = storage.retrieve_map_metadata(map_id, parent_reactor_class_name)
         total_count = metadata ? metadata["count"].to_i : 0

data/lib/ruby_reactor/map/dispatcher.rb CHANGED Viewed

@@ -104,6 +104,48 @@ module RubyReactor
         end
       end
+      # Re-dispatch a SPECIFIC index whose result slot is missing (Phase 5c, used
+      # by the map sweeper). Index-driven rather than offset-driven: resolve the
+      # source from the stored parent context and pick source[index]. Idempotent
+      # because store_map_result HSETs by index — a re-run overwrites slot `index`,
+      # never duplicates.
+      def self.requeue_index(map_meta, index)
+        storage = RubyReactor.configuration.storage_adapter
+        parent_class_name = map_meta["parent_reactor_class_name"]
+        parent_context = load_parent_context_from_storage(map_meta["parent_context_id"], parent_class_name, storage)
+        arguments = {
+          map_id: map_meta["map_id"],
+          step_name: map_meta["step_name"],
+          strict_ordering: map_meta["strict_ordering"],
+          parent_context_id: map_meta["parent_context_id"],
+          parent_reactor_class_name: parent_class_name,
+          fail_fast: map_meta["fail_fast"],
+          batch_size: map_meta["batch_size"]
+        }
+        source = resolve_source(arguments, parent_context)
+        element = element_at(source, index)
+        queue_element_job(element, index, {
+                            map_id: map_meta["map_id"],
+                            arguments: arguments,
+                            context: parent_context,
+                            reactor_class_info: map_meta["reactor_class_info"],
+                            step_name: map_meta["step_name"]
+                          })
+      end
+      def self.element_at(source, index)
+        if source.is_a?(Array)
+          source[index]
+        elsif source.respond_to?(:offset) && source.respond_to?(:limit)
+          source.offset(index).limit(1).to_a.first
+        else
+          source.drop(index).first
+        end
+      end
       def self.queue_element_job(element, index, options)
         arguments = options[:arguments]
         context = options[:context]

data/lib/ruby_reactor/map/element_executor.rb CHANGED Viewed

@@ -8,6 +8,45 @@ module RubyReactor
       def self.perform(arguments)
         arguments = arguments.transform_keys(&:to_sym)
+        # Per-element liveness lock (Phase 5b): its presence is the map sweeper's
+        # "element alive" signal, and it serializes duplicate deliveries so a
+        # re-run can't double-decrement the counter (M3). A duplicate of a live
+        # element is dropped — the live original stores the result and finalizes.
+        lock = acquire_element_lock(arguments)
+        return if lock == :contended
+        begin
+          perform_element(arguments)
+        ensure
+          lock.release if lock.respond_to?(:release)
+        end
+      end
+      def self.acquire_element_lock(arguments)
+        # In Sidekiq::Testing.inline! an element's async-retry perform_map_element_in
+        # re-enters synchronously inside this frame; the lock would self-contend.
+        # It only guards concurrent cross-process delivery, impossible inline.
+        return :inline if inline_testing_mode?
+        lock = RubyReactor::Lock.new(
+          "map_element:#{arguments[:map_id]}:#{arguments[:index]}",
+          owner: SecureRandom.uuid, ttl: RubyReactor.configuration.context_lock_ttl,
+          wait: 0, auto_extend: true
+        )
+        lock.acquire
+        lock
+      rescue RubyReactor::Lock::AcquisitionError
+        RubyReactor.configuration.logger.info(
+          "RubyReactor map element #{arguments[:map_id]}:#{arguments[:index]} already in flight; dropping duplicate"
+        )
+        :contended
+      end
+      def self.inline_testing_mode?
+        defined?(Sidekiq::Testing) && Sidekiq::Testing.respond_to?(:inline?) && Sidekiq::Testing.inline?
+      end
+      def self.perform_element(arguments)
         context = hydrate_or_create_context(arguments)
         # The element already runs inside its own background worker, so any async
         # steps (and async retries) must execute inline here rather than handing

data/lib/ruby_reactor/map/helpers.rb CHANGED Viewed

@@ -108,10 +108,17 @@ module RubyReactor
           executor.resume_execution
         end
+        # Checkpoint the ROOT, not the sub (F9/C2). When the map is embedded in a
+        # composed sub-reactor, parent_context is the *sub*; storing only the sub
+        # would leave the root blob stale and a rehydrate-by-root-id resume would
+        # lose the map's completion. Resolve the root (which embeds the sub's
+        # post-map state via composed_contexts) and store that. For a top-level
+        # map parent_context IS the root, so this is unchanged.
+        root = parent_context.root_context || parent_context
         storage.store_context(
-          parent_context.context_id,
-          ContextSerializer.serialize(parent_context),
-          parent_context.reactor_class.name
+          root.context_id,
+          ContextSerializer.serialize(root),
+          RubyReactor.reactor_storage_name(root.reactor_class)
         )
       end
     end

data/lib/ruby_reactor/map/sweeper.rb ADDED Viewed

@@ -0,0 +1,110 @@
+# frozen_string_literal: true
+module RubyReactor
+  module Map
+    # Recovers map fan-out from a hard kill (Phase 5d). Maps are the path most
+    # exposed to a lost job: one missing element result hangs the whole map and
+    # its parent forever. The unifying signal is the results hash — index-keyed
+    # and idempotent (HSET) — so completion is authoritative on `missing`, not on
+    # the fragile counter:
+    #
+    #   missing = (0...count) - HKEYS(results)
+    #
+    # For each active map:
+    #   * missing indices with NO live element lock are re-dispatched (M1/M4/M5).
+    #   * if nothing is missing but the parent never resumed, the collector is
+    #     re-triggered (M2) — gated so it never fires while a collector or the
+    #     parent is alive, or after the parent already collected.
+    #
+    # `run_once` is pure and idempotent; the host wires the cadence (same contract
+    # as RubyReactor::Sweeper).
+    class Sweeper
+      def self.run_once(limit: 1000)
+        new.run_once(limit: limit)
+      end
+      def initialize(storage: nil, async_router: nil, logger: nil)
+        @storage = storage || RubyReactor.configuration.storage_adapter
+        @async_router = async_router || RubyReactor.configuration.async_router
+        @logger = logger || RubyReactor.configuration.logger
+      end
+      # Returns { redispatched:, recollected: } counts.
+      def run_once(limit: 1000)
+        redispatched = 0
+        recollected = 0
+        @storage.scan_maps(count: limit).each do |meta|
+          missing = missing_indices(meta)
+          if missing.any?
+            redispatched += redispatch_missing(meta, missing)
+          elsif recollect?(meta)
+            retrigger_collector(meta)
+            recollected += 1
+          end
+        rescue StandardError => e
+          @logger.warn("RubyReactor::Map::Sweeper failed on map #{meta["map_id"]}: #{e.class}: #{e.message}")
+        end
+        { redispatched: redispatched, recollected: recollected }
+      end
+      private
+      def missing_indices(meta)
+        @storage.missing_map_indices(meta["map_id"], meta["count"].to_i, meta["parent_reactor_class_name"])
+      end
+      def redispatch_missing(meta, missing)
+        count = 0
+        missing.each do |index|
+          next if @storage.lock_held?("map_element:#{meta["map_id"]}:#{index}") # element alive
+          RubyReactor::Map::Dispatcher.requeue_index(meta, index)
+          count += 1
+        end
+        count
+      end
+      # All results are in. Re-trigger the collector only if no collector/parent is
+      # alive and the parent has not already collected this step.
+      def recollect?(meta)
+        return false if @storage.lock_held?("map_collect:#{meta["map_id"]}")  # a collector is running
+        return false if parent_live_lock?(meta)                               # parent execution alive
+        return false if parent_already_collected?(meta)
+        true
+      end
+      # N1: a nested map's parent is a map element running under a `map_element:`
+      # lock, not an `async:` lock. Derive the right key from metadata.
+      def parent_live_lock?(meta)
+        if meta["parent_is_map_element"]
+          @storage.lock_held?("map_element:#{meta["outer_map_id"]}:#{meta["outer_index"]}")
+        else
+          @storage.lock_held?("async:#{meta["parent_context_id"]}")
+        end
+      end
+      def parent_already_collected?(meta)
+        data = @storage.retrieve_context(meta["parent_context_id"], meta["parent_reactor_class_name"])
+        return false unless data
+        results = data["intermediate_results"] || {}
+        status = data["status"].to_s
+        results.key?(meta["step_name"].to_s) || %w[completed failed skipped].include?(status)
+      end
+      def retrigger_collector(meta)
+        @async_router.perform_map_collection_async(
+          parent_context_id: meta["parent_context_id"],
+          map_id: meta["map_id"],
+          parent_reactor_class_name: meta["parent_reactor_class_name"],
+          step_name: meta["step_name"],
+          strict_ordering: meta["strict_ordering"],
+          timeout: 3600
+        )
+      end
+    end
+  end
+end

data/lib/ruby_reactor/reactor.rb CHANGED Viewed

@@ -111,10 +111,11 @@ module RubyReactor
         # For async reactors, queue a job for the whole reactor
         @context.status = :running
         Executor.middlewares_for(self.class).on(:before_async_enqueue, @context)
+        # Persist BEFORE enqueue — the job payload is identity-only (F2).
         save_context
-        serialized_context = ContextSerializer.serialize(@context)
-        @result = configuration.async_router.perform_async(serialized_context, self.class.name,
+        @result = configuration.async_router.perform_async(@context.context_id,
+                                                           RubyReactor.reactor_storage_name(self.class),
                                                            intermediate_results: @context.intermediate_results)
         # Even if it's an AsyncResult, it might have finished inline (e.g. Sidekiq::Testing.inline!)
@@ -312,10 +313,11 @@ module RubyReactor
     def perform_async_run
       @context.status = :running
+      # Persist BEFORE enqueue — the job payload is identity-only (F2).
       save_context
-      serialized_context = ContextSerializer.serialize(@context)
-      @result = configuration.async_router.perform_async(serialized_context, self.class.name,
+      @result = configuration.async_router.perform_async(@context.context_id,
+                                                         RubyReactor.reactor_storage_name(self.class),
                                                          intermediate_results: @context.intermediate_results)
       check_for_inline_completion
@@ -424,7 +426,7 @@ module RubyReactor
     def save_context
       storage = configuration.storage_adapter
-      reactor_class_name = self.class.name || "AnonymousReactor-#{self.class.object_id}"
+      reactor_class_name = RubyReactor.reactor_storage_name(self.class)
       serialized_context = ContextSerializer.serialize(@context)
       storage.store_context(@context.context_id, serialized_context, reactor_class_name)
     end

data/lib/ruby_reactor/sidekiq_adapter.rb CHANGED Viewed

@@ -2,18 +2,19 @@
 module RubyReactor
   class SidekiqAdapter
-    def self.perform_async(serialized_context, reactor_class_name = nil, intermediate_results: {})
-      job_id = SidekiqWorkers::Worker.perform_async(serialized_context, reactor_class_name)
-      context = ContextSerializer.deserialize(serialized_context)
+    # Identity-only payload: the worker rehydrates the live context from storage
+    # by (context_id, reactor_class_name). The caller already holds context_id, so
+    # there is no blob to deserialize here.
+    def self.perform_async(context_id, reactor_class_name = nil, intermediate_results: {})
+      job_id = SidekiqWorkers::Worker.perform_async(context_id, reactor_class_name)
       RubyReactor::AsyncResult.new(job_id: job_id, intermediate_results: intermediate_results,
-                                   execution_id: context.context_id)
+                                   execution_id: context_id)
     end
-    def self.perform_in(delay, serialized_context, reactor_class_name = nil, intermediate_results: {})
-      job_id = SidekiqWorkers::Worker.perform_in(delay, serialized_context, reactor_class_name)
-      context = ContextSerializer.deserialize(serialized_context)
+    def self.perform_in(delay, context_id, reactor_class_name = nil, intermediate_results: {})
+      job_id = SidekiqWorkers::Worker.perform_in(delay, context_id, reactor_class_name)
       RubyReactor::AsyncResult.new(job_id: job_id, intermediate_results: intermediate_results,
-                                   execution_id: context.context_id)
+                                   execution_id: context_id)
     end
     # rubocop:disable Metrics/ParameterLists

data/lib/ruby_reactor/sidekiq_workers/sweeper_worker.rb ADDED Viewed

@@ -0,0 +1,73 @@
+# frozen_string_literal: true
+require "sidekiq"
+require "securerandom"
+module RubyReactor
+  module SidekiqWorkers
+    # Self-rescheduling recovery tick. Each run sweeps both the top-level reactor
+    # sweeper and the map sweeper, then schedules the next tick — a perpetual
+    # chain the host kicks once via `RubyReactor.start_sweeper!`.
+    #
+    # super_fetch safety. Sidekiq Enterprise `super_fetch` reliably re-runs a job
+    # whose worker died mid-execution. For a self-rescheduling chain that is a
+    # hazard: a tick can crash AFTER enqueuing its successor but BEFORE acking, so
+    # super_fetch recovers the crashed tick *alongside* the successor it already
+    # scheduled — the chain forks and then doubles every interval. We therefore do
+    # NOT rely on "exactly one job exists". The next tick is claimed by a
+    # per-time-window lock: every duplicate computes the SAME target window and
+    # only one wins the claim, so recovered/duplicated ticks collapse back to a
+    # single chain. The claim lock is never released — it simply expires — so no
+    # delete can race two duplicates into both winning.
+    class SweeperWorker
+      include ::Sidekiq::Worker
+      # retry: false — the sweep is idempotent and self-rescheduling, so a failed
+      # tick must not pile up Sidekiq retries; the next tick (or a super_fetch
+      # recovery) re-runs it anyway.
+      sidekiq_options retry: false, queue: RubyReactor.configuration.sidekiq_queue
+      def perform
+        config = RubyReactor.configuration
+        return unless config.sweeper_enabled
+        run_sweeps(config)
+      ensure
+        # Always chain forward (unless disabled), even after an error above, so a
+        # single bad sweep can't kill recovery. The window lock keeps this from
+        # forking under super_fetch.
+        self.class.schedule_next if RubyReactor.configuration.sweeper_enabled
+      end
+      def run_sweeps(config)
+        RubyReactor::Sweeper.run_once(limit: config.sweeper_limit)
+        RubyReactor::Map::Sweeper.run_once(limit: config.sweeper_limit)
+      rescue StandardError => e
+        config.logger.error("RubyReactor::SweeperWorker sweep failed: #{e.class}: #{e.message}")
+      end
+      # Enqueue the next tick for the upcoming time window, claiming that window
+      # so concurrent/duplicate/recovered ticks produce exactly one successor.
+      # Idempotent: also safe to call from `start_sweeper!` on every process boot.
+      def self.schedule_next
+        interval = RubyReactor.configuration.sweeper_interval
+        window = (Time.now.to_i / interval) + 1
+        lock = RubyReactor::Lock.new(
+          "sweeper:window:#{window}",
+          owner: SecureRandom.uuid,
+          ttl: interval * 2, # outlive the window; expires on its own (never released)
+          wait: 0,
+          auto_extend: false
+        )
+        lock.acquire # raises AcquisitionError if this window is already claimed
+        delay = (window * interval) - Time.now.to_i
+        perform_in([delay, 1].max)
+      rescue RubyReactor::Lock::AcquisitionError
+        # Another tick already scheduled this window — collapse the duplicate.
+        nil
+      end
+    end
+  end
+end

data/lib/ruby_reactor/sidekiq_workers/worker.rb CHANGED Viewed

@@ -17,15 +17,26 @@ module RubyReactor
         # Handle infrastructure failures (network, Redis, etc.)
       end
-      def perform(serialized_context, reactor_class_name = nil, snooze_count = 0)
+      # Identity-only payload: storage is the source of truth. Rehydrate the live
+      # context from storage by id, then resume. A nil read means the context was
+      # swept, expired, or already terminal-and-collected — nothing to resume.
+      def perform(context_id, reactor_class_name = nil, snooze_count = 0)
+        # Normalize so a nil/omitted name resolves to the same storage key the
+        # enqueue path wrote (always via reactor_storage_name). Without this a
+        # nil here builds "reactor::context:<id>" and misses the stored
+        # "reactor:AnonymousReactor:context:<id>", silently no-op'ing.
+        reactor_class_name ||= RubyReactor.reactor_storage_name(nil)
+        data = RubyReactor.configuration.storage_adapter.retrieve_context(context_id, reactor_class_name)
+        return if data.nil?
         begin
-          context = ContextSerializer.deserialize(serialized_context)
+          context = ContextSerializer.deserialize_hash(data)
         rescue RubyReactor::Error::DeserializationError,
                RubyReactor::Error::SchemaVersionError => e
-          # Permanent failures — retrying the same blob will keep failing.
-          # Mark the context as failed (best-effort) and return so Sidekiq
-          # does not burn its retry budget.
-          handle_deserialization_failure(serialized_context, reactor_class_name, e)
+          # Permanent failures — re-reading the same stored blob will keep
+          # failing. Mark the context as failed (best-effort) and return so
+          # Sidekiq does not burn its retry budget.
+          handle_deserialization_failure(context_id, reactor_class_name, e)
           return
         end
@@ -48,11 +59,12 @@ module RubyReactor
           # Resume execution from the failed step
           executor = Executor.new(context.reactor_class, {}, context)
           executor.resume_execution
-          # Skip the post-run save when the executor deliberately suppressed
-          # persistence (stale-batch redelivery of an already-terminal context)
-          # — re-saving here would clobber the stored terminal record with this
-          # run's stale in-memory status.
-          executor.save_context unless executor.skip_context_persist?
+          # No explicit save here: resume_execution's ensure block already persists
+          # the final root state (`save_context unless skip_context_persist?`), and
+          # in the worker the executor's context IS the root, so an extra checkpoint!
+          # would just re-write the identical blob to the identical key. The
+          # skip_context_persist? guard (stale-batch redelivery of an already-terminal
+          # context) is likewise honored there.
           # Return the executor (which now has the result stored in it)
           executor
@@ -66,7 +78,7 @@ module RubyReactor
           # retry path so this doesn't burn the job's retry budget or appear
           # as an error in dashboards. After the configured cap is reached we
           # escalate by marking the reactor as failed.
-          handle_snooze(serialized_context, reactor_class_name, context, snooze_count, e)
+          handle_snooze(context_id, reactor_class_name, context, snooze_count, e)
         rescue RubyReactor::RateLimitRegistry::UnknownLimitError => e
           # Permanent configuration error — snoozing or retrying the same job
           # will keep failing. Mark the context failed immediately.
@@ -76,7 +88,7 @@ module RubyReactor
       private
-      def handle_snooze(serialized_context, reactor_class_name, context, snooze_count, error)
+      def handle_snooze(context_id, reactor_class_name, context, snooze_count, error)
         config = RubyReactor.configuration
         max = config.lock_snooze_max_attempts
@@ -86,7 +98,12 @@ module RubyReactor
         # prematurely or strand the nonce in `assigned_at` until poison_pill
         # eventually advances past it. Snooze until the gate passes (or poison
         # auto-advance moves the cursor past us).
-        capped = !error.is_a?(RubyReactor::OrderedLock::WaitError)
+        # The per-context liveness lock (`async:<id>`) is also uncapped: a
+        # duplicate of the *same* execution may wait arbitrarily long for the
+        # live original to finish (e.g. a sweeper re-enqueue racing a slow but
+        # alive worker). Capping it would fail a legitimately-waiting duplicate.
+        capped = !(error.is_a?(RubyReactor::OrderedLock::WaitError) ||
+                   error.is_a?(RubyReactor::Lock::ContextLockContention))
         if capped && max != :infinity && snooze_count >= max
           escalate_snooze(context, snooze_count, error)
@@ -94,7 +111,9 @@ module RubyReactor
         end
         delay = compute_snooze_delay(config, error)
-        self.class.perform_in(delay, serialized_context, reactor_class_name, snooze_count + 1)
+        # Re-enqueue by id: the context is already persisted in storage, so the
+        # rescheduled job rehydrates fresh state (no stale blob).
+        self.class.perform_in(delay, context_id, reactor_class_name, snooze_count + 1)
       end
       # Use the error's `retry_after_seconds` hint when available
@@ -141,7 +160,7 @@ module RubyReactor
         }
         serialized = ContextSerializer.serialize(context)
-        reactor_class_name = context.reactor_class&.name || "AnonymousReactor"
+        reactor_class_name = RubyReactor.reactor_storage_name(context.reactor_class)
         RubyReactor.configuration.storage_adapter.store_context(
           context.context_id,
           serialized,
@@ -162,23 +181,22 @@ module RubyReactor
         RubyReactor.configuration.logger.error("Job details: #{msg.inspect}")
       end
-      def handle_deserialization_failure(serialized_context, reactor_class_name, error)
-        metadata = extract_failure_metadata(serialized_context)
-        context_id = metadata[:context_id]
-        resolved_reactor_class_name = reactor_class_name || metadata[:reactor_class_name]
+      # The id-only payload already carries context_id and reactor_class_name, so
+      # there is no blob to parse for metadata — just mark the stored context
+      # failed (best-effort) so the job stops retrying a permanently-broken blob.
+      def handle_deserialization_failure(context_id, reactor_class_name, error)
         RubyReactor.configuration.logger.error(
           "RubyReactor deserialization failure for context " \
           "#{context_id || "unknown"}: #{error.class.name}: #{error.message}"
         )
-        return unless context_id && resolved_reactor_class_name
+        return unless context_id && reactor_class_name
-        payload = build_failed_context_payload(context_id, resolved_reactor_class_name, error)
+        payload = build_failed_context_payload(context_id, reactor_class_name, error)
         RubyReactor.configuration.storage_adapter.store_context(
           context_id,
           payload,
-          resolved_reactor_class_name
+          reactor_class_name
         )
       rescue StandardError => e
         # Don't let a persistence failure mask the original deserialization error.
@@ -187,16 +205,6 @@ module RubyReactor
         )
       end
-      def extract_failure_metadata(serialized_context)
-        data = JSON.parse(serialized_context)
-        {
-          context_id: data["context_id"],
-          reactor_class_name: data["reactor_class"]
-        }
-      rescue StandardError
-        {}
-      end
       def build_failed_context_payload(context_id, reactor_class_name, error)
         JSON.generate(
           "schema_version" => ContextSerializer::SCHEMA_VERSION,

data/lib/ruby_reactor/step/map_step.rb CHANGED Viewed

@@ -179,10 +179,25 @@ module RubyReactor
           storage = RubyReactor.configuration.storage_adapter
           storage.initialize_map_operation(
             map_id, arguments[:source].size, context.reactor_class.name,
-            strict_ordering: arguments[:strict_ordering], reactor_class_info: reactor_class_info
+            strict_ordering: arguments[:strict_ordering], reactor_class_info: reactor_class_info,
+            **map_recovery_metadata(context, arguments[:step_name] || context.current_step)
           )
         end
+        # Recovery metadata for the map sweeper. When this map runs inside a map
+        # element (context.map_metadata present), it is a NESTED map: its parent
+        # holds the element's `map_element:` lock, not an `async:` lock (N1).
+        def map_recovery_metadata(context, step_name)
+          outer = context.map_metadata
+          {
+            parent_context_id: context.context_id,
+            step_name: step_name.to_s,
+            parent_is_map_element: !outer.nil?,
+            outer_map_id: outer && (outer[:map_id] || outer["map_id"]),
+            outer_index: outer && (outer[:index] || outer["index"])
+          }
+        end
         def dispatch_async_map(map_id, arguments, context, _reactor_class_info, step_name)
           # Every async map runs through the per-element Dispatcher path. When no
           # batch_size is given we default to the full source size (one fan-out
@@ -231,7 +246,8 @@ module RubyReactor
           storage = RubyReactor.configuration.storage_adapter
           storage.initialize_map_operation(
             map_id, arguments[:source].size, context.reactor_class.name,
-            strict_ordering: arguments[:strict_ordering], reactor_class_info: reactor_class_info
+            strict_ordering: arguments[:strict_ordering], reactor_class_info: reactor_class_info,
+            **map_recovery_metadata(context, step_name)
           )
           limit ||= arguments[:source].size