RubyGems - chrono_forge - Versions diffs - 0.9.1 → 0.10.0 - Mend

chrono_forge 0.9.1 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

data/lib/chrono_forge/executor/methods/wait_until.rb CHANGED Viewed

@@ -14,7 +14,11 @@ module ChronoForge
         #   The method should return a truthy value when the condition is met.
         # @param timeout [ActiveSupport::Duration] Maximum time to wait for condition (default: 1.hour)
         # @param check_interval [ActiveSupport::Duration] Time between condition checks (default: 15.minutes)
-        # @param retry_on [Array<Class>] Exception classes that should trigger retries instead of failures
+        # @param retry_policy [RetryPolicy, nil] Policy governing errors raised *while
+        #   evaluating the condition* (not the poll cadence). When nil, uses
+        #   RetryPolicy.wait_default, which retries nothing — a raised condition fails
+        #   fast. Pass a policy with `retry_on:` to opt specific errors into retrying.
+        #   Note: unlike steps, wait_until does NOT inherit the class-level default.
         #
         # @return [true] When the condition is met
         #
@@ -31,7 +35,7 @@ module ChronoForge
         #   wait_until :database_migration_complete?,
         #     timeout: 2.hours,
         #     check_interval: 30.seconds,
-        #     retry_on: [ActiveRecord::ConnectionNotEstablished, Net::TimeoutError]
+        #     retry_policy: RetryPolicy.new(retry_on: [ActiveRecord::ConnectionNotEstablished, Net::TimeoutError])
         #
         # @example Waiting for external system
         #   def third_party_service_ready?
@@ -42,7 +46,7 @@ module ChronoForge
         #   wait_until :third_party_service_ready?,
         #     timeout: 1.hour,
         #     check_interval: 2.minutes,
-        #     retry_on: [Net::TimeoutError, Net::HTTPClientException]
+        #     retry_policy: RetryPolicy.new(retry_on: [Net::TimeoutError, Net::HTTPClientException])
         #
         # @example Waiting for file processing
         #   def file_processing_complete?
@@ -60,7 +64,7 @@ module ChronoForge
         # The condition method is called on each check interval:
         # - Should return truthy value when condition is met
         # - Should return falsy value when condition is not yet met
-        # - Can raise exceptions that will be handled based on retry_on parameter
+        # - Can raise exceptions that will be handled based on the retry_policy
         #
         # === Timeout Handling
         # - Timeout is calculated from the first execution start time
@@ -69,9 +73,10 @@ module ChronoForge
         #
         # === Error Handling
         # - Exceptions during condition evaluation are caught and logged
-        # - If exception class is in retry_on array, it triggers retry with exponential backoff
-        # - Other exceptions cause immediate failure with ExecutionFailedError
-        # - Retry backoff: 2^attempt seconds (capped at 2^5 = 32 seconds)
+        # - If the retry_policy deems the error retryable, it triggers a retry with the
+        #   policy's backoff
+        # - Otherwise the error causes immediate failure with ExecutionFailedError
+        # - Backoff is governed by the resolved RetryPolicy
         #
         # === Persistence and Resumability
         # - Wait state is persisted in execution logs with metadata
@@ -85,7 +90,8 @@ module ChronoForge
         # - Tracks attempt count and execution times
         # - Records final result (true for success, :timed_out for timeout)
         #
-        def wait_until(condition, timeout: 1.hour, check_interval: 15.minutes, retry_on: [])
+        def wait_until(condition, timeout: 1.hour, check_interval: 15.minutes, retry_policy: nil)
+          policy = wait_retry_policy(retry_policy)
           validate_step_name_segment!(condition)
           step_name = "wait_until$#{condition}"
           # Find or create execution log
@@ -117,16 +123,15 @@ module ChronoForge
             Rails.logger.error { "Error evaluating condition #{condition}: #{e.message}" }
             self.class::ExecutionTracker.track_error(workflow, e, execution_log: execution_log)
-            # Optional retry logic
-            if retry_on.include?(e.class)
-              # Reschedule with exponential backoff
-              backoff = (2**[execution_log.attempts, 5].min).seconds
-              self.class
-                .set(wait: backoff)
-                .perform_later(
-                  @workflow.key
-                )
+            # Optional retry logic for errors raised while evaluating the
+            # condition. The poll cadence (check_interval/timeout) below is
+            # separate and unaffected by the retry policy.
+            backoff = policy.retry_backoff(e, attempts: execution_log.attempts) do |policy_key|
+              bump_retry_count!(execution_log, policy_key)
+            end
+            if backoff
+              # Reschedule with the policy's backoff (published after lock release).
+              enqueue_continuation(wait: backoff)
               # Halt current execution
               halt_execution!
@@ -167,13 +172,8 @@ module ChronoForge
             raise error
           end
-          # Reschedule with delay
-          self.class
-            .set(wait: check_interval)
-            .perform_later(
-              @workflow.key,
-              wait_condition: condition
-            )
+          # Reschedule the poll (published after lock release).
+          enqueue_continuation(wait: check_interval, wait_condition: condition)
           # Halt current execution
           halt_execution!

data/lib/chrono_forge/executor/methods/workflow_states.rb CHANGED Viewed

@@ -48,6 +48,8 @@ module ChronoForge
         # - Safe to call multiple times without side effects
         #
         def complete_workflow!
+          enforce_branch_joins!
           # Create an execution log for workflow completion
           execution_log = find_or_create_execution_log!("$workflow_completion$") do |log|
             log.started_at = Time.current
@@ -80,6 +82,20 @@ module ChronoForge
           end
         end
+        # Every branch must be joined: automerge branches join inline at their
+        # block's close (removing themselves from @open_branches); explicitly
+        # awaited branches are removed by merge_branches. Anything still in
+        # @open_branches here was opened but never joined — fail fast.
+        def enforce_branch_joins!
+          leftover = (@open_branches || {}).keys
+          return if leftover.empty?
+          raise UnmergedBranchError,
+            "branch(es) #{leftover.join(", ")} were opened but never merged. " \
+            "Add `merge_branches #{leftover.map { |n| ":#{n}" }.join(", ")}` " \
+            "or open with `branch(..., automerge: true)`."
+        end
         # Marks a workflow as failed due to an unrecoverable error.
         #
         # This method provides durable workflow failure tracking with proper state

data/lib/chrono_forge/executor/methods.rb CHANGED Viewed

@@ -6,6 +6,8 @@ module ChronoForge
       include Methods::ContinueIf
       include Methods::DurablyExecute
       include Methods::DurablyRepeat
+      include Methods::Branch
+      include Methods::MergeBranches
       include Methods::WorkflowStates
     end
   end

data/lib/chrono_forge/executor/retry_policy.rb ADDED Viewed

@@ -0,0 +1,111 @@
+module ChronoForge
+  module Executor
+    # A single, unified description of retry behavior shared by every retry site
+    # (workflow-level uncaught errors, durably_execute, durably_repeat, and
+    # wait_until's condition errors).
+    #
+    # It answers the only two questions a retry site ever asks:
+    #   - retryable?(error, attempts) — should this failure be retried?
+    #   - backoff_for(attempts)       — how long until the next attempt?
+    #
+    # `attempts` is always the 1-based count of attempts made so far, *including*
+    # the one that just failed (matching ExecutionLog#attempts). So on the first
+    # failure `attempts == 1`.
+    class RetryPolicy
+      attr_reader :max_attempts, :base, :cap, :jitter, :retry_on
+      # @param max_attempts [Integer, nil] cap on total attempts; nil = no count
+      #   cap (bounded elsewhere, e.g. wait_until's timeout)
+      # @param base [Numeric, ActiveSupport::Duration] delay of the first retry
+      # @param cap [Numeric, ActiveSupport::Duration] ceiling for a single delay
+      # @param jitter [Boolean] apply equal jitter to spread retries
+      # @param retry_on [Array<Class>, nil] nil = retry any StandardError;
+      #   an array = retry only those classes (and subclasses); [] = retry nothing
+      def initialize(max_attempts: 3, base: 1, cap: 30, jitter: true, retry_on: nil)
+        @max_attempts = max_attempts
+        @base = base
+        @cap = cap
+        @jitter = jitter
+        @retry_on = retry_on
+      end
+      def retryable?(error, attempts)
+        within_attempt_cap?(attempts) && retryable_error?(error)
+      end
+      # Equal jitter: half the computed delay plus a random portion of the other
+      # half. Computed once at re-enqueue time and never persisted, so the
+      # randomness does not affect replay determinism.
+      def backoff_for(attempts)
+        exponent = [attempts - 1, 0].max
+        delay = [cap.to_f, base.to_f * (2**exponent)].min
+        delay = (delay / 2) + rand(0.0..(delay / 2)) if jitter
+        delay.seconds
+      end
+      # Public routing predicate: would this policy handle this error at all?
+      # (independent of the attempt cap). nil retry_on = any StandardError;
+      # [] = nothing; a list = those classes and their subclasses.
+      def matches?(error)
+        retryable_error?(error)
+      end
+      # Single-call decision used by every retry site: the backoff Duration to
+      # retry, or nil to stop. A plain policy uses `attempts` and ignores any
+      # block (the block exists only so a CompositeRetryPolicy can supply a
+      # per-error count — see CompositeRetryPolicy#retry_backoff).
+      def retry_backoff(error, attempts:)
+        retryable?(error, attempts) ? backoff_for(attempts) : nil
+      end
+      # Stable per-policy identifier derived from the errors this policy
+      # *declares* (its retry_on), not the error thrown. Inside a composite this
+      # keys the policy's attempt budget, so the budget is shared across every
+      # class the policy lists (and their subclasses) and is independent of the
+      # policy's position — reordering the composite does not reset counts. A
+      # catch-all (retry_on: nil) keys "*".
+      def budget_key
+        retry_on.nil? ? "*" : retry_on.map(&:name).sort.join(",")
+      end
+      def self.step_default
+        new(max_attempts: 3, base: 1, cap: 30, jitter: true, retry_on: nil)
+      end
+      # Workflow-level (uncaught) errors retry the whole workflow from the top
+      # (replaying completed steps). They cover two populations the default can't
+      # distinguish: transient infra blips — worth riding out — and deterministic
+      # bugs, where every replay is waste. 10 attempts gives a tolerant window of
+      # up to ~8.5 min (≈4 min typical, since equal jitter puts each wait in
+      # [d/2, d]) — enough for a DB failover or deploy restart — without dragging
+      # out the bug case; cap (600s / 10 min) bounds any single backoff and only
+      # binds if a caller configures more attempts.
+      def self.workflow_default
+        new(max_attempts: 10, base: 1, cap: 600, jitter: true, retry_on: nil)
+      end
+      def self.wait_default
+        new(max_attempts: nil, base: 1, cap: 30, jitter: true, retry_on: [])
+      end
+      # Build a composite policy from an ordered list of RetryPolicy objects.
+      def self.compose(*policies)
+        CompositeRetryPolicy.new(policies)
+      end
+      private
+      def within_attempt_cap?(attempts)
+        max_attempts.nil? || attempts < max_attempts
+      end
+      def retryable_error?(error)
+        if retry_on.nil?
+          error.is_a?(StandardError)
+        else
+          retry_on.any? { |klass| error.is_a?(klass) }
+        end
+      end
+    end
+  end
+end

data/lib/chrono_forge/executor.rb CHANGED Viewed

@@ -14,46 +14,105 @@ module ChronoForge
     class InvalidStepName < NotExecutableError; end
+    # spawn/spawn_each called outside a branch block. NotExecutableError so it
+    # propagates (fail-fast on a programming error) rather than being retried.
+    class NotInBranchError < NotExecutableError; end
+    # A branch was opened but neither merged via merge_branches nor declared
+    # automerge: true. Raised at the completion gate. Fail-fast (not retried).
+    class UnmergedBranchError < NotExecutableError; end
+    # merge_branches given a name that was never opened as a branch this pass.
+    # NotExecutableError so it propagates (fail-fast) instead of being retried.
+    class UnknownBranchError < NotExecutableError; end
     # "$" separates the segments of a step name (e.g. "durably_repeat$name$ts").
     # User-supplied names/methods must not contain it.
     STEP_NAME_DELIMITER = "$"
+    # Keyword args ChronoForge threads through job args internally. Users must
+    # not pass these to perform_now/perform_later; the framework injects them
+    # via `.set(...)` continuations, whose ConfiguredJob proxy bypasses the
+    # class-level guard in `prepended` below.
+    RESERVED_KWARGS = %i[attempt retry_counts retry_workflow].freeze
     include Methods
     # Add class methods
     def self.prepended(base)
+      # Class-wide default retry policy, inherited by subclasses. Set via the
+      # `retry_policy` DSL below; nil means "use the per-site built-in default".
+      base.class_attribute :default_retry_policy, instance_accessor: false, default: nil
       class << base
-        # Enforce expected signature for perform_now with key as first arg and keywords after
-        def perform_now(key, **kwargs)
-          if !key.is_a?(String)
-            raise ArgumentError, "Workflow key must be a string as the first argument"
-          end
-          super
+        # Public enqueue contract: exactly one positional (`key`) plus keywords.
+        # Reserved internal kwargs (RESERVED_KWARGS) are rejected here; the
+        # framework injects them only via `.set(...)` continuations, whose
+        # ActiveJob ConfiguredJob proxy bypasses these class-level overrides.
+        def perform_now(key, *extra, **kwargs)
+          __validate_enqueue!(key, extra, kwargs)
+          super(key, **kwargs)
         end
-        # Enforce expected signature for perform_later with key as first arg and keywords after
-        def perform_later(key, **kwargs)
-          if !key.is_a?(String)
-            raise ArgumentError, "Workflow key must be a string as the first argument"
-          end
-          super
+        def perform_later(key, *extra, **kwargs)
+          __validate_enqueue!(key, extra, kwargs)
+          super(key, **kwargs)
+        end
+        # Re-run a failed/stalled workflow. Routes through `.set(...)` so the
+        # reserved `retry_workflow: true` flag reaches the instance perform
+        # without tripping the public guard above.
+        def retry_now(key, **kwargs)
+          __validate_enqueue!(key, [], kwargs)
+          set.perform_now(key, retry_workflow: true, **kwargs)
         end
-        # Add retry_now class method that calls perform_now with retry_workflow: true
-        def retry_now(key, **)
-          perform_now(key, retry_workflow: true, **)
+        def retry_later(key, **kwargs)
+          __validate_enqueue!(key, [], kwargs)
+          set.perform_later(key, retry_workflow: true, **kwargs)
         end
-        # Add retry_later class method that calls perform_later with retry_workflow: true
-        def retry_later(key, **)
-          perform_later(key, retry_workflow: true, **)
+        # Class-level DSL to set this workflow's default retry policy. Applies to
+        # workflow-level retries and to steps without a per-call override.
+        # Positional RetryPolicy objects build a composite (per-error budgets);
+        # keyword options build a single RetryPolicy. The two forms are mutually
+        # exclusive.
+        def retry_policy(*policies, **opts)
+          if policies.any? && opts.any?
+            raise ArgumentError, "retry_policy takes either positional policies or keyword options, not both"
+          end
+          self.default_retry_policy =
+            policies.any? ? RetryPolicy.compose(*policies) : RetryPolicy.new(**opts)
+        end
+        private
+        def __validate_enqueue!(key, extra, kwargs)
+          unless key.is_a?(String)
+            raise ArgumentError, "Workflow key must be a string as the first argument"
+          end
+          unless extra.empty?
+            raise ArgumentError,
+              "ChronoForge workflows accept only `key` positionally; pass " \
+              "everything else as keywords (got #{extra.size} extra positional arg(s))"
+          end
+          reserved = kwargs.keys & RESERVED_KWARGS
+          if reserved.any?
+            raise ArgumentError,
+              "#{reserved.join(", ")} #{reserved.one? ? "is a reserved" : "are reserved"} " \
+              "ChronoForge #{reserved.one? ? "keyword" : "keywords"} and cannot be passed to perform_now/perform_later"
+          end
         end
       end
     end
-    def perform(key, attempt: 0, retry_workflow: false, options: {}, **kwargs)
-      # Prevent excessive retries
-      if attempt >= self.class::RetryStrategy.max_attempts
+    def perform(key, attempt: 0, retry_counts: {}, retry_workflow: false, options: {}, **kwargs)
+      # Safety net: prevent re-running a workflow whose attempts are exhausted
+      # (e.g. a stale job left in the queue). The normal exhaustion path fails the
+      # workflow from the rescue below before this is ever reached.
+      policy = workflow_retry_policy
+      if policy.max_attempts && attempt >= policy.max_attempts
         Rails.logger.error { "ChronoForge:#{self.class} max attempts reached for job workflow(#{key})" }
         return
       end
@@ -101,16 +160,39 @@ module ChronoForge
         Rails.logger.error { "ChronoForge:#{self.class}(#{key}) workflow execution failed" }
         error_log = self.class::ExecutionTracker.track_error(workflow, e, attempt: attempt)
-        # Retry if applicable
-        if should_retry?(e, attempt)
-          self.class::RetryStrategy.schedule_retry(workflow, attempt: attempt)
+        # Retry if applicable. `attempt` is a 0-based index, so the count of
+        # attempts made so far (including this one) is attempt + 1. For a
+        # composite policy the per-error budget lives in `retry_counts` (keyed by
+        # the matched policy's budget_key) and rides along the job args, mirroring
+        # how `attempt` is threaded — there is no execution log at this level.
+        attempts_made = attempt + 1
+        backoff = policy.retry_backoff(e, attempts: attempts_made) do |policy_key|
+          retry_counts[policy_key] = retry_counts[policy_key].to_i + 1
+          retry_counts[policy_key]
+        end
+        if backoff
+          enqueue_continuation(wait: backoff, attempt: attempts_made, retry_counts: retry_counts)
         else
           fail_workflow! error_log
         end
       ensure
         if lock_acquired # Only release lock if we acquired it
-          context.save!
-          self.class::LockStrategy.release_lock(job_id, workflow)
+          # Release the lock and publish the continuation even if context.save!
+          # raises — otherwise a transient save failure would leave the lock held
+          # (until it goes stale) AND drop the continuation, stranding the workflow
+          # with nothing scheduled to resume it. On a save failure the continuation
+          # resumes from the last persisted context, which is exactly crash
+          # semantics (durable steps replay).
+          begin
+            context.save!
+          ensure
+            self.class::LockStrategy.release_lock(job_id, workflow)
+            # Publish the continuation only now — after the lock is released — so a
+            # zero-delay, same-key continuation can't lose the acquire race against
+            # this still-locked job. If release_lock raised (this job overran and
+            # lost the lock), we never reach here and another job owns continuation.
+            flush_continuation!
+          end
         end
       end
     end
@@ -128,6 +210,15 @@ module ChronoForge
           workflow.kwargs = kwargs
           workflow.started_at = Time.current
         end
+      # Branch children are pre-inserted by their parent (dispatch_children's
+      # insert_all), so the creation block above never runs for them and their
+      # started_at stays nil. Stamp it the first time the child actually executes
+      # so started_at reliably means "has been picked up and run" — the
+      # BranchMergeJob rekick poller treats a nil started_at as a never-executed
+      # (dropped) child, and must not mistake a child that ran and is now parked
+      # on a wait (also :idle) for one that was never picked up.
+      @workflow.update_column(:started_at, Time.current) if @workflow.started_at.nil?
     end
     def setup_context!
@@ -148,11 +239,50 @@ module ChronoForge
     # which accumulate unbounded repetition logs: we touch only the rows we need,
     # never the whole set. create_or_find_by! is used only on a miss, keeping
     # creation safe if a lock takeover ever lets two executors race.
+    #
+    # Completed steps are short-circuited up front from a single bulk read (see
+    # #completed_step_cache) so that replaying N already-done steps costs one
+    # query for the whole batch rather than one SELECT each — without that, a
+    # workflow with hundreds of steps pays hundreds of SELECTs on every resume.
+    # The cached value is a readonly, unsaved stand-in: completed steps are only
+    # ever read (.completed? and metadata["result"]), never written, so it needs
+    # no database row.
     def find_or_create_execution_log!(step_name, &)
+      if completed_step_cache.key?(step_name)
+        return ExecutionLog.new(
+          workflow: @workflow, step_name: step_name, state: :completed,
+          metadata: completed_step_cache[step_name]
+        ).tap(&:readonly!)
+      end
       ExecutionLog.find_by(workflow: @workflow, step_name: step_name) ||
         ExecutionLog.create_or_find_by!(workflow: @workflow, step_name: step_name, &)
     end
+    # One bulk read of this workflow's completed steps, mapping step_name to its
+    # metadata, memoized for the duration of a single replay pass.
+    #
+    # Only completed rows are loaded: they are the ones replayed steps short-
+    # circuit on, and once completed a step never changes, so the snapshot stays
+    # valid for the whole pass. Plucking (step_name, metadata) avoids
+    # instantiating AR objects and keeps the read portable — Rails type-casts the
+    # JSON metadata column to a Hash on SQLite, PostgreSQL and MySQL alike, with
+    # no database-specific JSON extraction.
+    #
+    # durably_repeat repetition logs (durably_repeat$<name>$<timestamp>) are
+    # deliberately excluded: they accumulate without bound yet are never replayed
+    # (durably_repeat only ever looks up its coordination log plus the single
+    # current repetition), so pulling them into memory would be all cost and no
+    # benefit. Their coordination log (durably_repeat$<name>, only two segments)
+    # is not matched by the pattern and is still cached.
+    def completed_step_cache
+      @completed_step_cache ||= ExecutionLog
+        .where(workflow: @workflow, state: ExecutionLog.states[:completed])
+        .where.not("step_name LIKE ?", "durably_repeat#{STEP_NAME_DELIMITER}%#{STEP_NAME_DELIMITER}%")
+        .pluck(:step_name, :metadata)
+        .to_h
+    end
     # Guards the user-supplied portion of a step name (a custom name, method, or
     # condition). The "$" separator is reserved for the framework's own segment
     # structure, so a user value containing it would make step names ambiguous
@@ -164,8 +294,66 @@ module ChronoForge
         "ChronoForge step name may not contain '#{STEP_NAME_DELIMITER}' (reserved separator): #{segment.inspect}"
     end
-    def should_retry?(error, attempt_count)
-      attempt_count < 3
+    # Retry policy for workflow-level (uncaught) errors: the class default if one
+    # was declared, else the workflow built-in (10 attempts, up to ~8.5 min).
+    # Each retry replays the whole workflow from the top.
+    def workflow_retry_policy
+      self.class.default_retry_policy || RetryPolicy.workflow_default
+    end
+    # Retry policy for a durable step: an explicit per-call override, else the
+    # class default, else the step built-in (short, snappy fast-fail).
+    def step_retry_policy(override)
+      coerce_policy(override) || self.class.default_retry_policy || RetryPolicy.step_default
+    end
+    # Retry policy for a wait_until condition error. Deliberately does NOT inherit
+    # the class default, so a class-wide "retry everything" can't silently turn
+    # condition-evaluation bugs into retried errors. Built-in retries nothing.
+    def wait_retry_policy(override)
+      coerce_policy(override) || RetryPolicy.wait_default
+    end
+    # Normalize a retry-policy value: an Array becomes a composite; a RetryPolicy
+    # or CompositeRetryPolicy passes through; nil stays nil.
+    def coerce_policy(value)
+      value.is_a?(Array) ? RetryPolicy.compose(*value) : value
+    end
+    # JSON metadata key holding the per-error attempt counts of a composite
+    # policy, keyed by the matched policy's declared errors (RetryPolicy#budget_key).
+    RETRY_COUNTS_KEY = "retry_counts"
+    # Increment the matched policy's slot in the log's retry-count map and return
+    # the new count. Reassigns `metadata` so the JSON column is marked dirty.
+    def bump_retry_count!(log, policy_key)
+      meta = log.metadata || {}
+      counts = meta[RETRY_COUNTS_KEY] || {}
+      counts[policy_key] = counts[policy_key].to_i + 1
+      meta[RETRY_COUNTS_KEY] = counts
+      log.update!(metadata: meta)
+      counts[policy_key]
+    end
+    # Record the continuation this job intends to enqueue. It is NOT published
+    # here: publishing while the lock is still held lets another worker claim it
+    # and lose the lock-acquisition race. The executor flushes it in `ensure`,
+    # after release_lock (see #flush_continuation!). At most one continuation is
+    # recorded per job run (every primitive records one then halts, or falls
+    # through the workflow-retry rescue).
+    def enqueue_continuation(wait:, **kwargs)
+      @continuation = {wait: wait, kwargs: kwargs}
+    end
+    # Publish the recorded continuation, if any. Called from `ensure` only after
+    # the lock row has been updated to released, so even a zero-delay continuation
+    # finds the lock free.
+    def flush_continuation!
+      return unless @continuation
+      self.class
+        .set(wait: @continuation[:wait])
+        .perform_later(@workflow.key, **@continuation[:kwargs])
     end
     def halt_execution!

data/lib/chrono_forge/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module ChronoForge
-  VERSION = "0.9.1"
+  VERSION = "0.10.0"
 end

data/lib/chrono_forge/workflow.rb CHANGED Viewed

@@ -12,6 +12,7 @@
 #  kwargs       :json             not null
 #  options      :json             not null
 #  locked_at    :datetime
+#  parent_execution_log_id :integer
 #  started_at   :datetime
 #  state        :integer          default("idle"), not null
 #  created_at   :datetime         not null
@@ -19,7 +20,10 @@
 #
 # Indexes
 #
-#  index_chrono_forge_workflows_on_key  (key) UNIQUE
+#  index_chrono_forge_workflows_on_key                         (key)
+#  index_chrono_forge_workflows_on_job_class_and_key           (job_class,key) UNIQUE
+#  index_chrono_forge_workflows_on_parent_execution_log_and_st (parent_execution_log_id,state)
+#  index_chrono_forge_workflows_on_state_and_completed_at      (state,completed_at)
 #
 module ChronoForge
   class Workflow < ApplicationRecord()
@@ -28,6 +32,11 @@ module ChronoForge
     has_many :execution_logs, dependent: :destroy
     has_many :error_logs, dependent: :destroy
+    belongs_to :parent_execution_log,
+      class_name: "ChronoForge::ExecutionLog",
+      inverse_of: :spawned_workflows,
+      optional: true
     enum :state, %i[
       idle
       running

data/lib/generators/chrono_forge/migration_actions.rb CHANGED Viewed

@@ -18,6 +18,7 @@ module ChronoForge
         install_chrono_forge
         add_chrono_forge_workflow_state_index
         add_chrono_forge_error_log_step_context
+        add_chrono_forge_parent_execution_log
       ].freeze
       def copy_chrono_forge_migrations

data/lib/generators/chrono_forge/templates/add_chrono_forge_parent_execution_log.rb ADDED Viewed

@@ -0,0 +1,38 @@
+# frozen_string_literal: true
+# Adds chrono_forge_workflows.parent_execution_log_id: the execution log that
+# spawned a workflow (for branches, the branch$<name> log). Deliberately generic
+# so any future step that spawns sub-workflows can reuse it. The composite
+# [parent_execution_log_id, state] index makes the merge completion probe and the
+# dropped-job re-kick index-only at hundreds of thousands of children.
+#
+# Shipped standalone (matching add_chrono_forge_workflow_state_index) so existing
+# installs pick it up via `rails generate chrono_forge:upgrade`.
+class AddChronoForgeParentExecutionLog < ActiveRecord::Migration[7.1]
+  disable_ddl_transaction!
+  def change
+    add_column :chrono_forge_workflows, :parent_execution_log_id, parent_log_fk_type,
+      null: true, if_not_exists: true
+    add_index :chrono_forge_workflows, %i[parent_execution_log_id state],
+      if_not_exists: true, **chrono_forge_index_algorithm
+  end
+  private
+  # Match the type of chrono_forge_workflows.id so the FK lines up on both bigint
+  # and uuid installs.
+  def parent_log_fk_type
+    id_col = connection.columns(:chrono_forge_workflows).find { |c| c.name == "id" }
+    (id_col&.type == :uuid) ? :uuid : :bigint
+  end
+  def chrono_forge_index_algorithm
+    if connection.adapter_name.to_s.downcase.include?("postgresql")
+      {algorithm: :concurrently}
+    else
+      {}
+    end
+  end
+end