chrono_forge 0.9.1 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +22 -0
  3. data/README.md +305 -44
  4. data/docs/superpowers/plans/2026-06-25-chrono_forge-dashboard.md +1748 -0
  5. data/docs/superpowers/plans/2026-06-25-chrono_forge-dashboard.md.tasks.json +17 -0
  6. data/docs/superpowers/plans/2026-06-25-composite-retry-policies.md +930 -0
  7. data/docs/superpowers/plans/2026-06-25-composite-retry-policies.md.tasks.json +54 -0
  8. data/docs/superpowers/plans/2026-06-25-reserved-kwarg-guard.md +241 -0
  9. data/docs/superpowers/plans/2026-06-25-reserved-kwarg-guard.md.tasks.json +12 -0
  10. data/docs/superpowers/plans/2026-06-26-branches-spawn-merge.md +1378 -0
  11. data/docs/superpowers/plans/2026-06-26-branches-spawn-merge.md.tasks.json +67 -0
  12. data/docs/superpowers/plans/2026-06-26-deferral-continuation-race-and-catchup.md +709 -0
  13. data/docs/superpowers/plans/2026-06-26-deferral-continuation-race-and-catchup.md.tasks.json +19 -0
  14. data/docs/superpowers/specs/2026-06-03-unified-retry-policy-design.md +226 -0
  15. data/docs/superpowers/specs/2026-06-25-chrono_forge-dashboard-design.md +190 -0
  16. data/docs/superpowers/specs/2026-06-25-composite-retry-policies-design.md +228 -0
  17. data/docs/superpowers/specs/2026-06-25-reserved-kwarg-guard-design.md +169 -0
  18. data/docs/superpowers/specs/2026-06-25-spawn-merge-branches-design.md +468 -0
  19. data/docs/superpowers/specs/2026-06-26-dashboard-branch-view-design.md +142 -0
  20. data/docs/superpowers/specs/2026-06-26-deferral-continuation-race-and-catchup-design.md +265 -0
  21. data/lib/chrono_forge/branch_merge_job.rb +138 -0
  22. data/lib/chrono_forge/branch_probe.rb +26 -0
  23. data/lib/chrono_forge/cleanup.rb +6 -0
  24. data/lib/chrono_forge/execution_log.rb +6 -0
  25. data/lib/chrono_forge/executor/composite_retry_policy.rb +47 -0
  26. data/lib/chrono_forge/executor/methods/branch.rb +185 -0
  27. data/lib/chrono_forge/executor/methods/durably_execute.rb +21 -19
  28. data/lib/chrono_forge/executor/methods/durably_repeat.rb +118 -25
  29. data/lib/chrono_forge/executor/methods/merge_branches.rb +83 -0
  30. data/lib/chrono_forge/executor/methods/wait.rb +2 -4
  31. data/lib/chrono_forge/executor/methods/wait_until.rb +25 -25
  32. data/lib/chrono_forge/executor/methods/workflow_states.rb +16 -0
  33. data/lib/chrono_forge/executor/methods.rb +2 -0
  34. data/lib/chrono_forge/executor/retry_policy.rb +111 -0
  35. data/lib/chrono_forge/executor.rb +216 -28
  36. data/lib/chrono_forge/version.rb +1 -1
  37. data/lib/chrono_forge/workflow.rb +10 -1
  38. data/lib/generators/chrono_forge/migration_actions.rb +1 -0
  39. data/lib/generators/chrono_forge/templates/add_chrono_forge_parent_execution_log.rb +38 -0
  40. metadata +42 -5
  41. data/lib/chrono_forge/executor/retry_strategy.rb +0 -29
@@ -14,7 +14,11 @@ module ChronoForge
14
14
  # The method should return a truthy value when the condition is met.
15
15
  # @param timeout [ActiveSupport::Duration] Maximum time to wait for condition (default: 1.hour)
16
16
  # @param check_interval [ActiveSupport::Duration] Time between condition checks (default: 15.minutes)
17
- # @param retry_on [Array<Class>] Exception classes that should trigger retries instead of failures
17
+ # @param retry_policy [RetryPolicy, nil] Policy governing errors raised *while
18
+ # evaluating the condition* (not the poll cadence). When nil, uses
19
+ # RetryPolicy.wait_default, which retries nothing — a raised condition fails
20
+ # fast. Pass a policy with `retry_on:` to opt specific errors into retrying.
21
+ # Note: unlike steps, wait_until does NOT inherit the class-level default.
18
22
  #
19
23
  # @return [true] When the condition is met
20
24
  #
@@ -31,7 +35,7 @@ module ChronoForge
31
35
  # wait_until :database_migration_complete?,
32
36
  # timeout: 2.hours,
33
37
  # check_interval: 30.seconds,
34
- # retry_on: [ActiveRecord::ConnectionNotEstablished, Net::TimeoutError]
38
+ # retry_policy: RetryPolicy.new(retry_on: [ActiveRecord::ConnectionNotEstablished, Net::TimeoutError])
35
39
  #
36
40
  # @example Waiting for external system
37
41
  # def third_party_service_ready?
@@ -42,7 +46,7 @@ module ChronoForge
42
46
  # wait_until :third_party_service_ready?,
43
47
  # timeout: 1.hour,
44
48
  # check_interval: 2.minutes,
45
- # retry_on: [Net::TimeoutError, Net::HTTPClientException]
49
+ # retry_policy: RetryPolicy.new(retry_on: [Net::TimeoutError, Net::HTTPClientException])
46
50
  #
47
51
  # @example Waiting for file processing
48
52
  # def file_processing_complete?
@@ -60,7 +64,7 @@ module ChronoForge
60
64
  # The condition method is called on each check interval:
61
65
  # - Should return truthy value when condition is met
62
66
  # - Should return falsy value when condition is not yet met
63
- # - Can raise exceptions that will be handled based on retry_on parameter
67
+ # - Can raise exceptions that will be handled based on the retry_policy
64
68
  #
65
69
  # === Timeout Handling
66
70
  # - Timeout is calculated from the first execution start time
@@ -69,9 +73,10 @@ module ChronoForge
69
73
  #
70
74
  # === Error Handling
71
75
  # - Exceptions during condition evaluation are caught and logged
72
- # - If exception class is in retry_on array, it triggers retry with exponential backoff
73
- # - Other exceptions cause immediate failure with ExecutionFailedError
74
- # - Retry backoff: 2^attempt seconds (capped at 2^5 = 32 seconds)
76
+ # - If the retry_policy deems the error retryable, it triggers a retry with the
77
+ # policy's backoff
78
+ # - Otherwise the error causes immediate failure with ExecutionFailedError
79
+ # - Backoff is governed by the resolved RetryPolicy
75
80
  #
76
81
  # === Persistence and Resumability
77
82
  # - Wait state is persisted in execution logs with metadata
@@ -85,7 +90,8 @@ module ChronoForge
85
90
  # - Tracks attempt count and execution times
86
91
  # - Records final result (true for success, :timed_out for timeout)
87
92
  #
88
- def wait_until(condition, timeout: 1.hour, check_interval: 15.minutes, retry_on: [])
93
+ def wait_until(condition, timeout: 1.hour, check_interval: 15.minutes, retry_policy: nil)
94
+ policy = wait_retry_policy(retry_policy)
89
95
  validate_step_name_segment!(condition)
90
96
  step_name = "wait_until$#{condition}"
91
97
  # Find or create execution log
@@ -117,16 +123,15 @@ module ChronoForge
117
123
  Rails.logger.error { "Error evaluating condition #{condition}: #{e.message}" }
118
124
  self.class::ExecutionTracker.track_error(workflow, e, execution_log: execution_log)
119
125
 
120
- # Optional retry logic
121
- if retry_on.include?(e.class)
122
- # Reschedule with exponential backoff
123
- backoff = (2**[execution_log.attempts, 5].min).seconds
124
-
125
- self.class
126
- .set(wait: backoff)
127
- .perform_later(
128
- @workflow.key
129
- )
126
+ # Optional retry logic for errors raised while evaluating the
127
+ # condition. The poll cadence (check_interval/timeout) below is
128
+ # separate and unaffected by the retry policy.
129
+ backoff = policy.retry_backoff(e, attempts: execution_log.attempts) do |policy_key|
130
+ bump_retry_count!(execution_log, policy_key)
131
+ end
132
+ if backoff
133
+ # Reschedule with the policy's backoff (published after lock release).
134
+ enqueue_continuation(wait: backoff)
130
135
 
131
136
  # Halt current execution
132
137
  halt_execution!
@@ -167,13 +172,8 @@ module ChronoForge
167
172
  raise error
168
173
  end
169
174
 
170
- # Reschedule with delay
171
- self.class
172
- .set(wait: check_interval)
173
- .perform_later(
174
- @workflow.key,
175
- wait_condition: condition
176
- )
175
+ # Reschedule the poll (published after lock release).
176
+ enqueue_continuation(wait: check_interval, wait_condition: condition)
177
177
 
178
178
  # Halt current execution
179
179
  halt_execution!
@@ -48,6 +48,8 @@ module ChronoForge
48
48
  # - Safe to call multiple times without side effects
49
49
  #
50
50
  def complete_workflow!
51
+ enforce_branch_joins!
52
+
51
53
  # Create an execution log for workflow completion
52
54
  execution_log = find_or_create_execution_log!("$workflow_completion$") do |log|
53
55
  log.started_at = Time.current
@@ -80,6 +82,20 @@ module ChronoForge
80
82
  end
81
83
  end
82
84
 
85
+ # Every branch must be joined: automerge branches join inline at their
86
+ # block's close (removing themselves from @open_branches); explicitly
87
+ # awaited branches are removed by merge_branches. Anything still in
88
+ # @open_branches here was opened but never joined — fail fast.
89
+ def enforce_branch_joins!
90
+ leftover = (@open_branches || {}).keys
91
+ return if leftover.empty?
92
+
93
+ raise UnmergedBranchError,
94
+ "branch(es) #{leftover.join(", ")} were opened but never merged. " \
95
+ "Add `merge_branches #{leftover.map { |n| ":#{n}" }.join(", ")}` " \
96
+ "or open with `branch(..., automerge: true)`."
97
+ end
98
+
83
99
  # Marks a workflow as failed due to an unrecoverable error.
84
100
  #
85
101
  # This method provides durable workflow failure tracking with proper state
@@ -6,6 +6,8 @@ module ChronoForge
6
6
  include Methods::ContinueIf
7
7
  include Methods::DurablyExecute
8
8
  include Methods::DurablyRepeat
9
+ include Methods::Branch
10
+ include Methods::MergeBranches
9
11
  include Methods::WorkflowStates
10
12
  end
11
13
  end
@@ -0,0 +1,111 @@
1
+ module ChronoForge
2
+ module Executor
3
+ # A single, unified description of retry behavior shared by every retry site
4
+ # (workflow-level uncaught errors, durably_execute, durably_repeat, and
5
+ # wait_until's condition errors).
6
+ #
7
+ # It answers the only two questions a retry site ever asks:
8
+ # - retryable?(error, attempts) — should this failure be retried?
9
+ # - backoff_for(attempts) — how long until the next attempt?
10
+ #
11
+ # `attempts` is always the 1-based count of attempts made so far, *including*
12
+ # the one that just failed (matching ExecutionLog#attempts). So on the first
13
+ # failure `attempts == 1`.
14
+ class RetryPolicy
15
+ attr_reader :max_attempts, :base, :cap, :jitter, :retry_on
16
+
17
+ # @param max_attempts [Integer, nil] cap on total attempts; nil = no count
18
+ # cap (bounded elsewhere, e.g. wait_until's timeout)
19
+ # @param base [Numeric, ActiveSupport::Duration] delay of the first retry
20
+ # @param cap [Numeric, ActiveSupport::Duration] ceiling for a single delay
21
+ # @param jitter [Boolean] apply equal jitter to spread retries
22
+ # @param retry_on [Array<Class>, nil] nil = retry any StandardError;
23
+ # an array = retry only those classes (and subclasses); [] = retry nothing
24
+ def initialize(max_attempts: 3, base: 1, cap: 30, jitter: true, retry_on: nil)
25
+ @max_attempts = max_attempts
26
+ @base = base
27
+ @cap = cap
28
+ @jitter = jitter
29
+ @retry_on = retry_on
30
+ end
31
+
32
+ def retryable?(error, attempts)
33
+ within_attempt_cap?(attempts) && retryable_error?(error)
34
+ end
35
+
36
+ # Equal jitter: half the computed delay plus a random portion of the other
37
+ # half. Computed once at re-enqueue time and never persisted, so the
38
+ # randomness does not affect replay determinism.
39
+ def backoff_for(attempts)
40
+ exponent = [attempts - 1, 0].max
41
+ delay = [cap.to_f, base.to_f * (2**exponent)].min
42
+ delay = (delay / 2) + rand(0.0..(delay / 2)) if jitter
43
+ delay.seconds
44
+ end
45
+
46
+ # Public routing predicate: would this policy handle this error at all?
47
+ # (independent of the attempt cap). nil retry_on = any StandardError;
48
+ # [] = nothing; a list = those classes and their subclasses.
49
+ def matches?(error)
50
+ retryable_error?(error)
51
+ end
52
+
53
+ # Single-call decision used by every retry site: the backoff Duration to
54
+ # retry, or nil to stop. A plain policy uses `attempts` and ignores any
55
+ # block (the block exists only so a CompositeRetryPolicy can supply a
56
+ # per-error count — see CompositeRetryPolicy#retry_backoff).
57
+ def retry_backoff(error, attempts:)
58
+ retryable?(error, attempts) ? backoff_for(attempts) : nil
59
+ end
60
+
61
+ # Stable per-policy identifier derived from the errors this policy
62
+ # *declares* (its retry_on), not the error thrown. Inside a composite this
63
+ # keys the policy's attempt budget, so the budget is shared across every
64
+ # class the policy lists (and their subclasses) and is independent of the
65
+ # policy's position — reordering the composite does not reset counts. A
66
+ # catch-all (retry_on: nil) keys "*".
67
+ def budget_key
68
+ retry_on.nil? ? "*" : retry_on.map(&:name).sort.join(",")
69
+ end
70
+
71
+ def self.step_default
72
+ new(max_attempts: 3, base: 1, cap: 30, jitter: true, retry_on: nil)
73
+ end
74
+
75
+ # Workflow-level (uncaught) errors retry the whole workflow from the top
76
+ # (replaying completed steps). They cover two populations the default can't
77
+ # distinguish: transient infra blips — worth riding out — and deterministic
78
+ # bugs, where every replay is waste. 10 attempts gives a tolerant window of
79
+ # up to ~8.5 min (≈4 min typical, since equal jitter puts each wait in
80
+ # [d/2, d]) — enough for a DB failover or deploy restart — without dragging
81
+ # out the bug case; cap (600s / 10 min) bounds any single backoff and only
82
+ # binds if a caller configures more attempts.
83
+ def self.workflow_default
84
+ new(max_attempts: 10, base: 1, cap: 600, jitter: true, retry_on: nil)
85
+ end
86
+
87
+ def self.wait_default
88
+ new(max_attempts: nil, base: 1, cap: 30, jitter: true, retry_on: [])
89
+ end
90
+
91
+ # Build a composite policy from an ordered list of RetryPolicy objects.
92
+ def self.compose(*policies)
93
+ CompositeRetryPolicy.new(policies)
94
+ end
95
+
96
+ private
97
+
98
+ def within_attempt_cap?(attempts)
99
+ max_attempts.nil? || attempts < max_attempts
100
+ end
101
+
102
+ def retryable_error?(error)
103
+ if retry_on.nil?
104
+ error.is_a?(StandardError)
105
+ else
106
+ retry_on.any? { |klass| error.is_a?(klass) }
107
+ end
108
+ end
109
+ end
110
+ end
111
+ end
@@ -14,46 +14,105 @@ module ChronoForge
14
14
 
15
15
  class InvalidStepName < NotExecutableError; end
16
16
 
17
+ # spawn/spawn_each called outside a branch block. NotExecutableError so it
18
+ # propagates (fail-fast on a programming error) rather than being retried.
19
+ class NotInBranchError < NotExecutableError; end
20
+
21
+ # A branch was opened but neither merged via merge_branches nor declared
22
+ # automerge: true. Raised at the completion gate. Fail-fast (not retried).
23
+ class UnmergedBranchError < NotExecutableError; end
24
+
25
+ # merge_branches given a name that was never opened as a branch this pass.
26
+ # NotExecutableError so it propagates (fail-fast) instead of being retried.
27
+ class UnknownBranchError < NotExecutableError; end
28
+
17
29
  # "$" separates the segments of a step name (e.g. "durably_repeat$name$ts").
18
30
  # User-supplied names/methods must not contain it.
19
31
  STEP_NAME_DELIMITER = "$"
20
32
 
33
+ # Keyword args ChronoForge threads through job args internally. Users must
34
+ # not pass these to perform_now/perform_later; the framework injects them
35
+ # via `.set(...)` continuations, whose ConfiguredJob proxy bypasses the
36
+ # class-level guard in `prepended` below.
37
+ RESERVED_KWARGS = %i[attempt retry_counts retry_workflow].freeze
38
+
21
39
  include Methods
22
40
 
23
41
  # Add class methods
24
42
  def self.prepended(base)
43
+ # Class-wide default retry policy, inherited by subclasses. Set via the
44
+ # `retry_policy` DSL below; nil means "use the per-site built-in default".
45
+ base.class_attribute :default_retry_policy, instance_accessor: false, default: nil
46
+
25
47
  class << base
26
- # Enforce expected signature for perform_now with key as first arg and keywords after
27
- def perform_now(key, **kwargs)
28
- if !key.is_a?(String)
29
- raise ArgumentError, "Workflow key must be a string as the first argument"
30
- end
31
- super
48
+ # Public enqueue contract: exactly one positional (`key`) plus keywords.
49
+ # Reserved internal kwargs (RESERVED_KWARGS) are rejected here; the
50
+ # framework injects them only via `.set(...)` continuations, whose
51
+ # ActiveJob ConfiguredJob proxy bypasses these class-level overrides.
52
+ def perform_now(key, *extra, **kwargs)
53
+ __validate_enqueue!(key, extra, kwargs)
54
+ super(key, **kwargs)
32
55
  end
33
56
 
34
- # Enforce expected signature for perform_later with key as first arg and keywords after
35
- def perform_later(key, **kwargs)
36
- if !key.is_a?(String)
37
- raise ArgumentError, "Workflow key must be a string as the first argument"
38
- end
39
- super
57
+ def perform_later(key, *extra, **kwargs)
58
+ __validate_enqueue!(key, extra, kwargs)
59
+ super(key, **kwargs)
60
+ end
61
+
62
+ # Re-run a failed/stalled workflow. Routes through `.set(...)` so the
63
+ # reserved `retry_workflow: true` flag reaches the instance perform
64
+ # without tripping the public guard above.
65
+ def retry_now(key, **kwargs)
66
+ __validate_enqueue!(key, [], kwargs)
67
+ set.perform_now(key, retry_workflow: true, **kwargs)
40
68
  end
41
69
 
42
- # Add retry_now class method that calls perform_now with retry_workflow: true
43
- def retry_now(key, **)
44
- perform_now(key, retry_workflow: true, **)
70
+ def retry_later(key, **kwargs)
71
+ __validate_enqueue!(key, [], kwargs)
72
+ set.perform_later(key, retry_workflow: true, **kwargs)
45
73
  end
46
74
 
47
- # Add retry_later class method that calls perform_later with retry_workflow: true
48
- def retry_later(key, **)
49
- perform_later(key, retry_workflow: true, **)
75
+ # Class-level DSL to set this workflow's default retry policy. Applies to
76
+ # workflow-level retries and to steps without a per-call override.
77
+ # Positional RetryPolicy objects build a composite (per-error budgets);
78
+ # keyword options build a single RetryPolicy. The two forms are mutually
79
+ # exclusive.
80
+ def retry_policy(*policies, **opts)
81
+ if policies.any? && opts.any?
82
+ raise ArgumentError, "retry_policy takes either positional policies or keyword options, not both"
83
+ end
84
+
85
+ self.default_retry_policy =
86
+ policies.any? ? RetryPolicy.compose(*policies) : RetryPolicy.new(**opts)
87
+ end
88
+
89
+ private
90
+
91
+ def __validate_enqueue!(key, extra, kwargs)
92
+ unless key.is_a?(String)
93
+ raise ArgumentError, "Workflow key must be a string as the first argument"
94
+ end
95
+ unless extra.empty?
96
+ raise ArgumentError,
97
+ "ChronoForge workflows accept only `key` positionally; pass " \
98
+ "everything else as keywords (got #{extra.size} extra positional arg(s))"
99
+ end
100
+ reserved = kwargs.keys & RESERVED_KWARGS
101
+ if reserved.any?
102
+ raise ArgumentError,
103
+ "#{reserved.join(", ")} #{reserved.one? ? "is a reserved" : "are reserved"} " \
104
+ "ChronoForge #{reserved.one? ? "keyword" : "keywords"} and cannot be passed to perform_now/perform_later"
105
+ end
50
106
  end
51
107
  end
52
108
  end
53
109
 
54
- def perform(key, attempt: 0, retry_workflow: false, options: {}, **kwargs)
55
- # Prevent excessive retries
56
- if attempt >= self.class::RetryStrategy.max_attempts
110
+ def perform(key, attempt: 0, retry_counts: {}, retry_workflow: false, options: {}, **kwargs)
111
+ # Safety net: prevent re-running a workflow whose attempts are exhausted
112
+ # (e.g. a stale job left in the queue). The normal exhaustion path fails the
113
+ # workflow from the rescue below before this is ever reached.
114
+ policy = workflow_retry_policy
115
+ if policy.max_attempts && attempt >= policy.max_attempts
57
116
  Rails.logger.error { "ChronoForge:#{self.class} max attempts reached for job workflow(#{key})" }
58
117
  return
59
118
  end
@@ -101,16 +160,39 @@ module ChronoForge
101
160
  Rails.logger.error { "ChronoForge:#{self.class}(#{key}) workflow execution failed" }
102
161
  error_log = self.class::ExecutionTracker.track_error(workflow, e, attempt: attempt)
103
162
 
104
- # Retry if applicable
105
- if should_retry?(e, attempt)
106
- self.class::RetryStrategy.schedule_retry(workflow, attempt: attempt)
163
+ # Retry if applicable. `attempt` is a 0-based index, so the count of
164
+ # attempts made so far (including this one) is attempt + 1. For a
165
+ # composite policy the per-error budget lives in `retry_counts` (keyed by
166
+ # the matched policy's budget_key) and rides along the job args, mirroring
167
+ # how `attempt` is threaded — there is no execution log at this level.
168
+ attempts_made = attempt + 1
169
+ backoff = policy.retry_backoff(e, attempts: attempts_made) do |policy_key|
170
+ retry_counts[policy_key] = retry_counts[policy_key].to_i + 1
171
+ retry_counts[policy_key]
172
+ end
173
+ if backoff
174
+ enqueue_continuation(wait: backoff, attempt: attempts_made, retry_counts: retry_counts)
107
175
  else
108
176
  fail_workflow! error_log
109
177
  end
110
178
  ensure
111
179
  if lock_acquired # Only release lock if we acquired it
112
- context.save!
113
- self.class::LockStrategy.release_lock(job_id, workflow)
180
+ # Release the lock and publish the continuation even if context.save!
181
+ # raises — otherwise a transient save failure would leave the lock held
182
+ # (until it goes stale) AND drop the continuation, stranding the workflow
183
+ # with nothing scheduled to resume it. On a save failure the continuation
184
+ # resumes from the last persisted context, which is exactly crash
185
+ # semantics (durable steps replay).
186
+ begin
187
+ context.save!
188
+ ensure
189
+ self.class::LockStrategy.release_lock(job_id, workflow)
190
+ # Publish the continuation only now — after the lock is released — so a
191
+ # zero-delay, same-key continuation can't lose the acquire race against
192
+ # this still-locked job. If release_lock raised (this job overran and
193
+ # lost the lock), we never reach here and another job owns continuation.
194
+ flush_continuation!
195
+ end
114
196
  end
115
197
  end
116
198
  end
@@ -128,6 +210,15 @@ module ChronoForge
128
210
  workflow.kwargs = kwargs
129
211
  workflow.started_at = Time.current
130
212
  end
213
+
214
+ # Branch children are pre-inserted by their parent (dispatch_children's
215
+ # insert_all), so the creation block above never runs for them and their
216
+ # started_at stays nil. Stamp it the first time the child actually executes
217
+ # so started_at reliably means "has been picked up and run" — the
218
+ # BranchMergeJob rekick poller treats a nil started_at as a never-executed
219
+ # (dropped) child, and must not mistake a child that ran and is now parked
220
+ # on a wait (also :idle) for one that was never picked up.
221
+ @workflow.update_column(:started_at, Time.current) if @workflow.started_at.nil?
131
222
  end
132
223
 
133
224
  def setup_context!
@@ -148,11 +239,50 @@ module ChronoForge
148
239
  # which accumulate unbounded repetition logs: we touch only the rows we need,
149
240
  # never the whole set. create_or_find_by! is used only on a miss, keeping
150
241
  # creation safe if a lock takeover ever lets two executors race.
242
+ #
243
+ # Completed steps are short-circuited up front from a single bulk read (see
244
+ # #completed_step_cache) so that replaying N already-done steps costs one
245
+ # query for the whole batch rather than one SELECT each — without that, a
246
+ # workflow with hundreds of steps pays hundreds of SELECTs on every resume.
247
+ # The cached value is a readonly, unsaved stand-in: completed steps are only
248
+ # ever read (.completed? and metadata["result"]), never written, so it needs
249
+ # no database row.
151
250
  def find_or_create_execution_log!(step_name, &)
251
+ if completed_step_cache.key?(step_name)
252
+ return ExecutionLog.new(
253
+ workflow: @workflow, step_name: step_name, state: :completed,
254
+ metadata: completed_step_cache[step_name]
255
+ ).tap(&:readonly!)
256
+ end
257
+
152
258
  ExecutionLog.find_by(workflow: @workflow, step_name: step_name) ||
153
259
  ExecutionLog.create_or_find_by!(workflow: @workflow, step_name: step_name, &)
154
260
  end
155
261
 
262
+ # One bulk read of this workflow's completed steps, mapping step_name to its
263
+ # metadata, memoized for the duration of a single replay pass.
264
+ #
265
+ # Only completed rows are loaded: they are the ones replayed steps short-
266
+ # circuit on, and once completed a step never changes, so the snapshot stays
267
+ # valid for the whole pass. Plucking (step_name, metadata) avoids
268
+ # instantiating AR objects and keeps the read portable — Rails type-casts the
269
+ # JSON metadata column to a Hash on SQLite, PostgreSQL and MySQL alike, with
270
+ # no database-specific JSON extraction.
271
+ #
272
+ # durably_repeat repetition logs (durably_repeat$<name>$<timestamp>) are
273
+ # deliberately excluded: they accumulate without bound yet are never replayed
274
+ # (durably_repeat only ever looks up its coordination log plus the single
275
+ # current repetition), so pulling them into memory would be all cost and no
276
+ # benefit. Their coordination log (durably_repeat$<name>, only two segments)
277
+ # is not matched by the pattern and is still cached.
278
+ def completed_step_cache
279
+ @completed_step_cache ||= ExecutionLog
280
+ .where(workflow: @workflow, state: ExecutionLog.states[:completed])
281
+ .where.not("step_name LIKE ?", "durably_repeat#{STEP_NAME_DELIMITER}%#{STEP_NAME_DELIMITER}%")
282
+ .pluck(:step_name, :metadata)
283
+ .to_h
284
+ end
285
+
156
286
  # Guards the user-supplied portion of a step name (a custom name, method, or
157
287
  # condition). The "$" separator is reserved for the framework's own segment
158
288
  # structure, so a user value containing it would make step names ambiguous
@@ -164,8 +294,66 @@ module ChronoForge
164
294
  "ChronoForge step name may not contain '#{STEP_NAME_DELIMITER}' (reserved separator): #{segment.inspect}"
165
295
  end
166
296
 
167
- def should_retry?(error, attempt_count)
168
- attempt_count < 3
297
+ # Retry policy for workflow-level (uncaught) errors: the class default if one
298
+ # was declared, else the workflow built-in (10 attempts, up to ~8.5 min).
299
+ # Each retry replays the whole workflow from the top.
300
+ def workflow_retry_policy
301
+ self.class.default_retry_policy || RetryPolicy.workflow_default
302
+ end
303
+
304
+ # Retry policy for a durable step: an explicit per-call override, else the
305
+ # class default, else the step built-in (short, snappy fast-fail).
306
+ def step_retry_policy(override)
307
+ coerce_policy(override) || self.class.default_retry_policy || RetryPolicy.step_default
308
+ end
309
+
310
+ # Retry policy for a wait_until condition error. Deliberately does NOT inherit
311
+ # the class default, so a class-wide "retry everything" can't silently turn
312
+ # condition-evaluation bugs into retried errors. Built-in retries nothing.
313
+ def wait_retry_policy(override)
314
+ coerce_policy(override) || RetryPolicy.wait_default
315
+ end
316
+
317
+ # Normalize a retry-policy value: an Array becomes a composite; a RetryPolicy
318
+ # or CompositeRetryPolicy passes through; nil stays nil.
319
+ def coerce_policy(value)
320
+ value.is_a?(Array) ? RetryPolicy.compose(*value) : value
321
+ end
322
+
323
+ # JSON metadata key holding the per-error attempt counts of a composite
324
+ # policy, keyed by the matched policy's declared errors (RetryPolicy#budget_key).
325
+ RETRY_COUNTS_KEY = "retry_counts"
326
+
327
+ # Increment the matched policy's slot in the log's retry-count map and return
328
+ # the new count. Reassigns `metadata` so the JSON column is marked dirty.
329
+ def bump_retry_count!(log, policy_key)
330
+ meta = log.metadata || {}
331
+ counts = meta[RETRY_COUNTS_KEY] || {}
332
+ counts[policy_key] = counts[policy_key].to_i + 1
333
+ meta[RETRY_COUNTS_KEY] = counts
334
+ log.update!(metadata: meta)
335
+ counts[policy_key]
336
+ end
337
+
338
+ # Record the continuation this job intends to enqueue. It is NOT published
339
+ # here: publishing while the lock is still held lets another worker claim it
340
+ # and lose the lock-acquisition race. The executor flushes it in `ensure`,
341
+ # after release_lock (see #flush_continuation!). At most one continuation is
342
+ # recorded per job run (every primitive records one then halts, or falls
343
+ # through the workflow-retry rescue).
344
+ def enqueue_continuation(wait:, **kwargs)
345
+ @continuation = {wait: wait, kwargs: kwargs}
346
+ end
347
+
348
+ # Publish the recorded continuation, if any. Called from `ensure` only after
349
+ # the lock row has been updated to released, so even a zero-delay continuation
350
+ # finds the lock free.
351
+ def flush_continuation!
352
+ return unless @continuation
353
+
354
+ self.class
355
+ .set(wait: @continuation[:wait])
356
+ .perform_later(@workflow.key, **@continuation[:kwargs])
169
357
  end
170
358
 
171
359
  def halt_execution!
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module ChronoForge
4
- VERSION = "0.9.1"
4
+ VERSION = "0.10.0"
5
5
  end
@@ -12,6 +12,7 @@
12
12
  # kwargs :json not null
13
13
  # options :json not null
14
14
  # locked_at :datetime
15
+ # parent_execution_log_id :integer
15
16
  # started_at :datetime
16
17
  # state :integer default("idle"), not null
17
18
  # created_at :datetime not null
@@ -19,7 +20,10 @@
19
20
  #
20
21
  # Indexes
21
22
  #
22
- # index_chrono_forge_workflows_on_key (key) UNIQUE
23
+ # index_chrono_forge_workflows_on_key (key)
24
+ # index_chrono_forge_workflows_on_job_class_and_key (job_class,key) UNIQUE
25
+ # index_chrono_forge_workflows_on_parent_execution_log_and_st (parent_execution_log_id,state)
26
+ # index_chrono_forge_workflows_on_state_and_completed_at (state,completed_at)
23
27
  #
24
28
  module ChronoForge
25
29
  class Workflow < ApplicationRecord()
@@ -28,6 +32,11 @@ module ChronoForge
28
32
  has_many :execution_logs, dependent: :destroy
29
33
  has_many :error_logs, dependent: :destroy
30
34
 
35
+ belongs_to :parent_execution_log,
36
+ class_name: "ChronoForge::ExecutionLog",
37
+ inverse_of: :spawned_workflows,
38
+ optional: true
39
+
31
40
  enum :state, %i[
32
41
  idle
33
42
  running
@@ -18,6 +18,7 @@ module ChronoForge
18
18
  install_chrono_forge
19
19
  add_chrono_forge_workflow_state_index
20
20
  add_chrono_forge_error_log_step_context
21
+ add_chrono_forge_parent_execution_log
21
22
  ].freeze
22
23
 
23
24
  def copy_chrono_forge_migrations
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Adds chrono_forge_workflows.parent_execution_log_id: the execution log that
4
+ # spawned a workflow (for branches, the branch$<name> log). Deliberately generic
5
+ # so any future step that spawns sub-workflows can reuse it. The composite
6
+ # [parent_execution_log_id, state] index makes the merge completion probe and the
7
+ # dropped-job re-kick index-only at hundreds of thousands of children.
8
+ #
9
+ # Shipped standalone (matching add_chrono_forge_workflow_state_index) so existing
10
+ # installs pick it up via `rails generate chrono_forge:upgrade`.
11
+ class AddChronoForgeParentExecutionLog < ActiveRecord::Migration[7.1]
12
+ disable_ddl_transaction!
13
+
14
+ def change
15
+ add_column :chrono_forge_workflows, :parent_execution_log_id, parent_log_fk_type,
16
+ null: true, if_not_exists: true
17
+
18
+ add_index :chrono_forge_workflows, %i[parent_execution_log_id state],
19
+ if_not_exists: true, **chrono_forge_index_algorithm
20
+ end
21
+
22
+ private
23
+
24
+ # Match the type of chrono_forge_workflows.id so the FK lines up on both bigint
25
+ # and uuid installs.
26
+ def parent_log_fk_type
27
+ id_col = connection.columns(:chrono_forge_workflows).find { |c| c.name == "id" }
28
+ (id_col&.type == :uuid) ? :uuid : :bigint
29
+ end
30
+
31
+ def chrono_forge_index_algorithm
32
+ if connection.adapter_name.to_s.downcase.include?("postgresql")
33
+ {algorithm: :concurrently}
34
+ else
35
+ {}
36
+ end
37
+ end
38
+ end