chrono_forge 0.9.1 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +22 -0
  3. data/README.md +305 -44
  4. data/docs/superpowers/plans/2026-06-25-chrono_forge-dashboard.md +1748 -0
  5. data/docs/superpowers/plans/2026-06-25-chrono_forge-dashboard.md.tasks.json +17 -0
  6. data/docs/superpowers/plans/2026-06-25-composite-retry-policies.md +930 -0
  7. data/docs/superpowers/plans/2026-06-25-composite-retry-policies.md.tasks.json +54 -0
  8. data/docs/superpowers/plans/2026-06-25-reserved-kwarg-guard.md +241 -0
  9. data/docs/superpowers/plans/2026-06-25-reserved-kwarg-guard.md.tasks.json +12 -0
  10. data/docs/superpowers/plans/2026-06-26-branches-spawn-merge.md +1378 -0
  11. data/docs/superpowers/plans/2026-06-26-branches-spawn-merge.md.tasks.json +67 -0
  12. data/docs/superpowers/plans/2026-06-26-deferral-continuation-race-and-catchup.md +709 -0
  13. data/docs/superpowers/plans/2026-06-26-deferral-continuation-race-and-catchup.md.tasks.json +19 -0
  14. data/docs/superpowers/specs/2026-06-03-unified-retry-policy-design.md +226 -0
  15. data/docs/superpowers/specs/2026-06-25-chrono_forge-dashboard-design.md +190 -0
  16. data/docs/superpowers/specs/2026-06-25-composite-retry-policies-design.md +228 -0
  17. data/docs/superpowers/specs/2026-06-25-reserved-kwarg-guard-design.md +169 -0
  18. data/docs/superpowers/specs/2026-06-25-spawn-merge-branches-design.md +468 -0
  19. data/docs/superpowers/specs/2026-06-26-dashboard-branch-view-design.md +142 -0
  20. data/docs/superpowers/specs/2026-06-26-deferral-continuation-race-and-catchup-design.md +265 -0
  21. data/lib/chrono_forge/branch_merge_job.rb +138 -0
  22. data/lib/chrono_forge/branch_probe.rb +26 -0
  23. data/lib/chrono_forge/cleanup.rb +6 -0
  24. data/lib/chrono_forge/execution_log.rb +6 -0
  25. data/lib/chrono_forge/executor/composite_retry_policy.rb +47 -0
  26. data/lib/chrono_forge/executor/methods/branch.rb +185 -0
  27. data/lib/chrono_forge/executor/methods/durably_execute.rb +21 -19
  28. data/lib/chrono_forge/executor/methods/durably_repeat.rb +118 -25
  29. data/lib/chrono_forge/executor/methods/merge_branches.rb +83 -0
  30. data/lib/chrono_forge/executor/methods/wait.rb +2 -4
  31. data/lib/chrono_forge/executor/methods/wait_until.rb +25 -25
  32. data/lib/chrono_forge/executor/methods/workflow_states.rb +16 -0
  33. data/lib/chrono_forge/executor/methods.rb +2 -0
  34. data/lib/chrono_forge/executor/retry_policy.rb +111 -0
  35. data/lib/chrono_forge/executor.rb +216 -28
  36. data/lib/chrono_forge/version.rb +1 -1
  37. data/lib/chrono_forge/workflow.rb +10 -1
  38. data/lib/generators/chrono_forge/migration_actions.rb +1 -0
  39. data/lib/generators/chrono_forge/templates/add_chrono_forge_parent_execution_log.rb +38 -0
  40. metadata +42 -5
  41. data/lib/chrono_forge/executor/retry_strategy.rb +0 -29
@@ -0,0 +1,185 @@
1
+ module ChronoForge
2
+ module Executor
3
+ module Methods
4
+ module Branch
5
+ # Opens a named branch — a durable fan-out step. Spawns inside the block
6
+ # eagerly create + enqueue child workflows; the branch SEALS when the
7
+ # block closes. Returns without waiting (branches are concurrent; the
8
+ # join is a separate merge_branches / automerge).
9
+ def branch(name, automerge: false)
10
+ raise ArgumentError, "branch requires a block" unless block_given?
11
+ raise ArgumentError, "branch blocks cannot be nested" if @current_branch
12
+ validate_step_name_segment!(name)
13
+
14
+ step_name = "branch$#{name}"
15
+ log = find_or_create_execution_log!(step_name) { |l| l.started_at = Time.current }
16
+
17
+ # The sealed branch log may be a readonly, id-less cache stand-in; fetch
18
+ # the real id so the registry/merge can scope children to it.
19
+ log_id = log.id || ExecutionLog.where(workflow: @workflow, step_name: step_name).pick(:id)
20
+ (@open_branches ||= {})[name.to_s] = {automerge: automerge, log_id: log_id}
21
+
22
+ # ---- THE single most important correctness/performance property ----
23
+ # A SEALED branch skips its block ENTIRELY. The expensive source
24
+ # enumeration in spawn_each never re-runs after sealing. Do not move
25
+ # dispatch out from behind this guard.
26
+ unless log.completed?
27
+ @current_branch = {name: name.to_s, log: log}
28
+ begin
29
+ yield
30
+ ensure
31
+ @current_branch = nil
32
+ end
33
+ log.update!(state: :completed, completed_at: Time.current)
34
+ end
35
+
36
+ # automerge joins the branch inline, the moment its block closes (eager
37
+ # dispatch + immediate await). Deferred/concurrent joins use an explicit
38
+ # merge_branches instead. Runs on every pass so replay re-checks via the
39
+ # merge$<name> log's own idempotency; the inline merge removes the branch
40
+ # from @open_branches on completion, so the completion gate won't see it.
41
+ merge_branches(name) if automerge
42
+
43
+ name
44
+ end
45
+
46
+ # Dispatch a single child into the current branch.
47
+ def spawn(name, workflow_class, **kwargs)
48
+ cb = current_branch!
49
+ validate_step_name_segment!(name)
50
+ child_key = "#{@workflow.key}$#{cb[:name]}$#{name}"
51
+ dispatch_children(cb, [[child_key, workflow_class, kwargs]])
52
+ name
53
+ end
54
+
55
+ # Dispatch one child per item of `source`, streamed. AR relations use
56
+ # keyset iteration (find_in_batches start:) for constant memory and are
57
+ # keyed by record id; any other enumerable uses an offset cursor and is
58
+ # keyed `name_{index}` by position. Either way the source must re-enumerate
59
+ # identically across replays. For AR sources that additionally means STABLE
60
+ # MEMBERSHIP: dispatch resumes from the last primary key on crash-recovery,
61
+ # so a row entering the relation below the cursor after it passed (e.g. a
62
+ # mutating `where(state:)` scope) never gets a child — point spawn_each at a
63
+ # set fixed for the branch's lifetime. The block returns [WorkflowClass,
64
+ # kwargs] (or a bare class).
65
+ def spawn_each(name, source, of: 1000)
66
+ cb = current_branch!
67
+ validate_step_name_segment!(name)
68
+ cursor = cb[:log].metadata&.dig("cursors", name.to_s) || {}
69
+ n = cursor["n"] || 0
70
+
71
+ if source.is_a?(ActiveRecord::Relation)
72
+ # spawn_each iterates by primary key (find_in_batches) so the stream
73
+ # re-enumerates identically across replays. An explicit .order would
74
+ # make iteration non-deterministic, so reject it up front with a clear
75
+ # error rather than letting find_in_batches raise deep in the loop.
76
+ if source.order_values.present?
77
+ raise NotExecutableError,
78
+ "spawn_each iterates #{source.model_name} by primary key; remove the " \
79
+ "explicit .order(...) (or default-scope order) from the source relation"
80
+ end
81
+ source.find_in_batches(batch_size: of, start: cursor["pk"]) do |records|
82
+ entries = records.map do |record|
83
+ klass, kw = normalize_spawn(yield(record))
84
+ # Stable per-record key: an inclusive find_in_batches re-yield of the
85
+ # boundary record on crash-resume produces the SAME key, so insert_all
86
+ # dedups it (idempotent). Sequential indexing would duplicate it.
87
+ ck = "#{@workflow.key}$#{cb[:name]}$#{name}_#{record.id}"
88
+ [ck, klass, kw]
89
+ end
90
+ dispatch_children(cb, entries)
91
+ advance_cursor!(cb, name, pk: records.last.id)
92
+ end
93
+ else
94
+ source.drop(n).each_slice(of) do |slice|
95
+ entries = slice.map do |item|
96
+ klass, kw = normalize_spawn(yield(item))
97
+ ck = "#{@workflow.key}$#{cb[:name]}$#{name}_#{n}"
98
+ n += 1
99
+ [ck, klass, kw]
100
+ end
101
+ dispatch_children(cb, entries)
102
+ advance_cursor!(cb, name, n: n)
103
+ end
104
+ end
105
+ name
106
+ end
107
+
108
+ private
109
+
110
+ def current_branch!
111
+ @current_branch || raise(NotInBranchError, "spawn/spawn_each may only be called inside a branch block")
112
+ end
113
+
114
+ # Bulk-create child workflow rows then bulk-enqueue their jobs.
115
+ # perform_all_later bypasses the class-level perform_later guard, so we
116
+ # validate the args ourselves before enqueuing.
117
+ def dispatch_children(cb, entries)
118
+ return if entries.empty?
119
+ now = Time.current
120
+ rows = entries.map do |child_key, klass, kwargs|
121
+ validate_child_enqueue!(child_key, kwargs)
122
+ {
123
+ key: child_key, job_class: klass.to_s,
124
+ kwargs: kwargs, options: {}, context: {},
125
+ state: Workflow.states[:idle],
126
+ parent_execution_log_id: cb[:log].id,
127
+ created_at: now, updated_at: now
128
+ }
129
+ end
130
+ # On-conflict-ignore makes re-dispatch (crash recovery) idempotent.
131
+ Workflow.insert_all(rows, unique_by: [:job_class, :key])
132
+
133
+ # Enqueue only children still :idle. On a crash-resume the boundary chunk
134
+ # is re-dispatched; its rows already exist (insert_all ignored them) and
135
+ # may already have run — re-enqueuing a completed/running child would only
136
+ # raise NotExecutableError and dead-letter. Freshly inserted rows are
137
+ # :idle (we enqueue after inserting, so no worker can have touched them),
138
+ # so first-time dispatch enqueues the whole batch.
139
+ keys = entries.map { |child_key, _klass, _kwargs| child_key }
140
+ idle = Workflow.where(key: keys, state: Workflow.states[:idle]).pluck(:key).to_set
141
+ jobs = entries.filter_map do |child_key, klass, kwargs|
142
+ klass.new(child_key, **kwargs) if idle.include?(child_key)
143
+ end
144
+ ActiveJob.perform_all_later(jobs) if jobs.any?
145
+ end
146
+
147
+ # Mirrors the class-level __validate_enqueue! (executor.rb) because
148
+ # perform_all_later bypasses that guard — the two must stay in sync.
149
+ def validate_child_enqueue!(child_key, kwargs)
150
+ unless child_key.is_a?(String)
151
+ raise ArgumentError, "child key must be a String (got #{child_key.inspect})"
152
+ end
153
+ reserved = kwargs.keys.map(&:to_sym) & RESERVED_KWARGS
154
+ if reserved.any?
155
+ raise ArgumentError, "#{reserved.join(", ")} are reserved ChronoForge keywords"
156
+ end
157
+ end
158
+
159
+ # Advance (and persist) a spawn_each cursor on the branch log.
160
+ # `n` is the running item index; `pk` is the AR keyset position (nil for
161
+ # plain enumerables). (Used by spawn_each in a later task.)
162
+ def advance_cursor!(cb, spawn_name, n: nil, pk: nil)
163
+ meta = cb[:log].metadata || {}
164
+ cursors = meta["cursors"] || {}
165
+ entry = cursors[spawn_name.to_s] || {}
166
+ entry["n"] = n unless n.nil?
167
+ entry["pk"] = pk unless pk.nil?
168
+ cursors[spawn_name.to_s] = entry
169
+ meta["cursors"] = cursors
170
+ cb[:log].update!(metadata: meta)
171
+ end
172
+
173
+ # Normalize a spawn_each block return: [Klass, kwargs] or a bare Klass.
174
+ def normalize_spawn(result)
175
+ klass, kwargs = Array(result)
176
+ unless klass.is_a?(Module)
177
+ raise ArgumentError,
178
+ "spawn_each block must return a workflow class or [class, kwargs] (got #{result.inspect})"
179
+ end
180
+ [klass, kwargs || {}]
181
+ end
182
+ end
183
+ end
184
+ end
185
+ end
@@ -9,19 +9,22 @@ module ChronoForge
9
9
  # execution log, ensuring idempotent behavior during workflow replays.
10
10
  #
11
11
  # @param method [Symbol] The name of the instance method to execute
12
- # @param max_attempts [Integer] Maximum retry attempts before failing (default: 3)
12
+ # @param retry_policy [RetryPolicy, nil] Per-call retry policy. When nil,
13
+ # uses the class-level `retry_policy` default, then the step built-in
14
+ # (RetryPolicy.step_default: 3 attempts, exponential backoff capped at 30s).
13
15
  # @param name [String, nil] Custom name for the execution step. Defaults to method name.
14
16
  # Used to create unique step names for execution logs.
15
17
  #
16
18
  # @return [nil]
17
19
  #
18
- # @raise [ExecutionFailedError] When the method fails after max_attempts
20
+ # @raise [ExecutionFailedError] When the method fails after the policy's max_attempts
19
21
  #
20
22
  # @example Basic usage
21
23
  # durably_execute :send_welcome_email
22
24
  #
23
- # @example With custom retry attempts
24
- # durably_execute :critical_payment_processing, max_attempts: 5
25
+ # @example With a custom retry policy
26
+ # durably_execute :critical_payment_processing,
27
+ # retry_policy: RetryPolicy.new(max_attempts: 5)
25
28
  #
26
29
  # @example With custom name for tracking
27
30
  # durably_execute :complex_calculation, name: "phase_1_calculation"
@@ -33,7 +36,7 @@ module ChronoForge
33
36
  # Rails.logger.info "Successfully uploaded file to S3"
34
37
  # end
35
38
  #
36
- # durably_execute :upload_to_s3, max_attempts: 5
39
+ # durably_execute :upload_to_s3, retry_policy: RetryPolicy.new(max_attempts: 5)
37
40
  #
38
41
  # == Behavior
39
42
  #
@@ -43,9 +46,9 @@ module ChronoForge
43
46
  # already completed, it will be skipped.
44
47
  #
45
48
  # === Retry Logic
46
- # - Failed executions are automatically retried with exponential backoff
47
- # - Backoff calculation: 2^attempt seconds (capped at 2^5 = 32 seconds)
48
- # - After max_attempts, ExecutionFailedError is raised
49
+ # - Failed executions are retried per the resolved RetryPolicy
50
+ # - Backoff and attempt cap come from that policy (see RetryPolicy)
51
+ # - After the policy's max_attempts, ExecutionFailedError is raised
49
52
  #
50
53
  # === Error Handling
51
54
  # - All exceptions except HaltExecutionFlow are caught and handled
@@ -59,7 +62,8 @@ module ChronoForge
59
62
  # - Stores error details when failures occur
60
63
  # - Enables monitoring and debugging of execution history
61
64
  #
62
- def durably_execute(method, max_attempts: 3, name: nil)
65
+ def durably_execute(method, retry_policy: nil, name: nil)
66
+ policy = step_retry_policy(retry_policy)
63
67
  validate_step_name_segment!(name || method)
64
68
  step_name = "durably_execute$#{name || method}"
65
69
  # Find or create execution log
@@ -97,16 +101,14 @@ module ChronoForge
97
101
  self.class::ExecutionTracker.track_error(workflow, e, execution_log: execution_log)
98
102
 
99
103
  # Optional retry logic
100
- if execution_log.attempts < max_attempts
101
- # Reschedule with exponential backoff
102
- backoff = (2**[execution_log.attempts, 5].min).seconds
103
-
104
- self.class
105
- .set(wait: backoff)
106
- .perform_later(
107
- @workflow.key,
108
- retry_method: method
109
- )
104
+ backoff = policy.retry_backoff(e, attempts: execution_log.attempts) do |policy_key|
105
+ bump_retry_count!(execution_log, policy_key)
106
+ end
107
+ if backoff
108
+ # Reschedule with the policy's backoff (published after lock release).
109
+ # The workflow replays on resume and skips completed steps, so the
110
+ # rescheduled run picks this step up again by its execution log.
111
+ enqueue_continuation(wait: backoff)
110
112
 
111
113
  # Halt current execution
112
114
  halt_execution!
@@ -14,10 +14,12 @@ module ChronoForge
14
14
  # @param till [Symbol, Proc] The condition to check for stopping repetition. Should return
15
15
  # true when repetition should stop. Can be a symbol for instance methods or a callable.
16
16
  # @param start_at [Time, nil] When to start the periodic task. Defaults to coordination_log.created_at + every
17
- # @param max_attempts [Integer] Maximum retry attempts per individual execution (default: 3)
17
+ # @param retry_policy [RetryPolicy, nil] Per-call retry policy for an individual
18
+ # execution. When nil, uses the class-level `retry_policy` default, then the
19
+ # step built-in (RetryPolicy.step_default: 3 attempts, backoff capped at 30s).
18
20
  # @param timeout [ActiveSupport::Duration] How long after scheduled time an execution is
19
21
  # considered stale and skipped (default: 1.hour). This enables catch-up behavior.
20
- # @param on_error [Symbol] How to handle repetition failures after max_attempts. Options:
22
+ # @param on_error [Symbol] How to handle repetition failures after the policy's max_attempts. Options:
21
23
  # - :continue (default): Log failure and continue with next scheduled execution
22
24
  # - :fail_workflow: Raise ExecutionFailedError to fail the entire workflow
23
25
  # @param name [String, nil] Custom name for the periodic task. Defaults to method name.
@@ -60,7 +62,7 @@ module ChronoForge
60
62
  # every: 1.day,
61
63
  # till: :reports_complete?,
62
64
  # start_at: Date.tomorrow.beginning_of_day,
63
- # max_attempts: 5,
65
+ # retry_policy: RetryPolicy.new(max_attempts: 5),
64
66
  # timeout: 2.hours,
65
67
  # on_error: :fail_workflow,
66
68
  # name: "daily_reports"
@@ -89,7 +91,7 @@ module ChronoForge
89
91
  # - Eventually reaches current/future execution times
90
92
  #
91
93
  # === Error Handling
92
- # - Individual execution failures are retried up to `max_attempts` with exponential backoff
94
+ # - Individual execution failures are retried per the resolved RetryPolicy
93
95
  # - After max attempts, behavior depends on `on_error` parameter:
94
96
  # - `:continue`: Failed execution is logged, next execution is scheduled
95
97
  # - `:fail_workflow`: ExecutionFailedError is raised, failing the entire workflow
@@ -100,7 +102,8 @@ module ChronoForge
100
102
  # - Coordination log: `durably_repeat$#{name}` - tracks overall periodic task state
101
103
  # - Repetition logs: `durably_repeat$#{name}$#{timestamp}` - tracks individual executions
102
104
  #
103
- def durably_repeat(method, every:, till:, start_at: nil, max_attempts: 3, timeout: 1.hour, on_error: :continue, name: nil)
105
+ def durably_repeat(method, every:, till:, start_at: nil, retry_policy: nil, timeout: 1.hour, on_error: :continue, name: nil)
106
+ policy = step_retry_policy(retry_policy)
104
107
  validate_step_name_segment!(name || method)
105
108
  step_name = "durably_repeat$#{name || method}"
106
109
 
@@ -145,13 +148,108 @@ module ChronoForge
145
148
  coordination_log.created_at + every
146
149
  end
147
150
 
148
- execute_or_schedule_repetition(method, coordination_log, next_execution_at, every, max_attempts, timeout, on_error)
151
+ next_execution_at = fast_forward_expired_prefix(coordination_log, next_execution_at, every, timeout)
152
+
153
+ execute_or_schedule_repetition(method, coordination_log, next_execution_at, every, policy, timeout, on_error)
149
154
  nil
150
155
  end
151
156
 
152
157
  private
153
158
 
154
- def execute_or_schedule_repetition(method, coordination_log, next_execution_at, every, max_attempts, timeout, on_error)
159
+ # Catch-up fast-forward. A tick `t` is expired (its work is skipped) iff
160
+ # `Time.current > t + timeout`, i.e. `t < now - timeout`. Rather than
161
+ # walking one zero-delay job per expired tick, jump straight to the first
162
+ # non-expired tick on the same grid (see #advance_to_first_valid_tick).
163
+ #
164
+ # Anchoring the arithmetic on `next_execution_at` (already on the canonical
165
+ # grid: start_at / created_at+every / last_execution_at+every all land on
166
+ # it, because last_execution_at stores the *scheduled* time, not wall-clock)
167
+ # keeps the result exactly on the grid — no drift, for fixed AND calendar
168
+ # intervals.
169
+ #
170
+ # Returns `next_execution_at` unchanged when nothing is expired. Otherwise
171
+ # advances the coordination log's last_execution_at so a replay recomputes
172
+ # the same first tick, and writes ONE summary ExecutionLog for the whole
173
+ # skipped prefix (no per-tick timeout rows).
174
+ def fast_forward_expired_prefix(coordination_log, next_execution_at, every, timeout)
175
+ cutoff = Time.current - timeout
176
+ return next_execution_at if next_execution_at >= cutoff
177
+
178
+ first_valid, n = advance_to_first_valid_tick(next_execution_at, every, cutoff)
179
+ last_skipped = first_valid - every
180
+
181
+ Rails.logger.info {
182
+ "ChronoForge:#{self.class}(#{@workflow.key}) durably_repeat fast-forwarded " \
183
+ "#{n} expired tick(s) to #{first_valid.iso8601}"
184
+ }
185
+
186
+ # Single summary row for the skipped prefix, on the last skipped grid
187
+ # tick. This never collides with the first_valid repetition row, but it
188
+ # CAN reuse a prior cycle's pending repetition log at the same tick
189
+ # (e.g. a tick that was scheduled-for-later then later fast-forwarded
190
+ # over). Write the metadata in the update! so the fast_forward summary
191
+ # fields are present whether the row is newly created or reused.
192
+ summary_step = "#{coordination_log.step_name}$#{last_skipped.to_i}"
193
+ summary_log = find_or_create_execution_log!(summary_step) do |log|
194
+ log.started_at = Time.current
195
+ end
196
+ summary_log.update!(
197
+ state: :failed,
198
+ error_class: "TimeoutError",
199
+ error_message: "Fast-forwarded #{n} expired tick(s)",
200
+ completed_at: Time.current,
201
+ metadata: (summary_log.metadata || {}).merge(
202
+ "fast_forwarded" => n,
203
+ "from" => next_execution_at.iso8601,
204
+ "to" => last_skipped.iso8601,
205
+ "scheduled_for" => last_skipped.iso8601,
206
+ "timeout_at" => (last_skipped + timeout).iso8601,
207
+ "parent_id" => coordination_log.id
208
+ )
209
+ )
210
+
211
+ # Record progress: a replay recomputes naive_next = last + every = first_valid.
212
+ # Use .iso8601 (second precision) to match the existing last_execution_at
213
+ # format so resumed pre-existing workflows keep the same on-disk grid.
214
+ coordination_log.update!(
215
+ metadata: coordination_log.metadata.merge("last_execution_at" => last_skipped.iso8601)
216
+ )
217
+
218
+ first_valid
219
+ end
220
+
221
+ # Walk the canonical grid from `from` to the first tick at/after `cutoff`,
222
+ # returning [first_valid_tick, ticks_skipped].
223
+ #
224
+ # The split is at one day, which is exactly where ActiveSupport switches
225
+ # arithmetic:
226
+ #
227
+ # - Sub-day intervals (hours/minutes/seconds) are absolute (seconds-based):
228
+ # `from + n*every` is mathematically exact, no DST or clamping. These are
229
+ # also the only intervals whose missed-tick count can explode (1.second
230
+ # dormant a year ≈ 31M ticks), so we MUST jump in closed form.
231
+ #
232
+ # - Day-and-larger intervals go through calendar arithmetic (a "day" across
233
+ # DST is 23h/25h; months clamp at end-of-month), so `from + n*every` can
234
+ # drift off the grid (Jan 31 + 3.months = Apr 30, but stepping +1.month
235
+ # three times lands on Apr 28). Their count over any realistic dormancy is
236
+ # small (daily over a decade ≈ 3650), so we step the grid exactly.
237
+ def advance_to_first_valid_tick(from, every, cutoff)
238
+ if every < 1.day
239
+ n = ((cutoff - from) / every.to_f).ceil
240
+ [from + (n * every), n]
241
+ else
242
+ tick = from
243
+ n = 0
244
+ while tick < cutoff
245
+ tick += every
246
+ n += 1
247
+ end
248
+ [tick, n]
249
+ end
250
+ end
251
+
252
+ def execute_or_schedule_repetition(method, coordination_log, next_execution_at, every, policy, timeout, on_error)
155
253
  step_name = "#{coordination_log.step_name}$#{next_execution_at.to_i}"
156
254
 
157
255
  # Create execution log for this specific repetition
@@ -175,7 +273,7 @@ module ChronoForge
175
273
 
176
274
  # Check if it's time to execute this repetition
177
275
  if next_execution_at <= Time.current
178
- execute_repetition_now(method, repetition_log, coordination_log, next_execution_at, every, max_attempts, timeout, on_error)
276
+ execute_repetition_now(method, repetition_log, coordination_log, next_execution_at, every, policy, timeout, on_error)
179
277
  else
180
278
  schedule_repetition_for_later(repetition_log, next_execution_at)
181
279
  end
@@ -185,16 +283,14 @@ module ChronoForge
185
283
  # Calculate delay until execution time
186
284
  delay = [next_execution_at - Time.current, 0].max.seconds
187
285
 
188
- # Schedule the workflow to run at the specified time
189
- self.class
190
- .set(wait: delay)
191
- .perform_later(@workflow.key)
286
+ # Schedule the workflow to run at the specified time (published after release).
287
+ enqueue_continuation(wait: delay)
192
288
 
193
289
  # Halt current execution until scheduled time
194
290
  halt_execution!
195
291
  end
196
292
 
197
- def execute_repetition_now(method, repetition_log, coordination_log, execution_time, every, max_attempts, timeout, on_error)
293
+ def execute_repetition_now(method, repetition_log, coordination_log, execution_time, every, policy, timeout, on_error)
198
294
  # Check for timeout
199
295
  if Time.current > repetition_log.metadata["timeout_at"]
200
296
  repetition_log.update!(
@@ -223,13 +319,12 @@ module ChronoForge
223
319
  self.class::ExecutionTracker.track_error(@workflow, e, execution_log: repetition_log)
224
320
 
225
321
  # Handle retry logic for this specific repetition
226
- if repetition_log.attempts < max_attempts
227
- # Reschedule this same repetition with exponential backoff
228
- backoff = (2**[repetition_log.attempts, 5].min).seconds
229
-
230
- self.class
231
- .set(wait: backoff)
232
- .perform_later(@workflow.key)
322
+ backoff = policy.retry_backoff(e, attempts: repetition_log.attempts) do |policy_key|
323
+ bump_retry_count!(repetition_log, policy_key)
324
+ end
325
+ if backoff
326
+ # Reschedule this same repetition with the policy's backoff (after release).
327
+ enqueue_continuation(wait: backoff)
233
328
 
234
329
  # Halt current execution
235
330
  halt_execution!
@@ -243,7 +338,7 @@ module ChronoForge
243
338
 
244
339
  # Handle failure based on on_error setting
245
340
  if on_error == :fail_workflow
246
- raise ExecutionFailedError, "Periodic task #{method} failed after #{max_attempts} attempts: #{e.message}"
341
+ raise ExecutionFailedError, "Periodic task #{method} failed after #{repetition_log.attempts} attempts: #{e.message}"
247
342
  else
248
343
  # Continue with next execution despite this failure
249
344
  schedule_next_execution_after_completion(coordination_log, execution_time, every)
@@ -279,10 +374,8 @@ module ChronoForge
279
374
  # Calculate delay until next execution
280
375
  delay = [next_execution_time - Time.current, 0].max.seconds
281
376
 
282
- # Schedule the workflow to run for the next periodic execution
283
- self.class
284
- .set(wait: delay)
285
- .perform_later(@workflow.key)
377
+ # Schedule the next periodic execution (published after lock release).
378
+ enqueue_continuation(wait: delay)
286
379
 
287
380
  # Halt current execution
288
381
  halt_execution!
@@ -0,0 +1,83 @@
1
+ module ChronoForge
2
+ module Executor
3
+ module Methods
4
+ module MergeBranches
5
+ # Join one or more named branches. Separate from dispatch so branches run
6
+ # concurrently. Does one immediate check; if not done, hands off to the
7
+ # lightweight BranchMergeJob and halts (the heavy parent is not replayed
8
+ # per poll). Cadence clamps between min/max, scaled by pending.
9
+ def merge_branches(*names, min_interval: 5.seconds, max_interval: 5.minutes)
10
+ names.each do |nm|
11
+ validate_step_name_segment!(nm) # rejects "$"
12
+ if nm.to_s.include?(",")
13
+ raise InvalidStepName,
14
+ "branch name may not contain ',' (reserved merge separator): #{nm.inspect}"
15
+ end
16
+ end
17
+
18
+ # Validate cadence here, in the parent, so a misconfiguration fails at the
19
+ # call site instead of deep inside the poller — where (pending * FACTOR)
20
+ # .clamp(min, max) would raise ArgumentError, a non-transient error that
21
+ # dead-letters BranchMergeJob and orphans the parent.
22
+ if min_interval > max_interval
23
+ raise ArgumentError,
24
+ "min_interval (#{min_interval}) must be <= max_interval (#{max_interval})"
25
+ end
26
+
27
+ names = names.map(&:to_s).uniq
28
+ step_name = "merge$#{names.sort.join(",")}"
29
+ log = find_or_create_execution_log!(step_name) { |l| l.started_at = Time.current }
30
+
31
+ if log.completed?
32
+ # Already done — remove from registry so the completion gate does not
33
+ # see these as unmerged, then skip.
34
+ names.each { |nm| @open_branches&.delete(nm.to_s) }
35
+ return
36
+ end
37
+
38
+ branch_log_ids = names.map { |nm| open_branch!(nm)[:log_id] }
39
+
40
+ if branches_done?(branch_log_ids)
41
+ names.each { |nm| @open_branches&.delete(nm.to_s) }
42
+ log.update!(state: :completed, completed_at: Time.current)
43
+ return
44
+ end
45
+
46
+ enqueue_branch_merge_job(branch_log_ids, min_interval, max_interval)
47
+ halt_execution!
48
+ end
49
+ alias_method :merge_branch, :merge_branches
50
+
51
+ private
52
+
53
+ def open_branch!(name)
54
+ (@open_branches || {}).fetch(name.to_s) do
55
+ raise UnknownBranchError, "no open branch named #{name.inspect} (open it with `branch #{name.inspect} do … end` first)"
56
+ end
57
+ end
58
+
59
+ def branches_done?(branch_log_ids)
60
+ branch_log_ids.all? { |id| BranchProbe.done?(id) }
61
+ end
62
+
63
+ def enqueue_branch_merge_job(branch_log_ids, min_interval, max_interval)
64
+ # Mint a fresh fencing token and stamp it on each branch log under a row
65
+ # lock — the read-modify-write must not clobber a concurrent poll-state
66
+ # write from an in-flight poller. Rotating the token orphans any prior
67
+ # poller chain (its token no longer matches), so only the chain we enqueue
68
+ # below drives the merge. See BranchMergeJob#superseded?.
69
+ token = SecureRandom.uuid
70
+ ExecutionLog.where(id: branch_log_ids).find_each do |log|
71
+ log.with_lock do
72
+ log.update!(metadata: (log.metadata || {}).merge("poll_token" => token))
73
+ end
74
+ end
75
+ BranchMergeJob.perform_later(
76
+ @workflow.key, self.class.to_s, branch_log_ids,
77
+ min_interval.to_i, max_interval.to_i, token
78
+ )
79
+ end
80
+ end
81
+ end
82
+ end
83
+ end
@@ -102,10 +102,8 @@ module ChronoForge
102
102
  last_executed_at: Time.current
103
103
  )
104
104
 
105
- # Reschedule the job
106
- self.class
107
- .set(wait: duration)
108
- .perform_later(@workflow.key)
105
+ # Record the reschedule; the executor publishes it after lock release.
106
+ enqueue_continuation(wait: duration)
109
107
 
110
108
  # Halt current execution
111
109
  halt_execution!