chrono_forge 0.9.1 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +22 -0
- data/README.md +305 -44
- data/docs/superpowers/plans/2026-06-25-chrono_forge-dashboard.md +1748 -0
- data/docs/superpowers/plans/2026-06-25-chrono_forge-dashboard.md.tasks.json +17 -0
- data/docs/superpowers/plans/2026-06-25-composite-retry-policies.md +930 -0
- data/docs/superpowers/plans/2026-06-25-composite-retry-policies.md.tasks.json +54 -0
- data/docs/superpowers/plans/2026-06-25-reserved-kwarg-guard.md +241 -0
- data/docs/superpowers/plans/2026-06-25-reserved-kwarg-guard.md.tasks.json +12 -0
- data/docs/superpowers/plans/2026-06-26-branches-spawn-merge.md +1378 -0
- data/docs/superpowers/plans/2026-06-26-branches-spawn-merge.md.tasks.json +67 -0
- data/docs/superpowers/plans/2026-06-26-deferral-continuation-race-and-catchup.md +709 -0
- data/docs/superpowers/plans/2026-06-26-deferral-continuation-race-and-catchup.md.tasks.json +19 -0
- data/docs/superpowers/specs/2026-06-03-unified-retry-policy-design.md +226 -0
- data/docs/superpowers/specs/2026-06-25-chrono_forge-dashboard-design.md +190 -0
- data/docs/superpowers/specs/2026-06-25-composite-retry-policies-design.md +228 -0
- data/docs/superpowers/specs/2026-06-25-reserved-kwarg-guard-design.md +169 -0
- data/docs/superpowers/specs/2026-06-25-spawn-merge-branches-design.md +468 -0
- data/docs/superpowers/specs/2026-06-26-dashboard-branch-view-design.md +142 -0
- data/docs/superpowers/specs/2026-06-26-deferral-continuation-race-and-catchup-design.md +265 -0
- data/lib/chrono_forge/branch_merge_job.rb +138 -0
- data/lib/chrono_forge/branch_probe.rb +26 -0
- data/lib/chrono_forge/cleanup.rb +6 -0
- data/lib/chrono_forge/execution_log.rb +6 -0
- data/lib/chrono_forge/executor/composite_retry_policy.rb +47 -0
- data/lib/chrono_forge/executor/methods/branch.rb +185 -0
- data/lib/chrono_forge/executor/methods/durably_execute.rb +21 -19
- data/lib/chrono_forge/executor/methods/durably_repeat.rb +118 -25
- data/lib/chrono_forge/executor/methods/merge_branches.rb +83 -0
- data/lib/chrono_forge/executor/methods/wait.rb +2 -4
- data/lib/chrono_forge/executor/methods/wait_until.rb +25 -25
- data/lib/chrono_forge/executor/methods/workflow_states.rb +16 -0
- data/lib/chrono_forge/executor/methods.rb +2 -0
- data/lib/chrono_forge/executor/retry_policy.rb +111 -0
- data/lib/chrono_forge/executor.rb +216 -28
- data/lib/chrono_forge/version.rb +1 -1
- data/lib/chrono_forge/workflow.rb +10 -1
- data/lib/generators/chrono_forge/migration_actions.rb +1 -0
- data/lib/generators/chrono_forge/templates/add_chrono_forge_parent_execution_log.rb +38 -0
- metadata +42 -5
- data/lib/chrono_forge/executor/retry_strategy.rb +0 -29
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
module ChronoForge
|
|
2
|
+
module Executor
|
|
3
|
+
module Methods
|
|
4
|
+
module Branch
|
|
5
|
+
# Opens a named branch — a durable fan-out step. Spawns inside the block
|
|
6
|
+
# eagerly create + enqueue child workflows; the branch SEALS when the
|
|
7
|
+
# block closes. Returns without waiting (branches are concurrent; the
|
|
8
|
+
# join is a separate merge_branches / automerge).
|
|
9
|
+
def branch(name, automerge: false)
|
|
10
|
+
raise ArgumentError, "branch requires a block" unless block_given?
|
|
11
|
+
raise ArgumentError, "branch blocks cannot be nested" if @current_branch
|
|
12
|
+
validate_step_name_segment!(name)
|
|
13
|
+
|
|
14
|
+
step_name = "branch$#{name}"
|
|
15
|
+
log = find_or_create_execution_log!(step_name) { |l| l.started_at = Time.current }
|
|
16
|
+
|
|
17
|
+
# The sealed branch log may be a readonly, id-less cache stand-in; fetch
|
|
18
|
+
# the real id so the registry/merge can scope children to it.
|
|
19
|
+
log_id = log.id || ExecutionLog.where(workflow: @workflow, step_name: step_name).pick(:id)
|
|
20
|
+
(@open_branches ||= {})[name.to_s] = {automerge: automerge, log_id: log_id}
|
|
21
|
+
|
|
22
|
+
# ---- THE single most important correctness/performance property ----
|
|
23
|
+
# A SEALED branch skips its block ENTIRELY. The expensive source
|
|
24
|
+
# enumeration in spawn_each never re-runs after sealing. Do not move
|
|
25
|
+
# dispatch out from behind this guard.
|
|
26
|
+
unless log.completed?
|
|
27
|
+
@current_branch = {name: name.to_s, log: log}
|
|
28
|
+
begin
|
|
29
|
+
yield
|
|
30
|
+
ensure
|
|
31
|
+
@current_branch = nil
|
|
32
|
+
end
|
|
33
|
+
log.update!(state: :completed, completed_at: Time.current)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# automerge joins the branch inline, the moment its block closes (eager
|
|
37
|
+
# dispatch + immediate await). Deferred/concurrent joins use an explicit
|
|
38
|
+
# merge_branches instead. Runs on every pass so replay re-checks via the
|
|
39
|
+
# merge$<name> log's own idempotency; the inline merge removes the branch
|
|
40
|
+
# from @open_branches on completion, so the completion gate won't see it.
|
|
41
|
+
merge_branches(name) if automerge
|
|
42
|
+
|
|
43
|
+
name
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Dispatch a single child into the current branch.
|
|
47
|
+
def spawn(name, workflow_class, **kwargs)
|
|
48
|
+
cb = current_branch!
|
|
49
|
+
validate_step_name_segment!(name)
|
|
50
|
+
child_key = "#{@workflow.key}$#{cb[:name]}$#{name}"
|
|
51
|
+
dispatch_children(cb, [[child_key, workflow_class, kwargs]])
|
|
52
|
+
name
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Dispatch one child per item of `source`, streamed. AR relations use
|
|
56
|
+
# keyset iteration (find_in_batches start:) for constant memory and are
|
|
57
|
+
# keyed by record id; any other enumerable uses an offset cursor and is
|
|
58
|
+
# keyed `name_{index}` by position. Either way the source must re-enumerate
|
|
59
|
+
# identically across replays. For AR sources that additionally means STABLE
|
|
60
|
+
# MEMBERSHIP: dispatch resumes from the last primary key on crash-recovery,
|
|
61
|
+
# so a row entering the relation below the cursor after it passed (e.g. a
|
|
62
|
+
# mutating `where(state:)` scope) never gets a child — point spawn_each at a
|
|
63
|
+
# set fixed for the branch's lifetime. The block returns [WorkflowClass,
|
|
64
|
+
# kwargs] (or a bare class).
|
|
65
|
+
def spawn_each(name, source, of: 1000)
|
|
66
|
+
cb = current_branch!
|
|
67
|
+
validate_step_name_segment!(name)
|
|
68
|
+
cursor = cb[:log].metadata&.dig("cursors", name.to_s) || {}
|
|
69
|
+
n = cursor["n"] || 0
|
|
70
|
+
|
|
71
|
+
if source.is_a?(ActiveRecord::Relation)
|
|
72
|
+
# spawn_each iterates by primary key (find_in_batches) so the stream
|
|
73
|
+
# re-enumerates identically across replays. An explicit .order would
|
|
74
|
+
# make iteration non-deterministic, so reject it up front with a clear
|
|
75
|
+
# error rather than letting find_in_batches raise deep in the loop.
|
|
76
|
+
if source.order_values.present?
|
|
77
|
+
raise NotExecutableError,
|
|
78
|
+
"spawn_each iterates #{source.model_name} by primary key; remove the " \
|
|
79
|
+
"explicit .order(...) (or default-scope order) from the source relation"
|
|
80
|
+
end
|
|
81
|
+
source.find_in_batches(batch_size: of, start: cursor["pk"]) do |records|
|
|
82
|
+
entries = records.map do |record|
|
|
83
|
+
klass, kw = normalize_spawn(yield(record))
|
|
84
|
+
# Stable per-record key: an inclusive find_in_batches re-yield of the
|
|
85
|
+
# boundary record on crash-resume produces the SAME key, so insert_all
|
|
86
|
+
# dedups it (idempotent). Sequential indexing would duplicate it.
|
|
87
|
+
ck = "#{@workflow.key}$#{cb[:name]}$#{name}_#{record.id}"
|
|
88
|
+
[ck, klass, kw]
|
|
89
|
+
end
|
|
90
|
+
dispatch_children(cb, entries)
|
|
91
|
+
advance_cursor!(cb, name, pk: records.last.id)
|
|
92
|
+
end
|
|
93
|
+
else
|
|
94
|
+
source.drop(n).each_slice(of) do |slice|
|
|
95
|
+
entries = slice.map do |item|
|
|
96
|
+
klass, kw = normalize_spawn(yield(item))
|
|
97
|
+
ck = "#{@workflow.key}$#{cb[:name]}$#{name}_#{n}"
|
|
98
|
+
n += 1
|
|
99
|
+
[ck, klass, kw]
|
|
100
|
+
end
|
|
101
|
+
dispatch_children(cb, entries)
|
|
102
|
+
advance_cursor!(cb, name, n: n)
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
name
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
private
|
|
109
|
+
|
|
110
|
+
def current_branch!
|
|
111
|
+
@current_branch || raise(NotInBranchError, "spawn/spawn_each may only be called inside a branch block")
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# Bulk-create child workflow rows then bulk-enqueue their jobs.
|
|
115
|
+
# perform_all_later bypasses the class-level perform_later guard, so we
|
|
116
|
+
# validate the args ourselves before enqueuing.
|
|
117
|
+
def dispatch_children(cb, entries)
|
|
118
|
+
return if entries.empty?
|
|
119
|
+
now = Time.current
|
|
120
|
+
rows = entries.map do |child_key, klass, kwargs|
|
|
121
|
+
validate_child_enqueue!(child_key, kwargs)
|
|
122
|
+
{
|
|
123
|
+
key: child_key, job_class: klass.to_s,
|
|
124
|
+
kwargs: kwargs, options: {}, context: {},
|
|
125
|
+
state: Workflow.states[:idle],
|
|
126
|
+
parent_execution_log_id: cb[:log].id,
|
|
127
|
+
created_at: now, updated_at: now
|
|
128
|
+
}
|
|
129
|
+
end
|
|
130
|
+
# On-conflict-ignore makes re-dispatch (crash recovery) idempotent.
|
|
131
|
+
Workflow.insert_all(rows, unique_by: [:job_class, :key])
|
|
132
|
+
|
|
133
|
+
# Enqueue only children still :idle. On a crash-resume the boundary chunk
|
|
134
|
+
# is re-dispatched; its rows already exist (insert_all ignored them) and
|
|
135
|
+
# may already have run — re-enqueuing a completed/running child would only
|
|
136
|
+
# raise NotExecutableError and dead-letter. Freshly inserted rows are
|
|
137
|
+
# :idle (we enqueue after inserting, so no worker can have touched them),
|
|
138
|
+
# so first-time dispatch enqueues the whole batch.
|
|
139
|
+
keys = entries.map { |child_key, _klass, _kwargs| child_key }
|
|
140
|
+
idle = Workflow.where(key: keys, state: Workflow.states[:idle]).pluck(:key).to_set
|
|
141
|
+
jobs = entries.filter_map do |child_key, klass, kwargs|
|
|
142
|
+
klass.new(child_key, **kwargs) if idle.include?(child_key)
|
|
143
|
+
end
|
|
144
|
+
ActiveJob.perform_all_later(jobs) if jobs.any?
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
# Mirrors the class-level __validate_enqueue! (executor.rb) because
|
|
148
|
+
# perform_all_later bypasses that guard — the two must stay in sync.
|
|
149
|
+
def validate_child_enqueue!(child_key, kwargs)
|
|
150
|
+
unless child_key.is_a?(String)
|
|
151
|
+
raise ArgumentError, "child key must be a String (got #{child_key.inspect})"
|
|
152
|
+
end
|
|
153
|
+
reserved = kwargs.keys.map(&:to_sym) & RESERVED_KWARGS
|
|
154
|
+
if reserved.any?
|
|
155
|
+
raise ArgumentError, "#{reserved.join(", ")} are reserved ChronoForge keywords"
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
# Advance (and persist) a spawn_each cursor on the branch log.
|
|
160
|
+
# `n` is the running item index; `pk` is the AR keyset position (nil for
|
|
161
|
+
# plain enumerables). (Used by spawn_each in a later task.)
|
|
162
|
+
def advance_cursor!(cb, spawn_name, n: nil, pk: nil)
|
|
163
|
+
meta = cb[:log].metadata || {}
|
|
164
|
+
cursors = meta["cursors"] || {}
|
|
165
|
+
entry = cursors[spawn_name.to_s] || {}
|
|
166
|
+
entry["n"] = n unless n.nil?
|
|
167
|
+
entry["pk"] = pk unless pk.nil?
|
|
168
|
+
cursors[spawn_name.to_s] = entry
|
|
169
|
+
meta["cursors"] = cursors
|
|
170
|
+
cb[:log].update!(metadata: meta)
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# Normalize a spawn_each block return: [Klass, kwargs] or a bare Klass.
|
|
174
|
+
def normalize_spawn(result)
|
|
175
|
+
klass, kwargs = Array(result)
|
|
176
|
+
unless klass.is_a?(Module)
|
|
177
|
+
raise ArgumentError,
|
|
178
|
+
"spawn_each block must return a workflow class or [class, kwargs] (got #{result.inspect})"
|
|
179
|
+
end
|
|
180
|
+
[klass, kwargs || {}]
|
|
181
|
+
end
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
end
|
|
@@ -9,19 +9,22 @@ module ChronoForge
|
|
|
9
9
|
# execution log, ensuring idempotent behavior during workflow replays.
|
|
10
10
|
#
|
|
11
11
|
# @param method [Symbol] The name of the instance method to execute
|
|
12
|
-
# @param
|
|
12
|
+
# @param retry_policy [RetryPolicy, nil] Per-call retry policy. When nil,
|
|
13
|
+
# uses the class-level `retry_policy` default, then the step built-in
|
|
14
|
+
# (RetryPolicy.step_default: 3 attempts, exponential backoff capped at 30s).
|
|
13
15
|
# @param name [String, nil] Custom name for the execution step. Defaults to method name.
|
|
14
16
|
# Used to create unique step names for execution logs.
|
|
15
17
|
#
|
|
16
18
|
# @return [nil]
|
|
17
19
|
#
|
|
18
|
-
# @raise [ExecutionFailedError] When the method fails after max_attempts
|
|
20
|
+
# @raise [ExecutionFailedError] When the method fails after the policy's max_attempts
|
|
19
21
|
#
|
|
20
22
|
# @example Basic usage
|
|
21
23
|
# durably_execute :send_welcome_email
|
|
22
24
|
#
|
|
23
|
-
# @example With custom retry
|
|
24
|
-
# durably_execute :critical_payment_processing,
|
|
25
|
+
# @example With a custom retry policy
|
|
26
|
+
# durably_execute :critical_payment_processing,
|
|
27
|
+
# retry_policy: RetryPolicy.new(max_attempts: 5)
|
|
25
28
|
#
|
|
26
29
|
# @example With custom name for tracking
|
|
27
30
|
# durably_execute :complex_calculation, name: "phase_1_calculation"
|
|
@@ -33,7 +36,7 @@ module ChronoForge
|
|
|
33
36
|
# Rails.logger.info "Successfully uploaded file to S3"
|
|
34
37
|
# end
|
|
35
38
|
#
|
|
36
|
-
# durably_execute :upload_to_s3, max_attempts: 5
|
|
39
|
+
# durably_execute :upload_to_s3, retry_policy: RetryPolicy.new(max_attempts: 5)
|
|
37
40
|
#
|
|
38
41
|
# == Behavior
|
|
39
42
|
#
|
|
@@ -43,9 +46,9 @@ module ChronoForge
|
|
|
43
46
|
# already completed, it will be skipped.
|
|
44
47
|
#
|
|
45
48
|
# === Retry Logic
|
|
46
|
-
# - Failed executions are
|
|
47
|
-
# - Backoff
|
|
48
|
-
# - After max_attempts, ExecutionFailedError is raised
|
|
49
|
+
# - Failed executions are retried per the resolved RetryPolicy
|
|
50
|
+
# - Backoff and attempt cap come from that policy (see RetryPolicy)
|
|
51
|
+
# - After the policy's max_attempts, ExecutionFailedError is raised
|
|
49
52
|
#
|
|
50
53
|
# === Error Handling
|
|
51
54
|
# - All exceptions except HaltExecutionFlow are caught and handled
|
|
@@ -59,7 +62,8 @@ module ChronoForge
|
|
|
59
62
|
# - Stores error details when failures occur
|
|
60
63
|
# - Enables monitoring and debugging of execution history
|
|
61
64
|
#
|
|
62
|
-
def durably_execute(method,
|
|
65
|
+
def durably_execute(method, retry_policy: nil, name: nil)
|
|
66
|
+
policy = step_retry_policy(retry_policy)
|
|
63
67
|
validate_step_name_segment!(name || method)
|
|
64
68
|
step_name = "durably_execute$#{name || method}"
|
|
65
69
|
# Find or create execution log
|
|
@@ -97,16 +101,14 @@ module ChronoForge
|
|
|
97
101
|
self.class::ExecutionTracker.track_error(workflow, e, execution_log: execution_log)
|
|
98
102
|
|
|
99
103
|
# Optional retry logic
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
retry_method: method
|
|
109
|
-
)
|
|
104
|
+
backoff = policy.retry_backoff(e, attempts: execution_log.attempts) do |policy_key|
|
|
105
|
+
bump_retry_count!(execution_log, policy_key)
|
|
106
|
+
end
|
|
107
|
+
if backoff
|
|
108
|
+
# Reschedule with the policy's backoff (published after lock release).
|
|
109
|
+
# The workflow replays on resume and skips completed steps, so the
|
|
110
|
+
# rescheduled run picks this step up again by its execution log.
|
|
111
|
+
enqueue_continuation(wait: backoff)
|
|
110
112
|
|
|
111
113
|
# Halt current execution
|
|
112
114
|
halt_execution!
|
|
@@ -14,10 +14,12 @@ module ChronoForge
|
|
|
14
14
|
# @param till [Symbol, Proc] The condition to check for stopping repetition. Should return
|
|
15
15
|
# true when repetition should stop. Can be a symbol for instance methods or a callable.
|
|
16
16
|
# @param start_at [Time, nil] When to start the periodic task. Defaults to coordination_log.created_at + every
|
|
17
|
-
# @param
|
|
17
|
+
# @param retry_policy [RetryPolicy, nil] Per-call retry policy for an individual
|
|
18
|
+
# execution. When nil, uses the class-level `retry_policy` default, then the
|
|
19
|
+
# step built-in (RetryPolicy.step_default: 3 attempts, backoff capped at 30s).
|
|
18
20
|
# @param timeout [ActiveSupport::Duration] How long after scheduled time an execution is
|
|
19
21
|
# considered stale and skipped (default: 1.hour). This enables catch-up behavior.
|
|
20
|
-
# @param on_error [Symbol] How to handle repetition failures after max_attempts. Options:
|
|
22
|
+
# @param on_error [Symbol] How to handle repetition failures after the policy's max_attempts. Options:
|
|
21
23
|
# - :continue (default): Log failure and continue with next scheduled execution
|
|
22
24
|
# - :fail_workflow: Raise ExecutionFailedError to fail the entire workflow
|
|
23
25
|
# @param name [String, nil] Custom name for the periodic task. Defaults to method name.
|
|
@@ -60,7 +62,7 @@ module ChronoForge
|
|
|
60
62
|
# every: 1.day,
|
|
61
63
|
# till: :reports_complete?,
|
|
62
64
|
# start_at: Date.tomorrow.beginning_of_day,
|
|
63
|
-
# max_attempts: 5,
|
|
65
|
+
# retry_policy: RetryPolicy.new(max_attempts: 5),
|
|
64
66
|
# timeout: 2.hours,
|
|
65
67
|
# on_error: :fail_workflow,
|
|
66
68
|
# name: "daily_reports"
|
|
@@ -89,7 +91,7 @@ module ChronoForge
|
|
|
89
91
|
# - Eventually reaches current/future execution times
|
|
90
92
|
#
|
|
91
93
|
# === Error Handling
|
|
92
|
-
# - Individual execution failures are retried
|
|
94
|
+
# - Individual execution failures are retried per the resolved RetryPolicy
|
|
93
95
|
# - After max attempts, behavior depends on `on_error` parameter:
|
|
94
96
|
# - `:continue`: Failed execution is logged, next execution is scheduled
|
|
95
97
|
# - `:fail_workflow`: ExecutionFailedError is raised, failing the entire workflow
|
|
@@ -100,7 +102,8 @@ module ChronoForge
|
|
|
100
102
|
# - Coordination log: `durably_repeat$#{name}` - tracks overall periodic task state
|
|
101
103
|
# - Repetition logs: `durably_repeat$#{name}$#{timestamp}` - tracks individual executions
|
|
102
104
|
#
|
|
103
|
-
def durably_repeat(method, every:, till:, start_at: nil,
|
|
105
|
+
def durably_repeat(method, every:, till:, start_at: nil, retry_policy: nil, timeout: 1.hour, on_error: :continue, name: nil)
|
|
106
|
+
policy = step_retry_policy(retry_policy)
|
|
104
107
|
validate_step_name_segment!(name || method)
|
|
105
108
|
step_name = "durably_repeat$#{name || method}"
|
|
106
109
|
|
|
@@ -145,13 +148,108 @@ module ChronoForge
|
|
|
145
148
|
coordination_log.created_at + every
|
|
146
149
|
end
|
|
147
150
|
|
|
148
|
-
|
|
151
|
+
next_execution_at = fast_forward_expired_prefix(coordination_log, next_execution_at, every, timeout)
|
|
152
|
+
|
|
153
|
+
execute_or_schedule_repetition(method, coordination_log, next_execution_at, every, policy, timeout, on_error)
|
|
149
154
|
nil
|
|
150
155
|
end
|
|
151
156
|
|
|
152
157
|
private
|
|
153
158
|
|
|
154
|
-
|
|
159
|
+
# Catch-up fast-forward. A tick `t` is expired (its work is skipped) iff
|
|
160
|
+
# `Time.current > t + timeout`, i.e. `t < now - timeout`. Rather than
|
|
161
|
+
# walking one zero-delay job per expired tick, jump straight to the first
|
|
162
|
+
# non-expired tick on the same grid (see #advance_to_first_valid_tick).
|
|
163
|
+
#
|
|
164
|
+
# Anchoring the arithmetic on `next_execution_at` (already on the canonical
|
|
165
|
+
# grid: start_at / created_at+every / last_execution_at+every all land on
|
|
166
|
+
# it, because last_execution_at stores the *scheduled* time, not wall-clock)
|
|
167
|
+
# keeps the result exactly on the grid — no drift, for fixed AND calendar
|
|
168
|
+
# intervals.
|
|
169
|
+
#
|
|
170
|
+
# Returns `next_execution_at` unchanged when nothing is expired. Otherwise
|
|
171
|
+
# advances the coordination log's last_execution_at so a replay recomputes
|
|
172
|
+
# the same first tick, and writes ONE summary ExecutionLog for the whole
|
|
173
|
+
# skipped prefix (no per-tick timeout rows).
|
|
174
|
+
def fast_forward_expired_prefix(coordination_log, next_execution_at, every, timeout)
|
|
175
|
+
cutoff = Time.current - timeout
|
|
176
|
+
return next_execution_at if next_execution_at >= cutoff
|
|
177
|
+
|
|
178
|
+
first_valid, n = advance_to_first_valid_tick(next_execution_at, every, cutoff)
|
|
179
|
+
last_skipped = first_valid - every
|
|
180
|
+
|
|
181
|
+
Rails.logger.info {
|
|
182
|
+
"ChronoForge:#{self.class}(#{@workflow.key}) durably_repeat fast-forwarded " \
|
|
183
|
+
"#{n} expired tick(s) to #{first_valid.iso8601}"
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
# Single summary row for the skipped prefix, on the last skipped grid
|
|
187
|
+
# tick. This never collides with the first_valid repetition row, but it
|
|
188
|
+
# CAN reuse a prior cycle's pending repetition log at the same tick
|
|
189
|
+
# (e.g. a tick that was scheduled-for-later then later fast-forwarded
|
|
190
|
+
# over). Write the metadata in the update! so the fast_forward summary
|
|
191
|
+
# fields are present whether the row is newly created or reused.
|
|
192
|
+
summary_step = "#{coordination_log.step_name}$#{last_skipped.to_i}"
|
|
193
|
+
summary_log = find_or_create_execution_log!(summary_step) do |log|
|
|
194
|
+
log.started_at = Time.current
|
|
195
|
+
end
|
|
196
|
+
summary_log.update!(
|
|
197
|
+
state: :failed,
|
|
198
|
+
error_class: "TimeoutError",
|
|
199
|
+
error_message: "Fast-forwarded #{n} expired tick(s)",
|
|
200
|
+
completed_at: Time.current,
|
|
201
|
+
metadata: (summary_log.metadata || {}).merge(
|
|
202
|
+
"fast_forwarded" => n,
|
|
203
|
+
"from" => next_execution_at.iso8601,
|
|
204
|
+
"to" => last_skipped.iso8601,
|
|
205
|
+
"scheduled_for" => last_skipped.iso8601,
|
|
206
|
+
"timeout_at" => (last_skipped + timeout).iso8601,
|
|
207
|
+
"parent_id" => coordination_log.id
|
|
208
|
+
)
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
# Record progress: a replay recomputes naive_next = last + every = first_valid.
|
|
212
|
+
# Use .iso8601 (second precision) to match the existing last_execution_at
|
|
213
|
+
# format so resumed pre-existing workflows keep the same on-disk grid.
|
|
214
|
+
coordination_log.update!(
|
|
215
|
+
metadata: coordination_log.metadata.merge("last_execution_at" => last_skipped.iso8601)
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
first_valid
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
# Walk the canonical grid from `from` to the first tick at/after `cutoff`,
|
|
222
|
+
# returning [first_valid_tick, ticks_skipped].
|
|
223
|
+
#
|
|
224
|
+
# The split is at one day, which is exactly where ActiveSupport switches
|
|
225
|
+
# arithmetic:
|
|
226
|
+
#
|
|
227
|
+
# - Sub-day intervals (hours/minutes/seconds) are absolute (seconds-based):
|
|
228
|
+
# `from + n*every` is mathematically exact, no DST or clamping. These are
|
|
229
|
+
# also the only intervals whose missed-tick count can explode (1.second
|
|
230
|
+
# dormant a year ≈ 31M ticks), so we MUST jump in closed form.
|
|
231
|
+
#
|
|
232
|
+
# - Day-and-larger intervals go through calendar arithmetic (a "day" across
|
|
233
|
+
# DST is 23h/25h; months clamp at end-of-month), so `from + n*every` can
|
|
234
|
+
# drift off the grid (Jan 31 + 3.months = Apr 30, but stepping +1.month
|
|
235
|
+
# three times lands on Apr 28). Their count over any realistic dormancy is
|
|
236
|
+
# small (daily over a decade ≈ 3650), so we step the grid exactly.
|
|
237
|
+
def advance_to_first_valid_tick(from, every, cutoff)
|
|
238
|
+
if every < 1.day
|
|
239
|
+
n = ((cutoff - from) / every.to_f).ceil
|
|
240
|
+
[from + (n * every), n]
|
|
241
|
+
else
|
|
242
|
+
tick = from
|
|
243
|
+
n = 0
|
|
244
|
+
while tick < cutoff
|
|
245
|
+
tick += every
|
|
246
|
+
n += 1
|
|
247
|
+
end
|
|
248
|
+
[tick, n]
|
|
249
|
+
end
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
def execute_or_schedule_repetition(method, coordination_log, next_execution_at, every, policy, timeout, on_error)
|
|
155
253
|
step_name = "#{coordination_log.step_name}$#{next_execution_at.to_i}"
|
|
156
254
|
|
|
157
255
|
# Create execution log for this specific repetition
|
|
@@ -175,7 +273,7 @@ module ChronoForge
|
|
|
175
273
|
|
|
176
274
|
# Check if it's time to execute this repetition
|
|
177
275
|
if next_execution_at <= Time.current
|
|
178
|
-
execute_repetition_now(method, repetition_log, coordination_log, next_execution_at, every,
|
|
276
|
+
execute_repetition_now(method, repetition_log, coordination_log, next_execution_at, every, policy, timeout, on_error)
|
|
179
277
|
else
|
|
180
278
|
schedule_repetition_for_later(repetition_log, next_execution_at)
|
|
181
279
|
end
|
|
@@ -185,16 +283,14 @@ module ChronoForge
|
|
|
185
283
|
# Calculate delay until execution time
|
|
186
284
|
delay = [next_execution_at - Time.current, 0].max.seconds
|
|
187
285
|
|
|
188
|
-
# Schedule the workflow to run at the specified time
|
|
189
|
-
|
|
190
|
-
.set(wait: delay)
|
|
191
|
-
.perform_later(@workflow.key)
|
|
286
|
+
# Schedule the workflow to run at the specified time (published after release).
|
|
287
|
+
enqueue_continuation(wait: delay)
|
|
192
288
|
|
|
193
289
|
# Halt current execution until scheduled time
|
|
194
290
|
halt_execution!
|
|
195
291
|
end
|
|
196
292
|
|
|
197
|
-
def execute_repetition_now(method, repetition_log, coordination_log, execution_time, every,
|
|
293
|
+
def execute_repetition_now(method, repetition_log, coordination_log, execution_time, every, policy, timeout, on_error)
|
|
198
294
|
# Check for timeout
|
|
199
295
|
if Time.current > repetition_log.metadata["timeout_at"]
|
|
200
296
|
repetition_log.update!(
|
|
@@ -223,13 +319,12 @@ module ChronoForge
|
|
|
223
319
|
self.class::ExecutionTracker.track_error(@workflow, e, execution_log: repetition_log)
|
|
224
320
|
|
|
225
321
|
# Handle retry logic for this specific repetition
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
.perform_later(@workflow.key)
|
|
322
|
+
backoff = policy.retry_backoff(e, attempts: repetition_log.attempts) do |policy_key|
|
|
323
|
+
bump_retry_count!(repetition_log, policy_key)
|
|
324
|
+
end
|
|
325
|
+
if backoff
|
|
326
|
+
# Reschedule this same repetition with the policy's backoff (after release).
|
|
327
|
+
enqueue_continuation(wait: backoff)
|
|
233
328
|
|
|
234
329
|
# Halt current execution
|
|
235
330
|
halt_execution!
|
|
@@ -243,7 +338,7 @@ module ChronoForge
|
|
|
243
338
|
|
|
244
339
|
# Handle failure based on on_error setting
|
|
245
340
|
if on_error == :fail_workflow
|
|
246
|
-
raise ExecutionFailedError, "Periodic task #{method} failed after #{
|
|
341
|
+
raise ExecutionFailedError, "Periodic task #{method} failed after #{repetition_log.attempts} attempts: #{e.message}"
|
|
247
342
|
else
|
|
248
343
|
# Continue with next execution despite this failure
|
|
249
344
|
schedule_next_execution_after_completion(coordination_log, execution_time, every)
|
|
@@ -279,10 +374,8 @@ module ChronoForge
|
|
|
279
374
|
# Calculate delay until next execution
|
|
280
375
|
delay = [next_execution_time - Time.current, 0].max.seconds
|
|
281
376
|
|
|
282
|
-
# Schedule the
|
|
283
|
-
|
|
284
|
-
.set(wait: delay)
|
|
285
|
-
.perform_later(@workflow.key)
|
|
377
|
+
# Schedule the next periodic execution (published after lock release).
|
|
378
|
+
enqueue_continuation(wait: delay)
|
|
286
379
|
|
|
287
380
|
# Halt current execution
|
|
288
381
|
halt_execution!
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
module ChronoForge
|
|
2
|
+
module Executor
|
|
3
|
+
module Methods
|
|
4
|
+
module MergeBranches
|
|
5
|
+
# Join one or more named branches. Separate from dispatch so branches run
|
|
6
|
+
# concurrently. Does one immediate check; if not done, hands off to the
|
|
7
|
+
# lightweight BranchMergeJob and halts (the heavy parent is not replayed
|
|
8
|
+
# per poll). Cadence clamps between min/max, scaled by pending.
|
|
9
|
+
def merge_branches(*names, min_interval: 5.seconds, max_interval: 5.minutes)
|
|
10
|
+
names.each do |nm|
|
|
11
|
+
validate_step_name_segment!(nm) # rejects "$"
|
|
12
|
+
if nm.to_s.include?(",")
|
|
13
|
+
raise InvalidStepName,
|
|
14
|
+
"branch name may not contain ',' (reserved merge separator): #{nm.inspect}"
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Validate cadence here, in the parent, so a misconfiguration fails at the
|
|
19
|
+
# call site instead of deep inside the poller — where (pending * FACTOR)
|
|
20
|
+
# .clamp(min, max) would raise ArgumentError, a non-transient error that
|
|
21
|
+
# dead-letters BranchMergeJob and orphans the parent.
|
|
22
|
+
if min_interval > max_interval
|
|
23
|
+
raise ArgumentError,
|
|
24
|
+
"min_interval (#{min_interval}) must be <= max_interval (#{max_interval})"
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
names = names.map(&:to_s).uniq
|
|
28
|
+
step_name = "merge$#{names.sort.join(",")}"
|
|
29
|
+
log = find_or_create_execution_log!(step_name) { |l| l.started_at = Time.current }
|
|
30
|
+
|
|
31
|
+
if log.completed?
|
|
32
|
+
# Already done — remove from registry so the completion gate does not
|
|
33
|
+
# see these as unmerged, then skip.
|
|
34
|
+
names.each { |nm| @open_branches&.delete(nm.to_s) }
|
|
35
|
+
return
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
branch_log_ids = names.map { |nm| open_branch!(nm)[:log_id] }
|
|
39
|
+
|
|
40
|
+
if branches_done?(branch_log_ids)
|
|
41
|
+
names.each { |nm| @open_branches&.delete(nm.to_s) }
|
|
42
|
+
log.update!(state: :completed, completed_at: Time.current)
|
|
43
|
+
return
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
enqueue_branch_merge_job(branch_log_ids, min_interval, max_interval)
|
|
47
|
+
halt_execution!
|
|
48
|
+
end
|
|
49
|
+
alias_method :merge_branch, :merge_branches
|
|
50
|
+
|
|
51
|
+
private
|
|
52
|
+
|
|
53
|
+
def open_branch!(name)
|
|
54
|
+
(@open_branches || {}).fetch(name.to_s) do
|
|
55
|
+
raise UnknownBranchError, "no open branch named #{name.inspect} (open it with `branch #{name.inspect} do … end` first)"
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def branches_done?(branch_log_ids)
|
|
60
|
+
branch_log_ids.all? { |id| BranchProbe.done?(id) }
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def enqueue_branch_merge_job(branch_log_ids, min_interval, max_interval)
|
|
64
|
+
# Mint a fresh fencing token and stamp it on each branch log under a row
|
|
65
|
+
# lock — the read-modify-write must not clobber a concurrent poll-state
|
|
66
|
+
# write from an in-flight poller. Rotating the token orphans any prior
|
|
67
|
+
# poller chain (its token no longer matches), so only the chain we enqueue
|
|
68
|
+
# below drives the merge. See BranchMergeJob#superseded?.
|
|
69
|
+
token = SecureRandom.uuid
|
|
70
|
+
ExecutionLog.where(id: branch_log_ids).find_each do |log|
|
|
71
|
+
log.with_lock do
|
|
72
|
+
log.update!(metadata: (log.metadata || {}).merge("poll_token" => token))
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
BranchMergeJob.perform_later(
|
|
76
|
+
@workflow.key, self.class.to_s, branch_log_ids,
|
|
77
|
+
min_interval.to_i, max_interval.to_i, token
|
|
78
|
+
)
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
|
@@ -102,10 +102,8 @@ module ChronoForge
|
|
|
102
102
|
last_executed_at: Time.current
|
|
103
103
|
)
|
|
104
104
|
|
|
105
|
-
#
|
|
106
|
-
|
|
107
|
-
.set(wait: duration)
|
|
108
|
-
.perform_later(@workflow.key)
|
|
105
|
+
# Record the reschedule; the executor publishes it after lock release.
|
|
106
|
+
enqueue_continuation(wait: duration)
|
|
109
107
|
|
|
110
108
|
# Halt current execution
|
|
111
109
|
halt_execution!
|