chrono_forge 0.9.1 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +22 -0
- data/README.md +305 -44
- data/docs/superpowers/plans/2026-06-25-chrono_forge-dashboard.md +1748 -0
- data/docs/superpowers/plans/2026-06-25-chrono_forge-dashboard.md.tasks.json +17 -0
- data/docs/superpowers/plans/2026-06-25-composite-retry-policies.md +930 -0
- data/docs/superpowers/plans/2026-06-25-composite-retry-policies.md.tasks.json +54 -0
- data/docs/superpowers/plans/2026-06-25-reserved-kwarg-guard.md +241 -0
- data/docs/superpowers/plans/2026-06-25-reserved-kwarg-guard.md.tasks.json +12 -0
- data/docs/superpowers/plans/2026-06-26-branches-spawn-merge.md +1378 -0
- data/docs/superpowers/plans/2026-06-26-branches-spawn-merge.md.tasks.json +67 -0
- data/docs/superpowers/plans/2026-06-26-deferral-continuation-race-and-catchup.md +709 -0
- data/docs/superpowers/plans/2026-06-26-deferral-continuation-race-and-catchup.md.tasks.json +19 -0
- data/docs/superpowers/specs/2026-06-03-unified-retry-policy-design.md +226 -0
- data/docs/superpowers/specs/2026-06-25-chrono_forge-dashboard-design.md +190 -0
- data/docs/superpowers/specs/2026-06-25-composite-retry-policies-design.md +228 -0
- data/docs/superpowers/specs/2026-06-25-reserved-kwarg-guard-design.md +169 -0
- data/docs/superpowers/specs/2026-06-25-spawn-merge-branches-design.md +468 -0
- data/docs/superpowers/specs/2026-06-26-dashboard-branch-view-design.md +142 -0
- data/docs/superpowers/specs/2026-06-26-deferral-continuation-race-and-catchup-design.md +265 -0
- data/lib/chrono_forge/branch_merge_job.rb +138 -0
- data/lib/chrono_forge/branch_probe.rb +26 -0
- data/lib/chrono_forge/cleanup.rb +6 -0
- data/lib/chrono_forge/execution_log.rb +6 -0
- data/lib/chrono_forge/executor/composite_retry_policy.rb +47 -0
- data/lib/chrono_forge/executor/methods/branch.rb +185 -0
- data/lib/chrono_forge/executor/methods/durably_execute.rb +21 -19
- data/lib/chrono_forge/executor/methods/durably_repeat.rb +118 -25
- data/lib/chrono_forge/executor/methods/merge_branches.rb +83 -0
- data/lib/chrono_forge/executor/methods/wait.rb +2 -4
- data/lib/chrono_forge/executor/methods/wait_until.rb +25 -25
- data/lib/chrono_forge/executor/methods/workflow_states.rb +16 -0
- data/lib/chrono_forge/executor/methods.rb +2 -0
- data/lib/chrono_forge/executor/retry_policy.rb +111 -0
- data/lib/chrono_forge/executor.rb +216 -28
- data/lib/chrono_forge/version.rb +1 -1
- data/lib/chrono_forge/workflow.rb +10 -1
- data/lib/generators/chrono_forge/migration_actions.rb +1 -0
- data/lib/generators/chrono_forge/templates/add_chrono_forge_parent_execution_log.rb +38 -0
- metadata +42 -5
- data/lib/chrono_forge/executor/retry_strategy.rb +0 -29
|
@@ -14,7 +14,11 @@ module ChronoForge
|
|
|
14
14
|
# The method should return a truthy value when the condition is met.
|
|
15
15
|
# @param timeout [ActiveSupport::Duration] Maximum time to wait for condition (default: 1.hour)
|
|
16
16
|
# @param check_interval [ActiveSupport::Duration] Time between condition checks (default: 15.minutes)
|
|
17
|
-
# @param
|
|
17
|
+
# @param retry_policy [RetryPolicy, nil] Policy governing errors raised *while
|
|
18
|
+
# evaluating the condition* (not the poll cadence). When nil, uses
|
|
19
|
+
# RetryPolicy.wait_default, which retries nothing — a raised condition fails
|
|
20
|
+
# fast. Pass a policy with `retry_on:` to opt specific errors into retrying.
|
|
21
|
+
# Note: unlike steps, wait_until does NOT inherit the class-level default.
|
|
18
22
|
#
|
|
19
23
|
# @return [true] When the condition is met
|
|
20
24
|
#
|
|
@@ -31,7 +35,7 @@ module ChronoForge
|
|
|
31
35
|
# wait_until :database_migration_complete?,
|
|
32
36
|
# timeout: 2.hours,
|
|
33
37
|
# check_interval: 30.seconds,
|
|
34
|
-
# retry_on: [ActiveRecord::ConnectionNotEstablished, Net::TimeoutError]
|
|
38
|
+
# retry_policy: RetryPolicy.new(retry_on: [ActiveRecord::ConnectionNotEstablished, Net::TimeoutError])
|
|
35
39
|
#
|
|
36
40
|
# @example Waiting for external system
|
|
37
41
|
# def third_party_service_ready?
|
|
@@ -42,7 +46,7 @@ module ChronoForge
|
|
|
42
46
|
# wait_until :third_party_service_ready?,
|
|
43
47
|
# timeout: 1.hour,
|
|
44
48
|
# check_interval: 2.minutes,
|
|
45
|
-
# retry_on: [Net::TimeoutError, Net::HTTPClientException]
|
|
49
|
+
# retry_policy: RetryPolicy.new(retry_on: [Net::TimeoutError, Net::HTTPClientException])
|
|
46
50
|
#
|
|
47
51
|
# @example Waiting for file processing
|
|
48
52
|
# def file_processing_complete?
|
|
@@ -60,7 +64,7 @@ module ChronoForge
|
|
|
60
64
|
# The condition method is called on each check interval:
|
|
61
65
|
# - Should return truthy value when condition is met
|
|
62
66
|
# - Should return falsy value when condition is not yet met
|
|
63
|
-
# - Can raise exceptions that will be handled based on
|
|
67
|
+
# - Can raise exceptions that will be handled based on the retry_policy
|
|
64
68
|
#
|
|
65
69
|
# === Timeout Handling
|
|
66
70
|
# - Timeout is calculated from the first execution start time
|
|
@@ -69,9 +73,10 @@ module ChronoForge
|
|
|
69
73
|
#
|
|
70
74
|
# === Error Handling
|
|
71
75
|
# - Exceptions during condition evaluation are caught and logged
|
|
72
|
-
# - If
|
|
73
|
-
#
|
|
74
|
-
# -
|
|
76
|
+
# - If the retry_policy deems the error retryable, it triggers a retry with the
|
|
77
|
+
# policy's backoff
|
|
78
|
+
# - Otherwise the error causes immediate failure with ExecutionFailedError
|
|
79
|
+
# - Backoff is governed by the resolved RetryPolicy
|
|
75
80
|
#
|
|
76
81
|
# === Persistence and Resumability
|
|
77
82
|
# - Wait state is persisted in execution logs with metadata
|
|
@@ -85,7 +90,8 @@ module ChronoForge
|
|
|
85
90
|
# - Tracks attempt count and execution times
|
|
86
91
|
# - Records final result (true for success, :timed_out for timeout)
|
|
87
92
|
#
|
|
88
|
-
def wait_until(condition, timeout: 1.hour, check_interval: 15.minutes,
|
|
93
|
+
def wait_until(condition, timeout: 1.hour, check_interval: 15.minutes, retry_policy: nil)
|
|
94
|
+
policy = wait_retry_policy(retry_policy)
|
|
89
95
|
validate_step_name_segment!(condition)
|
|
90
96
|
step_name = "wait_until$#{condition}"
|
|
91
97
|
# Find or create execution log
|
|
@@ -117,16 +123,15 @@ module ChronoForge
|
|
|
117
123
|
Rails.logger.error { "Error evaluating condition #{condition}: #{e.message}" }
|
|
118
124
|
self.class::ExecutionTracker.track_error(workflow, e, execution_log: execution_log)
|
|
119
125
|
|
|
120
|
-
# Optional retry logic
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
)
|
|
126
|
+
# Optional retry logic for errors raised while evaluating the
|
|
127
|
+
# condition. The poll cadence (check_interval/timeout) below is
|
|
128
|
+
# separate and unaffected by the retry policy.
|
|
129
|
+
backoff = policy.retry_backoff(e, attempts: execution_log.attempts) do |policy_key|
|
|
130
|
+
bump_retry_count!(execution_log, policy_key)
|
|
131
|
+
end
|
|
132
|
+
if backoff
|
|
133
|
+
# Reschedule with the policy's backoff (published after lock release).
|
|
134
|
+
enqueue_continuation(wait: backoff)
|
|
130
135
|
|
|
131
136
|
# Halt current execution
|
|
132
137
|
halt_execution!
|
|
@@ -167,13 +172,8 @@ module ChronoForge
|
|
|
167
172
|
raise error
|
|
168
173
|
end
|
|
169
174
|
|
|
170
|
-
# Reschedule
|
|
171
|
-
|
|
172
|
-
.set(wait: check_interval)
|
|
173
|
-
.perform_later(
|
|
174
|
-
@workflow.key,
|
|
175
|
-
wait_condition: condition
|
|
176
|
-
)
|
|
175
|
+
# Reschedule the poll (published after lock release).
|
|
176
|
+
enqueue_continuation(wait: check_interval, wait_condition: condition)
|
|
177
177
|
|
|
178
178
|
# Halt current execution
|
|
179
179
|
halt_execution!
|
|
@@ -48,6 +48,8 @@ module ChronoForge
|
|
|
48
48
|
# - Safe to call multiple times without side effects
|
|
49
49
|
#
|
|
50
50
|
def complete_workflow!
|
|
51
|
+
enforce_branch_joins!
|
|
52
|
+
|
|
51
53
|
# Create an execution log for workflow completion
|
|
52
54
|
execution_log = find_or_create_execution_log!("$workflow_completion$") do |log|
|
|
53
55
|
log.started_at = Time.current
|
|
@@ -80,6 +82,20 @@ module ChronoForge
|
|
|
80
82
|
end
|
|
81
83
|
end
|
|
82
84
|
|
|
85
|
+
# Every branch must be joined: automerge branches join inline at their
|
|
86
|
+
# block's close (removing themselves from @open_branches); explicitly
|
|
87
|
+
# awaited branches are removed by merge_branches. Anything still in
|
|
88
|
+
# @open_branches here was opened but never joined — fail fast.
|
|
89
|
+
def enforce_branch_joins!
|
|
90
|
+
leftover = (@open_branches || {}).keys
|
|
91
|
+
return if leftover.empty?
|
|
92
|
+
|
|
93
|
+
raise UnmergedBranchError,
|
|
94
|
+
"branch(es) #{leftover.join(", ")} were opened but never merged. " \
|
|
95
|
+
"Add `merge_branches #{leftover.map { |n| ":#{n}" }.join(", ")}` " \
|
|
96
|
+
"or open with `branch(..., automerge: true)`."
|
|
97
|
+
end
|
|
98
|
+
|
|
83
99
|
# Marks a workflow as failed due to an unrecoverable error.
|
|
84
100
|
#
|
|
85
101
|
# This method provides durable workflow failure tracking with proper state
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
module ChronoForge
|
|
2
|
+
module Executor
|
|
3
|
+
# A single, unified description of retry behavior shared by every retry site
|
|
4
|
+
# (workflow-level uncaught errors, durably_execute, durably_repeat, and
|
|
5
|
+
# wait_until's condition errors).
|
|
6
|
+
#
|
|
7
|
+
# It answers the only two questions a retry site ever asks:
|
|
8
|
+
# - retryable?(error, attempts) — should this failure be retried?
|
|
9
|
+
# - backoff_for(attempts) — how long until the next attempt?
|
|
10
|
+
#
|
|
11
|
+
# `attempts` is always the 1-based count of attempts made so far, *including*
|
|
12
|
+
# the one that just failed (matching ExecutionLog#attempts). So on the first
|
|
13
|
+
# failure `attempts == 1`.
|
|
14
|
+
class RetryPolicy
|
|
15
|
+
attr_reader :max_attempts, :base, :cap, :jitter, :retry_on
|
|
16
|
+
|
|
17
|
+
# @param max_attempts [Integer, nil] cap on total attempts; nil = no count
|
|
18
|
+
# cap (bounded elsewhere, e.g. wait_until's timeout)
|
|
19
|
+
# @param base [Numeric, ActiveSupport::Duration] delay of the first retry
|
|
20
|
+
# @param cap [Numeric, ActiveSupport::Duration] ceiling for a single delay
|
|
21
|
+
# @param jitter [Boolean] apply equal jitter to spread retries
|
|
22
|
+
# @param retry_on [Array<Class>, nil] nil = retry any StandardError;
|
|
23
|
+
# an array = retry only those classes (and subclasses); [] = retry nothing
|
|
24
|
+
def initialize(max_attempts: 3, base: 1, cap: 30, jitter: true, retry_on: nil)
|
|
25
|
+
@max_attempts = max_attempts
|
|
26
|
+
@base = base
|
|
27
|
+
@cap = cap
|
|
28
|
+
@jitter = jitter
|
|
29
|
+
@retry_on = retry_on
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def retryable?(error, attempts)
|
|
33
|
+
within_attempt_cap?(attempts) && retryable_error?(error)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Equal jitter: half the computed delay plus a random portion of the other
|
|
37
|
+
# half. Computed once at re-enqueue time and never persisted, so the
|
|
38
|
+
# randomness does not affect replay determinism.
|
|
39
|
+
def backoff_for(attempts)
|
|
40
|
+
exponent = [attempts - 1, 0].max
|
|
41
|
+
delay = [cap.to_f, base.to_f * (2**exponent)].min
|
|
42
|
+
delay = (delay / 2) + rand(0.0..(delay / 2)) if jitter
|
|
43
|
+
delay.seconds
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Public routing predicate: would this policy handle this error at all?
|
|
47
|
+
# (independent of the attempt cap). nil retry_on = any StandardError;
|
|
48
|
+
# [] = nothing; a list = those classes and their subclasses.
|
|
49
|
+
def matches?(error)
|
|
50
|
+
retryable_error?(error)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Single-call decision used by every retry site: the backoff Duration to
|
|
54
|
+
# retry, or nil to stop. A plain policy uses `attempts` and ignores any
|
|
55
|
+
# block (the block exists only so a CompositeRetryPolicy can supply a
|
|
56
|
+
# per-error count — see CompositeRetryPolicy#retry_backoff).
|
|
57
|
+
def retry_backoff(error, attempts:)
|
|
58
|
+
retryable?(error, attempts) ? backoff_for(attempts) : nil
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Stable per-policy identifier derived from the errors this policy
|
|
62
|
+
# *declares* (its retry_on), not the error thrown. Inside a composite this
|
|
63
|
+
# keys the policy's attempt budget, so the budget is shared across every
|
|
64
|
+
# class the policy lists (and their subclasses) and is independent of the
|
|
65
|
+
# policy's position — reordering the composite does not reset counts. A
|
|
66
|
+
# catch-all (retry_on: nil) keys "*".
|
|
67
|
+
def budget_key
|
|
68
|
+
retry_on.nil? ? "*" : retry_on.map(&:name).sort.join(",")
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def self.step_default
|
|
72
|
+
new(max_attempts: 3, base: 1, cap: 30, jitter: true, retry_on: nil)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Workflow-level (uncaught) errors retry the whole workflow from the top
|
|
76
|
+
# (replaying completed steps). They cover two populations the default can't
|
|
77
|
+
# distinguish: transient infra blips — worth riding out — and deterministic
|
|
78
|
+
# bugs, where every replay is waste. 10 attempts gives a tolerant window of
|
|
79
|
+
# up to ~8.5 min (≈4 min typical, since equal jitter puts each wait in
|
|
80
|
+
# [d/2, d]) — enough for a DB failover or deploy restart — without dragging
|
|
81
|
+
# out the bug case; cap (600s / 10 min) bounds any single backoff and only
|
|
82
|
+
# binds if a caller configures more attempts.
|
|
83
|
+
def self.workflow_default
|
|
84
|
+
new(max_attempts: 10, base: 1, cap: 600, jitter: true, retry_on: nil)
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def self.wait_default
|
|
88
|
+
new(max_attempts: nil, base: 1, cap: 30, jitter: true, retry_on: [])
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Build a composite policy from an ordered list of RetryPolicy objects.
|
|
92
|
+
def self.compose(*policies)
|
|
93
|
+
CompositeRetryPolicy.new(policies)
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
private
|
|
97
|
+
|
|
98
|
+
def within_attempt_cap?(attempts)
|
|
99
|
+
max_attempts.nil? || attempts < max_attempts
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def retryable_error?(error)
|
|
103
|
+
if retry_on.nil?
|
|
104
|
+
error.is_a?(StandardError)
|
|
105
|
+
else
|
|
106
|
+
retry_on.any? { |klass| error.is_a?(klass) }
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
end
|
|
@@ -14,46 +14,105 @@ module ChronoForge
|
|
|
14
14
|
|
|
15
15
|
class InvalidStepName < NotExecutableError; end
|
|
16
16
|
|
|
17
|
+
# spawn/spawn_each called outside a branch block. NotExecutableError so it
|
|
18
|
+
# propagates (fail-fast on a programming error) rather than being retried.
|
|
19
|
+
class NotInBranchError < NotExecutableError; end
|
|
20
|
+
|
|
21
|
+
# A branch was opened but neither merged via merge_branches nor declared
|
|
22
|
+
# automerge: true. Raised at the completion gate. Fail-fast (not retried).
|
|
23
|
+
class UnmergedBranchError < NotExecutableError; end
|
|
24
|
+
|
|
25
|
+
# merge_branches given a name that was never opened as a branch this pass.
|
|
26
|
+
# NotExecutableError so it propagates (fail-fast) instead of being retried.
|
|
27
|
+
class UnknownBranchError < NotExecutableError; end
|
|
28
|
+
|
|
17
29
|
# "$" separates the segments of a step name (e.g. "durably_repeat$name$ts").
|
|
18
30
|
# User-supplied names/methods must not contain it.
|
|
19
31
|
STEP_NAME_DELIMITER = "$"
|
|
20
32
|
|
|
33
|
+
# Keyword args ChronoForge threads through job args internally. Users must
|
|
34
|
+
# not pass these to perform_now/perform_later; the framework injects them
|
|
35
|
+
# via `.set(...)` continuations, whose ConfiguredJob proxy bypasses the
|
|
36
|
+
# class-level guard in `prepended` below.
|
|
37
|
+
RESERVED_KWARGS = %i[attempt retry_counts retry_workflow].freeze
|
|
38
|
+
|
|
21
39
|
include Methods
|
|
22
40
|
|
|
23
41
|
# Add class methods
|
|
24
42
|
def self.prepended(base)
|
|
43
|
+
# Class-wide default retry policy, inherited by subclasses. Set via the
|
|
44
|
+
# `retry_policy` DSL below; nil means "use the per-site built-in default".
|
|
45
|
+
base.class_attribute :default_retry_policy, instance_accessor: false, default: nil
|
|
46
|
+
|
|
25
47
|
class << base
|
|
26
|
-
#
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
48
|
+
# Public enqueue contract: exactly one positional (`key`) plus keywords.
|
|
49
|
+
# Reserved internal kwargs (RESERVED_KWARGS) are rejected here; the
|
|
50
|
+
# framework injects them only via `.set(...)` continuations, whose
|
|
51
|
+
# ActiveJob ConfiguredJob proxy bypasses these class-level overrides.
|
|
52
|
+
def perform_now(key, *extra, **kwargs)
|
|
53
|
+
__validate_enqueue!(key, extra, kwargs)
|
|
54
|
+
super(key, **kwargs)
|
|
32
55
|
end
|
|
33
56
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
57
|
+
def perform_later(key, *extra, **kwargs)
|
|
58
|
+
__validate_enqueue!(key, extra, kwargs)
|
|
59
|
+
super(key, **kwargs)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Re-run a failed/stalled workflow. Routes through `.set(...)` so the
|
|
63
|
+
# reserved `retry_workflow: true` flag reaches the instance perform
|
|
64
|
+
# without tripping the public guard above.
|
|
65
|
+
def retry_now(key, **kwargs)
|
|
66
|
+
__validate_enqueue!(key, [], kwargs)
|
|
67
|
+
set.perform_now(key, retry_workflow: true, **kwargs)
|
|
40
68
|
end
|
|
41
69
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
70
|
+
def retry_later(key, **kwargs)
|
|
71
|
+
__validate_enqueue!(key, [], kwargs)
|
|
72
|
+
set.perform_later(key, retry_workflow: true, **kwargs)
|
|
45
73
|
end
|
|
46
74
|
|
|
47
|
-
#
|
|
48
|
-
|
|
49
|
-
|
|
75
|
+
# Class-level DSL to set this workflow's default retry policy. Applies to
|
|
76
|
+
# workflow-level retries and to steps without a per-call override.
|
|
77
|
+
# Positional RetryPolicy objects build a composite (per-error budgets);
|
|
78
|
+
# keyword options build a single RetryPolicy. The two forms are mutually
|
|
79
|
+
# exclusive.
|
|
80
|
+
def retry_policy(*policies, **opts)
|
|
81
|
+
if policies.any? && opts.any?
|
|
82
|
+
raise ArgumentError, "retry_policy takes either positional policies or keyword options, not both"
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
self.default_retry_policy =
|
|
86
|
+
policies.any? ? RetryPolicy.compose(*policies) : RetryPolicy.new(**opts)
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
private
|
|
90
|
+
|
|
91
|
+
def __validate_enqueue!(key, extra, kwargs)
|
|
92
|
+
unless key.is_a?(String)
|
|
93
|
+
raise ArgumentError, "Workflow key must be a string as the first argument"
|
|
94
|
+
end
|
|
95
|
+
unless extra.empty?
|
|
96
|
+
raise ArgumentError,
|
|
97
|
+
"ChronoForge workflows accept only `key` positionally; pass " \
|
|
98
|
+
"everything else as keywords (got #{extra.size} extra positional arg(s))"
|
|
99
|
+
end
|
|
100
|
+
reserved = kwargs.keys & RESERVED_KWARGS
|
|
101
|
+
if reserved.any?
|
|
102
|
+
raise ArgumentError,
|
|
103
|
+
"#{reserved.join(", ")} #{reserved.one? ? "is a reserved" : "are reserved"} " \
|
|
104
|
+
"ChronoForge #{reserved.one? ? "keyword" : "keywords"} and cannot be passed to perform_now/perform_later"
|
|
105
|
+
end
|
|
50
106
|
end
|
|
51
107
|
end
|
|
52
108
|
end
|
|
53
109
|
|
|
54
|
-
def perform(key, attempt: 0, retry_workflow: false, options: {}, **kwargs)
|
|
55
|
-
#
|
|
56
|
-
|
|
110
|
+
def perform(key, attempt: 0, retry_counts: {}, retry_workflow: false, options: {}, **kwargs)
|
|
111
|
+
# Safety net: prevent re-running a workflow whose attempts are exhausted
|
|
112
|
+
# (e.g. a stale job left in the queue). The normal exhaustion path fails the
|
|
113
|
+
# workflow from the rescue below before this is ever reached.
|
|
114
|
+
policy = workflow_retry_policy
|
|
115
|
+
if policy.max_attempts && attempt >= policy.max_attempts
|
|
57
116
|
Rails.logger.error { "ChronoForge:#{self.class} max attempts reached for job workflow(#{key})" }
|
|
58
117
|
return
|
|
59
118
|
end
|
|
@@ -101,16 +160,39 @@ module ChronoForge
|
|
|
101
160
|
Rails.logger.error { "ChronoForge:#{self.class}(#{key}) workflow execution failed" }
|
|
102
161
|
error_log = self.class::ExecutionTracker.track_error(workflow, e, attempt: attempt)
|
|
103
162
|
|
|
104
|
-
# Retry if applicable
|
|
105
|
-
|
|
106
|
-
|
|
163
|
+
# Retry if applicable. `attempt` is a 0-based index, so the count of
|
|
164
|
+
# attempts made so far (including this one) is attempt + 1. For a
|
|
165
|
+
# composite policy the per-error budget lives in `retry_counts` (keyed by
|
|
166
|
+
# the matched policy's budget_key) and rides along the job args, mirroring
|
|
167
|
+
# how `attempt` is threaded — there is no execution log at this level.
|
|
168
|
+
attempts_made = attempt + 1
|
|
169
|
+
backoff = policy.retry_backoff(e, attempts: attempts_made) do |policy_key|
|
|
170
|
+
retry_counts[policy_key] = retry_counts[policy_key].to_i + 1
|
|
171
|
+
retry_counts[policy_key]
|
|
172
|
+
end
|
|
173
|
+
if backoff
|
|
174
|
+
enqueue_continuation(wait: backoff, attempt: attempts_made, retry_counts: retry_counts)
|
|
107
175
|
else
|
|
108
176
|
fail_workflow! error_log
|
|
109
177
|
end
|
|
110
178
|
ensure
|
|
111
179
|
if lock_acquired # Only release lock if we acquired it
|
|
112
|
-
context.save!
|
|
113
|
-
|
|
180
|
+
# Release the lock and publish the continuation even if context.save!
|
|
181
|
+
# raises — otherwise a transient save failure would leave the lock held
|
|
182
|
+
# (until it goes stale) AND drop the continuation, stranding the workflow
|
|
183
|
+
# with nothing scheduled to resume it. On a save failure the continuation
|
|
184
|
+
# resumes from the last persisted context, which is exactly crash
|
|
185
|
+
# semantics (durable steps replay).
|
|
186
|
+
begin
|
|
187
|
+
context.save!
|
|
188
|
+
ensure
|
|
189
|
+
self.class::LockStrategy.release_lock(job_id, workflow)
|
|
190
|
+
# Publish the continuation only now — after the lock is released — so a
|
|
191
|
+
# zero-delay, same-key continuation can't lose the acquire race against
|
|
192
|
+
# this still-locked job. If release_lock raised (this job overran and
|
|
193
|
+
# lost the lock), we never reach here and another job owns continuation.
|
|
194
|
+
flush_continuation!
|
|
195
|
+
end
|
|
114
196
|
end
|
|
115
197
|
end
|
|
116
198
|
end
|
|
@@ -128,6 +210,15 @@ module ChronoForge
|
|
|
128
210
|
workflow.kwargs = kwargs
|
|
129
211
|
workflow.started_at = Time.current
|
|
130
212
|
end
|
|
213
|
+
|
|
214
|
+
# Branch children are pre-inserted by their parent (dispatch_children's
|
|
215
|
+
# insert_all), so the creation block above never runs for them and their
|
|
216
|
+
# started_at stays nil. Stamp it the first time the child actually executes
|
|
217
|
+
# so started_at reliably means "has been picked up and run" — the
|
|
218
|
+
# BranchMergeJob rekick poller treats a nil started_at as a never-executed
|
|
219
|
+
# (dropped) child, and must not mistake a child that ran and is now parked
|
|
220
|
+
# on a wait (also :idle) for one that was never picked up.
|
|
221
|
+
@workflow.update_column(:started_at, Time.current) if @workflow.started_at.nil?
|
|
131
222
|
end
|
|
132
223
|
|
|
133
224
|
def setup_context!
|
|
@@ -148,11 +239,50 @@ module ChronoForge
|
|
|
148
239
|
# which accumulate unbounded repetition logs: we touch only the rows we need,
|
|
149
240
|
# never the whole set. create_or_find_by! is used only on a miss, keeping
|
|
150
241
|
# creation safe if a lock takeover ever lets two executors race.
|
|
242
|
+
#
|
|
243
|
+
# Completed steps are short-circuited up front from a single bulk read (see
|
|
244
|
+
# #completed_step_cache) so that replaying N already-done steps costs one
|
|
245
|
+
# query for the whole batch rather than one SELECT each — without that, a
|
|
246
|
+
# workflow with hundreds of steps pays hundreds of SELECTs on every resume.
|
|
247
|
+
# The cached value is a readonly, unsaved stand-in: completed steps are only
|
|
248
|
+
# ever read (.completed? and metadata["result"]), never written, so it needs
|
|
249
|
+
# no database row.
|
|
151
250
|
def find_or_create_execution_log!(step_name, &)
|
|
251
|
+
if completed_step_cache.key?(step_name)
|
|
252
|
+
return ExecutionLog.new(
|
|
253
|
+
workflow: @workflow, step_name: step_name, state: :completed,
|
|
254
|
+
metadata: completed_step_cache[step_name]
|
|
255
|
+
).tap(&:readonly!)
|
|
256
|
+
end
|
|
257
|
+
|
|
152
258
|
ExecutionLog.find_by(workflow: @workflow, step_name: step_name) ||
|
|
153
259
|
ExecutionLog.create_or_find_by!(workflow: @workflow, step_name: step_name, &)
|
|
154
260
|
end
|
|
155
261
|
|
|
262
|
+
# One bulk read of this workflow's completed steps, mapping step_name to its
|
|
263
|
+
# metadata, memoized for the duration of a single replay pass.
|
|
264
|
+
#
|
|
265
|
+
# Only completed rows are loaded: they are the ones replayed steps short-
|
|
266
|
+
# circuit on, and once completed a step never changes, so the snapshot stays
|
|
267
|
+
# valid for the whole pass. Plucking (step_name, metadata) avoids
|
|
268
|
+
# instantiating AR objects and keeps the read portable — Rails type-casts the
|
|
269
|
+
# JSON metadata column to a Hash on SQLite, PostgreSQL and MySQL alike, with
|
|
270
|
+
# no database-specific JSON extraction.
|
|
271
|
+
#
|
|
272
|
+
# durably_repeat repetition logs (durably_repeat$<name>$<timestamp>) are
|
|
273
|
+
# deliberately excluded: they accumulate without bound yet are never replayed
|
|
274
|
+
# (durably_repeat only ever looks up its coordination log plus the single
|
|
275
|
+
# current repetition), so pulling them into memory would be all cost and no
|
|
276
|
+
# benefit. Their coordination log (durably_repeat$<name>, only two segments)
|
|
277
|
+
# is not matched by the pattern and is still cached.
|
|
278
|
+
def completed_step_cache
|
|
279
|
+
@completed_step_cache ||= ExecutionLog
|
|
280
|
+
.where(workflow: @workflow, state: ExecutionLog.states[:completed])
|
|
281
|
+
.where.not("step_name LIKE ?", "durably_repeat#{STEP_NAME_DELIMITER}%#{STEP_NAME_DELIMITER}%")
|
|
282
|
+
.pluck(:step_name, :metadata)
|
|
283
|
+
.to_h
|
|
284
|
+
end
|
|
285
|
+
|
|
156
286
|
# Guards the user-supplied portion of a step name (a custom name, method, or
|
|
157
287
|
# condition). The "$" separator is reserved for the framework's own segment
|
|
158
288
|
# structure, so a user value containing it would make step names ambiguous
|
|
@@ -164,8 +294,66 @@ module ChronoForge
|
|
|
164
294
|
"ChronoForge step name may not contain '#{STEP_NAME_DELIMITER}' (reserved separator): #{segment.inspect}"
|
|
165
295
|
end
|
|
166
296
|
|
|
167
|
-
|
|
168
|
-
|
|
297
|
+
# Retry policy for workflow-level (uncaught) errors: the class default if one
|
|
298
|
+
# was declared, else the workflow built-in (10 attempts, up to ~8.5 min).
|
|
299
|
+
# Each retry replays the whole workflow from the top.
|
|
300
|
+
def workflow_retry_policy
|
|
301
|
+
self.class.default_retry_policy || RetryPolicy.workflow_default
|
|
302
|
+
end
|
|
303
|
+
|
|
304
|
+
# Retry policy for a durable step: an explicit per-call override, else the
|
|
305
|
+
# class default, else the step built-in (short, snappy fast-fail).
|
|
306
|
+
def step_retry_policy(override)
|
|
307
|
+
coerce_policy(override) || self.class.default_retry_policy || RetryPolicy.step_default
|
|
308
|
+
end
|
|
309
|
+
|
|
310
|
+
# Retry policy for a wait_until condition error. Deliberately does NOT inherit
|
|
311
|
+
# the class default, so a class-wide "retry everything" can't silently turn
|
|
312
|
+
# condition-evaluation bugs into retried errors. Built-in retries nothing.
|
|
313
|
+
def wait_retry_policy(override)
|
|
314
|
+
coerce_policy(override) || RetryPolicy.wait_default
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
# Normalize a retry-policy value: an Array becomes a composite; a RetryPolicy
|
|
318
|
+
# or CompositeRetryPolicy passes through; nil stays nil.
|
|
319
|
+
def coerce_policy(value)
|
|
320
|
+
value.is_a?(Array) ? RetryPolicy.compose(*value) : value
|
|
321
|
+
end
|
|
322
|
+
|
|
323
|
+
# JSON metadata key holding the per-error attempt counts of a composite
|
|
324
|
+
# policy, keyed by the matched policy's declared errors (RetryPolicy#budget_key).
|
|
325
|
+
RETRY_COUNTS_KEY = "retry_counts"
|
|
326
|
+
|
|
327
|
+
# Increment the matched policy's slot in the log's retry-count map and return
|
|
328
|
+
# the new count. Reassigns `metadata` so the JSON column is marked dirty.
|
|
329
|
+
def bump_retry_count!(log, policy_key)
|
|
330
|
+
meta = log.metadata || {}
|
|
331
|
+
counts = meta[RETRY_COUNTS_KEY] || {}
|
|
332
|
+
counts[policy_key] = counts[policy_key].to_i + 1
|
|
333
|
+
meta[RETRY_COUNTS_KEY] = counts
|
|
334
|
+
log.update!(metadata: meta)
|
|
335
|
+
counts[policy_key]
|
|
336
|
+
end
|
|
337
|
+
|
|
338
|
+
# Record the continuation this job intends to enqueue. It is NOT published
|
|
339
|
+
# here: publishing while the lock is still held lets another worker claim it
|
|
340
|
+
# and lose the lock-acquisition race. The executor flushes it in `ensure`,
|
|
341
|
+
# after release_lock (see #flush_continuation!). At most one continuation is
|
|
342
|
+
# recorded per job run (every primitive records one then halts, or falls
|
|
343
|
+
# through the workflow-retry rescue).
|
|
344
|
+
def enqueue_continuation(wait:, **kwargs)
|
|
345
|
+
@continuation = {wait: wait, kwargs: kwargs}
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
# Publish the recorded continuation, if any. Called from `ensure` only after
|
|
349
|
+
# the lock row has been updated to released, so even a zero-delay continuation
|
|
350
|
+
# finds the lock free.
|
|
351
|
+
def flush_continuation!
|
|
352
|
+
return unless @continuation
|
|
353
|
+
|
|
354
|
+
self.class
|
|
355
|
+
.set(wait: @continuation[:wait])
|
|
356
|
+
.perform_later(@workflow.key, **@continuation[:kwargs])
|
|
169
357
|
end
|
|
170
358
|
|
|
171
359
|
def halt_execution!
|
data/lib/chrono_forge/version.rb
CHANGED
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
# kwargs :json not null
|
|
13
13
|
# options :json not null
|
|
14
14
|
# locked_at :datetime
|
|
15
|
+
# parent_execution_log_id :integer
|
|
15
16
|
# started_at :datetime
|
|
16
17
|
# state :integer default("idle"), not null
|
|
17
18
|
# created_at :datetime not null
|
|
@@ -19,7 +20,10 @@
|
|
|
19
20
|
#
|
|
20
21
|
# Indexes
|
|
21
22
|
#
|
|
22
|
-
# index_chrono_forge_workflows_on_key
|
|
23
|
+
# index_chrono_forge_workflows_on_key (key)
|
|
24
|
+
# index_chrono_forge_workflows_on_job_class_and_key (job_class,key) UNIQUE
|
|
25
|
+
# index_chrono_forge_workflows_on_parent_execution_log_and_st (parent_execution_log_id,state)
|
|
26
|
+
# index_chrono_forge_workflows_on_state_and_completed_at (state,completed_at)
|
|
23
27
|
#
|
|
24
28
|
module ChronoForge
|
|
25
29
|
class Workflow < ApplicationRecord()
|
|
@@ -28,6 +32,11 @@ module ChronoForge
|
|
|
28
32
|
has_many :execution_logs, dependent: :destroy
|
|
29
33
|
has_many :error_logs, dependent: :destroy
|
|
30
34
|
|
|
35
|
+
belongs_to :parent_execution_log,
|
|
36
|
+
class_name: "ChronoForge::ExecutionLog",
|
|
37
|
+
inverse_of: :spawned_workflows,
|
|
38
|
+
optional: true
|
|
39
|
+
|
|
31
40
|
enum :state, %i[
|
|
32
41
|
idle
|
|
33
42
|
running
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Adds chrono_forge_workflows.parent_execution_log_id: the execution log that
|
|
4
|
+
# spawned a workflow (for branches, the branch$<name> log). Deliberately generic
|
|
5
|
+
# so any future step that spawns sub-workflows can reuse it. The composite
|
|
6
|
+
# [parent_execution_log_id, state] index makes the merge completion probe and the
|
|
7
|
+
# dropped-job re-kick index-only at hundreds of thousands of children.
|
|
8
|
+
#
|
|
9
|
+
# Shipped standalone (matching add_chrono_forge_workflow_state_index) so existing
|
|
10
|
+
# installs pick it up via `rails generate chrono_forge:upgrade`.
|
|
11
|
+
class AddChronoForgeParentExecutionLog < ActiveRecord::Migration[7.1]
|
|
12
|
+
disable_ddl_transaction!
|
|
13
|
+
|
|
14
|
+
def change
|
|
15
|
+
add_column :chrono_forge_workflows, :parent_execution_log_id, parent_log_fk_type,
|
|
16
|
+
null: true, if_not_exists: true
|
|
17
|
+
|
|
18
|
+
add_index :chrono_forge_workflows, %i[parent_execution_log_id state],
|
|
19
|
+
if_not_exists: true, **chrono_forge_index_algorithm
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
private
|
|
23
|
+
|
|
24
|
+
# Match the type of chrono_forge_workflows.id so the FK lines up on both bigint
|
|
25
|
+
# and uuid installs.
|
|
26
|
+
def parent_log_fk_type
|
|
27
|
+
id_col = connection.columns(:chrono_forge_workflows).find { |c| c.name == "id" }
|
|
28
|
+
(id_col&.type == :uuid) ? :uuid : :bigint
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def chrono_forge_index_algorithm
|
|
32
|
+
if connection.adapter_name.to_s.downcase.include?("postgresql")
|
|
33
|
+
{algorithm: :concurrently}
|
|
34
|
+
else
|
|
35
|
+
{}
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|