ruby_reactor 0.5.2 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.release-please-manifest.json +1 -1
- data/CHANGELOG.md +7 -0
- data/README.md +147 -34
- data/lib/ruby_reactor/configuration.rb +66 -2
- data/lib/ruby_reactor/context_serializer.rb +9 -4
- data/lib/ruby_reactor/executor/ordered_lock_support.rb +1 -1
- data/lib/ruby_reactor/executor/retry_manager.rb +7 -2
- data/lib/ruby_reactor/executor/step_executor.rb +25 -5
- data/lib/ruby_reactor/executor.rb +85 -3
- data/lib/ruby_reactor/lock.rb +13 -0
- data/lib/ruby_reactor/map/collector.rb +41 -0
- data/lib/ruby_reactor/map/dispatcher.rb +42 -0
- data/lib/ruby_reactor/map/element_executor.rb +39 -0
- data/lib/ruby_reactor/map/helpers.rb +10 -3
- data/lib/ruby_reactor/map/sweeper.rb +110 -0
- data/lib/ruby_reactor/reactor.rb +7 -5
- data/lib/ruby_reactor/sidekiq_adapter.rb +9 -8
- data/lib/ruby_reactor/sidekiq_workers/sweeper_worker.rb +73 -0
- data/lib/ruby_reactor/sidekiq_workers/worker.rb +42 -34
- data/lib/ruby_reactor/step/map_step.rb +18 -2
- data/lib/ruby_reactor/storage/redis_adapter.rb +83 -60
- data/lib/ruby_reactor/storage/redis_locking.rb +8 -0
- data/lib/ruby_reactor/sweeper.rb +58 -0
- data/lib/ruby_reactor/version.rb +1 -1
- data/lib/ruby_reactor.rb +42 -0
- metadata +4 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 778bc5305c6d1f20833819afd9ccd46f5a5b4c2c135d8e63344d6530f9a733f1
|
|
4
|
+
data.tar.gz: 32e769816eba846f419e3f31e8290b94e8ff04fe6ea71fef125bb128b3085b82
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 592db1d0ef94153a4ea028aa3cdeb59f7e4c73929ebec5afd5a9795a93d27a0cfb0c4bd3e135ccf1b14c0329123be6cf0905ad4a141018993832d21212754db3
|
|
7
|
+
data.tar.gz: 2fd90e47af8e26cf2d58468a1f0629fd1dc04890df0a6e1704e8d6cba5c65b1e050f10f56628c27c7c90b0545d081e33169e3bfd62f416affd761b62edeb24a0
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,12 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [0.5.3](https://github.com/arturictus/ruby_reactor/compare/v0.5.2...v0.5.3) (2026-06-17)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
### Features
|
|
7
|
+
|
|
8
|
+
* Durability & Recovery ([#39](https://github.com/arturictus/ruby_reactor/issues/39)) ([103e583](https://github.com/arturictus/ruby_reactor/commit/103e5835b413eec2302fa63f3e998d487cfd9eaf))
|
|
9
|
+
|
|
3
10
|
## [0.5.2](https://github.com/arturictus/ruby_reactor/compare/v0.5.1...v0.5.2) (2026-06-14)
|
|
4
11
|
|
|
5
12
|
|
data/README.md
CHANGED
|
@@ -36,6 +36,7 @@ The key value is **Reliability**: if any part of your workflow fails, Ruby React
|
|
|
36
36
|
| Locks / sem / rate / per | Yes | No | No | Manual |
|
|
37
37
|
| Built-in web dashboard | Yes | No | No | No |
|
|
38
38
|
| Async with Sidekiq | Yes | No | Limited | Yes |
|
|
39
|
+
| Durable crash recovery | Yes | No | No | Manual |
|
|
39
40
|
|
|
40
41
|
## Real-World Use Cases
|
|
41
42
|
|
|
@@ -44,6 +45,7 @@ The key value is **Reliability**: if any part of your workflow fails, Ruby React
|
|
|
44
45
|
- **Subscription Billing**: Coordinate Stripe charges, invoice email generation, and internal entitlement updates. Use interrupts to pause the workflow when 3rd-party APIs are required to continue the workflow or when specific customer approval is needed.
|
|
45
46
|
|
|
46
47
|
## Table of Contents
|
|
48
|
+
|
|
47
49
|
- [Features](#features)
|
|
48
50
|
- [Comparison](#comparison)
|
|
49
51
|
- [Real-World Use Cases](#real-world-use-cases)
|
|
@@ -57,6 +59,7 @@ The key value is **Reliability**: if any part of your workflow fails, Ruby React
|
|
|
57
59
|
- [Async Execution](#async-execution)
|
|
58
60
|
- [Full Reactor Async](#full-reactor-async)
|
|
59
61
|
- [Step-Level Async](#step-level-async)
|
|
62
|
+
- [Durability & Recovery](#durability--recovery)
|
|
60
63
|
- [Interrupts (Pause & Resume)](#interrupts-pause--resume)
|
|
61
64
|
- [Locks, Semaphores & Ordered Locks](#locks-semaphores--ordered-locks)
|
|
62
65
|
- [Map & Parallel Execution](#map--parallel-execution)
|
|
@@ -90,53 +93,96 @@ Or install it yourself as:
|
|
|
90
93
|
|
|
91
94
|
## Configuration
|
|
92
95
|
|
|
93
|
-
|
|
96
|
+
Every setting is **optional** — RubyReactor ships with the defaults shown. Drop
|
|
97
|
+
this into an initializer (e.g. `config/initializers/ruby_reactor.rb`); pasted as-is
|
|
98
|
+
it changes nothing, so it doubles as a reference of every knob.
|
|
94
99
|
|
|
95
|
-
|
|
100
|
+
> **Reading the block:** lines starting with `##` are documentation. Lines starting
|
|
101
|
+
> with a single `#` (a `config.…` call) are real settings commented at their
|
|
102
|
+
> default — uncomment one to enable it.
|
|
96
103
|
|
|
97
104
|
```ruby
|
|
98
105
|
RubyReactor.configure do |config|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
106
|
+
## === Storage (Redis) ===
|
|
107
|
+
|
|
108
|
+
## Storage adapter. Default: :redis (the only adapter shipped today).
|
|
109
|
+
# config.storage.adapter = :redis
|
|
110
|
+
|
|
111
|
+
## Redis URL. Default: "redis://localhost:6379/0".
|
|
102
112
|
config.storage.redis_url = ENV.fetch("REDIS_URL", "redis://localhost:6379/0")
|
|
103
|
-
# Extra options passed to Redis.new. Default: {}.
|
|
104
|
-
config.storage.redis_options = { timeout: 1 }
|
|
105
|
-
|
|
106
|
-
# Sidekiq queue used by RubyReactor's async worker. Default: :default.
|
|
107
|
-
config.sidekiq_queue = :default
|
|
108
|
-
# Sidekiq retry count for infrastructure failures only (deserialization,
|
|
109
|
-
# Redis, network). Step retries are managed separately. Default: 3.
|
|
110
|
-
config.sidekiq_retry_count = 3
|
|
111
|
-
|
|
112
|
-
# Lock/semaphore/rate-limit/ordered-lock contention snooze behavior for
|
|
113
|
-
# async reactors. When a Sidekiq worker cannot acquire a primitive it
|
|
114
|
-
# re-enqueues itself with `lock_snooze_base_delay + rand(0..lock_snooze_jitter)`
|
|
115
|
-
# seconds (rate-limit uses a precise `retry_after_seconds` hint from the error;
|
|
116
|
-
# ordered-lock waits re-poll at the base delay so a successor catches its
|
|
117
|
-
# blocker finishing fast), up to `lock_snooze_max_attempts` times before
|
|
118
|
-
# marking the context :failed. Defaults: 5 / 5 / 20. Set max_attempts to
|
|
119
|
-
# :infinity to never give up.
|
|
120
|
-
config.lock_snooze_base_delay = 5
|
|
121
|
-
config.lock_snooze_jitter = 5
|
|
122
|
-
config.lock_snooze_max_attempts = 20
|
|
123
|
-
|
|
124
|
-
# Named rate limits shared across reactors. Reference them with
|
|
125
|
-
# `with_rate_limit(:stripe)`. See Locks, Semaphores, Rate Limits & Periods.
|
|
126
|
-
config.rate_limits.register(:stripe, limits: { second: 3, minute: 100 })
|
|
127
113
|
|
|
128
|
-
|
|
129
|
-
config.
|
|
114
|
+
## Extra options passed to Redis.new. Default: {}.
|
|
115
|
+
# config.storage.redis_options = { timeout: 1 }
|
|
116
|
+
|
|
117
|
+
## === Sidekiq ===
|
|
118
|
+
|
|
119
|
+
## Sidekiq queue used by RubyReactor's async worker. Default: :default.
|
|
120
|
+
# config.sidekiq_queue = :default
|
|
121
|
+
|
|
122
|
+
## Sidekiq retry count for infrastructure failures only (deserialization,
|
|
123
|
+
## Redis, network). Step retries are managed separately. Default: 3.
|
|
124
|
+
# config.sidekiq_retry_count = 3
|
|
125
|
+
|
|
126
|
+
## === Contention snooze (locks / semaphores / rate limits / ordered locks) ===
|
|
127
|
+
|
|
128
|
+
## When a Sidekiq worker cannot acquire a primitive it re-enqueues itself with
|
|
129
|
+
## `lock_snooze_base_delay + rand(0..lock_snooze_jitter)` seconds (rate-limit
|
|
130
|
+
## uses a precise `retry_after_seconds` hint from the error; ordered-lock waits
|
|
131
|
+
## re-poll at the base delay so a successor catches its blocker finishing fast),
|
|
132
|
+
## up to `lock_snooze_max_attempts` times before marking the context :failed.
|
|
133
|
+
## Set max_attempts to :infinity to never give up.
|
|
134
|
+
# config.lock_snooze_base_delay = 5
|
|
135
|
+
# config.lock_snooze_jitter = 5
|
|
136
|
+
# config.lock_snooze_max_attempts = 20
|
|
137
|
+
|
|
138
|
+
## === Durability & crash recovery (see "Durability & Recovery" below) ===
|
|
139
|
+
|
|
140
|
+
## Retention TTL (seconds) for stored reactor/map state. Must exceed your
|
|
141
|
+
## worst-case snooze/retry window; re-stamped on every write. Default: 86_400.
|
|
142
|
+
# config.context_ttl = 86_400
|
|
130
143
|
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
144
|
+
## TTL (seconds) for the per-context liveness lock. A live worker auto-extends
|
|
145
|
+
## it; its absence is the sweeper's "worker died" signal. Must exceed the
|
|
146
|
+
## longest a single step can run without yielding the GIL. Default: 60.
|
|
147
|
+
# config.context_lock_ttl = 60
|
|
148
|
+
|
|
149
|
+
## Minimum seconds between per-step checkpoints within one run. 0 = checkpoint
|
|
150
|
+
## after every step (strongest guarantee). Raise to coalesce mid-run writes for
|
|
151
|
+
## long reactors — only safe when steps are idempotent. Default: 0.
|
|
152
|
+
# config.checkpoint_min_interval = 0
|
|
153
|
+
|
|
154
|
+
## Recovery sweeper (the chain is kicked once by `RubyReactor.start_sweeper!`).
|
|
155
|
+
# config.sweeper_enabled = true # run recovery by default
|
|
156
|
+
# config.sweeper_interval = 30 # seconds between sweeps = recovery-latency bound
|
|
157
|
+
# config.sweeper_limit = 1000 # max contexts/maps inspected per sweep
|
|
158
|
+
|
|
159
|
+
## === Misc ===
|
|
160
|
+
|
|
161
|
+
## Logger. Default: Logger.new($stdout).
|
|
162
|
+
# config.logger = Logger.new($stdout)
|
|
163
|
+
|
|
164
|
+
## Async router. Default: RubyReactor::SidekiqAdapter. Swap for a custom adapter
|
|
165
|
+
## if you don't use Sidekiq — it only needs to respond to
|
|
166
|
+
## `perform_async(context_id, reactor_class_name, **)`.
|
|
134
167
|
# config.async_router = MyCustomAdapter
|
|
168
|
+
|
|
169
|
+
## === Examples (no default — set these to use the feature) ===
|
|
170
|
+
|
|
171
|
+
## Named rate limits shared across reactors. Reference with `with_rate_limit(:stripe)`.
|
|
172
|
+
# config.rate_limits.register(:stripe, limits: { second: 3, minute: 100 })
|
|
173
|
+
|
|
174
|
+
## OpenTelemetry / custom middlewares. Default: [].
|
|
175
|
+
# config.middlewares = [RubyReactor::OpenTelemetry]
|
|
135
176
|
end
|
|
136
177
|
```
|
|
137
178
|
|
|
138
179
|
You can also leave out the `configure` block entirely — defaults work for local development against a Redis on `localhost:6379`.
|
|
139
180
|
|
|
181
|
+
> **Crash recovery needs a kick.** The `sweeper_*` settings above only configure
|
|
182
|
+
> the recovery sweeper — they do not start it. Call `RubyReactor.start_sweeper!`
|
|
183
|
+
> once at boot (ideally from a Sidekiq `on(:startup)` hook) or no crashed reactor
|
|
184
|
+
> will ever resume. See [Durability & Recovery](#durability--recovery).
|
|
185
|
+
|
|
140
186
|
|
|
141
187
|
## Quick Start
|
|
142
188
|
|
|
@@ -341,6 +387,73 @@ def create(params)
|
|
|
341
387
|
end
|
|
342
388
|
```
|
|
343
389
|
|
|
390
|
+
### Durability & Recovery
|
|
391
|
+
|
|
392
|
+
Async reactors are durable: state lives in Redis, not in the job payload. Before
|
|
393
|
+
any background job is enqueued the root context is persisted, and after every
|
|
394
|
+
completed step a checkpoint advances the stored blob — so a crash re-runs at most
|
|
395
|
+
one step, never the whole reactor. Each running reactor also holds a short
|
|
396
|
+
**liveness lock** that a live worker auto-extends; its absence is how a dead
|
|
397
|
+
worker is detected.
|
|
398
|
+
|
|
399
|
+
**Recovery is not automatic until you start the sweeper.** A crashed worker's
|
|
400
|
+
reactor only resumes when the recovery sweeper notices the lapsed liveness lock
|
|
401
|
+
and re-enqueues it. The sweeper is a self-rescheduling chain — **kick it once per
|
|
402
|
+
process boot:**
|
|
403
|
+
|
|
404
|
+
The recommended spot is a Sidekiq server startup hook, so only the worker
|
|
405
|
+
process runs recovery (not your web/console/client processes):
|
|
406
|
+
|
|
407
|
+
```ruby
|
|
408
|
+
# config/initializers/sidekiq.rb
|
|
409
|
+
Sidekiq.configure_server do |config|
|
|
410
|
+
config.on(:startup) { RubyReactor.start_sweeper! }
|
|
411
|
+
end
|
|
412
|
+
```
|
|
413
|
+
|
|
414
|
+
Anywhere that runs once at boot works too — e.g. a Rails initializer:
|
|
415
|
+
|
|
416
|
+
```ruby
|
|
417
|
+
# config/initializers/ruby_reactor.rb
|
|
418
|
+
RubyReactor.start_sweeper!
|
|
419
|
+
```
|
|
420
|
+
|
|
421
|
+
That's all that's required: `start_sweeper!` is idempotent (safe to call on every
|
|
422
|
+
boot — duplicate kicks collapse to one chain), runs both the top-level reactor
|
|
423
|
+
sweeper and the map sweeper every `config.sweeper_interval` seconds, and stops if
|
|
424
|
+
you set `config.sweeper_enabled = false`. The interval is your recovery-latency
|
|
425
|
+
bound.
|
|
426
|
+
|
|
427
|
+
> **Sidekiq Enterprise `super_fetch` compatibility:** the chain is safe under
|
|
428
|
+
> reliable fetch. `super_fetch` re-runs a job whose worker died mid-execution, so
|
|
429
|
+
> a tick that crashes *after* enqueuing its successor but *before* acking would,
|
|
430
|
+
> with naive single-flight, be recovered alongside that successor and fork the
|
|
431
|
+
> chain (doubling every interval). RubyReactor avoids this: it never relies on
|
|
432
|
+
> "one job in the chain" — each next tick is claimed by a per-time-window lock, so
|
|
433
|
+
> a `super_fetch`-recovered tick computes the same window, loses the claim, and
|
|
434
|
+
> collapses back to a single successor. The startup hook above is likewise
|
|
435
|
+
> idempotent across multiple `super_fetch` server processes.
|
|
436
|
+
|
|
437
|
+
**Prefer your own scheduler?** Set `config.sweeper_enabled = false` (which makes
|
|
438
|
+
`start_sweeper!` a no-op) and drive recovery from cron, a Kubernetes `CronJob`,
|
|
439
|
+
`sidekiq-cron`, `sidekiq-scheduler`, or Rails recurring tasks. Each tick is one
|
|
440
|
+
call:
|
|
441
|
+
|
|
442
|
+
```ruby
|
|
443
|
+
RubyReactor.sweep_once # => { reactors: <n re-enqueued>, maps: <n recovered> }
|
|
444
|
+
```
|
|
445
|
+
|
|
446
|
+
For example, a rake task a system cron / CronJob can invoke:
|
|
447
|
+
|
|
448
|
+
```ruby
|
|
449
|
+
# lib/tasks/ruby_reactor.rake
|
|
450
|
+
namespace :ruby_reactor do
|
|
451
|
+
task sweep: :environment do
|
|
452
|
+
RubyReactor.sweep_once
|
|
453
|
+
end
|
|
454
|
+
end
|
|
455
|
+
```
|
|
456
|
+
|
|
344
457
|
### Interrupts (Pause & Resume)
|
|
345
458
|
|
|
346
459
|
Pause execution to wait for external events like webhooks or user approvals.
|
|
@@ -9,12 +9,76 @@ module RubyReactor
|
|
|
9
9
|
|
|
10
10
|
attr_writer :sidekiq_queue, :sidekiq_retry_count, :logger, :async_router,
|
|
11
11
|
:lock_snooze_base_delay, :lock_snooze_jitter, :lock_snooze_max_attempts,
|
|
12
|
-
:middlewares
|
|
12
|
+
:middlewares, :context_ttl, :context_lock_ttl, :checkpoint_min_interval,
|
|
13
|
+
:sweeper_enabled, :sweeper_interval, :sweeper_limit
|
|
13
14
|
|
|
14
15
|
def sidekiq_queue
|
|
15
16
|
@sidekiq_queue ||= :default
|
|
16
17
|
end
|
|
17
18
|
|
|
19
|
+
# Retention TTL (seconds) for a stored reactor context. Storage is
|
|
20
|
+
# load-bearing for resume, so this must comfortably exceed the worst-case
|
|
21
|
+
# snooze/retry window. Refreshed on every checkpoint write.
|
|
22
|
+
def context_ttl
|
|
23
|
+
@context_ttl ||= 86_400
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Minimum wall-clock seconds between two PER-STEP durable checkpoints within a
|
|
27
|
+
# single worker run. The save-per-step checkpoint (`on_step_complete`) bounds
|
|
28
|
+
# crash re-execution to one step, but re-serializes and re-writes the WHOLE
|
|
29
|
+
# root blob after every Success — O(steps × context_size) writes for a long,
|
|
30
|
+
# large reactor. This throttle coalesces the mid-run intermediate checkpoints:
|
|
31
|
+
# a checkpoint is written only if at least this many seconds have elapsed since
|
|
32
|
+
# the last one. The final terminal/handoff state is ALWAYS persisted (by the
|
|
33
|
+
# run's ensure-save and the pre-enqueue checkpoint), so throttling only affects
|
|
34
|
+
# mid-run granularity. Tradeoff: with interval > 0, a crash may re-run every
|
|
35
|
+
# step completed inside the last interval — safe only when those steps are
|
|
36
|
+
# idempotent or side-effect-free.
|
|
37
|
+
#
|
|
38
|
+
# Default 0 -> checkpoint after EVERY step (strongest guarantee, no coalescing).
|
|
39
|
+
def checkpoint_min_interval
|
|
40
|
+
@checkpoint_min_interval ||= 0
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Whether the recovery sweepers run. The host kicks the self-rescheduling
|
|
44
|
+
# chain once (`RubyReactor.start_sweeper!`, e.g. from an initializer); each
|
|
45
|
+
# tick re-checks this flag, so flipping it to false stops the chain at the
|
|
46
|
+
# next tick. Default on: durability is inert without a running sweeper, so
|
|
47
|
+
# recovery must work out of the box.
|
|
48
|
+
def sweeper_enabled
|
|
49
|
+
@sweeper_enabled = true if @sweeper_enabled.nil?
|
|
50
|
+
@sweeper_enabled
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Seconds between sweeps. This is the upper bound on recovery latency for a
|
|
54
|
+
# dead worker — lower it for faster recovery, raise it to cut scan load.
|
|
55
|
+
def sweeper_interval
|
|
56
|
+
@sweeper_interval ||= 30
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Max contexts/maps inspected per sweep (passed to each sweeper's run_once).
|
|
60
|
+
def sweeper_limit
|
|
61
|
+
@sweeper_limit ||= 1000
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# TTL (seconds) for the per-context liveness lock (`async:<id>`). Short by
|
|
65
|
+
# design — it is a liveness signal, not retention. A live worker auto-extends
|
|
66
|
+
# it (every ttl/3 s, from a background thread); its absence is the sweeper's
|
|
67
|
+
# "worker died" signal.
|
|
68
|
+
#
|
|
69
|
+
# SAFETY CONSTRAINT: this MUST exceed the longest a single step can run
|
|
70
|
+
# WITHOUT letting the auto-extend thread make progress. Under MRI the
|
|
71
|
+
# extender shares the GIL, so a step that holds the GIL continuously for
|
|
72
|
+
# longer than this TTL (a long CPU-bound pure-Ruby loop, a C extension that
|
|
73
|
+
# never releases the GIL, or a stop-the-world GC pause) lets the lock lapse.
|
|
74
|
+
# A lapsed lock looks "dead" to the sweeper, which may re-enqueue a duplicate
|
|
75
|
+
# that runs CONCURRENTLY with the still-live original — a double-run. I/O-bound
|
|
76
|
+
# steps release the GIL and keep the lock fresh, so the default 60s suits
|
|
77
|
+
# typical workloads; raise it if you run long synchronous CPU-bound steps.
|
|
78
|
+
def context_lock_ttl
|
|
79
|
+
@context_lock_ttl ||= 60
|
|
80
|
+
end
|
|
81
|
+
|
|
18
82
|
def sidekiq_retry_count
|
|
19
83
|
@sidekiq_retry_count ||= 3
|
|
20
84
|
end
|
|
@@ -36,7 +100,7 @@ module RubyReactor
|
|
|
36
100
|
end
|
|
37
101
|
|
|
38
102
|
def logger
|
|
39
|
-
@logger ||= Logger.new($
|
|
103
|
+
@logger ||= Logger.new($stdout)
|
|
40
104
|
end
|
|
41
105
|
|
|
42
106
|
def async_router
|
|
@@ -19,13 +19,18 @@ module RubyReactor
|
|
|
19
19
|
|
|
20
20
|
def deserialize(serialized_data)
|
|
21
21
|
decompressed = decompress_if_needed(serialized_data)
|
|
22
|
-
|
|
22
|
+
deserialize_hash(JSON.parse(decompressed, symbolize_names: false))
|
|
23
|
+
rescue JSON::ParserError => e
|
|
24
|
+
raise RubyReactor::Error::DeserializationError, "Failed to parse serialized context: #{e.message}"
|
|
25
|
+
end
|
|
23
26
|
|
|
27
|
+
# Deserialize from an already-parsed Hash (e.g. what the storage adapter's
|
|
28
|
+
# `retrieve_context` returns). Lets the rehydrate-by-id worker path avoid a
|
|
29
|
+
# second JSON parse while still schema-validating. Schema validation lives
|
|
30
|
+
# here so both the string and Hash entry points enforce it.
|
|
31
|
+
def deserialize_hash(data)
|
|
24
32
|
validate_schema_version(data)
|
|
25
|
-
|
|
26
33
|
Context.deserialize_from_retry(data)
|
|
27
|
-
rescue JSON::ParserError => e
|
|
28
|
-
raise RubyReactor::Error::DeserializationError, "Failed to parse serialized context: #{e.message}"
|
|
29
34
|
end
|
|
30
35
|
|
|
31
36
|
# rubocop:disable Metrics/CyclomaticComplexity, Metrics/MethodLength
|
|
@@ -180,7 +180,7 @@ module RubyReactor
|
|
|
180
180
|
end
|
|
181
181
|
|
|
182
182
|
def stored_context_status
|
|
183
|
-
reactor_class_name =
|
|
183
|
+
reactor_class_name = RubyReactor.reactor_storage_name(@reactor_class)
|
|
184
184
|
data = RubyReactor.configuration.storage_adapter.retrieve_context(@context.context_id, reactor_class_name)
|
|
185
185
|
return nil unless data
|
|
186
186
|
|
|
@@ -48,7 +48,7 @@ module RubyReactor
|
|
|
48
48
|
@context.root_context || @context
|
|
49
49
|
end
|
|
50
50
|
|
|
51
|
-
reactor_class_name = context_to_serialize.reactor_class
|
|
51
|
+
reactor_class_name = RubyReactor.reactor_storage_name(context_to_serialize.reactor_class)
|
|
52
52
|
|
|
53
53
|
@middlewares.on(:before_async_enqueue, context_to_serialize)
|
|
54
54
|
|
|
@@ -72,7 +72,12 @@ module RubyReactor
|
|
|
72
72
|
fail_fast: map_args[:fail_fast]
|
|
73
73
|
)
|
|
74
74
|
else
|
|
75
|
-
|
|
75
|
+
# Persist BEFORE enqueue — the job payload is identity-only (F2). The
|
|
76
|
+
# rescheduled job rehydrates the root by id from storage.
|
|
77
|
+
configuration.storage_adapter.store_context(
|
|
78
|
+
context_to_serialize.context_id, serialized_context, reactor_class_name
|
|
79
|
+
)
|
|
80
|
+
configuration.async_router.perform_in(delay, context_to_serialize.context_id, reactor_class_name)
|
|
76
81
|
end
|
|
77
82
|
end
|
|
78
83
|
|
|
@@ -11,6 +11,7 @@ module RubyReactor
|
|
|
11
11
|
@result_handler = managers[:result_handler]
|
|
12
12
|
@compensation_manager = managers[:compensation_manager]
|
|
13
13
|
@middlewares = managers[:middlewares] || context.middlewares || Executor.middlewares_for(reactor_class)
|
|
14
|
+
@on_step_complete = managers[:on_step_complete]
|
|
14
15
|
end
|
|
15
16
|
|
|
16
17
|
def execute_all_steps
|
|
@@ -45,8 +46,14 @@ module RubyReactor
|
|
|
45
46
|
# If a step returns InterruptResult, we need to stop execution and return it
|
|
46
47
|
return result if result.is_a?(RubyReactor::InterruptResult)
|
|
47
48
|
|
|
48
|
-
#
|
|
49
|
-
|
|
49
|
+
# Only a continue-Success reaches here (Async/Retry/Skipped/Failure/
|
|
50
|
+
# Interrupt all returned above; nil is inline-async test mode). It is
|
|
51
|
+
# the one outcome where the loop proceeds to more steps with no other
|
|
52
|
+
# save in between — every terminal/handoff result persists via its own
|
|
53
|
+
# path. Write a durable checkpoint so a crash re-runs at most this one
|
|
54
|
+
# step. Ordering: side-effect -> record result (inside execute_step) ->
|
|
55
|
+
# checkpoint here.
|
|
56
|
+
@on_step_complete&.call if result.is_a?(RubyReactor::Success)
|
|
50
57
|
end
|
|
51
58
|
end
|
|
52
59
|
|
|
@@ -198,20 +205,33 @@ module RubyReactor
|
|
|
198
205
|
|
|
199
206
|
# Use root context if available to ensure we serialize the full tree
|
|
200
207
|
context_to_serialize = @context.root_context || @context
|
|
201
|
-
reactor_class_name = context_to_serialize.reactor_class
|
|
208
|
+
reactor_class_name = RubyReactor.reactor_storage_name(context_to_serialize.reactor_class)
|
|
202
209
|
|
|
203
210
|
# Inject OTel context before serialization
|
|
204
211
|
@middlewares.on(:before_async_enqueue, context_to_serialize)
|
|
205
212
|
|
|
206
|
-
|
|
213
|
+
# Storage is load-bearing: the job payload is identity-only, so the root
|
|
214
|
+
# context MUST be persisted BEFORE the job is enqueued (F2). The reactor
|
|
215
|
+
# class name used for the storage key must match the one handed to the
|
|
216
|
+
# worker, so compute it once and reuse it for both.
|
|
217
|
+
checkpoint_root!(context_to_serialize, reactor_class_name)
|
|
207
218
|
|
|
208
219
|
configuration.async_router.perform_async(
|
|
209
|
-
|
|
220
|
+
context_to_serialize.context_id,
|
|
210
221
|
reactor_class_name,
|
|
211
222
|
intermediate_results: @context.intermediate_results
|
|
212
223
|
)
|
|
213
224
|
end
|
|
214
225
|
|
|
226
|
+
# Persist the root context under its storage key. Mirrors Executor#checkpoint!
|
|
227
|
+
# but lives here because handle_async_step runs inside the StepExecutor and
|
|
228
|
+
# must serialize AFTER the before_async_enqueue middleware has injected its
|
|
229
|
+
# OTel context.
|
|
230
|
+
def checkpoint_root!(root, reactor_class_name)
|
|
231
|
+
storage = RubyReactor::Configuration.instance.storage_adapter
|
|
232
|
+
storage.store_context(root.context_id, ContextSerializer.serialize(root), reactor_class_name)
|
|
233
|
+
end
|
|
234
|
+
|
|
215
235
|
def handle_interrupt_step(step_config)
|
|
216
236
|
# Check if we have a result for this step (resuming)
|
|
217
237
|
if @context.intermediate_results.key?(step_config.name)
|
|
@@ -38,14 +38,24 @@ module RubyReactor
|
|
|
38
38
|
retry_manager: @retry_manager,
|
|
39
39
|
result_handler: @result_handler,
|
|
40
40
|
compensation_manager: @compensation_manager,
|
|
41
|
-
middlewares: @middlewares
|
|
41
|
+
middlewares: @middlewares,
|
|
42
|
+
# Save-per-step durable checkpoint. checkpoint! resolves the ROOT
|
|
43
|
+
# context, so this same callback — wired into every executor including
|
|
44
|
+
# the nested ones ComposeStep builds — always advances the root blob
|
|
45
|
+
# (F8): a mid-child crash re-runs one sub-step, not the whole child.
|
|
46
|
+
# `throttle: true` lets checkpoint_min_interval coalesce these mid-run
|
|
47
|
+
# writes (default 0 = write every step); the terminal save still runs.
|
|
48
|
+
on_step_complete: -> { checkpoint!(throttle: true) }
|
|
42
49
|
}
|
|
43
50
|
)
|
|
44
51
|
@result = nil
|
|
45
52
|
@acquired_lock = nil
|
|
46
53
|
@acquired_semaphore = nil
|
|
54
|
+
@acquired_context_lock = nil
|
|
55
|
+
@context_lock_owner = nil
|
|
47
56
|
@contention_snooze = false
|
|
48
57
|
@skip_context_persist = false
|
|
58
|
+
@last_checkpoint_at = nil
|
|
49
59
|
end
|
|
50
60
|
|
|
51
61
|
def self.resolve_middlewares(reactor_class)
|
|
@@ -150,7 +160,7 @@ module RubyReactor
|
|
|
150
160
|
end
|
|
151
161
|
end
|
|
152
162
|
|
|
153
|
-
def resume_execution # rubocop:disable Metrics/MethodLength,Metrics/PerceivedComplexity
|
|
163
|
+
def resume_execution # rubocop:disable Metrics/MethodLength,Metrics/PerceivedComplexity,Metrics/CyclomaticComplexity
|
|
154
164
|
middlewares.on(:start_reactor, reactor_class.name, context.inputs, @context)
|
|
155
165
|
completed = false
|
|
156
166
|
|
|
@@ -175,6 +185,13 @@ module RubyReactor
|
|
|
175
185
|
@context.status = :running
|
|
176
186
|
check_rate_limit if first_run
|
|
177
187
|
|
|
188
|
+
# Per-context liveness lock: serializes duplicate deliveries of the same
|
|
189
|
+
# root context (e.g. a sweeper re-enqueue racing a still-live worker) and
|
|
190
|
+
# doubles as the sweeper's "worker alive" signal. Only the ROOT executor
|
|
191
|
+
# holds it — composed/nested children resume inline under the root worker
|
|
192
|
+
# and must not contend on the root's own key.
|
|
193
|
+
acquire_context_lock
|
|
194
|
+
|
|
178
195
|
# Resumes intentionally skip check_rate_limit (a paused run must not
|
|
179
196
|
# block itself on resume), so acquire lock/semaphore directly rather
|
|
180
197
|
# than via acquire_locks.
|
|
@@ -217,6 +234,8 @@ module RubyReactor
|
|
|
217
234
|
@result
|
|
218
235
|
ensure
|
|
219
236
|
release_locks
|
|
237
|
+
@acquired_context_lock&.release
|
|
238
|
+
@acquired_context_lock = nil
|
|
220
239
|
leave_ordered_lock_scope
|
|
221
240
|
save_context unless skip_context_persist?
|
|
222
241
|
|
|
@@ -241,13 +260,40 @@ module RubyReactor
|
|
|
241
260
|
|
|
242
261
|
def save_context
|
|
243
262
|
storage = RubyReactor::Configuration.instance.storage_adapter
|
|
244
|
-
reactor_class_name =
|
|
263
|
+
reactor_class_name = RubyReactor.reactor_storage_name(@reactor_class)
|
|
245
264
|
|
|
246
265
|
# Serialize context
|
|
247
266
|
serialized_context = ContextSerializer.serialize(@context)
|
|
248
267
|
storage.store_context(@context.context_id, serialized_context, reactor_class_name)
|
|
249
268
|
end
|
|
250
269
|
|
|
270
|
+
# Durable per-step checkpoint. Unlike save_context (which serializes THIS
|
|
271
|
+
# executor's @context — the observability path, F1), checkpoint! always
|
|
272
|
+
# serializes and stores the ROOT context under the root's key — the unit the
|
|
273
|
+
# async worker rehydrates by id. For a top-level reactor root == @context; for
|
|
274
|
+
# a composed/nested child it stores the root with the child's live state
|
|
275
|
+
# embedded via composed_contexts. TTL is re-stamped on every write (Phase 4).
|
|
276
|
+
def checkpoint!(throttle: false)
|
|
277
|
+
return if throttle && !checkpoint_due?
|
|
278
|
+
|
|
279
|
+
root = @context.root_context || @context
|
|
280
|
+
storage = RubyReactor::Configuration.instance.storage_adapter
|
|
281
|
+
reactor_class_name = RubyReactor.reactor_storage_name(root.reactor_class)
|
|
282
|
+
storage.store_context(root.context_id, ContextSerializer.serialize(root), reactor_class_name)
|
|
283
|
+
@last_checkpoint_at = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
284
|
+
end
|
|
285
|
+
|
|
286
|
+
# Whether a throttled (per-step) checkpoint is due. With checkpoint_min_interval
|
|
287
|
+
# <= 0 (default) every step checkpoints; otherwise mid-run checkpoints are
|
|
288
|
+
# coalesced to at most one per interval. The first step of a run always writes
|
|
289
|
+
# (@last_checkpoint_at is nil), and the run's terminal save is never throttled.
|
|
290
|
+
def checkpoint_due?
|
|
291
|
+
interval = RubyReactor.configuration.checkpoint_min_interval.to_f
|
|
292
|
+
return true if interval <= 0 || @last_checkpoint_at.nil?
|
|
293
|
+
|
|
294
|
+
(Process.clock_gettime(Process::CLOCK_MONOTONIC) - @last_checkpoint_at) >= interval
|
|
295
|
+
end
|
|
296
|
+
|
|
251
297
|
def persist_context?
|
|
252
298
|
@context.status.to_s != "pending" ||
|
|
253
299
|
@context.execution_trace.any? ||
|
|
@@ -340,6 +386,42 @@ module RubyReactor
|
|
|
340
386
|
RubyReactor::Period.key(base, config[:every])
|
|
341
387
|
end
|
|
342
388
|
|
|
389
|
+
# Per-execution liveness lock on the root context id. Owner is a fresh UUID
|
|
390
|
+
# per execution (NOT the context_id): a duplicate delivery of the *same*
|
|
391
|
+
# context from a different worker must be blocked, so reentrancy by id would
|
|
392
|
+
# defeat the guard. Only the root executor acquires — a composed/nested child
|
|
393
|
+
# resumes inline under the root worker and shares the root's lock, so it must
|
|
394
|
+
# not try to re-acquire the same key with a different owner (self-deadlock).
|
|
395
|
+
def acquire_context_lock
|
|
396
|
+
root = @context.root_context || @context
|
|
397
|
+
return unless root.equal?(@context) # only the root executor holds it
|
|
398
|
+
# In Sidekiq::Testing.inline! the retry/snooze `perform_in` re-enters the
|
|
399
|
+
# worker synchronously, nested inside this still-running frame that holds
|
|
400
|
+
# the lock — it would self-contend forever. The lock guards concurrent
|
|
401
|
+
# cross-process delivery, which cannot happen under inline testing, so skip.
|
|
402
|
+
return if inline_testing_mode?
|
|
403
|
+
|
|
404
|
+
lock = RubyReactor::Lock.new(
|
|
405
|
+
"async:#{root.context_id}",
|
|
406
|
+
owner: @context_lock_owner ||= SecureRandom.uuid,
|
|
407
|
+
ttl: RubyReactor.configuration.context_lock_ttl,
|
|
408
|
+
wait: 0, # fail fast -> snooze; never block the worker thread
|
|
409
|
+
auto_extend: true # keep the liveness signal fresh while we run
|
|
410
|
+
)
|
|
411
|
+
lock.acquire
|
|
412
|
+
@acquired_context_lock = lock
|
|
413
|
+
rescue RubyReactor::Lock::AcquisitionError => e
|
|
414
|
+
# We lost the race to a live original holding this context's lock. We did
|
|
415
|
+
# no work, so we must NOT persist on the way out — saving our (older)
|
|
416
|
+
# rehydrated snapshot would clobber the original's newer checkpoint.
|
|
417
|
+
@skip_context_persist = true
|
|
418
|
+
raise RubyReactor::Lock::ContextLockContention.new(e.message, context_lock_key: "async:#{root.context_id}")
|
|
419
|
+
end
|
|
420
|
+
|
|
421
|
+
def inline_testing_mode?
|
|
422
|
+
defined?(Sidekiq::Testing) && Sidekiq::Testing.respond_to?(:inline?) && Sidekiq::Testing.inline?
|
|
423
|
+
end
|
|
424
|
+
|
|
343
425
|
def acquire_exclusive_lock
|
|
344
426
|
config = @reactor_class.lock_config
|
|
345
427
|
key = config[:key_proc].call(@context.inputs)
|
data/lib/ruby_reactor/lock.rb
CHANGED
|
@@ -4,6 +4,19 @@ module RubyReactor
|
|
|
4
4
|
class Lock
|
|
5
5
|
class AcquisitionError < StandardError; end
|
|
6
6
|
|
|
7
|
+
# Raised specifically for the per-context liveness lock (`async:<id>`).
|
|
8
|
+
# Carries the bare key so the worker can exempt it from the snooze cap:
|
|
9
|
+
# a duplicate of the *same* execution may legitimately wait arbitrarily
|
|
10
|
+
# long for the live original to finish.
|
|
11
|
+
class ContextLockContention < AcquisitionError
|
|
12
|
+
attr_reader :context_lock_key
|
|
13
|
+
|
|
14
|
+
def initialize(message, context_lock_key:)
|
|
15
|
+
super(message)
|
|
16
|
+
@context_lock_key = context_lock_key
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
|
|
7
20
|
# Minimum interval between auto-extend pings; protects very small TTLs.
|
|
8
21
|
MIN_EXTEND_INTERVAL = 1.0
|
|
9
22
|
|