shikibu 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +487 -0
- data/lib/shikibu/activity.rb +135 -0
- data/lib/shikibu/app.rb +299 -0
- data/lib/shikibu/channels.rb +360 -0
- data/lib/shikibu/constants.rb +70 -0
- data/lib/shikibu/context.rb +208 -0
- data/lib/shikibu/errors.rb +137 -0
- data/lib/shikibu/integrations/active_job.rb +95 -0
- data/lib/shikibu/integrations/sidekiq.rb +104 -0
- data/lib/shikibu/locking.rb +110 -0
- data/lib/shikibu/middleware/rack_app.rb +197 -0
- data/lib/shikibu/notify/notify_base.rb +67 -0
- data/lib/shikibu/notify/pg_notify.rb +217 -0
- data/lib/shikibu/notify/wake_event.rb +56 -0
- data/lib/shikibu/outbox/relayer.rb +227 -0
- data/lib/shikibu/replay.rb +361 -0
- data/lib/shikibu/retry_policy.rb +81 -0
- data/lib/shikibu/storage/migrations.rb +179 -0
- data/lib/shikibu/storage/sequel_storage.rb +883 -0
- data/lib/shikibu/version.rb +5 -0
- data/lib/shikibu/worker.rb +389 -0
- data/lib/shikibu/workflow.rb +398 -0
- data/lib/shikibu.rb +152 -0
- data/schema/LICENSE +21 -0
- data/schema/README.md +57 -0
- data/schema/db/migrations/mysql/20251217000000_initial_schema.sql +284 -0
- data/schema/db/migrations/postgresql/20251217000000_initial_schema.sql +284 -0
- data/schema/db/migrations/sqlite/20251217000000_initial_schema.sql +284 -0
- data/schema/docs/column-values.md +91 -0
- metadata +231 -0
|
@@ -0,0 +1,389 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Shikibu
|
|
4
|
+
# Background worker for workflow processing
|
|
5
|
+
class Worker
|
|
6
|
+
WORKFLOW_POLL_INTERVAL = 1
|
|
7
|
+
TIMER_CHECK_INTERVAL = 10
|
|
8
|
+
STALE_LOCK_INTERVAL = 60
|
|
9
|
+
MESSAGE_CHECK_INTERVAL = 5
|
|
10
|
+
MESSAGE_CLEANUP_INTERVAL = 3600 # 1 hour
|
|
11
|
+
DEFAULT_MESSAGE_RETENTION_DAYS = 7
|
|
12
|
+
|
|
13
|
+
attr_reader :app, :storage, :worker_id, :resume_wake_event, :message_wake_event, :outbox_wake_event
|
|
14
|
+
|
|
15
|
+
def initialize(app)
|
|
16
|
+
@app = app
|
|
17
|
+
@storage = app.storage
|
|
18
|
+
@worker_id = app.worker_id
|
|
19
|
+
@running = false
|
|
20
|
+
@threads = []
|
|
21
|
+
@is_leader = false
|
|
22
|
+
@leader_tasks = []
|
|
23
|
+
|
|
24
|
+
# Wake events for NOTIFY integration
|
|
25
|
+
@resume_wake_event = Notify::WakeEvent.new
|
|
26
|
+
@message_wake_event = Notify::WakeEvent.new
|
|
27
|
+
@outbox_wake_event = Notify::WakeEvent.new
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def start
|
|
31
|
+
return if @running
|
|
32
|
+
|
|
33
|
+
@running = true
|
|
34
|
+
|
|
35
|
+
# All workers run these
|
|
36
|
+
@threads << Thread.new { run_workflow_resumption }
|
|
37
|
+
@threads << Thread.new { run_message_delivery }
|
|
38
|
+
|
|
39
|
+
# Leader election and leader-only tasks
|
|
40
|
+
@threads << Thread.new { run_leader_election }
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def stop
|
|
44
|
+
@running = false
|
|
45
|
+
stop_leader_tasks
|
|
46
|
+
@threads.each { |t| t.join(5) }
|
|
47
|
+
@threads.clear
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def running?
|
|
51
|
+
@running
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def leader?
|
|
55
|
+
@is_leader
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
private
|
|
59
|
+
|
|
60
|
+
# Resume workflows that are in 'running' status but not locked
|
|
61
|
+
def run_workflow_resumption
|
|
62
|
+
consecutive_empty = 0
|
|
63
|
+
|
|
64
|
+
while @running
|
|
65
|
+
begin
|
|
66
|
+
workflows = storage.find_resumable_workflows(limit: 10)
|
|
67
|
+
|
|
68
|
+
if workflows.empty?
|
|
69
|
+
consecutive_empty += 1
|
|
70
|
+
backoff = calculate_backoff(consecutive_empty)
|
|
71
|
+
|
|
72
|
+
# Wait with wake event support (reset backoff on NOTIFY wake)
|
|
73
|
+
consecutive_empty = 0 if @resume_wake_event.wait(backoff)
|
|
74
|
+
else
|
|
75
|
+
consecutive_empty = 0
|
|
76
|
+
workflows.each do |wf|
|
|
77
|
+
break unless @running
|
|
78
|
+
|
|
79
|
+
resume_workflow_safe(wf[:instance_id])
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
rescue StandardError => e
|
|
83
|
+
log_error('workflow_resumption', e)
|
|
84
|
+
interruptible_sleep(5)
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Deliver messages to waiting workflows
|
|
90
|
+
def run_message_delivery
|
|
91
|
+
while @running
|
|
92
|
+
begin
|
|
93
|
+
# Find subscriptions with pending messages
|
|
94
|
+
messages_delivered = process_pending_messages
|
|
95
|
+
|
|
96
|
+
# Wait with wake event support
|
|
97
|
+
@message_wake_event.wait(MESSAGE_CHECK_INTERVAL) if messages_delivered.zero?
|
|
98
|
+
rescue StandardError => e
|
|
99
|
+
log_error('message_delivery', e)
|
|
100
|
+
interruptible_sleep(5)
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# Leader election using system locks
|
|
106
|
+
def run_leader_election
|
|
107
|
+
while @running
|
|
108
|
+
begin
|
|
109
|
+
acquired = storage.try_acquire_system_lock('shikibu_leader', worker_id, timeout: 30)
|
|
110
|
+
|
|
111
|
+
if acquired && !@is_leader
|
|
112
|
+
@is_leader = true
|
|
113
|
+
start_leader_tasks
|
|
114
|
+
elsif !acquired && @is_leader
|
|
115
|
+
@is_leader = false
|
|
116
|
+
stop_leader_tasks
|
|
117
|
+
elsif acquired
|
|
118
|
+
# Refresh lock
|
|
119
|
+
storage.refresh_system_lock('shikibu_leader', worker_id, timeout: 30)
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
interruptible_sleep(10 + (rand * 3))
|
|
123
|
+
rescue StandardError => e
|
|
124
|
+
log_error('leader_election', e)
|
|
125
|
+
interruptible_sleep(5)
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# Release lock on shutdown
|
|
130
|
+
storage.release_system_lock('shikibu_leader', worker_id) if @is_leader
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def start_leader_tasks
|
|
134
|
+
@leader_tasks << Thread.new { run_timer_check }
|
|
135
|
+
@leader_tasks << Thread.new { run_stale_lock_cleanup }
|
|
136
|
+
@leader_tasks << Thread.new { run_timeout_check }
|
|
137
|
+
@leader_tasks << Thread.new { run_message_cleanup }
|
|
138
|
+
@leader_tasks << Thread.new { run_outbox_relay } if app.outbox_enabled?
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
def stop_leader_tasks
|
|
142
|
+
@is_leader = false # Signal threads to stop (they check @running && @is_leader)
|
|
143
|
+
@leader_tasks.each { |t| t.join(2) } # Wait up to 2 seconds for graceful shutdown
|
|
144
|
+
@leader_tasks.clear
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
# Check for expired timers (leader only)
|
|
148
|
+
def run_timer_check
|
|
149
|
+
while @running && @is_leader
|
|
150
|
+
begin
|
|
151
|
+
expired = storage.find_expired_timers(limit: 100)
|
|
152
|
+
|
|
153
|
+
expired.each do |timer|
|
|
154
|
+
break unless @running
|
|
155
|
+
|
|
156
|
+
handle_timer_expired(timer)
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
interruptible_sleep(TIMER_CHECK_INTERVAL)
|
|
160
|
+
rescue StandardError => e
|
|
161
|
+
log_error('timer_check', e)
|
|
162
|
+
interruptible_sleep(5)
|
|
163
|
+
end
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
# Cleanup stale locks and resume workflows (leader only)
|
|
168
|
+
def run_stale_lock_cleanup
|
|
169
|
+
while @running && @is_leader
|
|
170
|
+
begin
|
|
171
|
+
stale = storage.find_stale_locked_workflows(stale_threshold_seconds: 300)
|
|
172
|
+
|
|
173
|
+
stale.each do |wf|
|
|
174
|
+
break unless @running
|
|
175
|
+
|
|
176
|
+
instance_id = wf[:instance_id]
|
|
177
|
+
|
|
178
|
+
# Force release lock
|
|
179
|
+
storage.db[:workflow_instances]
|
|
180
|
+
.where(instance_id: instance_id)
|
|
181
|
+
.update(locked_by: nil, locked_at: nil, lock_expires_at: nil)
|
|
182
|
+
|
|
183
|
+
# Resume the workflow (like Edda's auto_resume_stale_workflows_periodically)
|
|
184
|
+
resume_workflow_safe(instance_id)
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
interruptible_sleep(STALE_LOCK_INTERVAL)
|
|
188
|
+
rescue StandardError => e
|
|
189
|
+
log_error('stale_lock_cleanup', e)
|
|
190
|
+
interruptible_sleep(10)
|
|
191
|
+
end
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
# Check for timed out subscriptions (leader only)
|
|
196
|
+
def run_timeout_check
|
|
197
|
+
while @running && @is_leader
|
|
198
|
+
begin
|
|
199
|
+
timed_out = storage.find_timed_out_subscriptions(limit: 100)
|
|
200
|
+
|
|
201
|
+
timed_out.each do |sub|
|
|
202
|
+
break unless @running
|
|
203
|
+
|
|
204
|
+
handle_subscription_timeout(sub)
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
interruptible_sleep(TIMER_CHECK_INTERVAL)
|
|
208
|
+
rescue StandardError => e
|
|
209
|
+
log_error('timeout_check', e)
|
|
210
|
+
interruptible_sleep(5)
|
|
211
|
+
end
|
|
212
|
+
end
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
# Cleanup old channel messages (leader only)
|
|
216
|
+
# Messages older than retention_days are deleted to prevent database growth
|
|
217
|
+
def run_message_cleanup
|
|
218
|
+
retention_days = app.message_retention_days || DEFAULT_MESSAGE_RETENTION_DAYS
|
|
219
|
+
|
|
220
|
+
while @running && @is_leader
|
|
221
|
+
begin
|
|
222
|
+
deleted_count = storage.cleanup_old_channel_messages(retention_days: retention_days)
|
|
223
|
+
|
|
224
|
+
warn "[Shikibu::Worker] Cleaned up #{deleted_count} old channel messages" if deleted_count.positive?
|
|
225
|
+
|
|
226
|
+
interruptible_sleep(MESSAGE_CLEANUP_INTERVAL)
|
|
227
|
+
rescue StandardError => e
|
|
228
|
+
log_error('message_cleanup', e)
|
|
229
|
+
interruptible_sleep(60)
|
|
230
|
+
end
|
|
231
|
+
end
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
# Relay outbox events to external message broker (leader only)
|
|
235
|
+
def run_outbox_relay
|
|
236
|
+
relayer = Outbox::Relayer.new(
|
|
237
|
+
storage: storage,
|
|
238
|
+
broker_url: app.broker_url,
|
|
239
|
+
wake_event: @outbox_wake_event,
|
|
240
|
+
poll_interval: app.outbox_poll_interval,
|
|
241
|
+
max_retries: app.outbox_max_retries,
|
|
242
|
+
max_age_hours: app.outbox_max_age_hours
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
relayer.start
|
|
246
|
+
|
|
247
|
+
# Wait while still leader
|
|
248
|
+
interruptible_sleep(1) while @running && @is_leader
|
|
249
|
+
|
|
250
|
+
relayer.stop
|
|
251
|
+
rescue StandardError => e
|
|
252
|
+
log_error('outbox_relay', e)
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
def resume_workflow_safe(instance_id)
|
|
256
|
+
app.resume_workflow(instance_id)
|
|
257
|
+
rescue LockNotAcquiredError
|
|
258
|
+
# Another worker got it, that's fine
|
|
259
|
+
rescue StandardError => e
|
|
260
|
+
log_error("resume_workflow(#{instance_id})", e)
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
def handle_timer_expired(timer)
|
|
264
|
+
instance_id = timer[:instance_id]
|
|
265
|
+
timer_id = timer[:timer_id]
|
|
266
|
+
activity_id = timer[:activity_id]
|
|
267
|
+
|
|
268
|
+
# Record timer expiration in history
|
|
269
|
+
storage.append_history(
|
|
270
|
+
instance_id: instance_id,
|
|
271
|
+
activity_id: activity_id || timer_id,
|
|
272
|
+
event_type: EventType::TIMER_EXPIRED,
|
|
273
|
+
event_data: { timer_id: timer_id, expired_at: Time.now.iso8601 }
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
# Remove timer subscription
|
|
277
|
+
storage.remove_timer(instance_id: instance_id, timer_id: timer_id)
|
|
278
|
+
|
|
279
|
+
# Update status to running for resumption
|
|
280
|
+
storage.update_instance_status(instance_id, Status::RUNNING)
|
|
281
|
+
rescue StandardError => e
|
|
282
|
+
log_error("handle_timer_expired(#{instance_id})", e)
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
def handle_subscription_timeout(sub)
|
|
286
|
+
instance_id = sub[:instance_id]
|
|
287
|
+
channel = sub[:channel]
|
|
288
|
+
activity_id = sub[:activity_id]
|
|
289
|
+
|
|
290
|
+
# Record timeout in history
|
|
291
|
+
storage.append_history(
|
|
292
|
+
instance_id: instance_id,
|
|
293
|
+
activity_id: activity_id,
|
|
294
|
+
event_type: EventType::MESSAGE_TIMEOUT,
|
|
295
|
+
event_data: { channel: channel, timed_out_at: Time.now.iso8601 }
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
# Remove subscription
|
|
299
|
+
storage.unsubscribe_from_channel(instance_id: instance_id, channel: channel)
|
|
300
|
+
|
|
301
|
+
# Update status to running for resumption
|
|
302
|
+
storage.update_instance_status(instance_id, Status::RUNNING)
|
|
303
|
+
rescue StandardError => e
|
|
304
|
+
log_error("handle_subscription_timeout(#{instance_id})", e)
|
|
305
|
+
end
|
|
306
|
+
|
|
307
|
+
def process_pending_messages
|
|
308
|
+
# Find channels with waiting subscriptions
|
|
309
|
+
waiting = storage.db[:channel_subscriptions]
|
|
310
|
+
.where { Sequel.~(activity_id: nil) }
|
|
311
|
+
.select(:channel, :instance_id, :mode, :activity_id, :cursor_message_id)
|
|
312
|
+
.limit(100)
|
|
313
|
+
.all
|
|
314
|
+
|
|
315
|
+
delivered_count = 0
|
|
316
|
+
|
|
317
|
+
waiting.each do |sub|
|
|
318
|
+
break unless @running
|
|
319
|
+
|
|
320
|
+
message = storage.get_next_message(
|
|
321
|
+
channel: sub[:channel],
|
|
322
|
+
mode: sub[:mode],
|
|
323
|
+
instance_id: sub[:instance_id],
|
|
324
|
+
cursor_id: sub[:cursor_message_id]
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
next unless message
|
|
328
|
+
|
|
329
|
+
deliver_message(sub, message)
|
|
330
|
+
delivered_count += 1
|
|
331
|
+
end
|
|
332
|
+
|
|
333
|
+
delivered_count
|
|
334
|
+
end
|
|
335
|
+
|
|
336
|
+
def deliver_message(sub, message)
|
|
337
|
+
instance_id = sub[:instance_id]
|
|
338
|
+
activity_id = sub[:activity_id]
|
|
339
|
+
mode = sub[:mode]
|
|
340
|
+
|
|
341
|
+
# For competing mode, try to claim
|
|
342
|
+
if (mode == ChannelMode::COMPETING) && !storage.claim_message(message_id: message[:message_id],
|
|
343
|
+
instance_id: instance_id)
|
|
344
|
+
return
|
|
345
|
+
end
|
|
346
|
+
|
|
347
|
+
# Record message received in history
|
|
348
|
+
storage.append_history(
|
|
349
|
+
instance_id: instance_id,
|
|
350
|
+
activity_id: activity_id,
|
|
351
|
+
event_type: EventType::CHANNEL_MESSAGE_RECEIVED,
|
|
352
|
+
event_data: {
|
|
353
|
+
channel: sub[:channel],
|
|
354
|
+
message_id: message[:message_id],
|
|
355
|
+
data: message[:data],
|
|
356
|
+
metadata: message[:metadata]
|
|
357
|
+
}
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
# Clear activity_id from subscription (no longer waiting)
|
|
361
|
+
storage.db[:channel_subscriptions]
|
|
362
|
+
.where(instance_id: instance_id, channel: sub[:channel])
|
|
363
|
+
.update(activity_id: nil, timeout_at: nil)
|
|
364
|
+
|
|
365
|
+
# Update status to running for resumption
|
|
366
|
+
storage.update_instance_status(instance_id, Status::RUNNING)
|
|
367
|
+
rescue StandardError => e
|
|
368
|
+
log_error("deliver_message(#{instance_id})", e)
|
|
369
|
+
end
|
|
370
|
+
|
|
371
|
+
def calculate_backoff(consecutive_empty)
|
|
372
|
+
# Exponential backoff: 2, 4, 8, 16, 32, max 60 seconds + jitter
|
|
373
|
+
base = WORKFLOW_POLL_INTERVAL
|
|
374
|
+
exp = [consecutive_empty, 5].min
|
|
375
|
+
backoff = base * (2**exp)
|
|
376
|
+
[backoff, 60].min + (rand * 3)
|
|
377
|
+
end
|
|
378
|
+
|
|
379
|
+
def interruptible_sleep(seconds)
|
|
380
|
+
deadline = Time.now + seconds
|
|
381
|
+
sleep([0.5, deadline - Time.now].min) while @running && Time.now < deadline
|
|
382
|
+
end
|
|
383
|
+
|
|
384
|
+
def log_error(context, error)
|
|
385
|
+
warn "[Shikibu::Worker] Error in #{context}: #{error.class}: #{error.message}"
|
|
386
|
+
warn error.backtrace.first(5).join("\n") if error.backtrace
|
|
387
|
+
end
|
|
388
|
+
end
|
|
389
|
+
end
|