shikibu 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Shikibu
4
+ VERSION = '0.1.0'
5
+ end
@@ -0,0 +1,389 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Shikibu
4
+ # Background worker for workflow processing
5
+ class Worker
6
+ WORKFLOW_POLL_INTERVAL = 1
7
+ TIMER_CHECK_INTERVAL = 10
8
+ STALE_LOCK_INTERVAL = 60
9
+ MESSAGE_CHECK_INTERVAL = 5
10
+ MESSAGE_CLEANUP_INTERVAL = 3600 # 1 hour
11
+ DEFAULT_MESSAGE_RETENTION_DAYS = 7
12
+
13
+ attr_reader :app, :storage, :worker_id, :resume_wake_event, :message_wake_event, :outbox_wake_event
14
+
15
+ def initialize(app)
16
+ @app = app
17
+ @storage = app.storage
18
+ @worker_id = app.worker_id
19
+ @running = false
20
+ @threads = []
21
+ @is_leader = false
22
+ @leader_tasks = []
23
+
24
+ # Wake events for NOTIFY integration
25
+ @resume_wake_event = Notify::WakeEvent.new
26
+ @message_wake_event = Notify::WakeEvent.new
27
+ @outbox_wake_event = Notify::WakeEvent.new
28
+ end
29
+
30
+ def start
31
+ return if @running
32
+
33
+ @running = true
34
+
35
+ # All workers run these
36
+ @threads << Thread.new { run_workflow_resumption }
37
+ @threads << Thread.new { run_message_delivery }
38
+
39
+ # Leader election and leader-only tasks
40
+ @threads << Thread.new { run_leader_election }
41
+ end
42
+
43
+ def stop
44
+ @running = false
45
+ stop_leader_tasks
46
+ @threads.each { |t| t.join(5) }
47
+ @threads.clear
48
+ end
49
+
50
+ def running?
51
+ @running
52
+ end
53
+
54
+ def leader?
55
+ @is_leader
56
+ end
57
+
58
+ private
59
+
60
+ # Resume workflows that are in 'running' status but not locked
61
+ def run_workflow_resumption
62
+ consecutive_empty = 0
63
+
64
+ while @running
65
+ begin
66
+ workflows = storage.find_resumable_workflows(limit: 10)
67
+
68
+ if workflows.empty?
69
+ consecutive_empty += 1
70
+ backoff = calculate_backoff(consecutive_empty)
71
+
72
+ # Wait with wake event support (reset backoff on NOTIFY wake)
73
+ consecutive_empty = 0 if @resume_wake_event.wait(backoff)
74
+ else
75
+ consecutive_empty = 0
76
+ workflows.each do |wf|
77
+ break unless @running
78
+
79
+ resume_workflow_safe(wf[:instance_id])
80
+ end
81
+ end
82
+ rescue StandardError => e
83
+ log_error('workflow_resumption', e)
84
+ interruptible_sleep(5)
85
+ end
86
+ end
87
+ end
88
+
89
+ # Deliver messages to waiting workflows
90
+ def run_message_delivery
91
+ while @running
92
+ begin
93
+ # Find subscriptions with pending messages
94
+ messages_delivered = process_pending_messages
95
+
96
+ # Wait with wake event support
97
+ @message_wake_event.wait(MESSAGE_CHECK_INTERVAL) if messages_delivered.zero?
98
+ rescue StandardError => e
99
+ log_error('message_delivery', e)
100
+ interruptible_sleep(5)
101
+ end
102
+ end
103
+ end
104
+
105
+ # Leader election using system locks
106
+ def run_leader_election
107
+ while @running
108
+ begin
109
+ acquired = storage.try_acquire_system_lock('shikibu_leader', worker_id, timeout: 30)
110
+
111
+ if acquired && !@is_leader
112
+ @is_leader = true
113
+ start_leader_tasks
114
+ elsif !acquired && @is_leader
115
+ @is_leader = false
116
+ stop_leader_tasks
117
+ elsif acquired
118
+ # Refresh lock
119
+ storage.refresh_system_lock('shikibu_leader', worker_id, timeout: 30)
120
+ end
121
+
122
+ interruptible_sleep(10 + (rand * 3))
123
+ rescue StandardError => e
124
+ log_error('leader_election', e)
125
+ interruptible_sleep(5)
126
+ end
127
+ end
128
+
129
+ # Release lock on shutdown
130
+ storage.release_system_lock('shikibu_leader', worker_id) if @is_leader
131
+ end
132
+
133
+ def start_leader_tasks
134
+ @leader_tasks << Thread.new { run_timer_check }
135
+ @leader_tasks << Thread.new { run_stale_lock_cleanup }
136
+ @leader_tasks << Thread.new { run_timeout_check }
137
+ @leader_tasks << Thread.new { run_message_cleanup }
138
+ @leader_tasks << Thread.new { run_outbox_relay } if app.outbox_enabled?
139
+ end
140
+
141
+ def stop_leader_tasks
142
+ @is_leader = false # Signal threads to stop (they check @running && @is_leader)
143
+ @leader_tasks.each { |t| t.join(2) } # Wait up to 2 seconds for graceful shutdown
144
+ @leader_tasks.clear
145
+ end
146
+
147
+ # Check for expired timers (leader only)
148
+ def run_timer_check
149
+ while @running && @is_leader
150
+ begin
151
+ expired = storage.find_expired_timers(limit: 100)
152
+
153
+ expired.each do |timer|
154
+ break unless @running
155
+
156
+ handle_timer_expired(timer)
157
+ end
158
+
159
+ interruptible_sleep(TIMER_CHECK_INTERVAL)
160
+ rescue StandardError => e
161
+ log_error('timer_check', e)
162
+ interruptible_sleep(5)
163
+ end
164
+ end
165
+ end
166
+
167
+ # Cleanup stale locks and resume workflows (leader only)
168
+ def run_stale_lock_cleanup
169
+ while @running && @is_leader
170
+ begin
171
+ stale = storage.find_stale_locked_workflows(stale_threshold_seconds: 300)
172
+
173
+ stale.each do |wf|
174
+ break unless @running
175
+
176
+ instance_id = wf[:instance_id]
177
+
178
+ # Force release lock
179
+ storage.db[:workflow_instances]
180
+ .where(instance_id: instance_id)
181
+ .update(locked_by: nil, locked_at: nil, lock_expires_at: nil)
182
+
183
+ # Resume the workflow (like Edda's auto_resume_stale_workflows_periodically)
184
+ resume_workflow_safe(instance_id)
185
+ end
186
+
187
+ interruptible_sleep(STALE_LOCK_INTERVAL)
188
+ rescue StandardError => e
189
+ log_error('stale_lock_cleanup', e)
190
+ interruptible_sleep(10)
191
+ end
192
+ end
193
+ end
194
+
195
+ # Check for timed out subscriptions (leader only)
196
+ def run_timeout_check
197
+ while @running && @is_leader
198
+ begin
199
+ timed_out = storage.find_timed_out_subscriptions(limit: 100)
200
+
201
+ timed_out.each do |sub|
202
+ break unless @running
203
+
204
+ handle_subscription_timeout(sub)
205
+ end
206
+
207
+ interruptible_sleep(TIMER_CHECK_INTERVAL)
208
+ rescue StandardError => e
209
+ log_error('timeout_check', e)
210
+ interruptible_sleep(5)
211
+ end
212
+ end
213
+ end
214
+
215
+ # Cleanup old channel messages (leader only)
216
+ # Messages older than retention_days are deleted to prevent database growth
217
+ def run_message_cleanup
218
+ retention_days = app.message_retention_days || DEFAULT_MESSAGE_RETENTION_DAYS
219
+
220
+ while @running && @is_leader
221
+ begin
222
+ deleted_count = storage.cleanup_old_channel_messages(retention_days: retention_days)
223
+
224
+ warn "[Shikibu::Worker] Cleaned up #{deleted_count} old channel messages" if deleted_count.positive?
225
+
226
+ interruptible_sleep(MESSAGE_CLEANUP_INTERVAL)
227
+ rescue StandardError => e
228
+ log_error('message_cleanup', e)
229
+ interruptible_sleep(60)
230
+ end
231
+ end
232
+ end
233
+
234
+ # Relay outbox events to external message broker (leader only)
235
+ def run_outbox_relay
236
+ relayer = Outbox::Relayer.new(
237
+ storage: storage,
238
+ broker_url: app.broker_url,
239
+ wake_event: @outbox_wake_event,
240
+ poll_interval: app.outbox_poll_interval,
241
+ max_retries: app.outbox_max_retries,
242
+ max_age_hours: app.outbox_max_age_hours
243
+ )
244
+
245
+ relayer.start
246
+
247
+ # Wait while still leader
248
+ interruptible_sleep(1) while @running && @is_leader
249
+
250
+ relayer.stop
251
+ rescue StandardError => e
252
+ log_error('outbox_relay', e)
253
+ end
254
+
255
+ def resume_workflow_safe(instance_id)
256
+ app.resume_workflow(instance_id)
257
+ rescue LockNotAcquiredError
258
+ # Another worker got it, that's fine
259
+ rescue StandardError => e
260
+ log_error("resume_workflow(#{instance_id})", e)
261
+ end
262
+
263
+ def handle_timer_expired(timer)
264
+ instance_id = timer[:instance_id]
265
+ timer_id = timer[:timer_id]
266
+ activity_id = timer[:activity_id]
267
+
268
+ # Record timer expiration in history
269
+ storage.append_history(
270
+ instance_id: instance_id,
271
+ activity_id: activity_id || timer_id,
272
+ event_type: EventType::TIMER_EXPIRED,
273
+ event_data: { timer_id: timer_id, expired_at: Time.now.iso8601 }
274
+ )
275
+
276
+ # Remove timer subscription
277
+ storage.remove_timer(instance_id: instance_id, timer_id: timer_id)
278
+
279
+ # Update status to running for resumption
280
+ storage.update_instance_status(instance_id, Status::RUNNING)
281
+ rescue StandardError => e
282
+ log_error("handle_timer_expired(#{instance_id})", e)
283
+ end
284
+
285
+ def handle_subscription_timeout(sub)
286
+ instance_id = sub[:instance_id]
287
+ channel = sub[:channel]
288
+ activity_id = sub[:activity_id]
289
+
290
+ # Record timeout in history
291
+ storage.append_history(
292
+ instance_id: instance_id,
293
+ activity_id: activity_id,
294
+ event_type: EventType::MESSAGE_TIMEOUT,
295
+ event_data: { channel: channel, timed_out_at: Time.now.iso8601 }
296
+ )
297
+
298
+ # Remove subscription
299
+ storage.unsubscribe_from_channel(instance_id: instance_id, channel: channel)
300
+
301
+ # Update status to running for resumption
302
+ storage.update_instance_status(instance_id, Status::RUNNING)
303
+ rescue StandardError => e
304
+ log_error("handle_subscription_timeout(#{instance_id})", e)
305
+ end
306
+
307
+ def process_pending_messages
308
+ # Find channels with waiting subscriptions
309
+ waiting = storage.db[:channel_subscriptions]
310
+ .where { Sequel.~(activity_id: nil) }
311
+ .select(:channel, :instance_id, :mode, :activity_id, :cursor_message_id)
312
+ .limit(100)
313
+ .all
314
+
315
+ delivered_count = 0
316
+
317
+ waiting.each do |sub|
318
+ break unless @running
319
+
320
+ message = storage.get_next_message(
321
+ channel: sub[:channel],
322
+ mode: sub[:mode],
323
+ instance_id: sub[:instance_id],
324
+ cursor_id: sub[:cursor_message_id]
325
+ )
326
+
327
+ next unless message
328
+
329
+ deliver_message(sub, message)
330
+ delivered_count += 1
331
+ end
332
+
333
+ delivered_count
334
+ end
335
+
336
+ def deliver_message(sub, message)
337
+ instance_id = sub[:instance_id]
338
+ activity_id = sub[:activity_id]
339
+ mode = sub[:mode]
340
+
341
+ # For competing mode, try to claim
342
+ if (mode == ChannelMode::COMPETING) && !storage.claim_message(message_id: message[:message_id],
343
+ instance_id: instance_id)
344
+ return
345
+ end
346
+
347
+ # Record message received in history
348
+ storage.append_history(
349
+ instance_id: instance_id,
350
+ activity_id: activity_id,
351
+ event_type: EventType::CHANNEL_MESSAGE_RECEIVED,
352
+ event_data: {
353
+ channel: sub[:channel],
354
+ message_id: message[:message_id],
355
+ data: message[:data],
356
+ metadata: message[:metadata]
357
+ }
358
+ )
359
+
360
+ # Clear activity_id from subscription (no longer waiting)
361
+ storage.db[:channel_subscriptions]
362
+ .where(instance_id: instance_id, channel: sub[:channel])
363
+ .update(activity_id: nil, timeout_at: nil)
364
+
365
+ # Update status to running for resumption
366
+ storage.update_instance_status(instance_id, Status::RUNNING)
367
+ rescue StandardError => e
368
+ log_error("deliver_message(#{instance_id})", e)
369
+ end
370
+
371
+ def calculate_backoff(consecutive_empty)
372
+ # Exponential backoff: 2, 4, 8, 16, 32, max 60 seconds + jitter
373
+ base = WORKFLOW_POLL_INTERVAL
374
+ exp = [consecutive_empty, 5].min
375
+ backoff = base * (2**exp)
376
+ [backoff, 60].min + (rand * 3)
377
+ end
378
+
379
+ def interruptible_sleep(seconds)
380
+ deadline = Time.now + seconds
381
+ sleep([0.5, deadline - Time.now].min) while @running && Time.now < deadline
382
+ end
383
+
384
+ def log_error(context, error)
385
+ warn "[Shikibu::Worker] Error in #{context}: #{error.class}: #{error.message}"
386
+ warn error.backtrace.first(5).join("\n") if error.backtrace
387
+ end
388
+ end
389
+ end