patient_http-sidekiq 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +7 -0
  2. data/ARCHITECTURE.md +496 -0
  3. data/CHANGELOG.md +16 -0
  4. data/MIT-LICENSE +20 -0
  5. data/README.md +620 -0
  6. data/VERSION +1 -0
  7. data/lib/patient_http/sidekiq/callback_worker.rb +96 -0
  8. data/lib/patient_http/sidekiq/configuration.rb +175 -0
  9. data/lib/patient_http/sidekiq/context.rb +61 -0
  10. data/lib/patient_http/sidekiq/lifecycle_hooks.rb +42 -0
  11. data/lib/patient_http/sidekiq/processor_observer.rb +49 -0
  12. data/lib/patient_http/sidekiq/request_executor.rb +104 -0
  13. data/lib/patient_http/sidekiq/request_worker.rb +57 -0
  14. data/lib/patient_http/sidekiq/stats.rb +119 -0
  15. data/lib/patient_http/sidekiq/task_handler.rb +81 -0
  16. data/lib/patient_http/sidekiq/task_monitor.rb +542 -0
  17. data/lib/patient_http/sidekiq/task_monitor_thread.rb +154 -0
  18. data/lib/patient_http/sidekiq/web_ui/assets/patient-http/css/patient_http.css +249 -0
  19. data/lib/patient_http/sidekiq/web_ui/locales/ar.yml +26 -0
  20. data/lib/patient_http/sidekiq/web_ui/locales/cs.yml +26 -0
  21. data/lib/patient_http/sidekiq/web_ui/locales/da.yml +26 -0
  22. data/lib/patient_http/sidekiq/web_ui/locales/de.yml +26 -0
  23. data/lib/patient_http/sidekiq/web_ui/locales/el.yml +26 -0
  24. data/lib/patient_http/sidekiq/web_ui/locales/en.yml +26 -0
  25. data/lib/patient_http/sidekiq/web_ui/locales/es.yml +26 -0
  26. data/lib/patient_http/sidekiq/web_ui/locales/fa.yml +26 -0
  27. data/lib/patient_http/sidekiq/web_ui/locales/fr.yml +26 -0
  28. data/lib/patient_http/sidekiq/web_ui/locales/gd.yml +26 -0
  29. data/lib/patient_http/sidekiq/web_ui/locales/he.yml +26 -0
  30. data/lib/patient_http/sidekiq/web_ui/locales/hi.yml +26 -0
  31. data/lib/patient_http/sidekiq/web_ui/locales/it.yml +26 -0
  32. data/lib/patient_http/sidekiq/web_ui/locales/ja.yml +26 -0
  33. data/lib/patient_http/sidekiq/web_ui/locales/ko.yml +26 -0
  34. data/lib/patient_http/sidekiq/web_ui/locales/lt.yml +26 -0
  35. data/lib/patient_http/sidekiq/web_ui/locales/nb.yml +26 -0
  36. data/lib/patient_http/sidekiq/web_ui/locales/nl.yml +26 -0
  37. data/lib/patient_http/sidekiq/web_ui/locales/pl.yml +26 -0
  38. data/lib/patient_http/sidekiq/web_ui/locales/pt-BR.yml +26 -0
  39. data/lib/patient_http/sidekiq/web_ui/locales/pt.yml +26 -0
  40. data/lib/patient_http/sidekiq/web_ui/locales/ru.yml +26 -0
  41. data/lib/patient_http/sidekiq/web_ui/locales/sv.yml +26 -0
  42. data/lib/patient_http/sidekiq/web_ui/locales/ta.yml +26 -0
  43. data/lib/patient_http/sidekiq/web_ui/locales/tr.yml +26 -0
  44. data/lib/patient_http/sidekiq/web_ui/locales/uk.yml +26 -0
  45. data/lib/patient_http/sidekiq/web_ui/locales/ur.yml +26 -0
  46. data/lib/patient_http/sidekiq/web_ui/locales/vi.yml +26 -0
  47. data/lib/patient_http/sidekiq/web_ui/locales/zh-CN.yml +26 -0
  48. data/lib/patient_http/sidekiq/web_ui/locales/zh-TW.yml +26 -0
  49. data/lib/patient_http/sidekiq/web_ui/views/patient_http.html.erb +142 -0
  50. data/lib/patient_http/sidekiq/web_ui.rb +69 -0
  51. data/lib/patient_http/sidekiq.rb +328 -0
  52. data/lib/patient_http-sidekiq.rb +3 -0
  53. data/patient_http-sidekiq.gemspec +46 -0
  54. metadata +140 -0
@@ -0,0 +1,542 @@
1
+ # frozen_string_literal: true
2
+
3
+ module PatientHttp
4
+ module Sidekiq
5
+ # Manages inflight request tracking in Redis for crash recovery.
6
+ #
7
+ # This class maintains a sorted set of request IDs indexed by timestamp
8
+ # and a hash of request payloads. It provides distributed locking for
9
+ # orphan detection and automatic re-enqueuing of requests that were
10
+ # interrupted by process crashes.
11
+ #
12
+ # Task ID format: "hostname:pid:hex/request-uuid"
13
+ # - hostname: sanitized hostname (colons and slashes replaced with dashes)
14
+ # - pid: process ID
15
+ # - hex: 8-character random hex for uniqueness
16
+ # - request-uuid: unique identifier for the request
17
+ class TaskMonitor
18
+ # Redis key prefixes
19
+ INFLIGHT_INDEX_KEY = "sidekiq:patient_http:inflight_index"
20
+ INFLIGHT_JOBS_KEY = "sidekiq:patient_http:inflight_jobs"
21
+ PROCESS_SET_KEY = "sidekiq:patient_http:processes"
22
+ GC_LOCK_KEY = "sidekiq:patient_http:gc_lock"
23
+ GC_LAST_RUN_KEY = "sidekiq:patient_http:gc_last_run"
24
+
25
+ # Lua script for atomic orphan removal.
26
+ # Checks if the task is still orphaned (timestamp < threshold) and removes it atomically.
27
+ # This prevents race conditions where a heartbeat could update the timestamp between
28
+ # the check and the removal.
29
+ #
30
+ # KEYS[1] = index key (sorted set)
31
+ # KEYS[2] = jobs key (hash)
32
+ # ARGV[1] = request_id
33
+ # ARGV[2] = threshold_ms
34
+ #
35
+ # Returns: [removed (0/1), job_payload or nil]
36
+ REMOVE_IF_ORPHANED_SCRIPT = <<~LUA
37
+ local index_key = KEYS[1]
38
+ local jobs_key = KEYS[2]
39
+ local request_id = ARGV[1]
40
+ local threshold_ms = tonumber(ARGV[2])
41
+
42
+ local current_score = redis.call('ZSCORE', index_key, request_id)
43
+ if not current_score or tonumber(current_score) >= threshold_ms then
44
+ return {0, nil} -- Not orphaned or already removed
45
+ end
46
+
47
+ local job_payload = redis.call('HGET', jobs_key, request_id)
48
+ redis.call('ZREM', index_key, request_id)
49
+ redis.call('HDEL', jobs_key, request_id)
50
+ return {1, job_payload}
51
+ LUA
52
+
53
+ # @return [Configuration] the configuration object
54
+ attr_reader :config
55
+
56
+ class << self
57
+ # Get the count of inflight requests in Redis.
58
+ #
59
+ # @return [Integer] number of inflight requests
60
+ def inflight_count
61
+ ::Sidekiq.redis do |redis|
62
+ redis.zcard(INFLIGHT_INDEX_KEY)
63
+ end
64
+ end
65
+
66
+ # Get all inflight counts across all processes and the number of max connections.
67
+ #
68
+ # @return [Hash] hash of "hostname:pid" => { inflight: Integer, max_capacity: Integer }
69
+ def inflight_counts_by_process
70
+ process_ids = nil
71
+ max_connections = nil
72
+ inflight_task_ids = nil
73
+
74
+ ::Sidekiq.redis do |redis|
75
+ process_ids = redis.smembers(PROCESS_SET_KEY)
76
+ return {} if process_ids.empty?
77
+
78
+ max_keys = process_ids.map { |pid| max_connections_key_for(pid) }
79
+ max_connections = redis.mget(*max_keys)
80
+
81
+ inflight_task_ids = redis.zrange(INFLIGHT_INDEX_KEY, 0, -1)
82
+ end
83
+
84
+ inflight_by_process_id = inflight_task_ids.group_by do |task_id|
85
+ task_id.split("/", 2).first
86
+ end
87
+
88
+ result = {}
89
+ stale_process_ids = []
90
+
91
+ process_ids.zip(max_connections).each do |process_id, max_conn|
92
+ if max_conn.nil?
93
+ # Mark for removal if max_conn key doesn't exist (process is gone)
94
+ stale_process_ids << process_id
95
+ else
96
+ host_pid = process_id.split(":", 3).first(2).join(":")
97
+ counts = result[host_pid]
98
+ unless counts
99
+ counts = {inflight: 0, max_capacity: 0}
100
+ result[host_pid] = counts
101
+ end
102
+ counts[:inflight] += inflight_by_process_id[process_id]&.size.to_i
103
+ counts[:max_capacity] += max_conn.to_i
104
+ end
105
+ end
106
+
107
+ # Remove stale process IDs from the set
108
+ unless stale_process_ids.empty?
109
+ ::Sidekiq.redis do |redis|
110
+ redis.srem(PROCESS_SET_KEY, stale_process_ids)
111
+ end
112
+ end
113
+
114
+ result
115
+ end
116
+
117
+ # Get the total max connections across all processes
118
+ #
119
+ # @return [Integer] sum of max connections from all active processes
120
+ def total_max_connections
121
+ inflight_counts_by_process.values.sum { |data| data[:max_capacity] }
122
+ end
123
+
124
+ # Get all registered process IDs.
125
+ #
126
+ # @return [Array<String>] list of process identifiers
127
+ def registered_process_ids
128
+ ::Sidekiq.redis do |redis|
129
+ redis.smembers(PROCESS_SET_KEY)
130
+ end
131
+ end
132
+
133
+ # Clear all registry data. Only allowed in test environment.
134
+ #
135
+ # @raise [RuntimeError] if called outside of test environment
136
+ # @return [void]
137
+ # @api private
138
+ def clear_all!
139
+ unless PatientHttp.testing?
140
+ raise "clear_all! is only allowed in test environment"
141
+ end
142
+
143
+ ::Sidekiq.redis do |redis|
144
+ redis.del(INFLIGHT_INDEX_KEY, INFLIGHT_JOBS_KEY, PROCESS_SET_KEY, GC_LOCK_KEY, GC_LAST_RUN_KEY)
145
+ end
146
+ end
147
+
148
+ private
149
+
150
+ # Build the max connections key for a given process identifier.
151
+ #
152
+ # @param process_id [String] the process identifier
153
+ #
154
+ # @return [String] the Redis key for max connections
155
+ def max_connections_key_for(process_id)
156
+ "#{PROCESS_SET_KEY}:#{process_id}:max_connections"
157
+ end
158
+ end
159
+
160
+ # @param config [Configuration] the configuration object
161
+ def initialize(config)
162
+ @config = config
163
+ hostname = ::Socket.gethostname.force_encoding("UTF-8").tr(":/", "-")
164
+ pid = ::Process.pid
165
+ @lock_identifier = "#{hostname}:#{pid}:#{SecureRandom.hex(8)}".freeze
166
+ end
167
+
168
+ # Register a request as inflight in Redis.
169
+ #
170
+ # @param task [RequestTask] the request task to register
171
+ #
172
+ # @return [void]
173
+ def register(task)
174
+ timestamp_ms = (Time.now.to_f * 1000).round
175
+ job_payload = JSON.generate(task.task_handler.sidekiq_job)
176
+ task_id = full_task_id(task.id)
177
+
178
+ ::Sidekiq.redis do |redis|
179
+ redis.multi do |transaction|
180
+ transaction.zadd(INFLIGHT_INDEX_KEY, timestamp_ms, task_id)
181
+ transaction.hset(INFLIGHT_JOBS_KEY, task_id, job_payload)
182
+ transaction.expire(INFLIGHT_INDEX_KEY, inflight_ttl)
183
+ transaction.expire(INFLIGHT_JOBS_KEY, inflight_ttl)
184
+ end
185
+ end
186
+ end
187
+
188
+ # Unregister a request from Redis (called when request completes).
189
+ #
190
+ # @param task [RequestTask] the request task to unregister
191
+ #
192
+ # @return [void]
193
+ def unregister(task)
194
+ task_id = full_task_id(task.id)
195
+
196
+ ::Sidekiq.redis do |redis|
197
+ redis.multi do |transaction|
198
+ transaction.zrem(INFLIGHT_INDEX_KEY, task_id)
199
+ transaction.hdel(INFLIGHT_JOBS_KEY, task_id)
200
+ end
201
+ end
202
+ end
203
+
204
+ # Remove this process's entry from the process set.
205
+ #
206
+ # @return [void]
207
+ def remove_process
208
+ ::Sidekiq.redis do |redis|
209
+ redis.srem(PROCESS_SET_KEY, @lock_identifier)
210
+ redis.del(max_connections_key)
211
+ end
212
+ end
213
+
214
+ # Update heartbeat timestamps for multiple requests in a single operation.
215
+ #
216
+ # @param task_ids [Array<String>] the request IDs to update
217
+ #
218
+ # @return [void]
219
+ def update_heartbeats(task_ids)
220
+ return if task_ids.empty?
221
+
222
+ timestamp_ms = (Time.now.to_f * 1000).round
223
+
224
+ ::Sidekiq.redis do |redis|
225
+ redis.pipelined do |pipeline|
226
+ task_ids.each do |task_id|
227
+ pipeline.call("ZADD", INFLIGHT_INDEX_KEY, "XX", timestamp_ms, full_task_id(task_id))
228
+ end
229
+ end
230
+ end
231
+ end
232
+
233
+ # Check if a task is registered in the inflight registry.
234
+ #
235
+ # @param task [RequestTask] the request task
236
+ #
237
+ # @return [Boolean] true if registered, false otherwise
238
+ # @api private
239
+ def registered?(task)
240
+ ::Sidekiq.redis do |redis|
241
+ !redis.zscore(INFLIGHT_INDEX_KEY, full_task_id(task.id)).nil?
242
+ end
243
+ end
244
+
245
+ # Get the heartbeat timestamp for a task.
246
+ #
247
+ # @param task [RequestTask] the request task
248
+ #
249
+ # @return [Integer, nil] timestamp in milliseconds, or nil if not registered
250
+ # @api private
251
+ def heartbeat_timestamp_for(task)
252
+ score = ::Sidekiq.redis do |redis|
253
+ redis.zscore(INFLIGHT_INDEX_KEY, full_task_id(task.id))
254
+ end
255
+ score&.to_i
256
+ end
257
+
258
+ # Get all registered task IDs for this registry's process.
259
+ #
260
+ # @return [Array<String>] list of full task IDs
261
+ # @api private
262
+ def registered_task_ids
263
+ ::Sidekiq.redis do |redis|
264
+ redis.zrange(INFLIGHT_INDEX_KEY, 0, -1)
265
+ end.select { |id| id.start_with?("#{@lock_identifier}/") }
266
+ end
267
+
268
+ # Build unique task ID for a request task that includes process identifier.
269
+ #
270
+ # @param task_id [String] the request task
271
+ # @return [String] the unique task ID
272
+ def full_task_id(task_id)
273
+ "#{@lock_identifier}/#{task_id}"
274
+ end
275
+
276
+ # Record the current process's max connections in Redis.
277
+ #
278
+ # This is used for monitoring purposes.
279
+ #
280
+ # @return [void]
281
+ def ping_process
282
+ ::Sidekiq.redis do |redis|
283
+ redis.multi do |transaction|
284
+ transaction.sadd(PROCESS_SET_KEY, @lock_identifier)
285
+ transaction.set(max_connections_key, @config.max_connections)
286
+ transaction.expire(PROCESS_SET_KEY, inflight_ttl)
287
+ transaction.expire(max_connections_key, process_ttl)
288
+ end
289
+ end
290
+ end
291
+
292
+ # Try to acquire the distributed garbage collection lock.
293
+ #
294
+ # @return [Boolean] true if lock acquired, false otherwise
295
+ def acquire_gc_lock
296
+ ::Sidekiq.redis do |redis|
297
+ # Use SET with NX and EX options directly
298
+ # Returns "OK" if successful with ::Sidekiq.redis, nil if key already exists
299
+ !!redis.set(GC_LOCK_KEY, @lock_identifier, nx: true, ex: gc_lock_ttl)
300
+ end
301
+ end
302
+
303
+ # Release the garbage collection lock if held by this process.
304
+ #
305
+ # Uses Redis WATCH/MULTI/EXEC for optimistic locking to ensure we only
306
+ # delete the lock if it's still held by this process.
307
+ #
308
+ # @return [Boolean] true if the lock was released, false otherwise
309
+ def release_gc_lock
310
+ ::Sidekiq.redis do |redis|
311
+ # Watch the lock key for changes
312
+ redis.watch(GC_LOCK_KEY)
313
+
314
+ # Get current lock value
315
+ current_value = redis.get(GC_LOCK_KEY)
316
+
317
+ if current_value == @lock_identifier
318
+ # Lock is ours, delete it atomically
319
+ result = redis.multi do |transaction|
320
+ transaction.del(GC_LOCK_KEY)
321
+ end
322
+ # MULTI returns nil if transaction was aborted (someone else modified the key)
323
+ # Otherwise returns array with results
324
+ !result.nil?
325
+ else
326
+ # Lock is not ours or doesn't exist
327
+ redis.unwatch
328
+ false
329
+ end
330
+ end
331
+ end
332
+
333
+ # Check if garbage collection should run based on the last run timestamp.
334
+ #
335
+ # Returns true if the GC_LAST_RUN_KEY doesn't exist in Redis or if enough
336
+ # time has elapsed since the last GC run.
337
+ #
338
+ # @return [Boolean] true if GC should run, false otherwise
339
+ def gc_needed?
340
+ last_run = ::Sidekiq.redis do |redis|
341
+ redis.get(GC_LAST_RUN_KEY)
342
+ end
343
+
344
+ return true if last_run.nil?
345
+
346
+ last_run_time = Time.at(last_run.to_f / 1000.0)
347
+ Time.now - last_run_time >= config.heartbeat_interval
348
+ end
349
+
350
+ # Record the timestamp of the last GC run in Redis.
351
+ #
352
+ # The timestamp is stored with a TTL slightly longer than the heartbeat
353
+ # interval to coordinate GC execution across multiple processes.
354
+ #
355
+ # @return [void]
356
+ def record_gc_run
357
+ ::Sidekiq.redis do |redis|
358
+ redis.set(GC_LAST_RUN_KEY, (Time.now.to_f * 1000).floor, ex: gc_last_run_ttl)
359
+ end
360
+ end
361
+
362
+ # Find and re-enqueue orphaned requests.
363
+ #
364
+ # @param orphan_threshold_seconds [Numeric] age threshold for considering a request orphaned
365
+ # @param logger [Logger] logger for output
366
+ #
367
+ # @return [Integer] number of orphaned requests re-enqueued
368
+ def cleanup_orphaned_requests(orphan_threshold_seconds, logger)
369
+ threshold_timestamp_ms = calculate_threshold_timestamp(orphan_threshold_seconds)
370
+ orphaned_requests = fetch_orphaned_requests(threshold_timestamp_ms)
371
+
372
+ return 0 if orphaned_requests.empty?
373
+
374
+ reenqueue_orphaned_jobs(orphaned_requests, threshold_timestamp_ms, logger)
375
+ end
376
+
377
+ private
378
+
379
+ # Calculate threshold timestamp in milliseconds for orphan detection.
380
+ #
381
+ # @param orphan_threshold_seconds [Numeric] age threshold in seconds
382
+ #
383
+ # @return [Integer] threshold timestamp in milliseconds
384
+ def calculate_threshold_timestamp(orphan_threshold_seconds)
385
+ ((Time.now.to_f - orphan_threshold_seconds) * 1000).round
386
+ end
387
+
388
+ # Fetch orphaned request IDs and their job payloads.
389
+ #
390
+ # @param threshold_timestamp_ms [Integer] threshold timestamp in milliseconds
391
+ #
392
+ # @return [Array<Array(String, String)>] array of [request_id, job_payload] pairs
393
+ def fetch_orphaned_requests(threshold_timestamp_ms)
394
+ # Find all requests older than the threshold
395
+ all_orphaned_request_ids = ::Sidekiq.redis do |redis|
396
+ redis.zrange(INFLIGHT_INDEX_KEY, "-inf", threshold_timestamp_ms, byscore: true)
397
+ end
398
+
399
+ return [] if all_orphaned_request_ids.empty?
400
+
401
+ orphaned_request_ids_by_process = all_orphaned_request_ids.group_by do |request_id|
402
+ request_id.split("/", 2).first
403
+ end
404
+ all_process_ids = ::Sidekiq.redis do |redis|
405
+ redis.smembers(PROCESS_SET_KEY)
406
+ end
407
+ orphaned_request_ids = orphaned_request_ids_by_process.except(*all_process_ids).values.flatten
408
+
409
+ return [] if orphaned_request_ids.empty?
410
+
411
+ # Retrieve job payloads for all orphaned requests
412
+ job_payloads = ::Sidekiq.redis do |redis|
413
+ redis.hmget(INFLIGHT_JOBS_KEY, *orphaned_request_ids)
414
+ end
415
+
416
+ orphaned_request_ids.zip(job_payloads).reject { |_id, payload| payload.nil? }
417
+ end
418
+
419
+ # Re-enqueue all orphaned jobs.
420
+ #
421
+ # @param orphaned_requests [Array<Array(String, String)>] array of [request_id, job_payload] pairs
422
+ # @param threshold_timestamp_ms [Integer] threshold timestamp in milliseconds
423
+ # @param logger [Logger] logger for output
424
+ #
425
+ # @return [Integer] number of jobs successfully re-enqueued
426
+ def reenqueue_orphaned_jobs(orphaned_requests, threshold_timestamp_ms, logger)
427
+ reenqueued_count = 0
428
+
429
+ orphaned_requests.each do |request_id, job_payload|
430
+ if reenqueue_orphaned_job(request_id, job_payload, threshold_timestamp_ms, logger)
431
+ reenqueued_count += 1
432
+ end
433
+ end
434
+
435
+ reenqueued_count
436
+ end
437
+
438
+ # Re-enqueue a single orphaned job using atomic Lua script.
439
+ #
440
+ # This method atomically checks if the task is still orphaned and removes it
441
+ # in a single Redis operation, preventing race conditions where a heartbeat
442
+ # could update the timestamp between checking and removal.
443
+ #
444
+ # @param request_id [String] the request ID
445
+ # @param job_payload [String] the JSON job payload (used as fallback)
446
+ # @param threshold_timestamp_ms [Integer] threshold timestamp in milliseconds
447
+ # @param logger [Logger] logger for output
448
+ #
449
+ # @return [Boolean] true if successfully re-enqueued, false otherwise
450
+ def reenqueue_orphaned_job(request_id, job_payload, threshold_timestamp_ms, logger)
451
+ # Atomically check and remove if still orphaned
452
+ removed, payload = remove_if_orphaned(request_id, threshold_timestamp_ms)
453
+
454
+ return false unless removed == 1
455
+
456
+ # Use payload from Lua script, fall back to provided payload
457
+ actual_payload = payload || job_payload
458
+ return false if actual_payload.nil?
459
+
460
+ # Re-enqueue the job
461
+ job_hash = JSON.parse(actual_payload)
462
+ ::Sidekiq::Client.push(job_hash)
463
+
464
+ logger&.info(
465
+ "[PatientHttp::Sidekiq] Re-enqueued orphaned request #{request_id} to #{job_hash["class"]}"
466
+ )
467
+
468
+ true
469
+ rescue => e
470
+ logger&.error(
471
+ "[PatientHttp::Sidekiq] Failed to re-enqueue orphaned request #{request_id}: #{e.class} - #{e.message}"
472
+ )
473
+ false
474
+ end
475
+
476
+ # Atomically check if orphaned and remove from registry.
477
+ #
478
+ # Uses a Lua script to ensure the check and removal happen in a single
479
+ # atomic operation, preventing race conditions with heartbeat updates.
480
+ #
481
+ # @param request_id [String] the request ID
482
+ # @param threshold_timestamp_ms [Integer] threshold timestamp in milliseconds
483
+ #
484
+ # @return [Array(Integer, String)] [removed (0/1), job_payload or nil]
485
+ def remove_if_orphaned(request_id, threshold_timestamp_ms)
486
+ ::Sidekiq.redis do |redis|
487
+ # EVAL script numkeys key1 key2 arg1 arg2
488
+ redis.call(
489
+ "EVAL",
490
+ REMOVE_IF_ORPHANED_SCRIPT,
491
+ 2, # number of keys
492
+ INFLIGHT_INDEX_KEY,
493
+ INFLIGHT_JOBS_KEY,
494
+ request_id,
495
+ threshold_timestamp_ms.to_s
496
+ )
497
+ end
498
+ end
499
+
500
+ # Calculate the TTL for inflight data structures.
501
+ # Should be significantly longer than the orphan threshold.
502
+ #
503
+ # @return [Integer] TTL in seconds
504
+ def inflight_ttl
505
+ # Set to 3x the orphan threshold, with a minimum of 1 hour
506
+ [config.orphan_threshold * 3, 3600].max
507
+ end
508
+
509
+ # Calculate the TTL for the garbage collection lock.
510
+ # Should be a bit longer than the heartbeat interval.
511
+ #
512
+ # @return [Integer] TTL in seconds
513
+ def gc_lock_ttl
514
+ # Set to 2x the heartbeat interval, with a minimum of 120 seconds
515
+ [config.heartbeat_interval * 2, 120].max
516
+ end
517
+
518
+ # Calculate the TTL for the last GC run timestamp.
519
+ # Should be a bit longer than the heartbeat interval to ensure
520
+ # proper coordination across processes.
521
+ #
522
+ # @return [Integer] TTL in seconds
523
+ def gc_last_run_ttl
524
+ # Set to 1.5x the heartbeat interval
525
+ (config.heartbeat_interval * 1.5).round
526
+ end
527
+
528
+ # Calculate the TTL for the process max_connections key.
529
+ # Must be longer than heartbeat_interval so the key survives between heartbeats.
530
+ #
531
+ # @return [Integer] TTL in seconds
532
+ def process_ttl
533
+ # Set to 2x the heartbeat interval so the key survives between heartbeats
534
+ config.heartbeat_interval * 2
535
+ end
536
+
537
+ def max_connections_key
538
+ "#{PROCESS_SET_KEY}:#{@lock_identifier}:max_connections"
539
+ end
540
+ end
541
+ end
542
+ end
@@ -0,0 +1,154 @@
1
+ # frozen_string_literal: true
2
+
3
+ module PatientHttp
4
+ module Sidekiq
5
+ # Background thread that maintains heartbeats and performs garbage collection
6
+ # for in-flight HTTP requests.
7
+ class TaskMonitorThread
8
+ include PatientHttp::TimeHelper
9
+
10
+ # Minimum seconds to sleep between monitor thread checks
11
+ MAX_MONITOR_SLEEP = 5.0
12
+
13
+ # @return [Configuration] the configuration object
14
+ attr_reader :config
15
+
16
+ # @return [TaskMonitor] the inflight request registry
17
+ attr_reader :task_monitor
18
+
19
+ # Initialize the monitor thread.
20
+ #
21
+ # @param config [Configuration] the configuration object
22
+ # @param task_monitor [TaskMonitor] the inflight request registry
23
+ # @param inflight_ids_callback [Proc] callback to get current inflight request IDs
24
+ # @return [void]
25
+ def initialize(config, task_monitor, inflight_ids_callback)
26
+ @config = config
27
+ @task_monitor = task_monitor
28
+ @inflight_ids_callback = inflight_ids_callback
29
+ @thread = nil
30
+ @running = Concurrent::AtomicBoolean.new(false)
31
+ @stop_signal = Concurrent::Event.new
32
+ end
33
+
34
+ # Start the monitor thread.
35
+ #
36
+ # @return [void]
37
+ def start
38
+ return if @running.true?
39
+ @running.make_true
40
+ @stop_signal.reset
41
+
42
+ @task_monitor.ping_process
43
+
44
+ @thread = Thread.new do
45
+ run
46
+ rescue => e
47
+ # Log error but don't crash
48
+ @config.logger&.error("[PatientHttp::Sidekiq] Monitor error: #{e.message}\n#{e.backtrace.join("\n")}")
49
+ raise if ::Sidekiq.testing?
50
+ end
51
+
52
+ @thread.name = "patient-http-monitor"
53
+ end
54
+
55
+ # Stop the monitor thread.
56
+ #
57
+ # @return [void]
58
+ def stop
59
+ @running.make_false
60
+ @stop_signal.set # Interrupt the sleep immediately
61
+ @thread&.join(1)
62
+ @thread&.kill if @thread&.alive?
63
+ @thread = nil
64
+ end
65
+
66
+ # Check if monitor thread is running.
67
+ #
68
+ # @return [Boolean]
69
+ def running?
70
+ @running.true?
71
+ end
72
+
73
+ private
74
+
75
+ # Run the monitor loop.
76
+ #
77
+ # @return [void]
78
+ def run
79
+ @config.logger&.info("[PatientHttp::Sidekiq] Monitor thread started")
80
+
81
+ last_heartbeat_update = monotonic_time - @config.heartbeat_interval
82
+ last_gc_attempt = monotonic_time - @config.heartbeat_interval
83
+
84
+ loop do
85
+ break unless @running.true?
86
+
87
+ current_time = monotonic_time
88
+
89
+ # Update heartbeats for all inflight requests
90
+ if current_time - last_heartbeat_update >= @config.heartbeat_interval
91
+ @task_monitor.ping_process
92
+ update_heartbeats
93
+ last_heartbeat_update = current_time
94
+ end
95
+
96
+ # Attempt garbage collection
97
+ if current_time - last_gc_attempt >= @config.heartbeat_interval
98
+ attempt_garbage_collection
99
+ last_gc_attempt = current_time
100
+ end
101
+
102
+ # Sleep with interruptible wait - returns true if interrupted
103
+ wait_time = @config.heartbeat_interval / 2.0
104
+ wait_time = MAX_MONITOR_SLEEP if wait_time > MAX_MONITOR_SLEEP
105
+ @stop_signal.wait(wait_time)
106
+ end
107
+
108
+ @config.logger&.info("[PatientHttp::Sidekiq] Monitor thread stopped")
109
+ end
110
+
111
+ # Update heartbeats for all inflight requests.
112
+ #
113
+ # @return [void]
114
+ def update_heartbeats
115
+ request_ids = @inflight_ids_callback.call
116
+ return if request_ids.empty?
117
+
118
+ @task_monitor.update_heartbeats(request_ids)
119
+
120
+ @config.logger&.debug("[PatientHttp::Sidekiq] Updated heartbeats for #{request_ids.size} inflight requests")
121
+ rescue => e
122
+ @config.logger&.error("[PatientHttp::Sidekiq] Failed to update heartbeats: #{e.class} - #{e.message}")
123
+ raise if ::Sidekiq.testing?
124
+ end
125
+
126
+ # Attempt to acquire GC lock and clean up orphaned requests.
127
+ #
128
+ # @return [void]
129
+ def attempt_garbage_collection
130
+ # Check if GC is needed based on coordinated timestamp
131
+ return unless @task_monitor.gc_needed?
132
+
133
+ # Try to acquire the distributed lock
134
+ return unless @task_monitor.acquire_gc_lock
135
+
136
+ begin
137
+ count = @task_monitor.cleanup_orphaned_requests(@config.orphan_threshold, @config.logger)
138
+
139
+ if count > 0
140
+ @config.logger&.info("[PatientHttp::Sidekiq] Garbage collection: re-enqueued #{count} orphaned requests")
141
+ end
142
+
143
+ # Record this GC run to coordinate with other processes
144
+ @task_monitor.record_gc_run
145
+ ensure
146
+ @task_monitor.release_gc_lock
147
+ end
148
+ rescue => e
149
+ @config.logger&.error("[PatientHttp::Sidekiq] Garbage collection failed: #{e.class} - #{e.message}")
150
+ raise if ::Sidekiq.testing?
151
+ end
152
+ end
153
+ end
154
+ end