patient_http-sidekiq 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/ARCHITECTURE.md +496 -0
- data/CHANGELOG.md +16 -0
- data/MIT-LICENSE +20 -0
- data/README.md +620 -0
- data/VERSION +1 -0
- data/lib/patient_http/sidekiq/callback_worker.rb +96 -0
- data/lib/patient_http/sidekiq/configuration.rb +175 -0
- data/lib/patient_http/sidekiq/context.rb +61 -0
- data/lib/patient_http/sidekiq/lifecycle_hooks.rb +42 -0
- data/lib/patient_http/sidekiq/processor_observer.rb +49 -0
- data/lib/patient_http/sidekiq/request_executor.rb +104 -0
- data/lib/patient_http/sidekiq/request_worker.rb +57 -0
- data/lib/patient_http/sidekiq/stats.rb +119 -0
- data/lib/patient_http/sidekiq/task_handler.rb +81 -0
- data/lib/patient_http/sidekiq/task_monitor.rb +542 -0
- data/lib/patient_http/sidekiq/task_monitor_thread.rb +154 -0
- data/lib/patient_http/sidekiq/web_ui/assets/patient-http/css/patient_http.css +249 -0
- data/lib/patient_http/sidekiq/web_ui/locales/ar.yml +26 -0
- data/lib/patient_http/sidekiq/web_ui/locales/cs.yml +26 -0
- data/lib/patient_http/sidekiq/web_ui/locales/da.yml +26 -0
- data/lib/patient_http/sidekiq/web_ui/locales/de.yml +26 -0
- data/lib/patient_http/sidekiq/web_ui/locales/el.yml +26 -0
- data/lib/patient_http/sidekiq/web_ui/locales/en.yml +26 -0
- data/lib/patient_http/sidekiq/web_ui/locales/es.yml +26 -0
- data/lib/patient_http/sidekiq/web_ui/locales/fa.yml +26 -0
- data/lib/patient_http/sidekiq/web_ui/locales/fr.yml +26 -0
- data/lib/patient_http/sidekiq/web_ui/locales/gd.yml +26 -0
- data/lib/patient_http/sidekiq/web_ui/locales/he.yml +26 -0
- data/lib/patient_http/sidekiq/web_ui/locales/hi.yml +26 -0
- data/lib/patient_http/sidekiq/web_ui/locales/it.yml +26 -0
- data/lib/patient_http/sidekiq/web_ui/locales/ja.yml +26 -0
- data/lib/patient_http/sidekiq/web_ui/locales/ko.yml +26 -0
- data/lib/patient_http/sidekiq/web_ui/locales/lt.yml +26 -0
- data/lib/patient_http/sidekiq/web_ui/locales/nb.yml +26 -0
- data/lib/patient_http/sidekiq/web_ui/locales/nl.yml +26 -0
- data/lib/patient_http/sidekiq/web_ui/locales/pl.yml +26 -0
- data/lib/patient_http/sidekiq/web_ui/locales/pt-BR.yml +26 -0
- data/lib/patient_http/sidekiq/web_ui/locales/pt.yml +26 -0
- data/lib/patient_http/sidekiq/web_ui/locales/ru.yml +26 -0
- data/lib/patient_http/sidekiq/web_ui/locales/sv.yml +26 -0
- data/lib/patient_http/sidekiq/web_ui/locales/ta.yml +26 -0
- data/lib/patient_http/sidekiq/web_ui/locales/tr.yml +26 -0
- data/lib/patient_http/sidekiq/web_ui/locales/uk.yml +26 -0
- data/lib/patient_http/sidekiq/web_ui/locales/ur.yml +26 -0
- data/lib/patient_http/sidekiq/web_ui/locales/vi.yml +26 -0
- data/lib/patient_http/sidekiq/web_ui/locales/zh-CN.yml +26 -0
- data/lib/patient_http/sidekiq/web_ui/locales/zh-TW.yml +26 -0
- data/lib/patient_http/sidekiq/web_ui/views/patient_http.html.erb +142 -0
- data/lib/patient_http/sidekiq/web_ui.rb +69 -0
- data/lib/patient_http/sidekiq.rb +328 -0
- data/lib/patient_http-sidekiq.rb +3 -0
- data/patient_http-sidekiq.gemspec +46 -0
- metadata +140 -0
|
@@ -0,0 +1,542 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module PatientHttp
|
|
4
|
+
module Sidekiq
|
|
5
|
+
# Manages inflight request tracking in Redis for crash recovery.
|
|
6
|
+
#
|
|
7
|
+
# This class maintains a sorted set of request IDs indexed by timestamp
|
|
8
|
+
# and a hash of request payloads. It provides distributed locking for
|
|
9
|
+
# orphan detection and automatic re-enqueuing of requests that were
|
|
10
|
+
# interrupted by process crashes.
|
|
11
|
+
#
|
|
12
|
+
# Task ID format: "hostname:pid:hex/request-uuid"
|
|
13
|
+
# - hostname: sanitized hostname (colons and slashes replaced with dashes)
|
|
14
|
+
# - pid: process ID
|
|
15
|
+
# - hex: 8-character random hex for uniqueness
|
|
16
|
+
# - request-uuid: unique identifier for the request
|
|
17
|
+
class TaskMonitor
|
|
18
|
+
# Redis key prefixes
|
|
19
|
+
INFLIGHT_INDEX_KEY = "sidekiq:patient_http:inflight_index"
|
|
20
|
+
INFLIGHT_JOBS_KEY = "sidekiq:patient_http:inflight_jobs"
|
|
21
|
+
PROCESS_SET_KEY = "sidekiq:patient_http:processes"
|
|
22
|
+
GC_LOCK_KEY = "sidekiq:patient_http:gc_lock"
|
|
23
|
+
GC_LAST_RUN_KEY = "sidekiq:patient_http:gc_last_run"
|
|
24
|
+
|
|
25
|
+
# Lua script for atomic orphan removal.
|
|
26
|
+
# Checks if the task is still orphaned (timestamp < threshold) and removes it atomically.
|
|
27
|
+
# This prevents race conditions where a heartbeat could update the timestamp between
|
|
28
|
+
# the check and the removal.
|
|
29
|
+
#
|
|
30
|
+
# KEYS[1] = index key (sorted set)
|
|
31
|
+
# KEYS[2] = jobs key (hash)
|
|
32
|
+
# ARGV[1] = request_id
|
|
33
|
+
# ARGV[2] = threshold_ms
|
|
34
|
+
#
|
|
35
|
+
# Returns: [removed (0/1), job_payload or nil]
|
|
36
|
+
REMOVE_IF_ORPHANED_SCRIPT = <<~LUA
|
|
37
|
+
local index_key = KEYS[1]
|
|
38
|
+
local jobs_key = KEYS[2]
|
|
39
|
+
local request_id = ARGV[1]
|
|
40
|
+
local threshold_ms = tonumber(ARGV[2])
|
|
41
|
+
|
|
42
|
+
local current_score = redis.call('ZSCORE', index_key, request_id)
|
|
43
|
+
if not current_score or tonumber(current_score) >= threshold_ms then
|
|
44
|
+
return {0, nil} -- Not orphaned or already removed
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
local job_payload = redis.call('HGET', jobs_key, request_id)
|
|
48
|
+
redis.call('ZREM', index_key, request_id)
|
|
49
|
+
redis.call('HDEL', jobs_key, request_id)
|
|
50
|
+
return {1, job_payload}
|
|
51
|
+
LUA
|
|
52
|
+
|
|
53
|
+
# @return [Configuration] the configuration object
|
|
54
|
+
attr_reader :config
|
|
55
|
+
|
|
56
|
+
class << self
|
|
57
|
+
# Get the count of inflight requests in Redis.
|
|
58
|
+
#
|
|
59
|
+
# @return [Integer] number of inflight requests
|
|
60
|
+
def inflight_count
|
|
61
|
+
::Sidekiq.redis do |redis|
|
|
62
|
+
redis.zcard(INFLIGHT_INDEX_KEY)
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Get all inflight counts across all processes and the number of max connections.
|
|
67
|
+
#
|
|
68
|
+
# @return [Hash] hash of "hostname:pid" => { inflight: Integer, max_capacity: Integer }
|
|
69
|
+
def inflight_counts_by_process
|
|
70
|
+
process_ids = nil
|
|
71
|
+
max_connections = nil
|
|
72
|
+
inflight_task_ids = nil
|
|
73
|
+
|
|
74
|
+
::Sidekiq.redis do |redis|
|
|
75
|
+
process_ids = redis.smembers(PROCESS_SET_KEY)
|
|
76
|
+
return {} if process_ids.empty?
|
|
77
|
+
|
|
78
|
+
max_keys = process_ids.map { |pid| max_connections_key_for(pid) }
|
|
79
|
+
max_connections = redis.mget(*max_keys)
|
|
80
|
+
|
|
81
|
+
inflight_task_ids = redis.zrange(INFLIGHT_INDEX_KEY, 0, -1)
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
inflight_by_process_id = inflight_task_ids.group_by do |task_id|
|
|
85
|
+
task_id.split("/", 2).first
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
result = {}
|
|
89
|
+
stale_process_ids = []
|
|
90
|
+
|
|
91
|
+
process_ids.zip(max_connections).each do |process_id, max_conn|
|
|
92
|
+
if max_conn.nil?
|
|
93
|
+
# Mark for removal if max_conn key doesn't exist (process is gone)
|
|
94
|
+
stale_process_ids << process_id
|
|
95
|
+
else
|
|
96
|
+
host_pid = process_id.split(":", 3).first(2).join(":")
|
|
97
|
+
counts = result[host_pid]
|
|
98
|
+
unless counts
|
|
99
|
+
counts = {inflight: 0, max_capacity: 0}
|
|
100
|
+
result[host_pid] = counts
|
|
101
|
+
end
|
|
102
|
+
counts[:inflight] += inflight_by_process_id[process_id]&.size.to_i
|
|
103
|
+
counts[:max_capacity] += max_conn.to_i
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Remove stale process IDs from the set
|
|
108
|
+
unless stale_process_ids.empty?
|
|
109
|
+
::Sidekiq.redis do |redis|
|
|
110
|
+
redis.srem(PROCESS_SET_KEY, stale_process_ids)
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
result
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# Get the total max connections across all processes
|
|
118
|
+
#
|
|
119
|
+
# @return [Integer] sum of max connections from all active processes
|
|
120
|
+
def total_max_connections
|
|
121
|
+
inflight_counts_by_process.values.sum { |data| data[:max_capacity] }
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# Get all registered process IDs.
|
|
125
|
+
#
|
|
126
|
+
# @return [Array<String>] list of process identifiers
|
|
127
|
+
def registered_process_ids
|
|
128
|
+
::Sidekiq.redis do |redis|
|
|
129
|
+
redis.smembers(PROCESS_SET_KEY)
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# Clear all registry data. Only allowed in test environment.
|
|
134
|
+
#
|
|
135
|
+
# @raise [RuntimeError] if called outside of test environment
|
|
136
|
+
# @return [void]
|
|
137
|
+
# @api private
|
|
138
|
+
def clear_all!
|
|
139
|
+
unless PatientHttp.testing?
|
|
140
|
+
raise "clear_all! is only allowed in test environment"
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
::Sidekiq.redis do |redis|
|
|
144
|
+
redis.del(INFLIGHT_INDEX_KEY, INFLIGHT_JOBS_KEY, PROCESS_SET_KEY, GC_LOCK_KEY, GC_LAST_RUN_KEY)
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
private
|
|
149
|
+
|
|
150
|
+
# Build the max connections key for a given process identifier.
|
|
151
|
+
#
|
|
152
|
+
# @param process_id [String] the process identifier
|
|
153
|
+
#
|
|
154
|
+
# @return [String] the Redis key for max connections
|
|
155
|
+
def max_connections_key_for(process_id)
|
|
156
|
+
"#{PROCESS_SET_KEY}:#{process_id}:max_connections"
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
# @param config [Configuration] the configuration object
|
|
161
|
+
def initialize(config)
|
|
162
|
+
@config = config
|
|
163
|
+
hostname = ::Socket.gethostname.force_encoding("UTF-8").tr(":/", "-")
|
|
164
|
+
pid = ::Process.pid
|
|
165
|
+
@lock_identifier = "#{hostname}:#{pid}:#{SecureRandom.hex(8)}".freeze
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# Register a request as inflight in Redis.
|
|
169
|
+
#
|
|
170
|
+
# @param task [RequestTask] the request task to register
|
|
171
|
+
#
|
|
172
|
+
# @return [void]
|
|
173
|
+
def register(task)
|
|
174
|
+
timestamp_ms = (Time.now.to_f * 1000).round
|
|
175
|
+
job_payload = JSON.generate(task.task_handler.sidekiq_job)
|
|
176
|
+
task_id = full_task_id(task.id)
|
|
177
|
+
|
|
178
|
+
::Sidekiq.redis do |redis|
|
|
179
|
+
redis.multi do |transaction|
|
|
180
|
+
transaction.zadd(INFLIGHT_INDEX_KEY, timestamp_ms, task_id)
|
|
181
|
+
transaction.hset(INFLIGHT_JOBS_KEY, task_id, job_payload)
|
|
182
|
+
transaction.expire(INFLIGHT_INDEX_KEY, inflight_ttl)
|
|
183
|
+
transaction.expire(INFLIGHT_JOBS_KEY, inflight_ttl)
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
# Unregister a request from Redis (called when request completes).
|
|
189
|
+
#
|
|
190
|
+
# @param task [RequestTask] the request task to unregister
|
|
191
|
+
#
|
|
192
|
+
# @return [void]
|
|
193
|
+
def unregister(task)
|
|
194
|
+
task_id = full_task_id(task.id)
|
|
195
|
+
|
|
196
|
+
::Sidekiq.redis do |redis|
|
|
197
|
+
redis.multi do |transaction|
|
|
198
|
+
transaction.zrem(INFLIGHT_INDEX_KEY, task_id)
|
|
199
|
+
transaction.hdel(INFLIGHT_JOBS_KEY, task_id)
|
|
200
|
+
end
|
|
201
|
+
end
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
# Remove this process's entry from the process set.
|
|
205
|
+
#
|
|
206
|
+
# @return [void]
|
|
207
|
+
def remove_process
|
|
208
|
+
::Sidekiq.redis do |redis|
|
|
209
|
+
redis.srem(PROCESS_SET_KEY, @lock_identifier)
|
|
210
|
+
redis.del(max_connections_key)
|
|
211
|
+
end
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
# Update heartbeat timestamps for multiple requests in a single operation.
|
|
215
|
+
#
|
|
216
|
+
# @param task_ids [Array<String>] the request IDs to update
|
|
217
|
+
#
|
|
218
|
+
# @return [void]
|
|
219
|
+
def update_heartbeats(task_ids)
|
|
220
|
+
return if task_ids.empty?
|
|
221
|
+
|
|
222
|
+
timestamp_ms = (Time.now.to_f * 1000).round
|
|
223
|
+
|
|
224
|
+
::Sidekiq.redis do |redis|
|
|
225
|
+
redis.pipelined do |pipeline|
|
|
226
|
+
task_ids.each do |task_id|
|
|
227
|
+
pipeline.call("ZADD", INFLIGHT_INDEX_KEY, "XX", timestamp_ms, full_task_id(task_id))
|
|
228
|
+
end
|
|
229
|
+
end
|
|
230
|
+
end
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
# Check if a task is registered in the inflight registry.
|
|
234
|
+
#
|
|
235
|
+
# @param task [RequestTask] the request task
|
|
236
|
+
#
|
|
237
|
+
# @return [Boolean] true if registered, false otherwise
|
|
238
|
+
# @api private
|
|
239
|
+
def registered?(task)
|
|
240
|
+
::Sidekiq.redis do |redis|
|
|
241
|
+
!redis.zscore(INFLIGHT_INDEX_KEY, full_task_id(task.id)).nil?
|
|
242
|
+
end
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
# Get the heartbeat timestamp for a task.
|
|
246
|
+
#
|
|
247
|
+
# @param task [RequestTask] the request task
|
|
248
|
+
#
|
|
249
|
+
# @return [Integer, nil] timestamp in milliseconds, or nil if not registered
|
|
250
|
+
# @api private
|
|
251
|
+
def heartbeat_timestamp_for(task)
|
|
252
|
+
score = ::Sidekiq.redis do |redis|
|
|
253
|
+
redis.zscore(INFLIGHT_INDEX_KEY, full_task_id(task.id))
|
|
254
|
+
end
|
|
255
|
+
score&.to_i
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
# Get all registered task IDs for this registry's process.
|
|
259
|
+
#
|
|
260
|
+
# @return [Array<String>] list of full task IDs
|
|
261
|
+
# @api private
|
|
262
|
+
def registered_task_ids
|
|
263
|
+
::Sidekiq.redis do |redis|
|
|
264
|
+
redis.zrange(INFLIGHT_INDEX_KEY, 0, -1)
|
|
265
|
+
end.select { |id| id.start_with?("#{@lock_identifier}/") }
|
|
266
|
+
end
|
|
267
|
+
|
|
268
|
+
# Build unique task ID for a request task that includes process identifier.
|
|
269
|
+
#
|
|
270
|
+
# @param task_id [String] the request task
|
|
271
|
+
# @return [String] the unique task ID
|
|
272
|
+
def full_task_id(task_id)
|
|
273
|
+
"#{@lock_identifier}/#{task_id}"
|
|
274
|
+
end
|
|
275
|
+
|
|
276
|
+
# Record the current process's max connections in Redis.
|
|
277
|
+
#
|
|
278
|
+
# This is used for monitoring purposes.
|
|
279
|
+
#
|
|
280
|
+
# @return [void]
|
|
281
|
+
def ping_process
|
|
282
|
+
::Sidekiq.redis do |redis|
|
|
283
|
+
redis.multi do |transaction|
|
|
284
|
+
transaction.sadd(PROCESS_SET_KEY, @lock_identifier)
|
|
285
|
+
transaction.set(max_connections_key, @config.max_connections)
|
|
286
|
+
transaction.expire(PROCESS_SET_KEY, inflight_ttl)
|
|
287
|
+
transaction.expire(max_connections_key, process_ttl)
|
|
288
|
+
end
|
|
289
|
+
end
|
|
290
|
+
end
|
|
291
|
+
|
|
292
|
+
# Try to acquire the distributed garbage collection lock.
|
|
293
|
+
#
|
|
294
|
+
# @return [Boolean] true if lock acquired, false otherwise
|
|
295
|
+
def acquire_gc_lock
|
|
296
|
+
::Sidekiq.redis do |redis|
|
|
297
|
+
# Use SET with NX and EX options directly
|
|
298
|
+
# Returns "OK" if successful with ::Sidekiq.redis, nil if key already exists
|
|
299
|
+
!!redis.set(GC_LOCK_KEY, @lock_identifier, nx: true, ex: gc_lock_ttl)
|
|
300
|
+
end
|
|
301
|
+
end
|
|
302
|
+
|
|
303
|
+
# Release the garbage collection lock if held by this process.
|
|
304
|
+
#
|
|
305
|
+
# Uses Redis WATCH/MULTI/EXEC for optimistic locking to ensure we only
|
|
306
|
+
# delete the lock if it's still held by this process.
|
|
307
|
+
#
|
|
308
|
+
# @return [Boolean] true if the lock was released, false otherwise
|
|
309
|
+
def release_gc_lock
|
|
310
|
+
::Sidekiq.redis do |redis|
|
|
311
|
+
# Watch the lock key for changes
|
|
312
|
+
redis.watch(GC_LOCK_KEY)
|
|
313
|
+
|
|
314
|
+
# Get current lock value
|
|
315
|
+
current_value = redis.get(GC_LOCK_KEY)
|
|
316
|
+
|
|
317
|
+
if current_value == @lock_identifier
|
|
318
|
+
# Lock is ours, delete it atomically
|
|
319
|
+
result = redis.multi do |transaction|
|
|
320
|
+
transaction.del(GC_LOCK_KEY)
|
|
321
|
+
end
|
|
322
|
+
# MULTI returns nil if transaction was aborted (someone else modified the key)
|
|
323
|
+
# Otherwise returns array with results
|
|
324
|
+
!result.nil?
|
|
325
|
+
else
|
|
326
|
+
# Lock is not ours or doesn't exist
|
|
327
|
+
redis.unwatch
|
|
328
|
+
false
|
|
329
|
+
end
|
|
330
|
+
end
|
|
331
|
+
end
|
|
332
|
+
|
|
333
|
+
# Check if garbage collection should run based on the last run timestamp.
|
|
334
|
+
#
|
|
335
|
+
# Returns true if the GC_LAST_RUN_KEY doesn't exist in Redis or if enough
|
|
336
|
+
# time has elapsed since the last GC run.
|
|
337
|
+
#
|
|
338
|
+
# @return [Boolean] true if GC should run, false otherwise
|
|
339
|
+
def gc_needed?
|
|
340
|
+
last_run = ::Sidekiq.redis do |redis|
|
|
341
|
+
redis.get(GC_LAST_RUN_KEY)
|
|
342
|
+
end
|
|
343
|
+
|
|
344
|
+
return true if last_run.nil?
|
|
345
|
+
|
|
346
|
+
last_run_time = Time.at(last_run.to_f / 1000.0)
|
|
347
|
+
Time.now - last_run_time >= config.heartbeat_interval
|
|
348
|
+
end
|
|
349
|
+
|
|
350
|
+
# Record the timestamp of the last GC run in Redis.
|
|
351
|
+
#
|
|
352
|
+
# The timestamp is stored with a TTL slightly longer than the heartbeat
|
|
353
|
+
# interval to coordinate GC execution across multiple processes.
|
|
354
|
+
#
|
|
355
|
+
# @return [void]
|
|
356
|
+
def record_gc_run
|
|
357
|
+
::Sidekiq.redis do |redis|
|
|
358
|
+
redis.set(GC_LAST_RUN_KEY, (Time.now.to_f * 1000).floor, ex: gc_last_run_ttl)
|
|
359
|
+
end
|
|
360
|
+
end
|
|
361
|
+
|
|
362
|
+
# Find and re-enqueue orphaned requests.
|
|
363
|
+
#
|
|
364
|
+
# @param orphan_threshold_seconds [Numeric] age threshold for considering a request orphaned
|
|
365
|
+
# @param logger [Logger] logger for output
|
|
366
|
+
#
|
|
367
|
+
# @return [Integer] number of orphaned requests re-enqueued
|
|
368
|
+
def cleanup_orphaned_requests(orphan_threshold_seconds, logger)
|
|
369
|
+
threshold_timestamp_ms = calculate_threshold_timestamp(orphan_threshold_seconds)
|
|
370
|
+
orphaned_requests = fetch_orphaned_requests(threshold_timestamp_ms)
|
|
371
|
+
|
|
372
|
+
return 0 if orphaned_requests.empty?
|
|
373
|
+
|
|
374
|
+
reenqueue_orphaned_jobs(orphaned_requests, threshold_timestamp_ms, logger)
|
|
375
|
+
end
|
|
376
|
+
|
|
377
|
+
private
|
|
378
|
+
|
|
379
|
+
# Calculate threshold timestamp in milliseconds for orphan detection.
|
|
380
|
+
#
|
|
381
|
+
# @param orphan_threshold_seconds [Numeric] age threshold in seconds
|
|
382
|
+
#
|
|
383
|
+
# @return [Integer] threshold timestamp in milliseconds
|
|
384
|
+
def calculate_threshold_timestamp(orphan_threshold_seconds)
|
|
385
|
+
((Time.now.to_f - orphan_threshold_seconds) * 1000).round
|
|
386
|
+
end
|
|
387
|
+
|
|
388
|
+
# Fetch orphaned request IDs and their job payloads.
|
|
389
|
+
#
|
|
390
|
+
# @param threshold_timestamp_ms [Integer] threshold timestamp in milliseconds
|
|
391
|
+
#
|
|
392
|
+
# @return [Array<Array(String, String)>] array of [request_id, job_payload] pairs
|
|
393
|
+
def fetch_orphaned_requests(threshold_timestamp_ms)
|
|
394
|
+
# Find all requests older than the threshold
|
|
395
|
+
all_orphaned_request_ids = ::Sidekiq.redis do |redis|
|
|
396
|
+
redis.zrange(INFLIGHT_INDEX_KEY, "-inf", threshold_timestamp_ms, byscore: true)
|
|
397
|
+
end
|
|
398
|
+
|
|
399
|
+
return [] if all_orphaned_request_ids.empty?
|
|
400
|
+
|
|
401
|
+
orphaned_request_ids_by_process = all_orphaned_request_ids.group_by do |request_id|
|
|
402
|
+
request_id.split("/", 2).first
|
|
403
|
+
end
|
|
404
|
+
all_process_ids = ::Sidekiq.redis do |redis|
|
|
405
|
+
redis.smembers(PROCESS_SET_KEY)
|
|
406
|
+
end
|
|
407
|
+
orphaned_request_ids = orphaned_request_ids_by_process.except(*all_process_ids).values.flatten
|
|
408
|
+
|
|
409
|
+
return [] if orphaned_request_ids.empty?
|
|
410
|
+
|
|
411
|
+
# Retrieve job payloads for all orphaned requests
|
|
412
|
+
job_payloads = ::Sidekiq.redis do |redis|
|
|
413
|
+
redis.hmget(INFLIGHT_JOBS_KEY, *orphaned_request_ids)
|
|
414
|
+
end
|
|
415
|
+
|
|
416
|
+
orphaned_request_ids.zip(job_payloads).reject { |_id, payload| payload.nil? }
|
|
417
|
+
end
|
|
418
|
+
|
|
419
|
+
# Re-enqueue all orphaned jobs.
|
|
420
|
+
#
|
|
421
|
+
# @param orphaned_requests [Array<Array(String, String)>] array of [request_id, job_payload] pairs
|
|
422
|
+
# @param threshold_timestamp_ms [Integer] threshold timestamp in milliseconds
|
|
423
|
+
# @param logger [Logger] logger for output
|
|
424
|
+
#
|
|
425
|
+
# @return [Integer] number of jobs successfully re-enqueued
|
|
426
|
+
def reenqueue_orphaned_jobs(orphaned_requests, threshold_timestamp_ms, logger)
|
|
427
|
+
reenqueued_count = 0
|
|
428
|
+
|
|
429
|
+
orphaned_requests.each do |request_id, job_payload|
|
|
430
|
+
if reenqueue_orphaned_job(request_id, job_payload, threshold_timestamp_ms, logger)
|
|
431
|
+
reenqueued_count += 1
|
|
432
|
+
end
|
|
433
|
+
end
|
|
434
|
+
|
|
435
|
+
reenqueued_count
|
|
436
|
+
end
|
|
437
|
+
|
|
438
|
+
# Re-enqueue a single orphaned job using atomic Lua script.
|
|
439
|
+
#
|
|
440
|
+
# This method atomically checks if the task is still orphaned and removes it
|
|
441
|
+
# in a single Redis operation, preventing race conditions where a heartbeat
|
|
442
|
+
# could update the timestamp between checking and removal.
|
|
443
|
+
#
|
|
444
|
+
# @param request_id [String] the request ID
|
|
445
|
+
# @param job_payload [String] the JSON job payload (used as fallback)
|
|
446
|
+
# @param threshold_timestamp_ms [Integer] threshold timestamp in milliseconds
|
|
447
|
+
# @param logger [Logger] logger for output
|
|
448
|
+
#
|
|
449
|
+
# @return [Boolean] true if successfully re-enqueued, false otherwise
|
|
450
|
+
def reenqueue_orphaned_job(request_id, job_payload, threshold_timestamp_ms, logger)
|
|
451
|
+
# Atomically check and remove if still orphaned
|
|
452
|
+
removed, payload = remove_if_orphaned(request_id, threshold_timestamp_ms)
|
|
453
|
+
|
|
454
|
+
return false unless removed == 1
|
|
455
|
+
|
|
456
|
+
# Use payload from Lua script, fall back to provided payload
|
|
457
|
+
actual_payload = payload || job_payload
|
|
458
|
+
return false if actual_payload.nil?
|
|
459
|
+
|
|
460
|
+
# Re-enqueue the job
|
|
461
|
+
job_hash = JSON.parse(actual_payload)
|
|
462
|
+
::Sidekiq::Client.push(job_hash)
|
|
463
|
+
|
|
464
|
+
logger&.info(
|
|
465
|
+
"[PatientHttp::Sidekiq] Re-enqueued orphaned request #{request_id} to #{job_hash["class"]}"
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
true
|
|
469
|
+
rescue => e
|
|
470
|
+
logger&.error(
|
|
471
|
+
"[PatientHttp::Sidekiq] Failed to re-enqueue orphaned request #{request_id}: #{e.class} - #{e.message}"
|
|
472
|
+
)
|
|
473
|
+
false
|
|
474
|
+
end
|
|
475
|
+
|
|
476
|
+
# Atomically check if orphaned and remove from registry.
|
|
477
|
+
#
|
|
478
|
+
# Uses a Lua script to ensure the check and removal happen in a single
|
|
479
|
+
# atomic operation, preventing race conditions with heartbeat updates.
|
|
480
|
+
#
|
|
481
|
+
# @param request_id [String] the request ID
|
|
482
|
+
# @param threshold_timestamp_ms [Integer] threshold timestamp in milliseconds
|
|
483
|
+
#
|
|
484
|
+
# @return [Array(Integer, String)] [removed (0/1), job_payload or nil]
|
|
485
|
+
def remove_if_orphaned(request_id, threshold_timestamp_ms)
|
|
486
|
+
::Sidekiq.redis do |redis|
|
|
487
|
+
# EVAL script numkeys key1 key2 arg1 arg2
|
|
488
|
+
redis.call(
|
|
489
|
+
"EVAL",
|
|
490
|
+
REMOVE_IF_ORPHANED_SCRIPT,
|
|
491
|
+
2, # number of keys
|
|
492
|
+
INFLIGHT_INDEX_KEY,
|
|
493
|
+
INFLIGHT_JOBS_KEY,
|
|
494
|
+
request_id,
|
|
495
|
+
threshold_timestamp_ms.to_s
|
|
496
|
+
)
|
|
497
|
+
end
|
|
498
|
+
end
|
|
499
|
+
|
|
500
|
+
# Calculate the TTL for inflight data structures.
|
|
501
|
+
# Should be significantly longer than the orphan threshold.
|
|
502
|
+
#
|
|
503
|
+
# @return [Integer] TTL in seconds
|
|
504
|
+
def inflight_ttl
|
|
505
|
+
# Set to 3x the orphan threshold, with a minimum of 1 hour
|
|
506
|
+
[config.orphan_threshold * 3, 3600].max
|
|
507
|
+
end
|
|
508
|
+
|
|
509
|
+
# Calculate the TTL for the garbage collection lock.
|
|
510
|
+
# Should be a bit longer than the heartbeat interval.
|
|
511
|
+
#
|
|
512
|
+
# @return [Integer] TTL in seconds
|
|
513
|
+
def gc_lock_ttl
|
|
514
|
+
# Set to 2x the heartbeat interval, with a minimum of 120 seconds
|
|
515
|
+
[config.heartbeat_interval * 2, 120].max
|
|
516
|
+
end
|
|
517
|
+
|
|
518
|
+
# Calculate the TTL for the last GC run timestamp.
|
|
519
|
+
# Should be a bit longer than the heartbeat interval to ensure
|
|
520
|
+
# proper coordination across processes.
|
|
521
|
+
#
|
|
522
|
+
# @return [Integer] TTL in seconds
|
|
523
|
+
def gc_last_run_ttl
|
|
524
|
+
# Set to 1.5x the heartbeat interval
|
|
525
|
+
(config.heartbeat_interval * 1.5).round
|
|
526
|
+
end
|
|
527
|
+
|
|
528
|
+
# Calculate the TTL for the process max_connections key.
|
|
529
|
+
# Must be longer than heartbeat_interval so the key survives between heartbeats.
|
|
530
|
+
#
|
|
531
|
+
# @return [Integer] TTL in seconds
|
|
532
|
+
def process_ttl
|
|
533
|
+
# Set to 2x the heartbeat interval so the key survives between heartbeats
|
|
534
|
+
config.heartbeat_interval * 2
|
|
535
|
+
end
|
|
536
|
+
|
|
537
|
+
def max_connections_key
|
|
538
|
+
"#{PROCESS_SET_KEY}:#{@lock_identifier}:max_connections"
|
|
539
|
+
end
|
|
540
|
+
end
|
|
541
|
+
end
|
|
542
|
+
end
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module PatientHttp
|
|
4
|
+
module Sidekiq
|
|
5
|
+
# Background thread that maintains heartbeats and performs garbage collection
|
|
6
|
+
# for in-flight HTTP requests.
|
|
7
|
+
class TaskMonitorThread
|
|
8
|
+
include PatientHttp::TimeHelper
|
|
9
|
+
|
|
10
|
+
# Minimum seconds to sleep between monitor thread checks
|
|
11
|
+
MAX_MONITOR_SLEEP = 5.0
|
|
12
|
+
|
|
13
|
+
# @return [Configuration] the configuration object
|
|
14
|
+
attr_reader :config
|
|
15
|
+
|
|
16
|
+
# @return [TaskMonitor] the inflight request registry
|
|
17
|
+
attr_reader :task_monitor
|
|
18
|
+
|
|
19
|
+
# Initialize the monitor thread.
|
|
20
|
+
#
|
|
21
|
+
# @param config [Configuration] the configuration object
|
|
22
|
+
# @param task_monitor [TaskMonitor] the inflight request registry
|
|
23
|
+
# @param inflight_ids_callback [Proc] callback to get current inflight request IDs
|
|
24
|
+
# @return [void]
|
|
25
|
+
def initialize(config, task_monitor, inflight_ids_callback)
|
|
26
|
+
@config = config
|
|
27
|
+
@task_monitor = task_monitor
|
|
28
|
+
@inflight_ids_callback = inflight_ids_callback
|
|
29
|
+
@thread = nil
|
|
30
|
+
@running = Concurrent::AtomicBoolean.new(false)
|
|
31
|
+
@stop_signal = Concurrent::Event.new
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Start the monitor thread.
|
|
35
|
+
#
|
|
36
|
+
# @return [void]
|
|
37
|
+
def start
|
|
38
|
+
return if @running.true?
|
|
39
|
+
@running.make_true
|
|
40
|
+
@stop_signal.reset
|
|
41
|
+
|
|
42
|
+
@task_monitor.ping_process
|
|
43
|
+
|
|
44
|
+
@thread = Thread.new do
|
|
45
|
+
run
|
|
46
|
+
rescue => e
|
|
47
|
+
# Log error but don't crash
|
|
48
|
+
@config.logger&.error("[PatientHttp::Sidekiq] Monitor error: #{e.message}\n#{e.backtrace.join("\n")}")
|
|
49
|
+
raise if ::Sidekiq.testing?
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
@thread.name = "patient-http-monitor"
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Stop the monitor thread.
|
|
56
|
+
#
|
|
57
|
+
# @return [void]
|
|
58
|
+
def stop
|
|
59
|
+
@running.make_false
|
|
60
|
+
@stop_signal.set # Interrupt the sleep immediately
|
|
61
|
+
@thread&.join(1)
|
|
62
|
+
@thread&.kill if @thread&.alive?
|
|
63
|
+
@thread = nil
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Check if monitor thread is running.
|
|
67
|
+
#
|
|
68
|
+
# @return [Boolean]
|
|
69
|
+
def running?
|
|
70
|
+
@running.true?
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
private
|
|
74
|
+
|
|
75
|
+
# Run the monitor loop.
|
|
76
|
+
#
|
|
77
|
+
# @return [void]
|
|
78
|
+
def run
|
|
79
|
+
@config.logger&.info("[PatientHttp::Sidekiq] Monitor thread started")
|
|
80
|
+
|
|
81
|
+
last_heartbeat_update = monotonic_time - @config.heartbeat_interval
|
|
82
|
+
last_gc_attempt = monotonic_time - @config.heartbeat_interval
|
|
83
|
+
|
|
84
|
+
loop do
|
|
85
|
+
break unless @running.true?
|
|
86
|
+
|
|
87
|
+
current_time = monotonic_time
|
|
88
|
+
|
|
89
|
+
# Update heartbeats for all inflight requests
|
|
90
|
+
if current_time - last_heartbeat_update >= @config.heartbeat_interval
|
|
91
|
+
@task_monitor.ping_process
|
|
92
|
+
update_heartbeats
|
|
93
|
+
last_heartbeat_update = current_time
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Attempt garbage collection
|
|
97
|
+
if current_time - last_gc_attempt >= @config.heartbeat_interval
|
|
98
|
+
attempt_garbage_collection
|
|
99
|
+
last_gc_attempt = current_time
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# Sleep with interruptible wait - returns true if interrupted
|
|
103
|
+
wait_time = @config.heartbeat_interval / 2.0
|
|
104
|
+
wait_time = MAX_MONITOR_SLEEP if wait_time > MAX_MONITOR_SLEEP
|
|
105
|
+
@stop_signal.wait(wait_time)
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
@config.logger&.info("[PatientHttp::Sidekiq] Monitor thread stopped")
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Update heartbeats for all inflight requests.
|
|
112
|
+
#
|
|
113
|
+
# @return [void]
|
|
114
|
+
def update_heartbeats
|
|
115
|
+
request_ids = @inflight_ids_callback.call
|
|
116
|
+
return if request_ids.empty?
|
|
117
|
+
|
|
118
|
+
@task_monitor.update_heartbeats(request_ids)
|
|
119
|
+
|
|
120
|
+
@config.logger&.debug("[PatientHttp::Sidekiq] Updated heartbeats for #{request_ids.size} inflight requests")
|
|
121
|
+
rescue => e
|
|
122
|
+
@config.logger&.error("[PatientHttp::Sidekiq] Failed to update heartbeats: #{e.class} - #{e.message}")
|
|
123
|
+
raise if ::Sidekiq.testing?
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# Attempt to acquire GC lock and clean up orphaned requests.
|
|
127
|
+
#
|
|
128
|
+
# @return [void]
|
|
129
|
+
def attempt_garbage_collection
|
|
130
|
+
# Check if GC is needed based on coordinated timestamp
|
|
131
|
+
return unless @task_monitor.gc_needed?
|
|
132
|
+
|
|
133
|
+
# Try to acquire the distributed lock
|
|
134
|
+
return unless @task_monitor.acquire_gc_lock
|
|
135
|
+
|
|
136
|
+
begin
|
|
137
|
+
count = @task_monitor.cleanup_orphaned_requests(@config.orphan_threshold, @config.logger)
|
|
138
|
+
|
|
139
|
+
if count > 0
|
|
140
|
+
@config.logger&.info("[PatientHttp::Sidekiq] Garbage collection: re-enqueued #{count} orphaned requests")
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# Record this GC run to coordinate with other processes
|
|
144
|
+
@task_monitor.record_gc_run
|
|
145
|
+
ensure
|
|
146
|
+
@task_monitor.release_gc_lock
|
|
147
|
+
end
|
|
148
|
+
rescue => e
|
|
149
|
+
@config.logger&.error("[PatientHttp::Sidekiq] Garbage collection failed: #{e.class} - #{e.message}")
|
|
150
|
+
raise if ::Sidekiq.testing?
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
end
|