wurk 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +43 -0
  3. data/CONTRIBUTING.md +73 -0
  4. data/LICENSE +21 -0
  5. data/README.md +137 -0
  6. data/SECURITY.md +39 -0
  7. data/app/controllers/wurk/api/pagination.rb +67 -0
  8. data/app/controllers/wurk/api/serializers.rb +131 -0
  9. data/app/controllers/wurk/api_controller.rb +248 -0
  10. data/app/controllers/wurk/application_controller.rb +7 -0
  11. data/app/controllers/wurk/dashboard_controller.rb +48 -0
  12. data/config/locales/en.yml +15 -0
  13. data/config/routes.rb +34 -0
  14. data/exe/wurk +22 -0
  15. data/lib/active_job/queue_adapters/wurk_adapter.rb +96 -0
  16. data/lib/generators/wurk/install/install_generator.rb +22 -0
  17. data/lib/generators/wurk/install/templates/wurk.rb +16 -0
  18. data/lib/wurk/active_job/wrapper.rb +32 -0
  19. data/lib/wurk/api/fast.rb +78 -0
  20. data/lib/wurk/batch/buffer.rb +26 -0
  21. data/lib/wurk/batch/callback_job.rb +37 -0
  22. data/lib/wurk/batch/callbacks.rb +176 -0
  23. data/lib/wurk/batch/client_middleware.rb +27 -0
  24. data/lib/wurk/batch/death_handler.rb +39 -0
  25. data/lib/wurk/batch/empty.rb +21 -0
  26. data/lib/wurk/batch/server_middleware.rb +62 -0
  27. data/lib/wurk/batch/status.rb +140 -0
  28. data/lib/wurk/batch.rb +351 -0
  29. data/lib/wurk/batch_set.rb +67 -0
  30. data/lib/wurk/capsule.rb +176 -0
  31. data/lib/wurk/cli.rb +349 -0
  32. data/lib/wurk/client/buffered.rb +372 -0
  33. data/lib/wurk/client.rb +330 -0
  34. data/lib/wurk/compat.rb +136 -0
  35. data/lib/wurk/component.rb +136 -0
  36. data/lib/wurk/configuration.rb +373 -0
  37. data/lib/wurk/context.rb +35 -0
  38. data/lib/wurk/cron.rb +636 -0
  39. data/lib/wurk/dashboard_manifest.rb +39 -0
  40. data/lib/wurk/dead_set.rb +78 -0
  41. data/lib/wurk/deploy.rb +91 -0
  42. data/lib/wurk/embedded.rb +94 -0
  43. data/lib/wurk/encryption.rb +276 -0
  44. data/lib/wurk/engine.rb +81 -0
  45. data/lib/wurk/fetcher/reaper.rb +264 -0
  46. data/lib/wurk/fetcher/reliable.rb +138 -0
  47. data/lib/wurk/fetcher.rb +11 -0
  48. data/lib/wurk/health.rb +193 -0
  49. data/lib/wurk/heartbeat.rb +211 -0
  50. data/lib/wurk/iterable_job.rb +292 -0
  51. data/lib/wurk/job/options.rb +70 -0
  52. data/lib/wurk/job.rb +33 -0
  53. data/lib/wurk/job_logger.rb +68 -0
  54. data/lib/wurk/job_record.rb +156 -0
  55. data/lib/wurk/job_retry.rb +320 -0
  56. data/lib/wurk/job_set.rb +212 -0
  57. data/lib/wurk/job_util.rb +162 -0
  58. data/lib/wurk/keys.rb +52 -0
  59. data/lib/wurk/launcher.rb +289 -0
  60. data/lib/wurk/leader.rb +221 -0
  61. data/lib/wurk/limiter/base.rb +138 -0
  62. data/lib/wurk/limiter/bucket.rb +80 -0
  63. data/lib/wurk/limiter/concurrent.rb +132 -0
  64. data/lib/wurk/limiter/leaky.rb +91 -0
  65. data/lib/wurk/limiter/points.rb +89 -0
  66. data/lib/wurk/limiter/server_middleware.rb +77 -0
  67. data/lib/wurk/limiter/unlimited.rb +48 -0
  68. data/lib/wurk/limiter/window.rb +80 -0
  69. data/lib/wurk/limiter.rb +255 -0
  70. data/lib/wurk/logger.rb +81 -0
  71. data/lib/wurk/lua/loader.rb +53 -0
  72. data/lib/wurk/lua.rb +187 -0
  73. data/lib/wurk/manager.rb +132 -0
  74. data/lib/wurk/metrics/history.rb +151 -0
  75. data/lib/wurk/metrics/query.rb +173 -0
  76. data/lib/wurk/metrics/rollup.rb +169 -0
  77. data/lib/wurk/metrics/statsd.rb +197 -0
  78. data/lib/wurk/metrics.rb +7 -0
  79. data/lib/wurk/middleware/chain.rb +128 -0
  80. data/lib/wurk/middleware/current_attributes.rb +87 -0
  81. data/lib/wurk/middleware/expiry.rb +50 -0
  82. data/lib/wurk/middleware/i18n.rb +63 -0
  83. data/lib/wurk/middleware/interrupt_handler.rb +45 -0
  84. data/lib/wurk/middleware/poison_pill.rb +149 -0
  85. data/lib/wurk/middleware.rb +34 -0
  86. data/lib/wurk/process_set.rb +243 -0
  87. data/lib/wurk/processor.rb +247 -0
  88. data/lib/wurk/queue.rb +108 -0
  89. data/lib/wurk/queues.rb +80 -0
  90. data/lib/wurk/rails.rb +9 -0
  91. data/lib/wurk/railtie.rb +28 -0
  92. data/lib/wurk/redis_pool.rb +79 -0
  93. data/lib/wurk/retry_set.rb +17 -0
  94. data/lib/wurk/scheduled.rb +189 -0
  95. data/lib/wurk/scheduled_set.rb +18 -0
  96. data/lib/wurk/sorted_entry.rb +95 -0
  97. data/lib/wurk/stats.rb +190 -0
  98. data/lib/wurk/swarm/child_boot.rb +105 -0
  99. data/lib/wurk/swarm.rb +260 -0
  100. data/lib/wurk/testing.rb +102 -0
  101. data/lib/wurk/topology.rb +74 -0
  102. data/lib/wurk/unique.rb +240 -0
  103. data/lib/wurk/version.rb +5 -0
  104. data/lib/wurk/web/config.rb +180 -0
  105. data/lib/wurk/web/enterprise.rb +138 -0
  106. data/lib/wurk/web/search.rb +139 -0
  107. data/lib/wurk/web.rb +25 -0
  108. data/lib/wurk/work_set.rb +116 -0
  109. data/lib/wurk/worker/setter.rb +93 -0
  110. data/lib/wurk/worker.rb +216 -0
  111. data/lib/wurk.rb +238 -0
  112. data/vendor/assets/dashboard/assets/index-8P3N_m1X.js +152 -0
  113. data/vendor/assets/dashboard/assets/index-Bqz4_SOQ.css +1 -0
  114. data/vendor/assets/dashboard/index.html +13 -0
  115. data/vendor/assets/dashboard/wurk-manifest.json +4 -0
  116. metadata +232 -0
@@ -0,0 +1,264 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../component'
4
+ require_relative '../keys'
5
+ require_relative '../middleware/poison_pill'
6
+
7
+ module Wurk
8
+ class Fetcher
9
+ # Orphan reclamation for the reliable fetcher (Pro super_fetch §3.2).
10
+ #
11
+ # The Reliable fetcher moves each job from a public queue into a
12
+ # per-process private list (`queue:<public>|<host>|<pid>|<idx>`) and
13
+ # leaves it there until the Processor ACKs. A SIGKILLed or crashed
14
+ # worker therefore strands its in-flight jobs in private lists that
15
+ # nobody will ever ACK. The Reaper is the recovery half: it periodically
16
+ # scans for private lists whose owning process is gone and atomically
17
+ # moves their jobs back to the public queue so a live worker re-runs them.
18
+ #
19
+ # Liveness is decided per owner:
20
+ # * same host — the OS is authoritative: `Process.kill(0, pid)`. This
21
+ # is instant and ignores a stale `processes` SET entry whose 60s TTL
22
+ # hasn't lapsed yet, so a `kill -9`ed sibling is reclaimed the moment
23
+ # the supervisor reaps it rather than 60s later. (Pid reuse by an
24
+ # unrelated local process is the one blind spot — the supervisor
25
+ # respawns with a fresh pid, so it does not arise in practice.)
26
+ # * other host — we cannot ping the pid, so we trust the heartbeat:
27
+ # the owner is alive iff some live `processes` member (one whose
28
+ # `info` hash still exists) shares its `host:pid`. Cross-host reclaim
29
+ # therefore waits out the 60s heartbeat TTL, exactly as the spec says.
30
+ #
31
+ # Re-pushed jobs run through Wurk::Middleware::PoisonPill, which caps a
32
+ # job at RECOVERY_THRESHOLD recoveries within 72h: past the cap the job
33
+ # is killed into the dead set instead of re-queued, so a job that crashes
34
+ # its worker every time can't loop forever.
35
+ #
36
+ # SCANs are scoped to the public queues this process serves and gated by
37
+ # a cluster-wide `SET NX EX` lock, so across a fleet only one process
38
+ # sweeps per interval ("1/min within process group" in the spec) and the
39
+ # keyspace touched is bounded to known queues.
40
+ #
41
+ # Spec: docs/target/sidekiq-pro.md §3.2.
42
+ class Reaper
43
+ include Component
44
+
45
+ # Sweep cadence in seconds; also the cluster-lock TTL so exactly one
46
+ # process sweeps per interval. 60s matches the heartbeat TTL — the
47
+ # floor below which cross-host orphans can't be detected anyway.
48
+ DEFAULT_INTERVAL = 60
49
+
50
+ LOCK_KEY = 'super_fetch:reaper'
51
+ SCAN_COUNT = 100
52
+ THREAD_NAME = 'wurk-reaper'
53
+
54
+ attr_reader :interval
55
+
56
+ def initialize(config, interval: DEFAULT_INTERVAL, lock_key: LOCK_KEY)
57
+ @config = config
58
+ @interval = interval
59
+ @lock_key = lock_key
60
+ @thread = nil
61
+ @done = false
62
+ @mutex = ::Mutex.new
63
+ @sleeper = ::ConditionVariable.new
64
+ end
65
+
66
+ # Spawns the sweep loop. Idempotent. The loop waits one interval before
67
+ # its first sweep so booting processes don't dogpile Redis and so an
68
+ # un-stopped launcher in a unit test never touches the keyspace.
69
+ def start
70
+ @mutex.synchronize do
71
+ return @thread if @thread
72
+
73
+ @done = false
74
+ @thread = spawn_loop_thread
75
+ end
76
+ @thread
77
+ end
78
+
79
+ def stop
80
+ @mutex.synchronize do
81
+ @done = true
82
+ @sleeper.signal
83
+ end
84
+ @thread&.join
85
+ @thread = nil
86
+ end
87
+
88
+ def running?
89
+ !@thread.nil? && @thread.alive?
90
+ end
91
+
92
+ # One cluster-gated sweep: a no-op (returns 0) unless this process wins
93
+ # the interval's lock. Used by the loop.
94
+ def reap
95
+ return 0 unless acquire_lock?
96
+
97
+ reclaim!
98
+ end
99
+
100
+ # One unguarded sweep over every served queue. Returns the number of
101
+ # jobs reclaimed (re-queued or killed). Public so boot paths and tests
102
+ # can drive a deterministic pass without the cluster lock.
103
+ def reclaim!
104
+ prefixes = live_process_prefixes
105
+ served_queues.sum { |public_q| reclaim_queue(public_q, prefixes) }
106
+ end
107
+
108
+ private
109
+
110
+ # Union of `queue:<name>` keys across every capsule this process serves.
111
+ # Scoping the scan to these keeps the keyspace we touch bounded and lets
112
+ # parallel test namespaces stay isolated.
113
+ def served_queues
114
+ @config.capsules.each_value
115
+ .flat_map(&:queues)
116
+ .uniq
117
+ .map { |name| Keys.queue(name) }
118
+ end
119
+
120
+ # SCAN for this public queue's private lists, reclaim the orphaned ones.
121
+ def reclaim_queue(public_q, prefixes)
122
+ reclaimed = 0
123
+ each_private_list(public_q) do |key, host, pid|
124
+ next if owner_alive?(host, pid, prefixes)
125
+
126
+ reclaimed += drain(key, public_q)
127
+ end
128
+ reclaimed
129
+ end
130
+
131
+ # Yields [private_list_key, host, pid] for each private list of
132
+ # `public_q`. MATCH `<public_q>|*` matches only this queue's private
133
+ # lists (public queue keys carry no `|`).
134
+ def each_private_list(public_q)
135
+ cursor = '0'
136
+ loop do
137
+ cursor, keys = redis { |c| c.call('SCAN', cursor, 'MATCH', "#{public_q}|*", 'COUNT', SCAN_COUNT) }
138
+ keys.each do |key|
139
+ host, pid = parse_owner(public_q, key)
140
+ yield key, host, pid if pid
141
+ end
142
+ break if cursor == '0'
143
+ end
144
+ end
145
+
146
+ # `<public_q>|<host>|<pid>|<idx>` → [host, pid] (pid as Integer), or
147
+ # [nil, nil] when the suffix isn't a well-formed `host|pid|idx` triple.
148
+ # Splitting the suffix off the known public-queue prefix tolerates a
149
+ # `|` inside the queue name itself.
150
+ def parse_owner(public_q, key)
151
+ suffix = key.delete_prefix("#{public_q}|")
152
+ return [nil, nil] if suffix == key
153
+
154
+ host, pid, idx = suffix.split('|')
155
+ return [nil, nil] unless host && integer?(pid) && integer?(idx)
156
+
157
+ [host, pid.to_i]
158
+ end
159
+
160
+ def integer?(str)
161
+ str.is_a?(String) && str.match?(/\A\d+\z/)
162
+ end
163
+
164
+ def owner_alive?(host, pid, prefixes)
165
+ return local_pid_alive?(pid) if host == hostname
166
+
167
+ prefixes.include?("#{host}:#{pid}")
168
+ end
169
+
170
+ def local_pid_alive?(pid)
171
+ ::Process.kill(0, pid)
172
+ true
173
+ rescue Errno::ESRCH
174
+ false
175
+ rescue Errno::EPERM
176
+ true
177
+ end
178
+
179
+ # `host:pid` of every live process — a member of `processes` whose
180
+ # `info` hash still exists. A bare SET membership isn't enough: the
181
+ # member lingers after its 60s hash TTL until ProcessSet#cleanup prunes
182
+ # it, and we must treat that window as dead for cross-host reclaim.
183
+ def live_process_prefixes
184
+ redis do |conn|
185
+ members = conn.call('SMEMBERS', Keys::PROCESSES)
186
+ next ::Set.new if members.empty?
187
+
188
+ infos = conn.pipelined { |pipe| members.each { |m| pipe.call('HGET', m, 'info') } }
189
+ members.zip(infos).each_with_object(::Set.new) do |(member, info), set|
190
+ set << host_pid(member) if info
191
+ end
192
+ end
193
+ end
194
+
195
+ # identity is `<host>:<pid>:<nonce>`; the owner prefix is `<host>:<pid>`.
196
+ def host_pid(identity)
197
+ identity.split(':')[0..1].join(':')
198
+ end
199
+
200
+ # Drain one orphaned private list back to its public queue. Each job is
201
+ # moved with an atomic LMOVE (private tail → public tail) BEFORE the
202
+ # poison check, so a crash mid-drain leaves the job safely in the public
203
+ # queue (at-least-once), never lost. Poison jobs are killed to the dead
204
+ # set by PoisonPill.track! and then LREM'd out of the public queue.
205
+ def drain(private_list, public_q)
206
+ queue_name = public_q.delete_prefix(Keys::QUEUE_PREFIX)
207
+ count = 0
208
+ loop do
209
+ job = redis { |c| c.call('LMOVE', private_list, public_q, 'RIGHT', 'RIGHT') }
210
+ break unless job
211
+
212
+ count += 1
213
+ poison_off(public_q, job, queue_name)
214
+ end
215
+ count
216
+ rescue StandardError => e
217
+ handle_exception(e, context: THREAD_NAME)
218
+ count
219
+ end
220
+
221
+ def poison_off(public_q, job, queue_name)
222
+ return unless Middleware::PoisonPill.track!(job, queue: queue_name) == :poison
223
+
224
+ # track! already ZADDed the payload to the dead set; pull the copy we
225
+ # just LMOVE'd onto the public tail so it isn't also re-run.
226
+ redis { |c| c.call('LREM', public_q, -1, job) }
227
+ end
228
+
229
+ def acquire_lock?
230
+ redis { |c| c.call('SET', @lock_key, '1', 'NX', 'EX', @interval) } == 'OK'
231
+ end
232
+
233
+ def spawn_loop_thread
234
+ t = Thread.new { run_loop }
235
+ t.name = THREAD_NAME
236
+ t.report_on_exception = false
237
+ t
238
+ end
239
+
240
+ def run_loop
241
+ until done?
242
+ wait_next
243
+ break if done?
244
+
245
+ tick_once
246
+ end
247
+ end
248
+
249
+ def tick_once
250
+ reap
251
+ rescue StandardError => e
252
+ handle_exception(e, context: THREAD_NAME) if @config.respond_to?(:handle_exception)
253
+ end
254
+
255
+ def wait_next
256
+ @mutex.synchronize { @sleeper.wait(@mutex, @interval) unless @done }
257
+ end
258
+
259
+ def done?
260
+ @mutex.synchronize { @done }
261
+ end
262
+ end
263
+ end
264
+ end
@@ -0,0 +1,138 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'socket'
4
+ require_relative '../component'
5
+ require_relative '../keys'
6
+ require_relative '../fetcher'
7
+
8
+ module Wurk
9
+ class Fetcher
10
+ # Default fetcher. Each public queue is paired with a per-process
11
+ # private list (`queue:<name>|<host>|<pid>|<idx>`); a job is moved
12
+ # atomically from the public tail to the private head via LMOVE, and
13
+ # stays there until the Processor explicitly ACKs (LREM). SIGKILL
14
+ # between fetch and ack leaves the job in the private list, where the
15
+ # next boot of this process reclaims it via bulk_requeue.
16
+ #
17
+ # Priority handling: iterate queues_cmd in order with non-blocking
18
+ # LMOVE, then fall back to a 2s BLMOVE on the first queue so an
19
+ # empty poll doesn't spin Redis. BLMOVE has no multi-key form, so
20
+ # blocking on a single queue is the best Redis gives us.
21
+ #
22
+ # Spec: docs/target/sidekiq-pro.md §3 (super_fetch),
23
+ # docs/target/sidekiq-free.md §15 (TIMEOUT=2).
24
+ class Reliable < Fetcher
25
+ include Component
26
+
27
+ TIMEOUT = 2
28
+
29
+ # Carries the public queue key, the raw (still-JSON) job payload,
30
+ # and the capsule we use to reach Redis. ACK removes from the private
31
+ # list; requeue pushes back to the public queue head so the job is
32
+ # next pulled. LREM count=1 is idempotent for our payloads since
33
+ # each job's JSON contains a unique `jid`.
34
+ UnitOfWork = Struct.new(:queue, :job, :config, keyword_init: true) do
35
+ def acknowledge
36
+ config.redis do |conn|
37
+ conn.call('LREM', Reliable.private_queue_name(queue), 1, job)
38
+ end
39
+ end
40
+
41
+ def queue_name
42
+ queue.delete_prefix(Keys::QUEUE_PREFIX)
43
+ end
44
+
45
+ def requeue
46
+ config.redis { |conn| conn.call('RPUSH', queue, job) }
47
+ end
48
+ end
49
+
50
+ # Class-level so UnitOfWork can compute the private list without
51
+ # carrying a back-reference to its parent fetcher. Index defaults to
52
+ # 0 — we run one fetcher per capsule today. Multi-processor topology
53
+ # (one private list per processor slot) is a future Manager concern.
54
+ def self.private_queue_name(public_queue, index = 0)
55
+ host = ENV['DYNO'] || Socket.gethostname
56
+ "#{public_queue}|#{host}|#{::Process.pid}|#{index}"
57
+ end
58
+
59
+ def initialize(capsule)
60
+ super()
61
+ @config = capsule
62
+ @done = false
63
+ end
64
+
65
+ def retrieve_work
66
+ return nil if @done
67
+
68
+ queues = queues_cmd
69
+ return nil if queues.empty?
70
+
71
+ queues.each do |public_q|
72
+ uow = lmove(public_q)
73
+ return uow if uow
74
+ end
75
+ blmove(queues.first)
76
+ end
77
+
78
+ # Called on shutdown for jobs the Processor couldn't finish in time.
79
+ # One pipelined RPUSH per public queue (head insert) so on next boot
80
+ # they're picked again ahead of fresh enqueues.
81
+ def bulk_requeue(in_progress)
82
+ return if in_progress.nil? || in_progress.empty?
83
+
84
+ grouped = in_progress.group_by(&:queue)
85
+ config.redis do |conn|
86
+ conn.pipelined do |pipe|
87
+ grouped.each do |public_q, uows|
88
+ pipe.call('RPUSH', public_q, *uows.map(&:job))
89
+ end
90
+ end
91
+ end
92
+ end
93
+
94
+ # Prefixed queue keys (`queue:<name>`) in fetch order. Strict mode
95
+ # preserves declaration order. Random/weighted shuffle each call —
96
+ # @queues is pre-expanded by weight in Capsule#queues=, so uniform
97
+ # shuffle yields weighted fairness; .uniq trims duplicates. Paused
98
+ # queues are filtered after shuffle so the membership test runs on
99
+ # the smallest possible set.
100
+ def queues_cmd
101
+ names = config.mode == :strict ? config.queues : config.queues.shuffle.uniq
102
+ paused = paused_names
103
+ names = names.reject { |q| paused.include?(q) } unless paused.empty?
104
+ names.map { |q| "#{Keys::QUEUE_PREFIX}#{q}" }
105
+ end
106
+
107
+ def terminate
108
+ @done = true
109
+ end
110
+
111
+ private
112
+
113
+ # SMEMBERS of the `paused` SET. One round-trip per fetch pass; the
114
+ # set is tiny in practice (one entry per paused queue) so the cost
115
+ # is dominated by the BLMOVE that follows. Returns a Set for O(1)
116
+ # lookup against the (often weighted-expanded) queue list.
117
+ def paused_names
118
+ config.redis { |conn| conn.call('SMEMBERS', Keys::PAUSED_SET) }.to_set
119
+ end
120
+
121
+ def lmove(public_q)
122
+ priv = self.class.private_queue_name(public_q)
123
+ job = config.redis { |conn| conn.call('LMOVE', public_q, priv, 'RIGHT', 'LEFT') }
124
+ job ? UnitOfWork.new(queue: public_q, job: job, config: config) : nil
125
+ end
126
+
127
+ def blmove(public_q)
128
+ priv = self.class.private_queue_name(public_q)
129
+ # Extend the socket read-timeout past BLMOVE's own timeout so the
130
+ # default 1s pool timeout doesn't fire before BLMOVE returns.
131
+ job = config.redis do |conn|
132
+ conn.blocking_call(TIMEOUT + 1, 'BLMOVE', public_q, priv, 'RIGHT', 'LEFT', TIMEOUT)
133
+ end
134
+ job ? UnitOfWork.new(queue: public_q, job: job, config: config) : nil
135
+ end
136
+ end
137
+ end
138
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wurk
4
+ # Abstract fetcher. Wurk::Fetcher::Reliable is the only implementation
5
+ # we ship and the only one we recommend — BLMOVE-based reliable fetch.
6
+ # No "basic fetch" mode.
7
+ class Fetcher
8
+ def retrieve_work; end
9
+ def bulk_requeue(in_progress); end
10
+ end
11
+ end
@@ -0,0 +1,193 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'socket'
4
+ require 'json'
5
+
6
+ module Wurk
7
+ # Thin HTTP listener for k8s liveness/readiness probes. Optional, off by
8
+ # default — opt in with `config.health_check(port: 7433)`.
9
+ #
10
+ # Endpoints:
11
+ # * GET /live → 200 while the Launcher is running (not in quiet/stop).
12
+ # * GET /ready → 200 only when Redis is reachable AND the heartbeat has
13
+ # fired within `ready_window` seconds. 503 otherwise.
14
+ # Anything else returns 404 JSON.
15
+ #
16
+ # The server uses a raw TCPServer and one accept thread. No Rack, no
17
+ # dependencies — it lives inside every worker process where Rails may or
18
+ # may not exist (standalone CLI, Embedded, swarm child). Bound to a
19
+ # dedicated port so it does not collide with the host application's HTTP.
20
+ #
21
+ # Spec: docs/target/sidekiq-ent.md §7.1.2 (`config.health_check`).
22
+ module Health
23
+ DEFAULT_PORT = 7433
24
+ DEFAULT_BIND = '0.0.0.0'
25
+ DEFAULT_READY_WINDOW = 30
26
+
27
+ # The HTTP listener. Owns one TCPServer + one accept thread. Idempotent
28
+ # start/stop; safe to call from Launcher#run / Launcher#stop.
29
+ class Server
30
+ ACCEPT_TIMEOUT = 0.2
31
+
32
+ attr_reader :port, :bind
33
+
34
+ def initialize(launcher, port: DEFAULT_PORT, bind: DEFAULT_BIND, ready_window: DEFAULT_READY_WINDOW)
35
+ @launcher = launcher
36
+ @config = launcher.instance_variable_get(:@config)
37
+ @port = port
38
+ @bind = bind
39
+ @ready_window = ready_window
40
+ @server = nil
41
+ @thread = nil
42
+ @done = false
43
+ end
44
+
45
+ def start
46
+ @server = ::TCPServer.new(@bind, @port)
47
+ # Capture the OS-assigned port when caller passed 0 (test pattern,
48
+ # also lets the kernel pick a free port at boot).
49
+ @port = @server.addr[1]
50
+ @done = false
51
+ @thread = ::Thread.new { run }
52
+ @thread.name = 'wurk-health'
53
+ self
54
+ rescue ::Errno::EADDRINUSE => e
55
+ # Swarm children all try to bind the same port — only the first wins.
56
+ # Don't crash the worker; just log and skip.
57
+ logger&.warn { "Wurk::Health: port #{@port} in use; health server NOT started (#{e.message})" }
58
+ @server = nil
59
+ @thread = nil
60
+ self
61
+ end
62
+
63
+ def stop
64
+ @done = true
65
+ srv = @server
66
+ @server = nil
67
+ srv&.close
68
+ @thread&.join(2)
69
+ @thread = nil
70
+ end
71
+
72
+ def running?
73
+ @thread&.alive? == true
74
+ end
75
+
76
+ private
77
+
78
+ def run
79
+ until @done
80
+ ready = ::IO.select([@server], nil, nil, ACCEPT_TIMEOUT)
81
+ next unless ready
82
+
83
+ begin
84
+ client, _addr = @server.accept_nonblock(exception: false)
85
+ handle(client) if client
86
+ rescue ::IO::WaitReadable
87
+ next
88
+ rescue ::StandardError => e
89
+ logger&.error { "Wurk::Health accept: #{e.class}: #{e.message}" }
90
+ next
91
+ end
92
+ end
93
+ rescue ::IOError, ::Errno::EBADF
94
+ # Server was closed during shutdown — expected.
95
+ end
96
+
97
+ def handle(client)
98
+ return unless ::IO.select([client], nil, nil, 1.0)
99
+ request_line = client.gets("\r\n")
100
+ return if request_line.nil?
101
+
102
+ method, path, = request_line.strip.split(' ', 3)
103
+ # Drain remaining headers; ignore the body (probes don't send one).
104
+ # Wait for readability before each gets so a stalled client can't
105
+ # block the single accept thread mid-headers.
106
+ loop do
107
+ break unless ::IO.select([client], nil, nil, 1.0)
108
+ line = client.gets("\r\n")
109
+ break if line.nil? || line == "\r\n"
110
+ end
111
+
112
+ body, status = response_for(method, path)
113
+ write_response(client, status, body)
114
+ rescue ::StandardError => e
115
+ logger&.error { "Wurk::Health request: #{e.class}: #{e.message}" }
116
+ ensure
117
+ client&.close
118
+ end
119
+
120
+ def response_for(method, path)
121
+ return [json('error', message: 'method not allowed'), 405] unless method == 'GET'
122
+
123
+ case path
124
+ when '/live' then live_response
125
+ when '/ready' then ready_response
126
+ else [json('error', message: 'not found', path: path), 404]
127
+ end
128
+ end
129
+
130
+ def live_response
131
+ if @launcher.stopping?
132
+ [json('down', check: 'live', reason: 'stopping'), 503]
133
+ else
134
+ [json('ok', check: 'live'), 200]
135
+ end
136
+ end
137
+
138
+ def ready_response
139
+ redis_ok = ping_redis
140
+ beat_fresh = heartbeat_fresh?
141
+
142
+ if redis_ok && beat_fresh
143
+ [json('ok', check: 'ready'), 200]
144
+ else
145
+ reason = !redis_ok ? 'redis unreachable' : 'heartbeat stale'
146
+ [json('down', check: 'ready', reason: reason), 503]
147
+ end
148
+ end
149
+
150
+ def ping_redis
151
+ @config.redis { |conn| conn.call('PING') } == 'PONG'
152
+ rescue ::StandardError
153
+ false
154
+ end
155
+
156
+ def heartbeat_fresh?
157
+ hb = @launcher.instance_variable_get(:@heartbeat)
158
+ return false unless hb&.respond_to?(:last_beat_at)
159
+
160
+ last = hb.last_beat_at
161
+ return false if last.nil?
162
+
163
+ (::Time.now.to_f - last) < @ready_window
164
+ end
165
+
166
+ def json(status, **extra)
167
+ ::JSON.generate({ status: status }.merge(extra))
168
+ end
169
+
170
+ def write_response(client, status, body)
171
+ reason = case status
172
+ when 200 then 'OK'
173
+ when 404 then 'Not Found'
174
+ when 405 then 'Method Not Allowed'
175
+ when 503 then 'Service Unavailable'
176
+ else 'Status'
177
+ end
178
+
179
+ client.write(
180
+ "HTTP/1.1 #{status} #{reason}\r\n" \
181
+ "Content-Type: application/json\r\n" \
182
+ "Content-Length: #{body.bytesize}\r\n" \
183
+ "Connection: close\r\n\r\n" \
184
+ "#{body}"
185
+ )
186
+ end
187
+
188
+ def logger
189
+ @config&.logger
190
+ end
191
+ end
192
+ end
193
+ end