wurk 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +43 -0
- data/CONTRIBUTING.md +73 -0
- data/LICENSE +21 -0
- data/README.md +137 -0
- data/SECURITY.md +39 -0
- data/app/controllers/wurk/api/pagination.rb +67 -0
- data/app/controllers/wurk/api/serializers.rb +131 -0
- data/app/controllers/wurk/api_controller.rb +248 -0
- data/app/controllers/wurk/application_controller.rb +7 -0
- data/app/controllers/wurk/dashboard_controller.rb +48 -0
- data/config/locales/en.yml +15 -0
- data/config/routes.rb +34 -0
- data/exe/wurk +22 -0
- data/lib/active_job/queue_adapters/wurk_adapter.rb +96 -0
- data/lib/generators/wurk/install/install_generator.rb +22 -0
- data/lib/generators/wurk/install/templates/wurk.rb +16 -0
- data/lib/wurk/active_job/wrapper.rb +32 -0
- data/lib/wurk/api/fast.rb +78 -0
- data/lib/wurk/batch/buffer.rb +26 -0
- data/lib/wurk/batch/callback_job.rb +37 -0
- data/lib/wurk/batch/callbacks.rb +176 -0
- data/lib/wurk/batch/client_middleware.rb +27 -0
- data/lib/wurk/batch/death_handler.rb +39 -0
- data/lib/wurk/batch/empty.rb +21 -0
- data/lib/wurk/batch/server_middleware.rb +62 -0
- data/lib/wurk/batch/status.rb +140 -0
- data/lib/wurk/batch.rb +351 -0
- data/lib/wurk/batch_set.rb +67 -0
- data/lib/wurk/capsule.rb +176 -0
- data/lib/wurk/cli.rb +349 -0
- data/lib/wurk/client/buffered.rb +372 -0
- data/lib/wurk/client.rb +330 -0
- data/lib/wurk/compat.rb +136 -0
- data/lib/wurk/component.rb +136 -0
- data/lib/wurk/configuration.rb +373 -0
- data/lib/wurk/context.rb +35 -0
- data/lib/wurk/cron.rb +636 -0
- data/lib/wurk/dashboard_manifest.rb +39 -0
- data/lib/wurk/dead_set.rb +78 -0
- data/lib/wurk/deploy.rb +91 -0
- data/lib/wurk/embedded.rb +94 -0
- data/lib/wurk/encryption.rb +276 -0
- data/lib/wurk/engine.rb +81 -0
- data/lib/wurk/fetcher/reaper.rb +264 -0
- data/lib/wurk/fetcher/reliable.rb +138 -0
- data/lib/wurk/fetcher.rb +11 -0
- data/lib/wurk/health.rb +193 -0
- data/lib/wurk/heartbeat.rb +211 -0
- data/lib/wurk/iterable_job.rb +292 -0
- data/lib/wurk/job/options.rb +70 -0
- data/lib/wurk/job.rb +33 -0
- data/lib/wurk/job_logger.rb +68 -0
- data/lib/wurk/job_record.rb +156 -0
- data/lib/wurk/job_retry.rb +320 -0
- data/lib/wurk/job_set.rb +212 -0
- data/lib/wurk/job_util.rb +162 -0
- data/lib/wurk/keys.rb +52 -0
- data/lib/wurk/launcher.rb +289 -0
- data/lib/wurk/leader.rb +221 -0
- data/lib/wurk/limiter/base.rb +138 -0
- data/lib/wurk/limiter/bucket.rb +80 -0
- data/lib/wurk/limiter/concurrent.rb +132 -0
- data/lib/wurk/limiter/leaky.rb +91 -0
- data/lib/wurk/limiter/points.rb +89 -0
- data/lib/wurk/limiter/server_middleware.rb +77 -0
- data/lib/wurk/limiter/unlimited.rb +48 -0
- data/lib/wurk/limiter/window.rb +80 -0
- data/lib/wurk/limiter.rb +255 -0
- data/lib/wurk/logger.rb +81 -0
- data/lib/wurk/lua/loader.rb +53 -0
- data/lib/wurk/lua.rb +187 -0
- data/lib/wurk/manager.rb +132 -0
- data/lib/wurk/metrics/history.rb +151 -0
- data/lib/wurk/metrics/query.rb +173 -0
- data/lib/wurk/metrics/rollup.rb +169 -0
- data/lib/wurk/metrics/statsd.rb +197 -0
- data/lib/wurk/metrics.rb +7 -0
- data/lib/wurk/middleware/chain.rb +128 -0
- data/lib/wurk/middleware/current_attributes.rb +87 -0
- data/lib/wurk/middleware/expiry.rb +50 -0
- data/lib/wurk/middleware/i18n.rb +63 -0
- data/lib/wurk/middleware/interrupt_handler.rb +45 -0
- data/lib/wurk/middleware/poison_pill.rb +149 -0
- data/lib/wurk/middleware.rb +34 -0
- data/lib/wurk/process_set.rb +243 -0
- data/lib/wurk/processor.rb +247 -0
- data/lib/wurk/queue.rb +108 -0
- data/lib/wurk/queues.rb +80 -0
- data/lib/wurk/rails.rb +9 -0
- data/lib/wurk/railtie.rb +28 -0
- data/lib/wurk/redis_pool.rb +79 -0
- data/lib/wurk/retry_set.rb +17 -0
- data/lib/wurk/scheduled.rb +189 -0
- data/lib/wurk/scheduled_set.rb +18 -0
- data/lib/wurk/sorted_entry.rb +95 -0
- data/lib/wurk/stats.rb +190 -0
- data/lib/wurk/swarm/child_boot.rb +105 -0
- data/lib/wurk/swarm.rb +260 -0
- data/lib/wurk/testing.rb +102 -0
- data/lib/wurk/topology.rb +74 -0
- data/lib/wurk/unique.rb +240 -0
- data/lib/wurk/version.rb +5 -0
- data/lib/wurk/web/config.rb +180 -0
- data/lib/wurk/web/enterprise.rb +138 -0
- data/lib/wurk/web/search.rb +139 -0
- data/lib/wurk/web.rb +25 -0
- data/lib/wurk/work_set.rb +116 -0
- data/lib/wurk/worker/setter.rb +93 -0
- data/lib/wurk/worker.rb +216 -0
- data/lib/wurk.rb +238 -0
- data/vendor/assets/dashboard/assets/index-8P3N_m1X.js +152 -0
- data/vendor/assets/dashboard/assets/index-Bqz4_SOQ.css +1 -0
- data/vendor/assets/dashboard/index.html +13 -0
- data/vendor/assets/dashboard/wurk-manifest.json +4 -0
- metadata +232 -0
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative '../component'
|
|
4
|
+
require_relative '../keys'
|
|
5
|
+
require_relative '../middleware/poison_pill'
|
|
6
|
+
|
|
7
|
+
module Wurk
|
|
8
|
+
class Fetcher
|
|
9
|
+
# Orphan reclamation for the reliable fetcher (Pro super_fetch §3.2).
|
|
10
|
+
#
|
|
11
|
+
# The Reliable fetcher moves each job from a public queue into a
|
|
12
|
+
# per-process private list (`queue:<public>|<host>|<pid>|<idx>`) and
|
|
13
|
+
# leaves it there until the Processor ACKs. A SIGKILLed or crashed
|
|
14
|
+
# worker therefore strands its in-flight jobs in private lists that
|
|
15
|
+
# nobody will ever ACK. The Reaper is the recovery half: it periodically
|
|
16
|
+
# scans for private lists whose owning process is gone and atomically
|
|
17
|
+
# moves their jobs back to the public queue so a live worker re-runs them.
|
|
18
|
+
#
|
|
19
|
+
# Liveness is decided per owner:
|
|
20
|
+
# * same host — the OS is authoritative: `Process.kill(0, pid)`. This
|
|
21
|
+
# is instant and ignores a stale `processes` SET entry whose 60s TTL
|
|
22
|
+
# hasn't lapsed yet, so a `kill -9`ed sibling is reclaimed the moment
|
|
23
|
+
# the supervisor reaps it rather than 60s later. (Pid reuse by an
|
|
24
|
+
# unrelated local process is the one blind spot — the supervisor
|
|
25
|
+
# respawns with a fresh pid, so it does not arise in practice.)
|
|
26
|
+
# * other host — we cannot ping the pid, so we trust the heartbeat:
|
|
27
|
+
# the owner is alive iff some live `processes` member (one whose
|
|
28
|
+
# `info` hash still exists) shares its `host:pid`. Cross-host reclaim
|
|
29
|
+
# therefore waits out the 60s heartbeat TTL, exactly as the spec says.
|
|
30
|
+
#
|
|
31
|
+
# Re-pushed jobs run through Wurk::Middleware::PoisonPill, which caps a
|
|
32
|
+
# job at RECOVERY_THRESHOLD recoveries within 72h: past the cap the job
|
|
33
|
+
# is killed into the dead set instead of re-queued, so a job that crashes
|
|
34
|
+
# its worker every time can't loop forever.
|
|
35
|
+
#
|
|
36
|
+
# SCANs are scoped to the public queues this process serves and gated by
|
|
37
|
+
# a cluster-wide `SET NX EX` lock, so across a fleet only one process
|
|
38
|
+
# sweeps per interval ("1/min within process group" in the spec) and the
|
|
39
|
+
# keyspace touched is bounded to known queues.
|
|
40
|
+
#
|
|
41
|
+
# Spec: docs/target/sidekiq-pro.md §3.2.
|
|
42
|
+
class Reaper
|
|
43
|
+
include Component
|
|
44
|
+
|
|
45
|
+
# Sweep cadence in seconds; also the cluster-lock TTL so exactly one
|
|
46
|
+
# process sweeps per interval. 60s matches the heartbeat TTL — the
|
|
47
|
+
# floor below which cross-host orphans can't be detected anyway.
|
|
48
|
+
DEFAULT_INTERVAL = 60
|
|
49
|
+
|
|
50
|
+
LOCK_KEY = 'super_fetch:reaper'
|
|
51
|
+
SCAN_COUNT = 100
|
|
52
|
+
THREAD_NAME = 'wurk-reaper'
|
|
53
|
+
|
|
54
|
+
attr_reader :interval
|
|
55
|
+
|
|
56
|
+
def initialize(config, interval: DEFAULT_INTERVAL, lock_key: LOCK_KEY)
|
|
57
|
+
@config = config
|
|
58
|
+
@interval = interval
|
|
59
|
+
@lock_key = lock_key
|
|
60
|
+
@thread = nil
|
|
61
|
+
@done = false
|
|
62
|
+
@mutex = ::Mutex.new
|
|
63
|
+
@sleeper = ::ConditionVariable.new
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Spawns the sweep loop. Idempotent. The loop waits one interval before
|
|
67
|
+
# its first sweep so booting processes don't dogpile Redis and so an
|
|
68
|
+
# un-stopped launcher in a unit test never touches the keyspace.
|
|
69
|
+
def start
|
|
70
|
+
@mutex.synchronize do
|
|
71
|
+
return @thread if @thread
|
|
72
|
+
|
|
73
|
+
@done = false
|
|
74
|
+
@thread = spawn_loop_thread
|
|
75
|
+
end
|
|
76
|
+
@thread
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def stop
|
|
80
|
+
@mutex.synchronize do
|
|
81
|
+
@done = true
|
|
82
|
+
@sleeper.signal
|
|
83
|
+
end
|
|
84
|
+
@thread&.join
|
|
85
|
+
@thread = nil
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def running?
|
|
89
|
+
!@thread.nil? && @thread.alive?
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# One cluster-gated sweep: a no-op (returns 0) unless this process wins
|
|
93
|
+
# the interval's lock. Used by the loop.
|
|
94
|
+
def reap
|
|
95
|
+
return 0 unless acquire_lock?
|
|
96
|
+
|
|
97
|
+
reclaim!
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# One unguarded sweep over every served queue. Returns the number of
|
|
101
|
+
# jobs reclaimed (re-queued or killed). Public so boot paths and tests
|
|
102
|
+
# can drive a deterministic pass without the cluster lock.
|
|
103
|
+
def reclaim!
|
|
104
|
+
prefixes = live_process_prefixes
|
|
105
|
+
served_queues.sum { |public_q| reclaim_queue(public_q, prefixes) }
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
private
|
|
109
|
+
|
|
110
|
+
# Union of `queue:<name>` keys across every capsule this process serves.
|
|
111
|
+
# Scoping the scan to these keeps the keyspace we touch bounded and lets
|
|
112
|
+
# parallel test namespaces stay isolated.
|
|
113
|
+
def served_queues
|
|
114
|
+
@config.capsules.each_value
|
|
115
|
+
.flat_map(&:queues)
|
|
116
|
+
.uniq
|
|
117
|
+
.map { |name| Keys.queue(name) }
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
# SCAN for this public queue's private lists, reclaim the orphaned ones.
|
|
121
|
+
def reclaim_queue(public_q, prefixes)
|
|
122
|
+
reclaimed = 0
|
|
123
|
+
each_private_list(public_q) do |key, host, pid|
|
|
124
|
+
next if owner_alive?(host, pid, prefixes)
|
|
125
|
+
|
|
126
|
+
reclaimed += drain(key, public_q)
|
|
127
|
+
end
|
|
128
|
+
reclaimed
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
# Yields [private_list_key, host, pid] for each private list of
|
|
132
|
+
# `public_q`. MATCH `<public_q>|*` matches only this queue's private
|
|
133
|
+
# lists (public queue keys carry no `|`).
|
|
134
|
+
def each_private_list(public_q)
|
|
135
|
+
cursor = '0'
|
|
136
|
+
loop do
|
|
137
|
+
cursor, keys = redis { |c| c.call('SCAN', cursor, 'MATCH', "#{public_q}|*", 'COUNT', SCAN_COUNT) }
|
|
138
|
+
keys.each do |key|
|
|
139
|
+
host, pid = parse_owner(public_q, key)
|
|
140
|
+
yield key, host, pid if pid
|
|
141
|
+
end
|
|
142
|
+
break if cursor == '0'
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
# `<public_q>|<host>|<pid>|<idx>` → [host, pid] (pid as Integer), or
|
|
147
|
+
# [nil, nil] when the suffix isn't a well-formed `host|pid|idx` triple.
|
|
148
|
+
# Splitting the suffix off the known public-queue prefix tolerates a
|
|
149
|
+
# `|` inside the queue name itself.
|
|
150
|
+
def parse_owner(public_q, key)
|
|
151
|
+
suffix = key.delete_prefix("#{public_q}|")
|
|
152
|
+
return [nil, nil] if suffix == key
|
|
153
|
+
|
|
154
|
+
host, pid, idx = suffix.split('|')
|
|
155
|
+
return [nil, nil] unless host && integer?(pid) && integer?(idx)
|
|
156
|
+
|
|
157
|
+
[host, pid.to_i]
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def integer?(str)
|
|
161
|
+
str.is_a?(String) && str.match?(/\A\d+\z/)
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
def owner_alive?(host, pid, prefixes)
|
|
165
|
+
return local_pid_alive?(pid) if host == hostname
|
|
166
|
+
|
|
167
|
+
prefixes.include?("#{host}:#{pid}")
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
def local_pid_alive?(pid)
|
|
171
|
+
::Process.kill(0, pid)
|
|
172
|
+
true
|
|
173
|
+
rescue Errno::ESRCH
|
|
174
|
+
false
|
|
175
|
+
rescue Errno::EPERM
|
|
176
|
+
true
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
# `host:pid` of every live process — a member of `processes` whose
|
|
180
|
+
# `info` hash still exists. A bare SET membership isn't enough: the
|
|
181
|
+
# member lingers after its 60s hash TTL until ProcessSet#cleanup prunes
|
|
182
|
+
# it, and we must treat that window as dead for cross-host reclaim.
|
|
183
|
+
def live_process_prefixes
|
|
184
|
+
redis do |conn|
|
|
185
|
+
members = conn.call('SMEMBERS', Keys::PROCESSES)
|
|
186
|
+
next ::Set.new if members.empty?
|
|
187
|
+
|
|
188
|
+
infos = conn.pipelined { |pipe| members.each { |m| pipe.call('HGET', m, 'info') } }
|
|
189
|
+
members.zip(infos).each_with_object(::Set.new) do |(member, info), set|
|
|
190
|
+
set << host_pid(member) if info
|
|
191
|
+
end
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
# identity is `<host>:<pid>:<nonce>`; the owner prefix is `<host>:<pid>`.
|
|
196
|
+
def host_pid(identity)
|
|
197
|
+
identity.split(':')[0..1].join(':')
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
# Drain one orphaned private list back to its public queue. Each job is
|
|
201
|
+
# moved with an atomic LMOVE (private tail → public tail) BEFORE the
|
|
202
|
+
# poison check, so a crash mid-drain leaves the job safely in the public
|
|
203
|
+
# queue (at-least-once), never lost. Poison jobs are killed to the dead
|
|
204
|
+
# set by PoisonPill.track! and then LREM'd out of the public queue.
|
|
205
|
+
def drain(private_list, public_q)
|
|
206
|
+
queue_name = public_q.delete_prefix(Keys::QUEUE_PREFIX)
|
|
207
|
+
count = 0
|
|
208
|
+
loop do
|
|
209
|
+
job = redis { |c| c.call('LMOVE', private_list, public_q, 'RIGHT', 'RIGHT') }
|
|
210
|
+
break unless job
|
|
211
|
+
|
|
212
|
+
count += 1
|
|
213
|
+
poison_off(public_q, job, queue_name)
|
|
214
|
+
end
|
|
215
|
+
count
|
|
216
|
+
rescue StandardError => e
|
|
217
|
+
handle_exception(e, context: THREAD_NAME)
|
|
218
|
+
count
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
def poison_off(public_q, job, queue_name)
|
|
222
|
+
return unless Middleware::PoisonPill.track!(job, queue: queue_name) == :poison
|
|
223
|
+
|
|
224
|
+
# track! already ZADDed the payload to the dead set; pull the copy we
|
|
225
|
+
# just LMOVE'd onto the public tail so it isn't also re-run.
|
|
226
|
+
redis { |c| c.call('LREM', public_q, -1, job) }
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
def acquire_lock?
|
|
230
|
+
redis { |c| c.call('SET', @lock_key, '1', 'NX', 'EX', @interval) } == 'OK'
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
def spawn_loop_thread
|
|
234
|
+
t = Thread.new { run_loop }
|
|
235
|
+
t.name = THREAD_NAME
|
|
236
|
+
t.report_on_exception = false
|
|
237
|
+
t
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
def run_loop
|
|
241
|
+
until done?
|
|
242
|
+
wait_next
|
|
243
|
+
break if done?
|
|
244
|
+
|
|
245
|
+
tick_once
|
|
246
|
+
end
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
def tick_once
|
|
250
|
+
reap
|
|
251
|
+
rescue StandardError => e
|
|
252
|
+
handle_exception(e, context: THREAD_NAME) if @config.respond_to?(:handle_exception)
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
def wait_next
|
|
256
|
+
@mutex.synchronize { @sleeper.wait(@mutex, @interval) unless @done }
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
def done?
|
|
260
|
+
@mutex.synchronize { @done }
|
|
261
|
+
end
|
|
262
|
+
end
|
|
263
|
+
end
|
|
264
|
+
end
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'socket'
|
|
4
|
+
require_relative '../component'
|
|
5
|
+
require_relative '../keys'
|
|
6
|
+
require_relative '../fetcher'
|
|
7
|
+
|
|
8
|
+
module Wurk
|
|
9
|
+
class Fetcher
|
|
10
|
+
# Default fetcher. Each public queue is paired with a per-process
|
|
11
|
+
# private list (`queue:<name>|<host>|<pid>|<idx>`); a job is moved
|
|
12
|
+
# atomically from the public tail to the private head via LMOVE, and
|
|
13
|
+
# stays there until the Processor explicitly ACKs (LREM). SIGKILL
|
|
14
|
+
# between fetch and ack leaves the job in the private list, where the
|
|
15
|
+
# next boot of this process reclaims it via bulk_requeue.
|
|
16
|
+
#
|
|
17
|
+
# Priority handling: iterate queues_cmd in order with non-blocking
|
|
18
|
+
# LMOVE, then fall back to a 2s BLMOVE on the first queue so an
|
|
19
|
+
# empty poll doesn't spin Redis. BLMOVE has no multi-key form, so
|
|
20
|
+
# blocking on a single queue is the best Redis gives us.
|
|
21
|
+
#
|
|
22
|
+
# Spec: docs/target/sidekiq-pro.md §3 (super_fetch),
|
|
23
|
+
# docs/target/sidekiq-free.md §15 (TIMEOUT=2).
|
|
24
|
+
class Reliable < Fetcher
|
|
25
|
+
include Component
|
|
26
|
+
|
|
27
|
+
TIMEOUT = 2
|
|
28
|
+
|
|
29
|
+
# Carries the public queue key, the raw (still-JSON) job payload,
|
|
30
|
+
# and the capsule we use to reach Redis. ACK removes from the private
|
|
31
|
+
# list; requeue pushes back to the public queue head so the job is
|
|
32
|
+
# next pulled. LREM count=1 is idempotent for our payloads since
|
|
33
|
+
# each job's JSON contains a unique `jid`.
|
|
34
|
+
UnitOfWork = Struct.new(:queue, :job, :config, keyword_init: true) do
|
|
35
|
+
def acknowledge
|
|
36
|
+
config.redis do |conn|
|
|
37
|
+
conn.call('LREM', Reliable.private_queue_name(queue), 1, job)
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def queue_name
|
|
42
|
+
queue.delete_prefix(Keys::QUEUE_PREFIX)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def requeue
|
|
46
|
+
config.redis { |conn| conn.call('RPUSH', queue, job) }
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Class-level so UnitOfWork can compute the private list without
|
|
51
|
+
# carrying a back-reference to its parent fetcher. Index defaults to
|
|
52
|
+
# 0 — we run one fetcher per capsule today. Multi-processor topology
|
|
53
|
+
# (one private list per processor slot) is a future Manager concern.
|
|
54
|
+
def self.private_queue_name(public_queue, index = 0)
|
|
55
|
+
host = ENV['DYNO'] || Socket.gethostname
|
|
56
|
+
"#{public_queue}|#{host}|#{::Process.pid}|#{index}"
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def initialize(capsule)
|
|
60
|
+
super()
|
|
61
|
+
@config = capsule
|
|
62
|
+
@done = false
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def retrieve_work
|
|
66
|
+
return nil if @done
|
|
67
|
+
|
|
68
|
+
queues = queues_cmd
|
|
69
|
+
return nil if queues.empty?
|
|
70
|
+
|
|
71
|
+
queues.each do |public_q|
|
|
72
|
+
uow = lmove(public_q)
|
|
73
|
+
return uow if uow
|
|
74
|
+
end
|
|
75
|
+
blmove(queues.first)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Called on shutdown for jobs the Processor couldn't finish in time.
|
|
79
|
+
# One pipelined RPUSH per public queue (head insert) so on next boot
|
|
80
|
+
# they're picked again ahead of fresh enqueues.
|
|
81
|
+
def bulk_requeue(in_progress)
|
|
82
|
+
return if in_progress.nil? || in_progress.empty?
|
|
83
|
+
|
|
84
|
+
grouped = in_progress.group_by(&:queue)
|
|
85
|
+
config.redis do |conn|
|
|
86
|
+
conn.pipelined do |pipe|
|
|
87
|
+
grouped.each do |public_q, uows|
|
|
88
|
+
pipe.call('RPUSH', public_q, *uows.map(&:job))
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Prefixed queue keys (`queue:<name>`) in fetch order. Strict mode
|
|
95
|
+
# preserves declaration order. Random/weighted shuffle each call —
|
|
96
|
+
# @queues is pre-expanded by weight in Capsule#queues=, so uniform
|
|
97
|
+
# shuffle yields weighted fairness; .uniq trims duplicates. Paused
|
|
98
|
+
# queues are filtered after shuffle so the membership test runs on
|
|
99
|
+
# the smallest possible set.
|
|
100
|
+
def queues_cmd
|
|
101
|
+
names = config.mode == :strict ? config.queues : config.queues.shuffle.uniq
|
|
102
|
+
paused = paused_names
|
|
103
|
+
names = names.reject { |q| paused.include?(q) } unless paused.empty?
|
|
104
|
+
names.map { |q| "#{Keys::QUEUE_PREFIX}#{q}" }
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def terminate
|
|
108
|
+
@done = true
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
private
|
|
112
|
+
|
|
113
|
+
# SMEMBERS of the `paused` SET. One round-trip per fetch pass; the
|
|
114
|
+
# set is tiny in practice (one entry per paused queue) so the cost
|
|
115
|
+
# is dominated by the BLMOVE that follows. Returns a Set for O(1)
|
|
116
|
+
# lookup against the (often weighted-expanded) queue list.
|
|
117
|
+
def paused_names
|
|
118
|
+
config.redis { |conn| conn.call('SMEMBERS', Keys::PAUSED_SET) }.to_set
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def lmove(public_q)
|
|
122
|
+
priv = self.class.private_queue_name(public_q)
|
|
123
|
+
job = config.redis { |conn| conn.call('LMOVE', public_q, priv, 'RIGHT', 'LEFT') }
|
|
124
|
+
job ? UnitOfWork.new(queue: public_q, job: job, config: config) : nil
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def blmove(public_q)
|
|
128
|
+
priv = self.class.private_queue_name(public_q)
|
|
129
|
+
# Extend the socket read-timeout past BLMOVE's own timeout so the
|
|
130
|
+
# default 1s pool timeout doesn't fire before BLMOVE returns.
|
|
131
|
+
job = config.redis do |conn|
|
|
132
|
+
conn.blocking_call(TIMEOUT + 1, 'BLMOVE', public_q, priv, 'RIGHT', 'LEFT', TIMEOUT)
|
|
133
|
+
end
|
|
134
|
+
job ? UnitOfWork.new(queue: public_q, job: job, config: config) : nil
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
end
|
data/lib/wurk/fetcher.rb
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Wurk
|
|
4
|
+
# Abstract fetcher. Wurk::Fetcher::Reliable is the only implementation
|
|
5
|
+
# we ship and the only one we recommend — BLMOVE-based reliable fetch.
|
|
6
|
+
# No "basic fetch" mode.
|
|
7
|
+
class Fetcher
|
|
8
|
+
def retrieve_work; end
|
|
9
|
+
def bulk_requeue(in_progress); end
|
|
10
|
+
end
|
|
11
|
+
end
|
data/lib/wurk/health.rb
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'socket'
|
|
4
|
+
require 'json'
|
|
5
|
+
|
|
6
|
+
module Wurk
|
|
7
|
+
# Thin HTTP listener for k8s liveness/readiness probes. Optional, off by
|
|
8
|
+
# default — opt in with `config.health_check(port: 7433)`.
|
|
9
|
+
#
|
|
10
|
+
# Endpoints:
|
|
11
|
+
# * GET /live → 200 while the Launcher is running (not in quiet/stop).
|
|
12
|
+
# * GET /ready → 200 only when Redis is reachable AND the heartbeat has
|
|
13
|
+
# fired within `ready_window` seconds. 503 otherwise.
|
|
14
|
+
# Anything else returns 404 JSON.
|
|
15
|
+
#
|
|
16
|
+
# The server uses a raw TCPServer and one accept thread. No Rack, no
|
|
17
|
+
# dependencies — it lives inside every worker process where Rails may or
|
|
18
|
+
# may not exist (standalone CLI, Embedded, swarm child). Bound to a
|
|
19
|
+
# dedicated port so it does not collide with the host application's HTTP.
|
|
20
|
+
#
|
|
21
|
+
# Spec: docs/target/sidekiq-ent.md §7.1.2 (`config.health_check`).
|
|
22
|
+
module Health
|
|
23
|
+
DEFAULT_PORT = 7433
|
|
24
|
+
DEFAULT_BIND = '0.0.0.0'
|
|
25
|
+
DEFAULT_READY_WINDOW = 30
|
|
26
|
+
|
|
27
|
+
# The HTTP listener. Owns one TCPServer + one accept thread. Idempotent
|
|
28
|
+
# start/stop; safe to call from Launcher#run / Launcher#stop.
|
|
29
|
+
class Server
|
|
30
|
+
ACCEPT_TIMEOUT = 0.2
|
|
31
|
+
|
|
32
|
+
attr_reader :port, :bind
|
|
33
|
+
|
|
34
|
+
def initialize(launcher, port: DEFAULT_PORT, bind: DEFAULT_BIND, ready_window: DEFAULT_READY_WINDOW)
|
|
35
|
+
@launcher = launcher
|
|
36
|
+
@config = launcher.instance_variable_get(:@config)
|
|
37
|
+
@port = port
|
|
38
|
+
@bind = bind
|
|
39
|
+
@ready_window = ready_window
|
|
40
|
+
@server = nil
|
|
41
|
+
@thread = nil
|
|
42
|
+
@done = false
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def start
|
|
46
|
+
@server = ::TCPServer.new(@bind, @port)
|
|
47
|
+
# Capture the OS-assigned port when caller passed 0 (test pattern,
|
|
48
|
+
# also lets the kernel pick a free port at boot).
|
|
49
|
+
@port = @server.addr[1]
|
|
50
|
+
@done = false
|
|
51
|
+
@thread = ::Thread.new { run }
|
|
52
|
+
@thread.name = 'wurk-health'
|
|
53
|
+
self
|
|
54
|
+
rescue ::Errno::EADDRINUSE => e
|
|
55
|
+
# Swarm children all try to bind the same port — only the first wins.
|
|
56
|
+
# Don't crash the worker; just log and skip.
|
|
57
|
+
logger&.warn { "Wurk::Health: port #{@port} in use; health server NOT started (#{e.message})" }
|
|
58
|
+
@server = nil
|
|
59
|
+
@thread = nil
|
|
60
|
+
self
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def stop
|
|
64
|
+
@done = true
|
|
65
|
+
srv = @server
|
|
66
|
+
@server = nil
|
|
67
|
+
srv&.close
|
|
68
|
+
@thread&.join(2)
|
|
69
|
+
@thread = nil
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def running?
|
|
73
|
+
@thread&.alive? == true
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
private
|
|
77
|
+
|
|
78
|
+
def run
|
|
79
|
+
until @done
|
|
80
|
+
ready = ::IO.select([@server], nil, nil, ACCEPT_TIMEOUT)
|
|
81
|
+
next unless ready
|
|
82
|
+
|
|
83
|
+
begin
|
|
84
|
+
client, _addr = @server.accept_nonblock(exception: false)
|
|
85
|
+
handle(client) if client
|
|
86
|
+
rescue ::IO::WaitReadable
|
|
87
|
+
next
|
|
88
|
+
rescue ::StandardError => e
|
|
89
|
+
logger&.error { "Wurk::Health accept: #{e.class}: #{e.message}" }
|
|
90
|
+
next
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
rescue ::IOError, ::Errno::EBADF
|
|
94
|
+
# Server was closed during shutdown — expected.
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def handle(client)
|
|
98
|
+
return unless ::IO.select([client], nil, nil, 1.0)
|
|
99
|
+
request_line = client.gets("\r\n")
|
|
100
|
+
return if request_line.nil?
|
|
101
|
+
|
|
102
|
+
method, path, = request_line.strip.split(' ', 3)
|
|
103
|
+
# Drain remaining headers; ignore the body (probes don't send one).
|
|
104
|
+
# Wait for readability before each gets so a stalled client can't
|
|
105
|
+
# block the single accept thread mid-headers.
|
|
106
|
+
loop do
|
|
107
|
+
break unless ::IO.select([client], nil, nil, 1.0)
|
|
108
|
+
line = client.gets("\r\n")
|
|
109
|
+
break if line.nil? || line == "\r\n"
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
body, status = response_for(method, path)
|
|
113
|
+
write_response(client, status, body)
|
|
114
|
+
rescue ::StandardError => e
|
|
115
|
+
logger&.error { "Wurk::Health request: #{e.class}: #{e.message}" }
|
|
116
|
+
ensure
|
|
117
|
+
client&.close
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def response_for(method, path)
|
|
121
|
+
return [json('error', message: 'method not allowed'), 405] unless method == 'GET'
|
|
122
|
+
|
|
123
|
+
case path
|
|
124
|
+
when '/live' then live_response
|
|
125
|
+
when '/ready' then ready_response
|
|
126
|
+
else [json('error', message: 'not found', path: path), 404]
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
def live_response
|
|
131
|
+
if @launcher.stopping?
|
|
132
|
+
[json('down', check: 'live', reason: 'stopping'), 503]
|
|
133
|
+
else
|
|
134
|
+
[json('ok', check: 'live'), 200]
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
def ready_response
|
|
139
|
+
redis_ok = ping_redis
|
|
140
|
+
beat_fresh = heartbeat_fresh?
|
|
141
|
+
|
|
142
|
+
if redis_ok && beat_fresh
|
|
143
|
+
[json('ok', check: 'ready'), 200]
|
|
144
|
+
else
|
|
145
|
+
reason = !redis_ok ? 'redis unreachable' : 'heartbeat stale'
|
|
146
|
+
[json('down', check: 'ready', reason: reason), 503]
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
def ping_redis
|
|
151
|
+
@config.redis { |conn| conn.call('PING') } == 'PONG'
|
|
152
|
+
rescue ::StandardError
|
|
153
|
+
false
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
def heartbeat_fresh?
|
|
157
|
+
hb = @launcher.instance_variable_get(:@heartbeat)
|
|
158
|
+
return false unless hb&.respond_to?(:last_beat_at)
|
|
159
|
+
|
|
160
|
+
last = hb.last_beat_at
|
|
161
|
+
return false if last.nil?
|
|
162
|
+
|
|
163
|
+
(::Time.now.to_f - last) < @ready_window
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
def json(status, **extra)
|
|
167
|
+
::JSON.generate({ status: status }.merge(extra))
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
def write_response(client, status, body)
|
|
171
|
+
reason = case status
|
|
172
|
+
when 200 then 'OK'
|
|
173
|
+
when 404 then 'Not Found'
|
|
174
|
+
when 405 then 'Method Not Allowed'
|
|
175
|
+
when 503 then 'Service Unavailable'
|
|
176
|
+
else 'Status'
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
client.write(
|
|
180
|
+
"HTTP/1.1 #{status} #{reason}\r\n" \
|
|
181
|
+
"Content-Type: application/json\r\n" \
|
|
182
|
+
"Content-Length: #{body.bytesize}\r\n" \
|
|
183
|
+
"Connection: close\r\n\r\n" \
|
|
184
|
+
"#{body}"
|
|
185
|
+
)
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
def logger
|
|
189
|
+
@config&.logger
|
|
190
|
+
end
|
|
191
|
+
end
|
|
192
|
+
end
|
|
193
|
+
end
|