wurk 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +43 -0
- data/CONTRIBUTING.md +73 -0
- data/LICENSE +21 -0
- data/README.md +137 -0
- data/SECURITY.md +39 -0
- data/app/controllers/wurk/api/pagination.rb +67 -0
- data/app/controllers/wurk/api/serializers.rb +131 -0
- data/app/controllers/wurk/api_controller.rb +248 -0
- data/app/controllers/wurk/application_controller.rb +7 -0
- data/app/controllers/wurk/dashboard_controller.rb +48 -0
- data/config/locales/en.yml +15 -0
- data/config/routes.rb +34 -0
- data/exe/wurk +22 -0
- data/lib/active_job/queue_adapters/wurk_adapter.rb +96 -0
- data/lib/generators/wurk/install/install_generator.rb +22 -0
- data/lib/generators/wurk/install/templates/wurk.rb +16 -0
- data/lib/wurk/active_job/wrapper.rb +32 -0
- data/lib/wurk/api/fast.rb +78 -0
- data/lib/wurk/batch/buffer.rb +26 -0
- data/lib/wurk/batch/callback_job.rb +37 -0
- data/lib/wurk/batch/callbacks.rb +176 -0
- data/lib/wurk/batch/client_middleware.rb +27 -0
- data/lib/wurk/batch/death_handler.rb +39 -0
- data/lib/wurk/batch/empty.rb +21 -0
- data/lib/wurk/batch/server_middleware.rb +62 -0
- data/lib/wurk/batch/status.rb +140 -0
- data/lib/wurk/batch.rb +351 -0
- data/lib/wurk/batch_set.rb +67 -0
- data/lib/wurk/capsule.rb +176 -0
- data/lib/wurk/cli.rb +349 -0
- data/lib/wurk/client/buffered.rb +372 -0
- data/lib/wurk/client.rb +330 -0
- data/lib/wurk/compat.rb +136 -0
- data/lib/wurk/component.rb +136 -0
- data/lib/wurk/configuration.rb +373 -0
- data/lib/wurk/context.rb +35 -0
- data/lib/wurk/cron.rb +636 -0
- data/lib/wurk/dashboard_manifest.rb +39 -0
- data/lib/wurk/dead_set.rb +78 -0
- data/lib/wurk/deploy.rb +91 -0
- data/lib/wurk/embedded.rb +94 -0
- data/lib/wurk/encryption.rb +276 -0
- data/lib/wurk/engine.rb +81 -0
- data/lib/wurk/fetcher/reaper.rb +264 -0
- data/lib/wurk/fetcher/reliable.rb +138 -0
- data/lib/wurk/fetcher.rb +11 -0
- data/lib/wurk/health.rb +193 -0
- data/lib/wurk/heartbeat.rb +211 -0
- data/lib/wurk/iterable_job.rb +292 -0
- data/lib/wurk/job/options.rb +70 -0
- data/lib/wurk/job.rb +33 -0
- data/lib/wurk/job_logger.rb +68 -0
- data/lib/wurk/job_record.rb +156 -0
- data/lib/wurk/job_retry.rb +320 -0
- data/lib/wurk/job_set.rb +212 -0
- data/lib/wurk/job_util.rb +162 -0
- data/lib/wurk/keys.rb +52 -0
- data/lib/wurk/launcher.rb +289 -0
- data/lib/wurk/leader.rb +221 -0
- data/lib/wurk/limiter/base.rb +138 -0
- data/lib/wurk/limiter/bucket.rb +80 -0
- data/lib/wurk/limiter/concurrent.rb +132 -0
- data/lib/wurk/limiter/leaky.rb +91 -0
- data/lib/wurk/limiter/points.rb +89 -0
- data/lib/wurk/limiter/server_middleware.rb +77 -0
- data/lib/wurk/limiter/unlimited.rb +48 -0
- data/lib/wurk/limiter/window.rb +80 -0
- data/lib/wurk/limiter.rb +255 -0
- data/lib/wurk/logger.rb +81 -0
- data/lib/wurk/lua/loader.rb +53 -0
- data/lib/wurk/lua.rb +187 -0
- data/lib/wurk/manager.rb +132 -0
- data/lib/wurk/metrics/history.rb +151 -0
- data/lib/wurk/metrics/query.rb +173 -0
- data/lib/wurk/metrics/rollup.rb +169 -0
- data/lib/wurk/metrics/statsd.rb +197 -0
- data/lib/wurk/metrics.rb +7 -0
- data/lib/wurk/middleware/chain.rb +128 -0
- data/lib/wurk/middleware/current_attributes.rb +87 -0
- data/lib/wurk/middleware/expiry.rb +50 -0
- data/lib/wurk/middleware/i18n.rb +63 -0
- data/lib/wurk/middleware/interrupt_handler.rb +45 -0
- data/lib/wurk/middleware/poison_pill.rb +149 -0
- data/lib/wurk/middleware.rb +34 -0
- data/lib/wurk/process_set.rb +243 -0
- data/lib/wurk/processor.rb +247 -0
- data/lib/wurk/queue.rb +108 -0
- data/lib/wurk/queues.rb +80 -0
- data/lib/wurk/rails.rb +9 -0
- data/lib/wurk/railtie.rb +28 -0
- data/lib/wurk/redis_pool.rb +79 -0
- data/lib/wurk/retry_set.rb +17 -0
- data/lib/wurk/scheduled.rb +189 -0
- data/lib/wurk/scheduled_set.rb +18 -0
- data/lib/wurk/sorted_entry.rb +95 -0
- data/lib/wurk/stats.rb +190 -0
- data/lib/wurk/swarm/child_boot.rb +105 -0
- data/lib/wurk/swarm.rb +260 -0
- data/lib/wurk/testing.rb +102 -0
- data/lib/wurk/topology.rb +74 -0
- data/lib/wurk/unique.rb +240 -0
- data/lib/wurk/version.rb +5 -0
- data/lib/wurk/web/config.rb +180 -0
- data/lib/wurk/web/enterprise.rb +138 -0
- data/lib/wurk/web/search.rb +139 -0
- data/lib/wurk/web.rb +25 -0
- data/lib/wurk/work_set.rb +116 -0
- data/lib/wurk/worker/setter.rb +93 -0
- data/lib/wurk/worker.rb +216 -0
- data/lib/wurk.rb +238 -0
- data/vendor/assets/dashboard/assets/index-8P3N_m1X.js +152 -0
- data/vendor/assets/dashboard/assets/index-Bqz4_SOQ.css +1 -0
- data/vendor/assets/dashboard/index.html +13 -0
- data/vendor/assets/dashboard/wurk-manifest.json +4 -0
- metadata +232 -0
data/lib/wurk/lua.rb
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'digest'
|
|
4
|
+
|
|
5
|
+
module Wurk
|
|
6
|
+
# EVALSHA-cached Lua scripts. Loaded once per pool, never re-uploaded.
|
|
7
|
+
# Bulk enqueue, multi-pop, atomic schedule promotion, batch ops.
|
|
8
|
+
#
|
|
9
|
+
# Source strings are intentionally bare — the SHA1 of each is computed
|
|
10
|
+
# at load time and is the same value Redis reports from `SCRIPT LOAD`.
|
|
11
|
+
# Whitespace edits change the SHA, which forces a re-upload at runtime.
|
|
12
|
+
#
|
|
13
|
+
# `:zpopbyscore` is reproduced verbatim from sidekiq-free.md §1.8 and
|
|
14
|
+
# MUST NOT diverge — parity tests will fail on a single byte change.
|
|
15
|
+
module Lua
|
|
16
|
+
ZPOPBYSCORE = <<~LUA
|
|
17
|
+
local key, now = KEYS[1], ARGV[1]
|
|
18
|
+
local jobs = redis.call("zrange", key, "-inf", now, "byscore", "limit", 0, 1)
|
|
19
|
+
if jobs[1] then
|
|
20
|
+
redis.call("zrem", key, jobs[1])
|
|
21
|
+
return jobs[1]
|
|
22
|
+
end
|
|
23
|
+
LUA
|
|
24
|
+
|
|
25
|
+
# Bulk enqueue to a single queue.
|
|
26
|
+
# KEYS = [queue_list, queues_set]
|
|
27
|
+
# ARGV = [queue_name, job_json, ...]
|
|
28
|
+
# Returns the number of jobs pushed.
|
|
29
|
+
BULK_PUSH = <<~LUA
|
|
30
|
+
redis.call("sadd", KEYS[2], ARGV[1])
|
|
31
|
+
for i = 2, #ARGV do
|
|
32
|
+
redis.call("lpush", KEYS[1], ARGV[i])
|
|
33
|
+
end
|
|
34
|
+
return #ARGV - 1
|
|
35
|
+
LUA
|
|
36
|
+
|
|
37
|
+
# Pro reliable scheduler: atomically promote all due jobs in a sorted
|
|
38
|
+
# set to their target queues. Pure-Ruby promotion does ZRANGE → ZREM →
|
|
39
|
+
# LPUSH non-atomically and can lose jobs on a mid-step crash.
|
|
40
|
+
# KEYS = [sorted_set, queues_set]
|
|
41
|
+
# ARGV = [now, queue_prefix]
|
|
42
|
+
# Returns the number of jobs promoted.
|
|
43
|
+
# Order matters: decode + push BEFORE zrem. Redis Lua has no rollback,
|
|
44
|
+
# so a failed cjson.decode after a zrem would lose the job. Decode first;
|
|
45
|
+
# push first; only then remove from the sorted set. Worst case is a
|
|
46
|
+
# crash between lpush and zrem → at-least-once redelivery, never loss.
|
|
47
|
+
RELIABLE_SCHEDULE_PROMOTE = <<~LUA
|
|
48
|
+
local jobs = redis.call("zrangebyscore", KEYS[1], "-inf", ARGV[1])
|
|
49
|
+
for i = 1, #jobs do
|
|
50
|
+
local job = jobs[i]
|
|
51
|
+
local q = cjson.decode(job)["queue"]
|
|
52
|
+
redis.call("sadd", KEYS[2], q)
|
|
53
|
+
redis.call("lpush", ARGV[2] .. q, job)
|
|
54
|
+
redis.call("zrem", KEYS[1], job)
|
|
55
|
+
end
|
|
56
|
+
return #jobs
|
|
57
|
+
LUA
|
|
58
|
+
|
|
59
|
+
# Pro Batch: register a job into a batch and push it to its queue
|
|
60
|
+
# atomically. Keeps total/pending in sync with the jids set.
|
|
61
|
+
# KEYS = [b-<bid>, b-<bid>-jids, queue_list, queues_set]
|
|
62
|
+
# ARGV = [queue_name, jid, job_json]
|
|
63
|
+
# Returns 1.
|
|
64
|
+
BATCH_PUSH = <<~LUA
|
|
65
|
+
redis.call("hincrby", KEYS[1], "total", 1)
|
|
66
|
+
redis.call("hincrby", KEYS[1], "pending", 1)
|
|
67
|
+
redis.call("sadd", KEYS[2], ARGV[2])
|
|
68
|
+
redis.call("sadd", KEYS[4], ARGV[1])
|
|
69
|
+
redis.call("lpush", KEYS[3], ARGV[3])
|
|
70
|
+
return 1
|
|
71
|
+
LUA
|
|
72
|
+
|
|
73
|
+
# Pro Batch: ACK a job that completed successfully. SREM from the live
|
|
74
|
+
# jids set and decrement pending iff the jid was a member (idempotent
|
|
75
|
+
# against double-success on a flaky retry).
|
|
76
|
+
# KEYS = [b-<bid>, b-<bid>-jids]
|
|
77
|
+
# ARGV = [jid]
|
|
78
|
+
# Returns [new_pending, live_jids_remaining], or [-1, -1] when the jid
|
|
79
|
+
# was not a member (treat as already acked).
|
|
80
|
+
BATCH_ACK_SUCCESS = <<~LUA
|
|
81
|
+
local removed = redis.call("srem", KEYS[2], ARGV[1])
|
|
82
|
+
if removed == 1 then
|
|
83
|
+
local pending = redis.call("hincrby", KEYS[1], "pending", -1)
|
|
84
|
+
return { pending, redis.call("scard", KEYS[2]) }
|
|
85
|
+
end
|
|
86
|
+
return { -1, -1 }
|
|
87
|
+
LUA
|
|
88
|
+
|
|
89
|
+
# Pro Batch: ACK a job that exhausted retries and died. Records death,
|
|
90
|
+
# bumps failures, and SREMs from live jids so the batch can fire
|
|
91
|
+
# `:complete` even with terminally failed jobs.
|
|
92
|
+
# KEYS = [b-<bid>, b-<bid>-jids, b-<bid>-died, b-<bid>-failed]
|
|
93
|
+
# ARGV = [jid]
|
|
94
|
+
# Returns [live_jids_remaining, died_count, first_death]. `first_death`
|
|
95
|
+
# is 1 the first time *any* jid is SADDed into the died set, 0 thereafter
|
|
96
|
+
# — caller uses it to fire `:death` exactly once per batch.
|
|
97
|
+
BATCH_ACK_COMPLETE = <<~LUA
|
|
98
|
+
local was_pre_existing_death = redis.call("scard", KEYS[3])
|
|
99
|
+
redis.call("srem", KEYS[2], ARGV[1])
|
|
100
|
+
redis.call("sadd", KEYS[4], ARGV[1])
|
|
101
|
+
local died_added = redis.call("sadd", KEYS[3], ARGV[1])
|
|
102
|
+
redis.call("hincrby", KEYS[1], "failures", 1)
|
|
103
|
+
local first_death = 0
|
|
104
|
+
if was_pre_existing_death == 0 and died_added == 1 then
|
|
105
|
+
first_death = 1
|
|
106
|
+
end
|
|
107
|
+
return { redis.call("scard", KEYS[2]), redis.call("scard", KEYS[3]), first_death }
|
|
108
|
+
LUA
|
|
109
|
+
|
|
110
|
+
# Pro Batch: invalidate all pending jobs. The jobs themselves stay
|
|
111
|
+
# in their queues — the server middleware short-circuits when it sees
|
|
112
|
+
# the invalidated flag — but the jids set is cleared so the batch can
|
|
113
|
+
# no longer accept completion callbacks.
|
|
114
|
+
# KEYS = [b-<bid>, b-<bid>-jids]
|
|
115
|
+
# ARGV = []
|
|
116
|
+
# Returns 1.
|
|
117
|
+
BATCH_INVALIDATE = <<~LUA
|
|
118
|
+
redis.call("del", KEYS[2])
|
|
119
|
+
redis.call("hset", KEYS[1], "invalidated", "1")
|
|
120
|
+
return 1
|
|
121
|
+
LUA
|
|
122
|
+
|
|
123
|
+
# Pro Fast API (§11): server-side LRANGE+LREM to delete a single job by
|
|
124
|
+
# jid from a queue list. Pure-Ruby Queue#find_job + JobRecord#delete is
|
|
125
|
+
# O(N) round-trips; this is O(1) round-trip with O(N) Lua work.
|
|
126
|
+
# KEYS = [queue:<name>]
|
|
127
|
+
# ARGV = [jid]
|
|
128
|
+
# Returns the number of payloads removed (0 or 1; can be >1 in pathological
|
|
129
|
+
# duplicate-jid corruption — caller doesn't rely on the value).
|
|
130
|
+
FAST_DELETE_JOB = <<~LUA
|
|
131
|
+
local items = redis.call("lrange", KEYS[1], 0, -1)
|
|
132
|
+
local removed = 0
|
|
133
|
+
for i = 1, #items do
|
|
134
|
+
if string.find(items[i], '"jid":"' .. ARGV[1] .. '"', 1, true) then
|
|
135
|
+
removed = removed + redis.call("lrem", KEYS[1], 1, items[i])
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
return removed
|
|
139
|
+
LUA
|
|
140
|
+
|
|
141
|
+
# Pro Fast API (§11): server-side LRANGE+LREM removing every payload whose
|
|
142
|
+
# `"class":"<klass>"` field matches. Plain-text scan (no JSON parse) so
|
|
143
|
+
# it tolerates partial corruption — caller drops only well-formed matches.
|
|
144
|
+
# KEYS = [queue:<name>]
|
|
145
|
+
# ARGV = [klass]
|
|
146
|
+
# Returns the number of payloads removed.
|
|
147
|
+
FAST_DELETE_BY_CLASS = <<~LUA
|
|
148
|
+
local items = redis.call("lrange", KEYS[1], 0, -1)
|
|
149
|
+
local removed = 0
|
|
150
|
+
local needle = '"class":"' .. ARGV[1] .. '"'
|
|
151
|
+
for i = 1, #items do
|
|
152
|
+
if string.find(items[i], needle, 1, true) then
|
|
153
|
+
removed = removed + redis.call("lrem", KEYS[1], 1, items[i])
|
|
154
|
+
end
|
|
155
|
+
end
|
|
156
|
+
return removed
|
|
157
|
+
LUA
|
|
158
|
+
|
|
159
|
+
# Limiter scripts live in `lib/wurk/lua/limiter_*.lua` — one file per
|
|
160
|
+
# type. Loaded at boot, the file's basename (minus `.lua`) becomes the
|
|
161
|
+
# SCRIPTS key as a symbol. Keeping them as separate files makes diffing
|
|
162
|
+
# individual rate-limiter changes painless and keeps each script self-
|
|
163
|
+
# contained for the `redis-cli --eval` debug workflow.
|
|
164
|
+
LUA_DIR = File.expand_path('lua', __dir__)
|
|
165
|
+
FILE_SCRIPTS = Dir.glob(File.join(LUA_DIR, '*.lua')).each_with_object({}) do |path, h|
|
|
166
|
+
h[File.basename(path, '.lua').to_sym] = File.read(path)
|
|
167
|
+
end.freeze
|
|
168
|
+
|
|
169
|
+
SCRIPTS = {
|
|
170
|
+
zpopbyscore: ZPOPBYSCORE,
|
|
171
|
+
bulk_push: BULK_PUSH,
|
|
172
|
+
reliable_schedule_promote: RELIABLE_SCHEDULE_PROMOTE,
|
|
173
|
+
batch_push: BATCH_PUSH,
|
|
174
|
+
batch_ack_success: BATCH_ACK_SUCCESS,
|
|
175
|
+
batch_ack_complete: BATCH_ACK_COMPLETE,
|
|
176
|
+
batch_invalidate: BATCH_INVALIDATE,
|
|
177
|
+
fast_delete_job: FAST_DELETE_JOB,
|
|
178
|
+
fast_delete_by_class: FAST_DELETE_BY_CLASS
|
|
179
|
+
}.merge(FILE_SCRIPTS).freeze
|
|
180
|
+
|
|
181
|
+
# SHA1 of each script source — matches what `SCRIPT LOAD` returns.
|
|
182
|
+
# Precomputing keeps `eval_cached` allocation-free in the hot path.
|
|
183
|
+
SHAS = SCRIPTS.transform_values { |src| Digest::SHA1.hexdigest(src) }.freeze
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
require_relative 'lua/loader'
|
data/lib/wurk/manager.rb
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'component'
|
|
4
|
+
require_relative 'processor'
|
|
5
|
+
|
|
6
|
+
module Wurk
|
|
7
|
+
# One per Capsule. Lives inside each forked child and owns the Processor
|
|
8
|
+
# pool. Replaces dead processors on the fly (replace-on-die), forwards
|
|
9
|
+
# the quiet/stop signals received by the Swarm to its processors, and
|
|
10
|
+
# ensures in-flight UnitsOfWork are bulk_requeued before threads are killed.
|
|
11
|
+
#
|
|
12
|
+
# Lifecycle:
|
|
13
|
+
# * `start` — spawn each processor thread.
|
|
14
|
+
# * `quiet` — stop fetching; in-flight jobs run to completion.
|
|
15
|
+
# * `stop(deadline)`— quiet + wait for drain; hard_shutdown on timeout.
|
|
16
|
+
#
|
|
17
|
+
# Spec: docs/target/sidekiq-free.md §13.
|
|
18
|
+
class Manager
|
|
19
|
+
include Component
|
|
20
|
+
|
|
21
|
+
# 0.1 in TTY mode so interactive shutdown feels snappy; 0.5 in
|
|
22
|
+
# production so the supervisor isn't spinning while threads drain.
|
|
23
|
+
PAUSE_TIME = $stdout.tty? ? 0.1 : 0.5
|
|
24
|
+
|
|
25
|
+
attr_reader :workers, :capsule
|
|
26
|
+
|
|
27
|
+
def initialize(capsule)
|
|
28
|
+
@config = @capsule = capsule
|
|
29
|
+
@count = capsule.concurrency
|
|
30
|
+
raise ArgumentError, "Concurrency of #{@count} is not supported" if @count < 1
|
|
31
|
+
|
|
32
|
+
@done = false
|
|
33
|
+
@workers = Set.new
|
|
34
|
+
@plock = ::Mutex.new
|
|
35
|
+
@count.times do
|
|
36
|
+
@workers << Processor.new(@capsule, &method(:processor_result))
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def start
|
|
41
|
+
@workers.each(&:start)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def quiet
|
|
45
|
+
return if @done
|
|
46
|
+
|
|
47
|
+
@done = true
|
|
48
|
+
logger.info { "Terminating quiet threads for #{capsule.name} capsule" }
|
|
49
|
+
@workers.each(&:terminate)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Graceful shutdown: quiet first, then poll for workers to clear.
|
|
53
|
+
# If the deadline elapses with workers still alive we fall through to
|
|
54
|
+
# hard_shutdown, which bulk_requeues their UoWs before killing threads.
|
|
55
|
+
def stop(deadline)
|
|
56
|
+
quiet
|
|
57
|
+
# Lifecycle hooks (e.g. :quiet) can be async; give them a tick to settle
|
|
58
|
+
# before we start polling. Matches Sidekiq's PAUSE_TIME behavior.
|
|
59
|
+
sleep PAUSE_TIME
|
|
60
|
+
return if @workers.empty?
|
|
61
|
+
|
|
62
|
+
logger.info { 'Pausing to allow jobs to finish...' }
|
|
63
|
+
wait_for(deadline) { @workers.empty? }
|
|
64
|
+
return if @workers.empty?
|
|
65
|
+
|
|
66
|
+
hard_shutdown
|
|
67
|
+
ensure
|
|
68
|
+
capsule.stop
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def stopped?
|
|
72
|
+
@done
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Processor#run callback: invoked when a Processor thread exits, whether
|
|
76
|
+
# cleanly or via raised exception. Removes the dead processor from the
|
|
77
|
+
# pool and (unless we're already stopping) spawns a replacement so the
|
|
78
|
+
# capsule's concurrency stays constant.
|
|
79
|
+
def processor_result(processor, _reason = nil)
|
|
80
|
+
@plock.synchronize do
|
|
81
|
+
@workers.delete(processor)
|
|
82
|
+
unless @done
|
|
83
|
+
p = Processor.new(@capsule, &method(:processor_result))
|
|
84
|
+
@workers << p
|
|
85
|
+
p.start
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Reached when the deadline expired with workers still busy. We must
|
|
91
|
+
# push their in-flight UoWs back to the public queues BEFORE raising
|
|
92
|
+
# Wurk::Shutdown into the threads — losing a job is worse than running
|
|
93
|
+
# it twice (Sidekiq's at-least-once contract).
|
|
94
|
+
def hard_shutdown # rubocop:disable Metrics/AbcSize
|
|
95
|
+
cleanup = nil
|
|
96
|
+
@plock.synchronize do
|
|
97
|
+
cleanup = @workers.dup
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
if cleanup.any?
|
|
101
|
+
jobs = cleanup.map(&:job).compact
|
|
102
|
+
|
|
103
|
+
logger.warn { "Terminating #{cleanup.size} busy threads" }
|
|
104
|
+
logger.debug { "Jobs still in progress #{jobs.inspect}" }
|
|
105
|
+
|
|
106
|
+
capsule.fetcher.bulk_requeue(jobs)
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
cleanup.each(&:kill)
|
|
110
|
+
|
|
111
|
+
# The caller typically `exit`s immediately after we return; give
|
|
112
|
+
# threads a brief window to run their `ensure` blocks.
|
|
113
|
+
deadline = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC) + 3
|
|
114
|
+
wait_for(deadline) { @workers.empty? }
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
private
|
|
118
|
+
|
|
119
|
+
# Polls `condblock` until it returns true or the monotonic deadline
|
|
120
|
+
# passes. The PAUSE_TIME floor stops us from spinning when only a few
|
|
121
|
+
# milliseconds remain.
|
|
122
|
+
def wait_for(deadline)
|
|
123
|
+
remaining = deadline - ::Process.clock_gettime(::Process::CLOCK_MONOTONIC)
|
|
124
|
+
while remaining > PAUSE_TIME
|
|
125
|
+
return if yield
|
|
126
|
+
|
|
127
|
+
sleep PAUSE_TIME
|
|
128
|
+
remaining = deadline - ::Process.clock_gettime(::Process::CLOCK_MONOTONIC)
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
end
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative '../middleware'
|
|
4
|
+
|
|
5
|
+
module Wurk
|
|
6
|
+
module Metrics
|
|
7
|
+
# Ent feature parity (§5): server middleware that records per-job-class
|
|
8
|
+
# execution metrics into Redis time-buckets. The on-the-wire schema is
|
|
9
|
+
# wire-compat with Sidekiq 8.x's history pane so dashboards built against
|
|
10
|
+
# `j|YYMMDD|H:M` HASH keys keep working unchanged.
|
|
11
|
+
#
|
|
12
|
+
# Bucket layout (spec: docs/target/sidekiq-free.md §1.6):
|
|
13
|
+
#
|
|
14
|
+
# j|YYMMDD|H:M HASH per-minute bucket, TTL = MID_TERM (3 days)
|
|
15
|
+
# <klass>|p INT processed count
|
|
16
|
+
# <klass>|f INT failed count
|
|
17
|
+
# <klass>|ms INT total ms spent
|
|
18
|
+
#
|
|
19
|
+
# j|YYMMDD|H:m0 HASH 10-minute rollup (last digit zeroed),
|
|
20
|
+
# TTL = SHORT_TERM (8 hours) — short window for
|
|
21
|
+
# quick aggregate queries without scanning 600 minute keys.
|
|
22
|
+
#
|
|
23
|
+
# <klass>-YYMMDD-H HASH per-class hourly histogram, TTL = MID_TERM
|
|
24
|
+
#
|
|
25
|
+
# Every bucket TTL is set on first write (EXPIRE NX-equivalent: only when
|
|
26
|
+
# the HASH was newly created in this call) — re-asserting TTL on every
|
|
27
|
+
# write would keep the bucket alive indefinitely while traffic continues,
|
|
28
|
+
# but that's the desired behavior here: as long as a class keeps running,
|
|
29
|
+
# we keep the minute bucket around for the retention window measured from
|
|
30
|
+
# *last write*, not from first write. So we EXPIRE unconditionally.
|
|
31
|
+
#
|
|
32
|
+
# The middleware is hot-path — every successful job pays for it. Writes
|
|
33
|
+
# are pipelined in a single round-trip per job (1 HINCRBY × 3 + 1 EXPIRE
|
|
34
|
+
# per bucket × 3 buckets = 12 commands, batched).
|
|
35
|
+
class History
|
|
36
|
+
include Wurk::Middleware::ServerMiddleware
|
|
37
|
+
|
|
38
|
+
# Per spec §1.6 — naming mirrors the upstream constants so anyone
|
|
39
|
+
# grepping the Sidekiq source for `MID_TERM` lands here.
|
|
40
|
+
MID_TERM = 3 * 24 * 60 * 60 # 3 days, in seconds
|
|
41
|
+
SHORT_TERM = 8 * 60 * 60 # 8 hours, in seconds
|
|
42
|
+
|
|
43
|
+
MINUTE_KEY_PREFIX = 'j|'
|
|
44
|
+
DATE_FORMAT = '%y%m%d' # YYMMDD — two-digit year per spec
|
|
45
|
+
|
|
46
|
+
def call(_worker, job, _queue)
|
|
47
|
+
klass = job['class']
|
|
48
|
+
started = monotonic_ms
|
|
49
|
+
success = false
|
|
50
|
+
begin
|
|
51
|
+
result = yield
|
|
52
|
+
success = true
|
|
53
|
+
result
|
|
54
|
+
ensure
|
|
55
|
+
duration = (monotonic_ms - started).round
|
|
56
|
+
# Best-effort: a metrics write failure must never propagate into
|
|
57
|
+
# the job result. The processor already finalized the ack path.
|
|
58
|
+
begin
|
|
59
|
+
self.class.record(klass, duration, success: success, redis_pool: redis_pool)
|
|
60
|
+
rescue StandardError => e
|
|
61
|
+
handle_error(e)
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
class << self
|
|
67
|
+
# Single Redis round-trip per job. `success: true` → `<klass>|p`;
|
|
68
|
+
# `success: false` → `<klass>|f`. `<klass>|ms` accumulates total
|
|
69
|
+
# runtime in milliseconds for *both* outcomes (so an operator can
|
|
70
|
+
# ask "how much wall-clock time has FooJob consumed?" without
|
|
71
|
+
# branching on outcome).
|
|
72
|
+
def record(klass, duration_ms, success:, redis_pool: nil, at: ::Time.now)
|
|
73
|
+
return if klass.nil? || klass.empty?
|
|
74
|
+
|
|
75
|
+
ms = duration_ms.to_i
|
|
76
|
+
ms = 0 if ms.negative?
|
|
77
|
+
buckets = { minute: minute_key(at), rollup: rollup_key(at), hour: hour_key(klass, at) }
|
|
78
|
+
with_pool(redis_pool) { |conn| pipeline_write(conn, klass, ms, success, buckets) }
|
|
79
|
+
nil
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Minute + 10-min rollup share a per-class `|p|f|ms` field layout;
|
|
83
|
+
# the hourly bucket is already class-scoped so its fields are bare
|
|
84
|
+
# `p|f|ms`. Pipeline all 9 commands in one round-trip.
|
|
85
|
+
def pipeline_write(conn, klass, ms, success, buckets)
|
|
86
|
+
outcome = success ? 'p' : 'f'
|
|
87
|
+
class_outcome = "#{klass}|#{outcome}"
|
|
88
|
+
class_ms = "#{klass}|ms"
|
|
89
|
+
conn.pipelined do |pipe|
|
|
90
|
+
incr_bucket(pipe, buckets[:minute], [class_outcome, class_ms, ms, MID_TERM])
|
|
91
|
+
# The minute key and the 10-min rollup key coincide whenever the
|
|
92
|
+
# minute ends in 0 (rollup zeroes the last digit). Writing both
|
|
93
|
+
# would double-count the shared field — the minute write above
|
|
94
|
+
# already lands on it — so skip the rollup write then. Minutes
|
|
95
|
+
# x1..x9 still accumulate into the x0 rollup key as normal.
|
|
96
|
+
unless buckets[:rollup] == buckets[:minute]
|
|
97
|
+
incr_bucket(pipe, buckets[:rollup], [class_outcome, class_ms, ms, SHORT_TERM])
|
|
98
|
+
end
|
|
99
|
+
incr_bucket(pipe, buckets[:hour], [outcome, 'ms', ms, MID_TERM])
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
def incr_bucket(pipe, key, parts)
|
|
104
|
+
outcome_field, ms_field, ms, ttl = parts
|
|
105
|
+
pipe.call('HINCRBY', key, outcome_field, 1)
|
|
106
|
+
pipe.call('HINCRBY', key, ms_field, ms)
|
|
107
|
+
pipe.call('EXPIRE', key, ttl)
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Public formatters — Wurk::Metrics::Query reuses these so the two
|
|
111
|
+
# cannot drift on bucket-naming convention.
|
|
112
|
+
def minute_key(time)
|
|
113
|
+
t = time.utc
|
|
114
|
+
format("#{MINUTE_KEY_PREFIX}%<date>s|%<hr>d:%<min>d",
|
|
115
|
+
date: t.strftime(DATE_FORMAT), hr: t.hour, min: t.min)
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
def rollup_key(time)
|
|
119
|
+
t = time.utc
|
|
120
|
+
format("#{MINUTE_KEY_PREFIX}%<date>s|%<hr>d:%<min>d",
|
|
121
|
+
date: t.strftime(DATE_FORMAT), hr: t.hour, min: (t.min / 10) * 10)
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
def hour_key(klass, time)
|
|
125
|
+
t = time.utc
|
|
126
|
+
"#{klass}-#{t.strftime(DATE_FORMAT)}-#{t.hour}"
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
private
|
|
131
|
+
|
|
132
|
+
def monotonic_ms
|
|
133
|
+
::Process.clock_gettime(::Process::CLOCK_MONOTONIC, :float_millisecond)
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
def handle_error(err)
|
|
137
|
+
cfg = config || Wurk.configuration
|
|
138
|
+
cfg.handle_exception(err, context: 'Wurk::Metrics::History')
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
def self.with_pool(pool, &)
|
|
142
|
+
if pool
|
|
143
|
+
pool.with(&)
|
|
144
|
+
else
|
|
145
|
+
Wurk.redis(&)
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
private_class_method :with_pool
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
end
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'history'
|
|
4
|
+
require_relative 'rollup'
|
|
5
|
+
|
|
6
|
+
module Wurk
|
|
7
|
+
module Metrics
|
|
8
|
+
# Read-side for the per-class HASH bucket schema written by
|
|
9
|
+
# `Wurk::Metrics::History`. Backs the Web UI's history pane and any
|
|
10
|
+
# external dashboarding code; both rely on the same HGETALL fan-out
|
|
11
|
+
# over a contiguous range of minute / hour keys.
|
|
12
|
+
#
|
|
13
|
+
# Window caps are spec-enforced:
|
|
14
|
+
#
|
|
15
|
+
# minutes ≤ 480 (8h — same as the 10-minute rollup retention)
|
|
16
|
+
# hours ≤ 72 (3d — same as MID_TERM retention)
|
|
17
|
+
#
|
|
18
|
+
# A wider window has no data to read anyway (the buckets are TTL'd out),
|
|
19
|
+
# so we fail loudly rather than silently returning sparse results.
|
|
20
|
+
module Query # rubocop:disable Metrics/ModuleLength
|
|
21
|
+
MAX_MINUTES = 480
|
|
22
|
+
MAX_HOURS = 72
|
|
23
|
+
TOTAL_FIELDS = %w[p f ms].freeze
|
|
24
|
+
private_constant :TOTAL_FIELDS
|
|
25
|
+
|
|
26
|
+
class WindowTooWide < ::ArgumentError; end
|
|
27
|
+
|
|
28
|
+
module_function
|
|
29
|
+
|
|
30
|
+
# Aggregate per-job-class totals over a recent window of minute
|
|
31
|
+
# buckets. Returns array of `[class_name, {p:, f:, ms:}]` tuples
|
|
32
|
+
# sorted by volume (p + f) descending so the UI's "top jobs" table
|
|
33
|
+
# renders without a second sort pass.
|
|
34
|
+
def top_jobs(class_filter: nil, minutes: 60, hours: nil, now: ::Time.now)
|
|
35
|
+
minutes = hours * 60 if hours
|
|
36
|
+
cap_hours!(hours) if hours
|
|
37
|
+
rows = aggregate_minutes(now, cap_minutes!(minutes)).to_a
|
|
38
|
+
rows = rows.select { |(k, _)| k.start_with?(class_filter) } if class_filter && !class_filter.empty?
|
|
39
|
+
rows.sort_by { |(_k, s)| -(s[:p] + s[:f]) }
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Per-class time-series. `minutes` reads the per-minute bucket; `hours`
|
|
43
|
+
# reads the per-class hourly bucket (separate keys per spec, so a long
|
|
44
|
+
# window doesn't fan out over 4320 minute hashes).
|
|
45
|
+
def for_job(klass, minutes: nil, hours: nil, now: ::Time.now)
|
|
46
|
+
validate_for_job!(klass, minutes, hours)
|
|
47
|
+
minutes ? minute_series(klass, now, cap_minutes!(minutes)) : hour_series(klass, now, cap_hours!(hours))
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Cluster-total time-series for the dashboard throughput/failures charts,
|
|
51
|
+
# read from the compact buckets written by Wurk::Metrics::Rollup. `bucket`
|
|
52
|
+
# is '1m'/'5m'/'1h'; `window_seconds` is clamped to that bucket's
|
|
53
|
+
# retention. Returns `[{at:, p:, f:, ms:}, ...]` oldest→newest, gap-filled
|
|
54
|
+
# with zeros so a chart has a continuous x-axis.
|
|
55
|
+
def history(bucket, window_seconds, now: ::Time.now)
|
|
56
|
+
step, ttl = bucket_spec!(bucket)
|
|
57
|
+
starts = bucket_starts(now, step, clamp_history_window!(window_seconds, ttl))
|
|
58
|
+
rows = pipeline_hmget(starts.map { |s| Wurk::Metrics::Rollup.bucket_key(bucket, s) }, %w[p f ms])
|
|
59
|
+
starts.zip(rows).map { |at, (p, f, ms)| { at: at, p: p.to_i, f: f.to_i, ms: ms.to_i } }
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def bucket_spec!(bucket)
|
|
63
|
+
Wurk::Metrics::Rollup::BUCKETS.fetch(bucket) do
|
|
64
|
+
raise ArgumentError, "bucket must be one of #{Wurk::Metrics::Rollup::BUCKETS.keys.inspect}"
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def clamp_history_window!(window_seconds, ttl)
|
|
69
|
+
window = Integer(window_seconds)
|
|
70
|
+
raise ArgumentError, 'window must be positive' if window <= 0
|
|
71
|
+
|
|
72
|
+
[window, ttl].min
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# The last `window/step` step-aligned bucket starts, oldest→newest, so
|
|
76
|
+
# they match the keys the rollup writes.
|
|
77
|
+
def bucket_starts(now, step, window)
|
|
78
|
+
last = (now.to_i / step) * step
|
|
79
|
+
(0...(window / step)).map { |i| last - (i * step) }.reverse
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def validate_for_job!(klass, minutes, hours)
|
|
83
|
+
raise ArgumentError, 'klass required' if klass.nil? || klass.empty?
|
|
84
|
+
raise ArgumentError, 'pass exactly one of minutes: or hours:' if minutes && hours
|
|
85
|
+
raise ArgumentError, 'pass minutes: or hours:' if minutes.nil? && hours.nil?
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def cap_minutes!(minutes)
|
|
89
|
+
check_window!(Integer(minutes), MAX_MINUTES, 'minutes')
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def cap_hours!(hours)
|
|
93
|
+
check_window!(Integer(hours), MAX_HOURS, 'hours')
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def check_window!(value, max, label)
|
|
97
|
+
raise ArgumentError, "#{label} must be positive" if value <= 0
|
|
98
|
+
raise WindowTooWide, "#{label} must be <= #{max} (got #{value})" if value > max
|
|
99
|
+
|
|
100
|
+
value
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
def aggregate_minutes(now, minutes)
|
|
104
|
+
totals = ::Hash.new { |h, k| h[k] = { p: 0, f: 0, ms: 0 } }
|
|
105
|
+
pipeline_hgetall(minute_keys(now, minutes)).each { |hash| accumulate!(totals, hash) }
|
|
106
|
+
totals
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def accumulate!(totals, hash)
|
|
110
|
+
return if hash.nil? || hash.empty?
|
|
111
|
+
|
|
112
|
+
hash.each do |field, value|
|
|
113
|
+
klass, kind = field.split('|', 2)
|
|
114
|
+
next unless kind && TOTAL_FIELDS.include?(kind)
|
|
115
|
+
|
|
116
|
+
totals[klass][kind.to_sym] += Integer(value)
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def minute_series(klass, now, minutes)
|
|
121
|
+
rows = pipeline_hmget(minute_keys(now, minutes), %W[#{klass}|p #{klass}|f #{klass}|ms])
|
|
122
|
+
zip_rows(minute_timestamps(now, minutes), rows)
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
def hour_series(klass, now, hours)
|
|
126
|
+
timestamps = hour_timestamps(now, hours)
|
|
127
|
+
keys = timestamps.map { |t| Wurk::Metrics::History.hour_key(klass, t) }
|
|
128
|
+
zip_rows(timestamps, pipeline_hmget(keys, %w[p f ms]))
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def zip_rows(timestamps, rows)
|
|
132
|
+
timestamps.zip(rows).map { |at, (p, f, ms)| { at: at, p: p.to_i, f: f.to_i, ms: ms.to_i } }
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
def minute_keys(now, minutes)
|
|
136
|
+
minute_timestamps(now, minutes).map { |t| Wurk::Metrics::History.minute_key(t) }
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
# Truncate to the minute so the bucket boundary matches what the
|
|
140
|
+
# writer used. Fractional-second drift would otherwise pull in an
|
|
141
|
+
# unrelated minute on the edge of the window.
|
|
142
|
+
def minute_timestamps(now, minutes)
|
|
143
|
+
floor = floor_to(now, :min)
|
|
144
|
+
(0...minutes).map { |i| floor - (i * 60) }.reverse
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
def hour_timestamps(now, hours)
|
|
148
|
+
floor = floor_to(now, :hour)
|
|
149
|
+
(0...hours).map { |i| floor - (i * 3600) }.reverse
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
def floor_to(time, unit)
|
|
153
|
+
t = time.utc
|
|
154
|
+
case unit
|
|
155
|
+
when :min then ::Time.utc(t.year, t.month, t.day, t.hour, t.min)
|
|
156
|
+
when :hour then ::Time.utc(t.year, t.month, t.day, t.hour)
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def pipeline_hgetall(keys)
|
|
161
|
+
return [] if keys.empty?
|
|
162
|
+
|
|
163
|
+
Wurk.redis { |c| c.pipelined { |p| keys.each { |k| p.call('HGETALL', k) } } }
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
def pipeline_hmget(keys, fields)
|
|
167
|
+
return [] if keys.empty?
|
|
168
|
+
|
|
169
|
+
Wurk.redis { |c| c.pipelined { |p| keys.each { |k| p.call('HMGET', k, *fields) } } }
|
|
170
|
+
end
|
|
171
|
+
end
|
|
172
|
+
end
|
|
173
|
+
end
|