sidekiq-routing 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +233 -0
- data/lib/sidekiq/routing/auto/batch_rerouter.rb +67 -0
- data/lib/sidekiq/routing/auto/configuration.rb +62 -0
- data/lib/sidekiq/routing/auto/job_duration_tracker.rb +78 -0
- data/lib/sidekiq/routing/auto/noisy_neighbor_detector.rb +52 -0
- data/lib/sidekiq/routing/auto/reroute_job.rb +93 -0
- data/lib/sidekiq/routing/auto/router.rb +25 -0
- data/lib/sidekiq/routing/configuration.rb +41 -0
- data/lib/sidekiq/routing/middleware/client.rb +36 -0
- data/lib/sidekiq/routing/middleware/server.rb +38 -0
- data/lib/sidekiq/routing/mover.rb +34 -0
- data/lib/sidekiq/routing/parked_processor.rb +41 -0
- data/lib/sidekiq/routing/store.rb +41 -0
- data/lib/sidekiq/routing/sweeper.rb +61 -0
- data/lib/sidekiq/routing/version.rb +7 -0
- data/lib/sidekiq/routing/web/views/routing.erb +74 -0
- data/lib/sidekiq/routing/web.rb +28 -0
- data/lib/sidekiq/routing/web_extension.rb +27 -0
- data/lib/sidekiq/routing.rb +208 -0
- data/lib/sidekiq-routing.rb +28 -0
- metadata +88 -0
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Sidekiq
|
|
4
|
+
module Routing
|
|
5
|
+
module Middleware
|
|
6
|
+
# Handles inflow: new jobs at enqueue time (perform_async/perform_in,
|
|
7
|
+
# push_bulk). Blackhole aborts the push; park rewrites the queue so the job
|
|
8
|
+
# lands in the worker-less parking queue.
|
|
9
|
+
#
|
|
10
|
+
# No per-job log or metric here — during a flood that would emit millions
|
|
11
|
+
# of lines. Observability is the Web tab's live aggregates + operator-action
|
|
12
|
+
# logs only (see docs/routing-how-it-works.md).
|
|
13
|
+
class Client
|
|
14
|
+
def call(worker_class, job, queue, _redis_pool = nil)
|
|
15
|
+
return yield unless Routing.enabled?
|
|
16
|
+
return yield if job[NO_DIVERT_KEY] # process_parked / explicit bypass
|
|
17
|
+
return yield if queue.to_s == Routing.parked_queue # never re-divert parked jobs
|
|
18
|
+
|
|
19
|
+
route = Routing.route_for(job["wrapped"] || worker_class)
|
|
20
|
+
return yield unless route
|
|
21
|
+
|
|
22
|
+
case route["mode"]
|
|
23
|
+
when MODE_BLACKHOLE
|
|
24
|
+
false # abort push — never enters Redis
|
|
25
|
+
when MODE_PARK
|
|
26
|
+
job[ORIGINAL_QUEUE_KEY] ||= queue.to_s # preserve true origin across re-diverts
|
|
27
|
+
job["queue"] = Routing.parked_queue
|
|
28
|
+
yield # push, but to the parking queue
|
|
29
|
+
else
|
|
30
|
+
yield
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Sidekiq
|
|
4
|
+
module Routing
|
|
5
|
+
module Middleware
|
|
6
|
+
# Handles the backlog: jobs already enqueued before the route was added,
|
|
7
|
+
# plus scheduled/retry-set jobs that re-enter their real queue (those do not
|
|
8
|
+
# pass through client middleware, so this is the only thing that catches
|
|
9
|
+
# them). Park re-pushes the job to the parking queue; blackhole acks and
|
|
10
|
+
# drops it (never the Dead set).
|
|
11
|
+
#
|
|
12
|
+
# Registered AFTER SidekiqUniqueJobs::Middleware::Server so that returning
|
|
13
|
+
# without yield still lets unique-jobs release the original's lock.
|
|
14
|
+
class Server
|
|
15
|
+
def call(worker, job, queue)
|
|
16
|
+
return yield unless Routing.enabled?
|
|
17
|
+
return yield if queue.to_s == Routing.parked_queue # loop guard: run parked jobs
|
|
18
|
+
return yield if job[NO_DIVERT_KEY]
|
|
19
|
+
|
|
20
|
+
route = Routing.route_for(job["wrapped"] || worker.class)
|
|
21
|
+
return yield unless route
|
|
22
|
+
|
|
23
|
+
case route["mode"]
|
|
24
|
+
when MODE_BLACKHOLE
|
|
25
|
+
nil # ack & drop; NOT the Dead set
|
|
26
|
+
when MODE_PARK
|
|
27
|
+
Mover.move(
|
|
28
|
+
job, Routing.parked_queue,
|
|
29
|
+
ORIGINAL_QUEUE_KEY => job[ORIGINAL_QUEUE_KEY] || queue.to_s)
|
|
30
|
+
nil # ack & remove original; the copy now lives in the parking queue
|
|
31
|
+
else
|
|
32
|
+
yield
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Sidekiq
|
|
4
|
+
module Routing
|
|
5
|
+
# Relocates an already-enqueued job payload to another queue by rewriting the
|
|
6
|
+
# "queue" field *inside* the payload and writing it straight to Redis.
|
|
7
|
+
#
|
|
8
|
+
# It deliberately bypasses the client middleware chain. Going through
|
|
9
|
+
# Sidekiq::Client.push would (a) risk re-diverting the job via our own client
|
|
10
|
+
# middleware and (b) make sidekiq-unique-jobs try to re-acquire a lock that
|
|
11
|
+
# the in-flight original still holds, which can fail the push and drop the job
|
|
12
|
+
# on a server-side park. Moving the raw payload sidesteps both.
|
|
13
|
+
#
|
|
14
|
+
# Used by the server middleware (park), the Sweeper, and the ParkedProcessor.
|
|
15
|
+
module Mover
|
|
16
|
+
class << self
|
|
17
|
+
# item: the existing job hash (a Sidekiq::JobRecord#item or job hash)
|
|
18
|
+
# to_queue: destination queue name
|
|
19
|
+
# extra: payload keys to merge in (e.g. the original-queue stamp)
|
|
20
|
+
def move(item, to_queue, extra = {})
|
|
21
|
+
target = to_queue.to_s
|
|
22
|
+
payload = item.merge("queue" => target).merge(extra)
|
|
23
|
+
json = Sidekiq.dump_json(payload)
|
|
24
|
+
|
|
25
|
+
Sidekiq.redis do |conn|
|
|
26
|
+
conn.sadd("queues", target) # keep the queue visible in the Web UI
|
|
27
|
+
conn.lpush("queue:#{target}", json)
|
|
28
|
+
end
|
|
29
|
+
true
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Sidekiq
|
|
4
|
+
module Routing
|
|
5
|
+
# Moves parked jobs back to their original queue by rewriting the "queue"
|
|
6
|
+
# field inside the payload (read item -> set queue -> push -> delete). Because
|
|
7
|
+
# the payload's queue is now the original, a processed parked job that later
|
|
8
|
+
# fails retries to its original queue, NOT the parking queue.
|
|
9
|
+
#
|
|
10
|
+
# Stamps NO_DIVERT_KEY so the job is not bounced straight back to parked even
|
|
11
|
+
# if the route is still active (recommended order is still: unpark, then
|
|
12
|
+
# process_parked).
|
|
13
|
+
class ParkedProcessor
|
|
14
|
+
def call(klass: nil, limit: nil, batch_size: nil)
|
|
15
|
+
limit ||= Routing.configuration.batch_limit
|
|
16
|
+
fallback = Routing.configuration.process_parked_fallback_queue
|
|
17
|
+
moved = 0
|
|
18
|
+
|
|
19
|
+
Sidekiq::Queue.new(Routing.parked_queue).each do |job|
|
|
20
|
+
break if limit && moved >= limit
|
|
21
|
+
next if klass && job.display_class != klass
|
|
22
|
+
|
|
23
|
+
target = job.item[ORIGINAL_QUEUE_KEY]
|
|
24
|
+
unless target
|
|
25
|
+
target = fallback
|
|
26
|
+
Routing.logger.warn(
|
|
27
|
+
"[Routing] #{job.display_class} #{job.jid} had no original queue; processing parked job to #{fallback}"
|
|
28
|
+
)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
payload = job.item.reject { |key, _| key == ORIGINAL_QUEUE_KEY }
|
|
32
|
+
Mover.move(payload, target, NO_DIVERT_KEY => true)
|
|
33
|
+
moved += 1 if job.delete
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
Routing.logger.warn("[Routing] processed #{moved} parked job(s) from #{Routing.parked_queue}")
|
|
37
|
+
moved
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "time"
|
|
4
|
+
|
|
5
|
+
module Sidekiq
|
|
6
|
+
module Routing
|
|
7
|
+
# Source of truth for manual route state: a single Redis hash in Sidekiq's Redis,
|
|
8
|
+
# field = job class name, value = JSON {mode, routed_at}.
|
|
9
|
+
# These reads are uncached (used by the operator API and Web tab); the
|
|
10
|
+
# middleware hot path uses Sidekiq::Routing.route_for instead.
|
|
11
|
+
module Store
|
|
12
|
+
HASH_KEY = "sidekiq:routing:routes"
|
|
13
|
+
|
|
14
|
+
class << self
|
|
15
|
+
def set(name, mode:)
|
|
16
|
+
value = JSON.dump(
|
|
17
|
+
"mode" => mode.to_s,
|
|
18
|
+
"routed_at" => Time.now.utc.iso8601
|
|
19
|
+
)
|
|
20
|
+
Sidekiq.redis { |conn| conn.hset(HASH_KEY, name, value) }
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def delete(name)
|
|
24
|
+
Sidekiq.redis { |conn| conn.hdel(HASH_KEY, name) }
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# -> {"mode"=>...} or nil
|
|
28
|
+
def fetch(name)
|
|
29
|
+
raw = Sidekiq.redis { |conn| conn.hget(HASH_KEY, name) }
|
|
30
|
+
raw && JSON.parse(raw)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# -> { "ClassName" => {"mode"=>...}, ... }
|
|
34
|
+
def all
|
|
35
|
+
Sidekiq.redis { |conn| conn.hgetall(HASH_KEY) }
|
|
36
|
+
.transform_values { |raw| JSON.parse(raw) }
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Sidekiq
|
|
4
|
+
module Routing
|
|
5
|
+
# Eagerly clears the already-enqueued backlog of a parked class out of its
|
|
6
|
+
# live queue(s) and into the parking queue. Explicit operator action — never
|
|
7
|
+
# automatic on park. For a 100Ks+ backlog, prefer drain-in-place over moving
|
|
8
|
+
# millions of jobs.
|
|
9
|
+
class Sweeper
|
|
10
|
+
def call(klass_name, queue: nil, limit: nil, batch_size: nil)
|
|
11
|
+
limit ||= Routing.configuration.batch_limit
|
|
12
|
+
requested_queue = queue.to_s.empty? ? nil : queue
|
|
13
|
+
target_queues = Array(requested_queue || default_queues_for(klass_name))
|
|
14
|
+
moved = 0
|
|
15
|
+
|
|
16
|
+
target_queues.each do |source|
|
|
17
|
+
next if source.to_s == Routing.parked_queue
|
|
18
|
+
|
|
19
|
+
Sidekiq::Queue.new(source).each do |job|
|
|
20
|
+
break if limit && moved >= limit
|
|
21
|
+
next unless job.display_class == klass_name
|
|
22
|
+
|
|
23
|
+
Mover.move(
|
|
24
|
+
job.item, Routing.parked_queue,
|
|
25
|
+
ORIGINAL_QUEUE_KEY => job.item[ORIGINAL_QUEUE_KEY] || source.to_s)
|
|
26
|
+
moved += 1 if job.delete
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
Routing.logger.warn(
|
|
31
|
+
"[Routing] swept #{moved} #{klass_name} job(s) into #{Routing.parked_queue}"
|
|
32
|
+
)
|
|
33
|
+
moved
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
private
|
|
37
|
+
|
|
38
|
+
# The class's configured queue if we can resolve it. We deliberately do NOT
|
|
39
|
+
# fall back to scanning every queue: during an incident that would hammer
|
|
40
|
+
# Redis and slow healthy queues. If the queue can't be resolved, the operator
|
|
41
|
+
# must say which queue(s) to sweep.
|
|
42
|
+
def default_queues_for(klass_name)
|
|
43
|
+
klass = safe_constantize(klass_name)
|
|
44
|
+
configured = klass.respond_to?(:get_sidekiq_options) ? klass.get_sidekiq_options["queue"] : nil
|
|
45
|
+
return [configured.to_s] if configured
|
|
46
|
+
|
|
47
|
+
raise ArgumentError,
|
|
48
|
+
"Cannot resolve a queue for #{klass_name}; pass queue: explicitly so the sweep " \
|
|
49
|
+
"does not scan every queue, e.g. Sidekiq::Routing.sweep(#{klass_name.inspect}, queue: \"within_1_minute\")."
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Plain-Ruby stand-in for ActiveSupport's String#safe_constantize:
|
|
53
|
+
# resolve a (possibly namespaced) class name, or nil if it isn't loaded.
|
|
54
|
+
def safe_constantize(name)
|
|
55
|
+
Object.const_get(name)
|
|
56
|
+
rescue NameError
|
|
57
|
+
nil
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
<div class="row header">
|
|
2
|
+
<div class="col-sm-12">
|
|
3
|
+
<h3>Routing</h3>
|
|
4
|
+
<p class="text-muted">
|
|
5
|
+
Read-only view of runtime job-class routing.
|
|
6
|
+
<strong>Park</strong> diverts a class's jobs to the worker-less
|
|
7
|
+
<code><%= h @parked_queue %></code> queue (reversible);
|
|
8
|
+
<strong>Blackhole</strong> drops them entirely.
|
|
9
|
+
Actions (park, blackhole, unpark, sweep, process_parked) are performed from the
|
|
10
|
+
Rails console — see the routing runbook.
|
|
11
|
+
</p>
|
|
12
|
+
</div>
|
|
13
|
+
</div>
|
|
14
|
+
|
|
15
|
+
<h4>Active routes</h4>
|
|
16
|
+
<% if @routes.empty? %>
|
|
17
|
+
<p class="text-muted">No active routes.</p>
|
|
18
|
+
<% else %>
|
|
19
|
+
<table class="table table-striped table-bordered table-white">
|
|
20
|
+
<thead>
|
|
21
|
+
<tr>
|
|
22
|
+
<th>Job class</th>
|
|
23
|
+
<th>Mode</th>
|
|
24
|
+
<th>Since</th>
|
|
25
|
+
</tr>
|
|
26
|
+
</thead>
|
|
27
|
+
<tbody>
|
|
28
|
+
<% @routes.sort.each do |class_name, info| %>
|
|
29
|
+
<tr>
|
|
30
|
+
<td><code><%= h class_name %></code></td>
|
|
31
|
+
<td>
|
|
32
|
+
<span class="label <%= info["mode"] == "blackhole" ? "label-danger" : "label-warning" %>">
|
|
33
|
+
<%= h info["mode"] %>
|
|
34
|
+
</span>
|
|
35
|
+
</td>
|
|
36
|
+
<td><%= h info["routed_at"] %></td>
|
|
37
|
+
</tr>
|
|
38
|
+
<% end %>
|
|
39
|
+
</tbody>
|
|
40
|
+
</table>
|
|
41
|
+
<% end %>
|
|
42
|
+
|
|
43
|
+
<h4>Parking queue: <code><%= h @parked_queue %></code> (<%= @parked_size %> jobs)</h4>
|
|
44
|
+
<p class="text-muted">
|
|
45
|
+
Recover by backlog size: <strong>process_parked</strong> moves parked jobs back to their original queue (modest volume);
|
|
46
|
+
for 100Ks+, scale a worker on <code><%= h @parked_queue %></code> to <strong>drain in place</strong> instead of moving them.
|
|
47
|
+
Run these from the Rails console — see the routing runbook.
|
|
48
|
+
</p>
|
|
49
|
+
|
|
50
|
+
<% if @parked_breakdown.any? %>
|
|
51
|
+
<p class="text-muted">Distribution over the first <%= @breakdown_sample %> parked jobs (sampled; the count above is the true total).</p>
|
|
52
|
+
<table class="table table-striped table-bordered table-white">
|
|
53
|
+
<thead>
|
|
54
|
+
<tr>
|
|
55
|
+
<th>Job class</th>
|
|
56
|
+
<th>Parked</th>
|
|
57
|
+
<th>By original queue</th>
|
|
58
|
+
</tr>
|
|
59
|
+
</thead>
|
|
60
|
+
<tbody>
|
|
61
|
+
<% @parked_breakdown.sort.each do |class_name, stats| %>
|
|
62
|
+
<tr>
|
|
63
|
+
<td><code><%= h class_name %></code></td>
|
|
64
|
+
<td><%= stats["count"] %></td>
|
|
65
|
+
<td>
|
|
66
|
+
<% stats["by_original_queue"].sort.each do |queue, count| %>
|
|
67
|
+
<%= h queue %>: <%= count %><br/>
|
|
68
|
+
<% end %>
|
|
69
|
+
</td>
|
|
70
|
+
</tr>
|
|
71
|
+
<% end %>
|
|
72
|
+
</tbody>
|
|
73
|
+
</table>
|
|
74
|
+
<% end %>
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "sidekiq/routing/web_extension"
|
|
4
|
+
|
|
5
|
+
# Registers the "Routing" tab. The registration API changed across Sidekiq
|
|
6
|
+
# versions, so branch the way sidekiq-cron does (apps run 7.3.x; commons locks 8.x).
|
|
7
|
+
if defined?(Sidekiq::Web)
|
|
8
|
+
if Gem::Version.new(Sidekiq::VERSION) >= Gem::Version.new("8.0.0")
|
|
9
|
+
Sidekiq::Web.configure do |config|
|
|
10
|
+
config.register(
|
|
11
|
+
Sidekiq::Routing::WebExtension,
|
|
12
|
+
name: "routing",
|
|
13
|
+
tab: "Routing",
|
|
14
|
+
index: "routing"
|
|
15
|
+
)
|
|
16
|
+
end
|
|
17
|
+
elsif Gem::Version.new(Sidekiq::VERSION) >= Gem::Version.new("7.3.0")
|
|
18
|
+
Sidekiq::Web.register(
|
|
19
|
+
Sidekiq::Routing::WebExtension,
|
|
20
|
+
name: "routing",
|
|
21
|
+
tab: "Routing",
|
|
22
|
+
index: "routing"
|
|
23
|
+
)
|
|
24
|
+
else
|
|
25
|
+
Sidekiq::Web.register Sidekiq::Routing::WebExtension
|
|
26
|
+
Sidekiq::Web.tabs["Routing"] = "routing"
|
|
27
|
+
end
|
|
28
|
+
end
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Sidekiq
|
|
4
|
+
module Routing
|
|
5
|
+
# Sidekiq Web "Routing" tab. Read-only: it displays routing state only
|
|
6
|
+
# (active routes + parking-queue depth/breakdown). Every mutating
|
|
7
|
+
# operation (park, blackhole, unpark, sweep, process_parked) is performed from the
|
|
8
|
+
# Rails console via the Sidekiq::Routing API — never from the dashboard — so
|
|
9
|
+
# destructive actions stay deliberate and don't ride on the shared
|
|
10
|
+
# Sidekiq Web credentials. Reads aggregates live from Redis; never emits
|
|
11
|
+
# per-job telemetry. Inherits the existing Sidekiq Web auth.
|
|
12
|
+
module WebExtension
|
|
13
|
+
VIEWS = File.expand_path("web/views", __dir__)
|
|
14
|
+
|
|
15
|
+
def self.registered(app)
|
|
16
|
+
app.get "/routing" do
|
|
17
|
+
@parked_queue = Sidekiq::Routing.parked_queue
|
|
18
|
+
@routes = Sidekiq::Routing.routes
|
|
19
|
+
@parked_size = Sidekiq::Routing.parked_size
|
|
20
|
+
@breakdown_sample = Sidekiq::Routing.configuration.breakdown_sample_size
|
|
21
|
+
@parked_breakdown = Sidekiq::Routing.parked_breakdown
|
|
22
|
+
erb(File.read(File.join(VIEWS, "routing.erb")))
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "sidekiq"
|
|
4
|
+
require "json"
|
|
5
|
+
|
|
6
|
+
# Runtime, per-job-class parking/blackhole mechanism for Sidekiq incident response.
|
|
7
|
+
#
|
|
8
|
+
# Two modes per job class (see lib/sidekiq/CONTEXT.md and
|
|
9
|
+
# docs/routing-how-it-works.md):
|
|
10
|
+
# - park (default, reversible): divert jobs to a worker-less parking queue.
|
|
11
|
+
# - blackhole: drop jobs entirely (only for classes safe to lose).
|
|
12
|
+
#
|
|
13
|
+
# State lives in a single Redis hash; the hot path reads a whole-hash snapshot
|
|
14
|
+
# refreshed at most once per cache_ttl_seconds, so per-job cost is an in-memory
|
|
15
|
+
# lookup rather than a Redis round-trip.
|
|
16
|
+
module Sidekiq
|
|
17
|
+
module Routing
|
|
18
|
+
PARKED_QUEUE_DEFAULT = "routing_parked"
|
|
19
|
+
|
|
20
|
+
# Keys stamped into the job payload.
|
|
21
|
+
ORIGINAL_QUEUE_KEY = "routing_original_queue"
|
|
22
|
+
NO_DIVERT_KEY = "routing_no_divert"
|
|
23
|
+
|
|
24
|
+
MODE_PARK = "park"
|
|
25
|
+
MODE_BLACKHOLE = "blackhole"
|
|
26
|
+
MODES = [MODE_PARK, MODE_BLACKHOLE].freeze
|
|
27
|
+
|
|
28
|
+
@snapshot = nil
|
|
29
|
+
@snapshot_at = nil
|
|
30
|
+
@snapshot_mutex = Mutex.new
|
|
31
|
+
|
|
32
|
+
class << self
|
|
33
|
+
def configuration
|
|
34
|
+
@_configuration ||= Configuration.new
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def setup
|
|
38
|
+
yield configuration if block_given?
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Registers the manual routing client + server middleware on both client
|
|
42
|
+
# and server configurations. This mirrors Sidekiq::Lock.install! so host
|
|
43
|
+
# apps can add the gem first and opt in to routing per app.
|
|
44
|
+
def install!
|
|
45
|
+
require "sidekiq/routing/middleware/client"
|
|
46
|
+
require "sidekiq/routing/middleware/server"
|
|
47
|
+
|
|
48
|
+
prepend_routing = ->(chain) { chain.prepend Middleware::Client }
|
|
49
|
+
|
|
50
|
+
Sidekiq.configure_client do |config|
|
|
51
|
+
config.client_middleware(&prepend_routing)
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
Sidekiq.configure_server do |config|
|
|
55
|
+
config.client_middleware(&prepend_routing)
|
|
56
|
+
config.server_middleware { |chain| install_server_middleware(chain) }
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def enabled?
|
|
61
|
+
configuration.enabled
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def logger
|
|
65
|
+
configuration.logger
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def parked_queue
|
|
69
|
+
configuration.parked_queue
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# ---- operator API: managing manual routes ----
|
|
73
|
+
|
|
74
|
+
def park(klass)
|
|
75
|
+
write(klass, MODE_PARK)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def blackhole(klass)
|
|
79
|
+
write(klass, MODE_BLACKHOLE)
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def unpark(klass)
|
|
83
|
+
name = class_name(klass)
|
|
84
|
+
Store.delete(name)
|
|
85
|
+
reset_cache!
|
|
86
|
+
logger.warn("[Routing] unparked #{name}")
|
|
87
|
+
name
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def parked?(klass)
|
|
91
|
+
mode(klass) == MODE_PARK
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def routed?(klass)
|
|
95
|
+
!Store.fetch(class_name(klass)).nil?
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def mode(klass)
|
|
99
|
+
Store.fetch(class_name(klass))&.fetch("mode", nil)
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# All active manual routes, read straight from Redis (uncached) so operators
|
|
103
|
+
# and the Web tab always see the truth.
|
|
104
|
+
def routes
|
|
105
|
+
Store.all
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# ---- hot path: snapshot lookup, used by the middleware ----
|
|
109
|
+
|
|
110
|
+
# Returns the route hash ({"mode"=>...}) for a class, or nil. Reads from a
|
|
111
|
+
# process-local snapshot of all routes, refreshed at most once per
|
|
112
|
+
# cache_ttl_seconds.
|
|
113
|
+
def route_for(klass_or_name)
|
|
114
|
+
snapshot[class_name(klass_or_name)]
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
def reset_cache!
|
|
118
|
+
@snapshot_mutex.synchronize do
|
|
119
|
+
@snapshot = nil
|
|
120
|
+
@snapshot_at = nil
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# ---- parking queue introspection ----
|
|
125
|
+
|
|
126
|
+
def parked_size
|
|
127
|
+
Sidekiq::Queue.new(parked_queue).size
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
# { "SomeJob" => { "count" => 12, "by_original_queue" => { "within_1_minute" => 12 } } }
|
|
131
|
+
#
|
|
132
|
+
# Scans at most `sample` jobs (default Configuration#breakdown_sample_size), not the
|
|
133
|
+
# whole queue: the parking queue can hold millions during a flood and this is called on
|
|
134
|
+
# every Web tab load. The result is a distribution over the sampled head, not exact
|
|
135
|
+
# totals — use parked_size (O(1) LLEN) for the true total. Pass sample: nil to scan all.
|
|
136
|
+
def parked_breakdown(sample: configuration.breakdown_sample_size)
|
|
137
|
+
result = Hash.new { |h, k| h[k] = { "count" => 0, "by_original_queue" => Hash.new(0) } }
|
|
138
|
+
scanned = 0
|
|
139
|
+
Sidekiq::Queue.new(parked_queue).each do |job|
|
|
140
|
+
break if sample && scanned >= sample
|
|
141
|
+
|
|
142
|
+
klass = job.display_class
|
|
143
|
+
original = job.item[ORIGINAL_QUEUE_KEY] || "unknown"
|
|
144
|
+
result[klass]["count"] += 1
|
|
145
|
+
result[klass]["by_original_queue"][original] += 1
|
|
146
|
+
scanned += 1
|
|
147
|
+
end
|
|
148
|
+
result
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
# ---- recovery (thin wrappers; logic in Sweeper/ParkedProcessor) ----
|
|
152
|
+
|
|
153
|
+
def sweep(klass, queue: nil, limit: nil, batch_size: nil)
|
|
154
|
+
Sweeper.new.call(class_name(klass), queue:, limit:, batch_size:)
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
def process_parked(klass: nil, limit: nil, batch_size: nil)
|
|
158
|
+
ParkedProcessor.new.call(klass: klass && class_name(klass), limit:, batch_size:)
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
# Resolve the effective job-class name. Prefers the ActiveJob "wrapped"
|
|
162
|
+
# class so a wrapped job is matched by its real class, not the JobWrapper
|
|
163
|
+
# (mirrors Sidekiq's own display_class). Accepts a Class, String, or nil.
|
|
164
|
+
def class_name(klass_or_name)
|
|
165
|
+
return klass_or_name if klass_or_name.is_a?(String)
|
|
166
|
+
|
|
167
|
+
klass_or_name&.name
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
private
|
|
171
|
+
|
|
172
|
+
def install_server_middleware(chain)
|
|
173
|
+
unique_jobs = defined?(SidekiqUniqueJobs::Middleware::Server) && SidekiqUniqueJobs::Middleware::Server
|
|
174
|
+
if unique_jobs && chain.exists?(unique_jobs)
|
|
175
|
+
chain.insert_after unique_jobs, Middleware::Server
|
|
176
|
+
else
|
|
177
|
+
chain.add Middleware::Server
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
def write(klass, mode)
|
|
182
|
+
name = class_name(klass)
|
|
183
|
+
Store.set(name, mode:)
|
|
184
|
+
reset_cache!
|
|
185
|
+
logger.warn("[Routing] #{name} routed to #{mode}")
|
|
186
|
+
name
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
def snapshot
|
|
190
|
+
ttl = configuration.cache_ttl_seconds.to_f
|
|
191
|
+
now = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC)
|
|
192
|
+
current = @snapshot
|
|
193
|
+
return current if current && ttl.positive? && (now - @snapshot_at) < ttl
|
|
194
|
+
|
|
195
|
+
@snapshot_mutex.synchronize do
|
|
196
|
+
current = @snapshot
|
|
197
|
+
fresh = current && ttl.positive? && (now - @snapshot_at) < ttl
|
|
198
|
+
unless fresh
|
|
199
|
+
@snapshot = Store.all.freeze
|
|
200
|
+
@snapshot_at = now
|
|
201
|
+
current = @snapshot
|
|
202
|
+
end
|
|
203
|
+
end
|
|
204
|
+
current
|
|
205
|
+
end
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "sidekiq/routing/version"
|
|
4
|
+
|
|
5
|
+
# Routing core: defines Sidekiq::Routing and the manual driver (operator
|
|
6
|
+
# park / blackhole overrides). Loaded first so Sidekiq::Routing exists before
|
|
7
|
+
# the Auto sub-namespace reopens it.
|
|
8
|
+
#
|
|
9
|
+
# The Web tab (sidekiq/routing/web) is intentionally NOT required here. Host
|
|
10
|
+
# apps require it at the Sidekiq::Web mount site so worker processes never load
|
|
11
|
+
# the web framework.
|
|
12
|
+
require "sidekiq/routing"
|
|
13
|
+
require "sidekiq/routing/configuration"
|
|
14
|
+
require "sidekiq/routing/store"
|
|
15
|
+
require "sidekiq/routing/mover"
|
|
16
|
+
require "sidekiq/routing/sweeper"
|
|
17
|
+
require "sidekiq/routing/parked_processor"
|
|
18
|
+
require "sidekiq/routing/middleware/client"
|
|
19
|
+
require "sidekiq/routing/middleware/server"
|
|
20
|
+
|
|
21
|
+
# Routing::Auto: the automatic driver — latency-driven movement of jobs between
|
|
22
|
+
# SLA tiers. Opt-in via SIDEKIQ_ROUTING_AUTO_REROUTE_ENABLED.
|
|
23
|
+
require "sidekiq/routing/auto/configuration"
|
|
24
|
+
require "sidekiq/routing/auto/job_duration_tracker"
|
|
25
|
+
require "sidekiq/routing/auto/noisy_neighbor_detector"
|
|
26
|
+
require "sidekiq/routing/auto/batch_rerouter"
|
|
27
|
+
require "sidekiq/routing/auto/router"
|
|
28
|
+
require "sidekiq/routing/auto/reroute_job"
|