sidekiq-routing 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +233 -0
- data/lib/sidekiq/routing/auto/batch_rerouter.rb +67 -0
- data/lib/sidekiq/routing/auto/configuration.rb +62 -0
- data/lib/sidekiq/routing/auto/job_duration_tracker.rb +78 -0
- data/lib/sidekiq/routing/auto/noisy_neighbor_detector.rb +52 -0
- data/lib/sidekiq/routing/auto/reroute_job.rb +93 -0
- data/lib/sidekiq/routing/auto/router.rb +25 -0
- data/lib/sidekiq/routing/configuration.rb +41 -0
- data/lib/sidekiq/routing/middleware/client.rb +36 -0
- data/lib/sidekiq/routing/middleware/server.rb +38 -0
- data/lib/sidekiq/routing/mover.rb +34 -0
- data/lib/sidekiq/routing/parked_processor.rb +41 -0
- data/lib/sidekiq/routing/store.rb +41 -0
- data/lib/sidekiq/routing/sweeper.rb +61 -0
- data/lib/sidekiq/routing/version.rb +7 -0
- data/lib/sidekiq/routing/web/views/routing.erb +74 -0
- data/lib/sidekiq/routing/web.rb +28 -0
- data/lib/sidekiq/routing/web_extension.rb +27 -0
- data/lib/sidekiq/routing.rb +208 -0
- data/lib/sidekiq-routing.rb +28 -0
- metadata +88 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: '0119411113d408ffff9aa353faf8a1d9b4dfd7572edd26efb39dc97b048d204e'
|
|
4
|
+
data.tar.gz: cec0724470361bdbf22d98b9be14387acbd46c1f895ad3f0f4f27d3f77a3f477
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 6f299c288a571de2e21cfb6a8ece1be70df3b30f16b6251c1eb81eacb29ab788eea272d51187f6b919fbdbef07f8f176d6291637aefc32190699c90d20b3e702
|
|
7
|
+
data.tar.gz: fa64c7f1d15224a0011e9f5a7f16747ca1699781cdb50a6d84b69281814885481fc1ebf555bb9ea59a02727f2fa1536627900ebcc94631fb30af413b1d4f0537
|
data/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 BigBinary Technologies Pvt. Ltd.
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
# sidekiq-routing
|
|
2
|
+
|
|
3
|
+
Runtime, per-job-class **queue routing for Sidekiq** — park, blackhole, or
|
|
4
|
+
auto-reroute a job class **without a deploy**.
|
|
5
|
+
|
|
6
|
+
Background jobs often share a handful of latency-tiered queues
|
|
7
|
+
(`within_5_seconds`, `within_1_minute`, …). The queue name is a contract: every
|
|
8
|
+
job on a tier should start within that window. That sharing is efficient until
|
|
9
|
+
one class misbehaves — a flood, a runaway argument, a broken downstream — and
|
|
10
|
+
then the only built-in lever, pausing the whole queue, punishes every other
|
|
11
|
+
class on the tier. `sidekiq-routing` gives you a finer lever, applied at
|
|
12
|
+
runtime to a single job class:
|
|
13
|
+
|
|
14
|
+
| Kind | Purpose | Driver | Target |
|
|
15
|
+
|---|---|---|---|
|
|
16
|
+
| **Manual routing** | Incident response for one misbehaving class. | Operator, from a console. | `park` (reversible) or `blackhole` (drop). |
|
|
17
|
+
| **Auto rerouting** | Capacity management when an SLA tier is overloaded. | Background job, latency-driven, opt-in. | The next live SLA tier. Never parks or blackholes. |
|
|
18
|
+
|
|
19
|
+
Both are per-job-class. To halt a *whole* queue, use Sidekiq's own pause button.
|
|
20
|
+
|
|
21
|
+
The hot path is cheap: routing state lives in a single Redis hash, read from a
|
|
22
|
+
process-local snapshot refreshed at most once per `cache_ttl_seconds`, so the
|
|
23
|
+
per-job cost is an in-memory lookup, not a Redis round-trip.
|
|
24
|
+
|
|
25
|
+
## Installation
|
|
26
|
+
|
|
27
|
+
```ruby
|
|
28
|
+
gem "sidekiq-routing"
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
```sh
|
|
32
|
+
bundle install
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
Requires Ruby >= 3.1 and Sidekiq >= 7.0 (tested on Sidekiq 7.3.x and 8.x). The
|
|
36
|
+
gem depends only on `sidekiq` — no Rails or ActiveSupport required.
|
|
37
|
+
|
|
38
|
+
## Quick start
|
|
39
|
+
|
|
40
|
+
Add an initializer (e.g. `config/initializers/sidekiq_routing.rb`):
|
|
41
|
+
|
|
42
|
+
```ruby
|
|
43
|
+
require "sidekiq-routing"
|
|
44
|
+
|
|
45
|
+
Sidekiq::Routing.setup do |config|
|
|
46
|
+
# All optional — these are the defaults.
|
|
47
|
+
# config.parked_queue = "routing_parked"
|
|
48
|
+
# config.cache_ttl_seconds = 5
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
Sidekiq::Routing.install! # registers the client + server middleware
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
`install!` prepends the client middleware (so enqueues are diverted) and adds
|
|
55
|
+
the server middleware (so in-flight jobs are diverted). If
|
|
56
|
+
[sidekiq-unique-jobs](https://github.com/mhenrixon/sidekiq-unique-jobs) is
|
|
57
|
+
present, the routing server middleware is inserted *after* it automatically.
|
|
58
|
+
|
|
59
|
+
Then, from a console during an incident:
|
|
60
|
+
|
|
61
|
+
```ruby
|
|
62
|
+
Sidekiq::Routing.park("FlakyReportJob") # divert to the parking queue (reversible)
|
|
63
|
+
Sidekiq::Routing.blackhole("SpamWebhookJob") # drop its jobs entirely
|
|
64
|
+
Sidekiq::Routing.unpark("FlakyReportJob") # remove the route
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Manual routing — incident response
|
|
68
|
+
|
|
69
|
+
One job class is misbehaving and you want to disable *just that class*, at
|
|
70
|
+
runtime, without pausing the rest of its queue or shipping a deploy.
|
|
71
|
+
|
|
72
|
+
- **Park** (reversible) — newly enqueued and in-flight jobs for the class are
|
|
73
|
+
diverted to a worker-less parking queue (`routing_parked` by default) and
|
|
74
|
+
held. The work is preserved; recover it later.
|
|
75
|
+
- **Blackhole** (irreversible) — the class's jobs are dropped entirely (never
|
|
76
|
+
added to the Dead set). Only for classes you can afford to lose.
|
|
77
|
+
|
|
78
|
+
```ruby
|
|
79
|
+
Sidekiq::Routing.park("HardJob")
|
|
80
|
+
Sidekiq::Routing.blackhole("FireAndForgetJob")
|
|
81
|
+
Sidekiq::Routing.unpark("HardJob")
|
|
82
|
+
|
|
83
|
+
Sidekiq::Routing.routed?("HardJob") # => true/false
|
|
84
|
+
Sidekiq::Routing.parked?("HardJob") # => true only when in park mode
|
|
85
|
+
Sidekiq::Routing.mode("HardJob") # => "park" | "blackhole" | nil
|
|
86
|
+
Sidekiq::Routing.routes # => { "HardJob" => { "mode" => "park", ... } }
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
A route accepts a Class or a String. ActiveJob jobs are matched by their real
|
|
90
|
+
("wrapped") class, not the adapter's job wrapper.
|
|
91
|
+
|
|
92
|
+
### Clearing an existing backlog into the parking queue
|
|
93
|
+
|
|
94
|
+
`park` only diverts jobs from the moment it's set. To move a class's jobs that
|
|
95
|
+
are *already enqueued* on a live queue into the parking queue, sweep them:
|
|
96
|
+
|
|
97
|
+
```ruby
|
|
98
|
+
Sidekiq::Routing.sweep("HardJob", queue: "within_1_minute")
|
|
99
|
+
Sidekiq::Routing.sweep("HardJob", queue: "within_1_minute", limit: 10_000)
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
A queue must be resolvable — pass `queue:` explicitly (the sweep deliberately
|
|
103
|
+
never scans every queue, which would hammer Redis during an incident).
|
|
104
|
+
|
|
105
|
+
### Recovering parked work
|
|
106
|
+
|
|
107
|
+
```ruby
|
|
108
|
+
# Move parked jobs back to their original queue (stamps them so an active
|
|
109
|
+
# route won't immediately bounce them back).
|
|
110
|
+
Sidekiq::Routing.process_parked
|
|
111
|
+
Sidekiq::Routing.process_parked(klass: "HardJob", limit: 1_000)
|
|
112
|
+
|
|
113
|
+
# Introspection
|
|
114
|
+
Sidekiq::Routing.parked_size # O(1) count of the parking queue
|
|
115
|
+
Sidekiq::Routing.parked_breakdown # { "HardJob" => { "count" => 12, "by_original_queue" => {...} } }
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
A processed parked job has its payload rewritten to target its original queue,
|
|
119
|
+
so if it later fails it retries to that queue — not back to the parking queue.
|
|
120
|
+
Jobs with no stamped original queue go to `process_parked_fallback_queue`
|
|
121
|
+
(`"default"`).
|
|
122
|
+
|
|
123
|
+
## Auto rerouting — capacity management
|
|
124
|
+
|
|
125
|
+
Optionally, move *noisy-neighbor* job classes between SLA tiers automatically
|
|
126
|
+
when a tier is overloaded. It is **off unless you opt in**:
|
|
127
|
+
|
|
128
|
+
```sh
|
|
129
|
+
export SIDEKIQ_ROUTING_AUTO_REROUTE_ENABLED=true
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
Auto rerouting only ever moves jobs to the *next* live SLA tier
|
|
133
|
+
(`within_5_seconds` → `within_1_minute` → `within_5_minutes` → `within_1_hour`).
|
|
134
|
+
It never parks or blackholes.
|
|
135
|
+
|
|
136
|
+
Wire it up in your initializer:
|
|
137
|
+
|
|
138
|
+
```ruby
|
|
139
|
+
if Sidekiq::Routing::Auto.enabled?
|
|
140
|
+
# 1. Track per-class job durations (server middleware).
|
|
141
|
+
Sidekiq.configure_server do |config|
|
|
142
|
+
config.server_middleware do |chain|
|
|
143
|
+
chain.add Sidekiq::Routing::Auto::JobDurationTracker
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
Sidekiq::Routing::Auto.setup do |config|
|
|
149
|
+
# config.capacity_threshold_percent = 80
|
|
150
|
+
# config.sla_thresholds = { "within_5_seconds" => 5, "within_1_minute" => 60, ... }
|
|
151
|
+
end
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
Then schedule the driver periodically (e.g. every minute) with
|
|
155
|
+
[sidekiq-cron](https://github.com/sidekiq-cron/sidekiq-cron):
|
|
156
|
+
|
|
157
|
+
```yaml
|
|
158
|
+
# config/schedule.yml
|
|
159
|
+
routing_auto_reroute:
|
|
160
|
+
cron: "* * * * *"
|
|
161
|
+
class: "Sidekiq::Routing::Auto::RerouteJob"
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
`RerouteJob` checks each SLA queue's estimated workload against its capacity and,
|
|
165
|
+
for any tier over `capacity_threshold_percent`, moves the noisiest classes to
|
|
166
|
+
the next tier. `RerouteJob` itself is excluded from rerouting by default.
|
|
167
|
+
|
|
168
|
+
## Web tab
|
|
169
|
+
|
|
170
|
+
A read-only "Routing" tab for Sidekiq Web shows active routes and parking-queue
|
|
171
|
+
depth/breakdown. Every mutating action stays on the console API — the tab never
|
|
172
|
+
exposes park/blackhole/unpark/sweep, so destructive operations stay deliberate.
|
|
173
|
+
|
|
174
|
+
Require it only where you mount Sidekiq Web (so worker processes never load the
|
|
175
|
+
web framework):
|
|
176
|
+
|
|
177
|
+
```ruby
|
|
178
|
+
require "sidekiq/web"
|
|
179
|
+
require "sidekiq/routing/web" # registers the "Routing" tab
|
|
180
|
+
mount Sidekiq::Web => "/sidekiq"
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
## Configuration reference
|
|
184
|
+
|
|
185
|
+
`Sidekiq::Routing.setup { |config| ... }`:
|
|
186
|
+
|
|
187
|
+
| Option | Default | Purpose |
|
|
188
|
+
|---|---|---|
|
|
189
|
+
| `enabled` | `true` | Master switch for the routing middleware. |
|
|
190
|
+
| `parked_queue` | `"routing_parked"` | Worker-less queue parked jobs divert to. |
|
|
191
|
+
| `process_parked_fallback_queue` | `"default"` | Target when a parked job has no stamped original queue. |
|
|
192
|
+
| `cache_ttl_seconds` | `5` | Hot-path snapshot freshness. `0` reads Redis every call. |
|
|
193
|
+
| `batch_limit` | `nil` | Default cap on jobs moved per recovery call (`nil` = all). |
|
|
194
|
+
| `batch_size` | `100` | Jobs moved per pass. |
|
|
195
|
+
| `breakdown_sample_size` | `1_000` | Max jobs `parked_breakdown` scans. |
|
|
196
|
+
| `logger` | `Rails.logger` or `Sidekiq.logger` | — |
|
|
197
|
+
|
|
198
|
+
`Sidekiq::Routing::Auto.setup { |config| ... }`:
|
|
199
|
+
|
|
200
|
+
| Option | Default | Purpose |
|
|
201
|
+
|---|---|---|
|
|
202
|
+
| `enabled` | `SIDEKIQ_ROUTING_AUTO_REROUTE_ENABLED == "true"` | Opt-in switch. |
|
|
203
|
+
| `sla_thresholds` | the four `within_*` tiers | Queue → SLA seconds. |
|
|
204
|
+
| `capacity_threshold_percent` | `80` | Reroute a tier above this % capacity. |
|
|
205
|
+
| `noisy_neighbor_threshold_percent` | `50` | Share of workload that marks a class noisy. |
|
|
206
|
+
| `batch_reroute_limit` | `50` | Max jobs moved per reroute pass. |
|
|
207
|
+
| `duration_tracking_window` | `3600` | Seconds of duration history retained. |
|
|
208
|
+
| `excluded_job_classes` | `["Sidekiq::Routing::Auto::RerouteJob"]` | Never auto-rerouted. |
|
|
209
|
+
|
|
210
|
+
## How it works
|
|
211
|
+
|
|
212
|
+
Manual routes live in one Redis hash. The middleware reads a frozen,
|
|
213
|
+
process-local snapshot of that hash, refreshed at most once per
|
|
214
|
+
`cache_ttl_seconds`, so the per-job decision is an in-memory lookup. The
|
|
215
|
+
operator API (`park`/`blackhole`/`unpark`) writes through and resets the
|
|
216
|
+
snapshot, so console changes take effect immediately for the writer and within
|
|
217
|
+
one TTL everywhere else. Parking rewrites the `queue` field *inside the job
|
|
218
|
+
payload*, which is what makes recovery and retry-to-original-queue correct.
|
|
219
|
+
|
|
220
|
+
## Development
|
|
221
|
+
|
|
222
|
+
```sh
|
|
223
|
+
bin/setup # or: bundle install
|
|
224
|
+
bundle exec rake test
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
The test suite talks to a real Redis. It uses logical DB 15 of
|
|
228
|
+
`redis://localhost:6379` by default; override with `REDIS_URL` (point it at a
|
|
229
|
+
disposable Redis). No Rails or database is involved.
|
|
230
|
+
|
|
231
|
+
## License
|
|
232
|
+
|
|
233
|
+
Released under the [MIT License](LICENSE).
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Sidekiq
|
|
4
|
+
module Routing::Auto
|
|
5
|
+
class BatchRerouter
|
|
6
|
+
def reroute_jobs(from_queue, to_queue, job_classes: nil)
|
|
7
|
+
limit = Routing::Auto.configuration.batch_reroute_limit
|
|
8
|
+
moved_count = 0
|
|
9
|
+
skipped = { already_rerouted: 0, wrong_class: 0, excluded: 0 }
|
|
10
|
+
queue = Sidekiq::Queue.new(from_queue)
|
|
11
|
+
|
|
12
|
+
Routing::Auto.logger.info(
|
|
13
|
+
"[Routing::Auto] reroute_jobs: from=#{from_queue}, to=#{to_queue}, " \
|
|
14
|
+
"limit=#{limit}, job_classes=#{job_classes.inspect}"
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
queue.each do |job|
|
|
18
|
+
break if moved_count >= limit
|
|
19
|
+
|
|
20
|
+
if job.item["auto_rerouted"]
|
|
21
|
+
skipped[:already_rerouted] += 1
|
|
22
|
+
next
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
if job_classes && !job_classes.include?(job.klass)
|
|
26
|
+
skipped[:wrong_class] += 1
|
|
27
|
+
next
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
if Routing::Auto.configuration.excluded_job_classes.include?(job.klass)
|
|
31
|
+
skipped[:excluded] += 1
|
|
32
|
+
next
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
moved_count += 1 if reroute_single_job(job, from_queue, to_queue)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
Routing::Auto.logger.info(
|
|
39
|
+
"[Routing::Auto] reroute_jobs: moved=#{moved_count}, skipped=#{skipped.inspect}"
|
|
40
|
+
)
|
|
41
|
+
log_reroute(from_queue, to_queue, moved_count, job_classes) if moved_count.positive?
|
|
42
|
+
|
|
43
|
+
moved_count
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
private
|
|
47
|
+
|
|
48
|
+
def reroute_single_job(job, from_queue, to_queue)
|
|
49
|
+
new_item = job.item.merge(
|
|
50
|
+
"queue" => to_queue,
|
|
51
|
+
"auto_rerouted" => true,
|
|
52
|
+
"original_queue" => from_queue,
|
|
53
|
+
"rerouted_at" => Time.now.to_i
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
job.delete if Sidekiq::Client.push(new_item)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def log_reroute(from_queue, to_queue, count, job_classes)
|
|
60
|
+
message = "[Routing::Auto] Moved #{count} jobs from #{from_queue} to #{to_queue}"
|
|
61
|
+
message += " (noisy neighbors: #{job_classes.join(', ')})" if job_classes&.any?
|
|
62
|
+
|
|
63
|
+
Routing::Auto.logger.warn(message)
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Sidekiq
|
|
4
|
+
module Routing::Auto
|
|
5
|
+
class << self
|
|
6
|
+
def configuration
|
|
7
|
+
@_configuration ||= Configuration.new
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def setup
|
|
11
|
+
yield configuration if block_given?
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def enabled?
|
|
15
|
+
configuration.enabled
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def logger
|
|
19
|
+
configuration.logger
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def default_enabled?
|
|
23
|
+
ENV["SIDEKIQ_ROUTING_AUTO_REROUTE_ENABLED"] == "true"
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
class Configuration
|
|
28
|
+
attr_accessor :enabled,
|
|
29
|
+
:logger,
|
|
30
|
+
:sla_thresholds,
|
|
31
|
+
:capacity_threshold_percent,
|
|
32
|
+
:noisy_neighbor_threshold_percent,
|
|
33
|
+
:batch_reroute_limit,
|
|
34
|
+
:duration_tracking_window,
|
|
35
|
+
:excluded_job_classes
|
|
36
|
+
|
|
37
|
+
def initialize
|
|
38
|
+
# Off unless explicitly opted in via SIDEKIQ_ROUTING_AUTO_REROUTE_ENABLED.
|
|
39
|
+
# (In the Rails-engine ancestor this was wired by an initializer; the
|
|
40
|
+
# standalone gem makes it the built-in default so auto stays safe-by-default.)
|
|
41
|
+
@enabled = Sidekiq::Routing::Auto.default_enabled?
|
|
42
|
+
@logger = if defined?(::Rails) && ::Rails.respond_to?(:logger) && ::Rails.logger
|
|
43
|
+
::Rails.logger
|
|
44
|
+
else
|
|
45
|
+
Logger.new($stdout)
|
|
46
|
+
end
|
|
47
|
+
@sla_thresholds = {
|
|
48
|
+
"within_5_seconds" => 5,
|
|
49
|
+
"within_1_minute" => 60,
|
|
50
|
+
"within_5_minutes" => 300,
|
|
51
|
+
"within_1_hour" => 3600
|
|
52
|
+
}
|
|
53
|
+
@capacity_threshold_percent = 80
|
|
54
|
+
@noisy_neighbor_threshold_percent = 50
|
|
55
|
+
@batch_reroute_limit = 50
|
|
56
|
+
@duration_tracking_window = 3600
|
|
57
|
+
# The internal reroute job must never reroute itself.
|
|
58
|
+
@excluded_job_classes = ["Sidekiq::Routing::Auto::RerouteJob"]
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Sidekiq
|
|
4
|
+
module Routing::Auto
|
|
5
|
+
class JobDurationTracker
|
|
6
|
+
REDIS_KEY_PREFIX = "sidekiq:auto_reroute:durations"
|
|
7
|
+
|
|
8
|
+
def self.average_duration(job_class, queue)
|
|
9
|
+
key = redis_key(job_class, queue)
|
|
10
|
+
cutoff = Time.now.to_i - Routing::Auto.configuration.duration_tracking_window
|
|
11
|
+
|
|
12
|
+
entries = Sidekiq.redis do |redis|
|
|
13
|
+
redis.zrangebyscore(key, cutoff, "+inf")
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
return nil if entries.empty?
|
|
17
|
+
|
|
18
|
+
durations = entries.map { |entry| entry.split(":").last.to_f }
|
|
19
|
+
(durations.sum / durations.size).round
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def self.tracked_job_classes(queue)
|
|
23
|
+
pattern = "#{REDIS_KEY_PREFIX}:*:#{queue}"
|
|
24
|
+
prefix = "#{REDIS_KEY_PREFIX}:"
|
|
25
|
+
suffix = ":#{queue}"
|
|
26
|
+
|
|
27
|
+
# Class names contain "::" so split-by-colon mangles them. Strip the
|
|
28
|
+
# known prefix/suffix instead — works for any namespaced class.
|
|
29
|
+
Sidekiq.redis do |redis|
|
|
30
|
+
redis.keys(pattern).map { |key| key.delete_prefix(prefix).delete_suffix(suffix) }
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def self.redis_key(job_class, queue)
|
|
35
|
+
"#{REDIS_KEY_PREFIX}:#{job_class}:#{queue}"
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Match Sidekiq::JobRecord#klass so duration writes line up with the
|
|
39
|
+
# name NoisyNeighborDetector queries by (job.klass already unwraps
|
|
40
|
+
# ActiveJob wrappers; without this, ActiveJob jobs would record under
|
|
41
|
+
# JobWrapper and never be looked up).
|
|
42
|
+
def self.job_class_name(worker, job)
|
|
43
|
+
job["wrapped"] || worker.class.name
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def call(worker, job, queue)
|
|
47
|
+
start_time = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC)
|
|
48
|
+
yield
|
|
49
|
+
ensure
|
|
50
|
+
duration_ms = ((::Process.clock_gettime(::Process::CLOCK_MONOTONIC) - start_time) * 1000).round
|
|
51
|
+
begin
|
|
52
|
+
record_duration(self.class.job_class_name(worker, job), queue, duration_ms)
|
|
53
|
+
rescue StandardError => e
|
|
54
|
+
Routing::Auto.logger.warn("[JobDurationTracker] failed to record duration: #{e.message}")
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def record_duration(job_class, queue, duration_ms)
|
|
59
|
+
key = redis_key(job_class, queue)
|
|
60
|
+
timestamp = Time.now.to_i
|
|
61
|
+
|
|
62
|
+
Sidekiq.redis do |redis|
|
|
63
|
+
redis.zadd(key, timestamp, "#{timestamp}:#{duration_ms}")
|
|
64
|
+
redis.expire(key, Routing::Auto.configuration.duration_tracking_window * 2)
|
|
65
|
+
|
|
66
|
+
cutoff = timestamp - Routing::Auto.configuration.duration_tracking_window
|
|
67
|
+
redis.zremrangebyscore(key, "-inf", cutoff)
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
private
|
|
72
|
+
|
|
73
|
+
def redis_key(job_class, queue)
|
|
74
|
+
self.class.redis_key(job_class, queue)
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Sidekiq
|
|
4
|
+
module Routing::Auto
|
|
5
|
+
class NoisyNeighborDetector
|
|
6
|
+
MIN_WORKLOAD_SECONDS = 10
|
|
7
|
+
|
|
8
|
+
def initialize(queue_name)
|
|
9
|
+
@queue_name = queue_name
|
|
10
|
+
@queue = Sidekiq::Queue.new(queue_name)
|
|
11
|
+
@duration_cache = {}
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def identify_noisy_neighbors
|
|
15
|
+
workload = calculate_workload_by_class
|
|
16
|
+
total_workload = workload.values.sum
|
|
17
|
+
return [] if total_workload < MIN_WORKLOAD_SECONDS
|
|
18
|
+
|
|
19
|
+
threshold = Routing::Auto.configuration.noisy_neighbor_threshold_percent
|
|
20
|
+
|
|
21
|
+
workload.select do |_job_class, workload_seconds|
|
|
22
|
+
((workload_seconds / total_workload.to_f) * 100) > threshold
|
|
23
|
+
end.keys
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def calculate_workload_by_class
|
|
27
|
+
result = Hash.new(0.0)
|
|
28
|
+
|
|
29
|
+
@queue.each do |job|
|
|
30
|
+
duration_ms = get_duration_for_class(job.klass)
|
|
31
|
+
next unless duration_ms
|
|
32
|
+
|
|
33
|
+
result[job.klass] += duration_ms / 1000.0
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
result
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def total_estimated_workload
|
|
40
|
+
calculate_workload_by_class.values.sum
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
private
|
|
44
|
+
|
|
45
|
+
def get_duration_for_class(job_class)
|
|
46
|
+
return @duration_cache[job_class] if @duration_cache.key?(job_class)
|
|
47
|
+
|
|
48
|
+
@duration_cache[job_class] = JobDurationTracker.average_duration(job_class, @queue_name)
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "sidekiq"
|
|
4
|
+
|
|
5
|
+
module Sidekiq
|
|
6
|
+
module Routing::Auto
|
|
7
|
+
# Periodic worker that drives latency-based rerouting: for each SLA queue
|
|
8
|
+
# breaching its capacity threshold, move (noisy-neighbor) jobs to the next
|
|
9
|
+
# tier. Schedule it via sidekiq-cron in the host app's scheduled_jobs.yml.
|
|
10
|
+
#
|
|
11
|
+
# Plain Sidekiq::Job (no app base class) so the gem carries no dependency on
|
|
12
|
+
# the host's job hierarchy.
|
|
13
|
+
class RerouteJob
|
|
14
|
+
include Sidekiq::Job
|
|
15
|
+
|
|
16
|
+
sidekiq_options queue: "within_1_minute", retry: false
|
|
17
|
+
|
|
18
|
+
def perform
|
|
19
|
+
return unless Sidekiq::Routing::Auto.enabled?
|
|
20
|
+
|
|
21
|
+
Sidekiq::Routing::Auto::Router.sla_queues.each do |queue_name|
|
|
22
|
+
process_queue(queue_name)
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
private
|
|
27
|
+
|
|
28
|
+
def process_queue(queue_name)
|
|
29
|
+
return unless should_reroute?(queue_name)
|
|
30
|
+
|
|
31
|
+
target_queue = Sidekiq::Routing::Auto::Router.next_queue_for(queue_name)
|
|
32
|
+
return unless target_queue
|
|
33
|
+
|
|
34
|
+
detector = Sidekiq::Routing::Auto::NoisyNeighborDetector.new(queue_name)
|
|
35
|
+
noisy_neighbors = detector.identify_noisy_neighbors
|
|
36
|
+
rerouter = Sidekiq::Routing::Auto::BatchRerouter.new
|
|
37
|
+
|
|
38
|
+
if noisy_neighbors.any?
|
|
39
|
+
rerouter.reroute_jobs(queue_name, target_queue, job_classes: noisy_neighbors)
|
|
40
|
+
else
|
|
41
|
+
rerouter.reroute_jobs(queue_name, target_queue)
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def should_reroute?(queue_name)
|
|
46
|
+
sla_seconds = Sidekiq::Routing::Auto.configuration.sla_thresholds[queue_name]
|
|
47
|
+
return false unless sla_seconds
|
|
48
|
+
|
|
49
|
+
detector = Sidekiq::Routing::Auto::NoisyNeighborDetector.new(queue_name)
|
|
50
|
+
estimated_workload = detector.total_estimated_workload
|
|
51
|
+
capacity = calculate_capacity(queue_name, sla_seconds)
|
|
52
|
+
capacity_used_percent = (estimated_workload / capacity.to_f) * 100
|
|
53
|
+
threshold = Sidekiq::Routing::Auto.configuration.capacity_threshold_percent
|
|
54
|
+
queue_size = Sidekiq::Queue.new(queue_name).size
|
|
55
|
+
should_reroute = capacity_used_percent > threshold
|
|
56
|
+
|
|
57
|
+
Sidekiq::Routing::Auto.logger.info(
|
|
58
|
+
"[Routing::Auto] #{queue_name}: size=#{queue_size}, " \
|
|
59
|
+
"workload=#{estimated_workload.round(1)}s, capacity=#{capacity.round(1)}s, " \
|
|
60
|
+
"used=#{capacity_used_percent.round(1)}%, threshold=#{threshold}%, reroute=#{should_reroute}"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
should_reroute
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def calculate_capacity(queue_name, sla_seconds)
|
|
67
|
+
total_concurrency = Sidekiq::ProcessSet.new.sum { |process| process["concurrency"] }.nonzero? || 10
|
|
68
|
+
weight_fraction = queue_weight_fraction(queue_name)
|
|
69
|
+
|
|
70
|
+
total_concurrency * weight_fraction * sla_seconds
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def queue_weight_fraction(queue_name)
|
|
74
|
+
queues = Sidekiq.default_configuration[:queues] || []
|
|
75
|
+
total_weight = 0
|
|
76
|
+
queue_weight = 1
|
|
77
|
+
|
|
78
|
+
queues.each do |queue|
|
|
79
|
+
if queue.is_a?(Array)
|
|
80
|
+
name, weight = queue
|
|
81
|
+
total_weight += weight
|
|
82
|
+
queue_weight = weight if name == queue_name
|
|
83
|
+
else
|
|
84
|
+
total_weight += 1
|
|
85
|
+
queue_weight = 1 if queue == queue_name
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
total_weight.zero? ? 1.0 : queue_weight.to_f / total_weight
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
end
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Sidekiq
|
|
4
|
+
module Routing::Auto
|
|
5
|
+
class Router
|
|
6
|
+
QUEUE_HIERARCHY = %w[
|
|
7
|
+
within_5_seconds
|
|
8
|
+
within_1_minute
|
|
9
|
+
within_5_minutes
|
|
10
|
+
within_1_hour
|
|
11
|
+
].freeze
|
|
12
|
+
|
|
13
|
+
def self.next_queue_for(current_queue)
|
|
14
|
+
index = QUEUE_HIERARCHY.index(current_queue)
|
|
15
|
+
return nil unless index
|
|
16
|
+
|
|
17
|
+
QUEUE_HIERARCHY[index + 1]
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def self.sla_queues
|
|
21
|
+
QUEUE_HIERARCHY
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Sidekiq
|
|
4
|
+
module Routing
|
|
5
|
+
class Configuration
|
|
6
|
+
attr_accessor :enabled,
|
|
7
|
+
:logger,
|
|
8
|
+
:parked_queue,
|
|
9
|
+
:process_parked_fallback_queue,
|
|
10
|
+
:cache_ttl_seconds,
|
|
11
|
+
:batch_limit,
|
|
12
|
+
:batch_size,
|
|
13
|
+
:breakdown_sample_size
|
|
14
|
+
|
|
15
|
+
def initialize
|
|
16
|
+
@enabled = true
|
|
17
|
+
@logger = default_logger
|
|
18
|
+
@parked_queue = PARKED_QUEUE_DEFAULT
|
|
19
|
+
# Where process_parked sends a parked job that has no stamped original queue.
|
|
20
|
+
@process_parked_fallback_queue = "default"
|
|
21
|
+
# Hot-path snapshot freshness. 0 disables caching (read Redis every call).
|
|
22
|
+
@cache_ttl_seconds = 5
|
|
23
|
+
# Recovery defaults: nil limit = move everything; batch_size bounds each pass.
|
|
24
|
+
@batch_limit = nil
|
|
25
|
+
@batch_size = 100
|
|
26
|
+
# Max jobs parked_breakdown scans (the parking queue can hold millions).
|
|
27
|
+
@breakdown_sample_size = 1_000
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
private
|
|
31
|
+
|
|
32
|
+
def default_logger
|
|
33
|
+
if defined?(::Rails) && ::Rails.respond_to?(:logger) && ::Rails.logger
|
|
34
|
+
::Rails.logger
|
|
35
|
+
else
|
|
36
|
+
::Sidekiq.logger
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|