sidekiq-heroku-autoscale 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +165 -0
- data/lib/sidekiq-heroku-autoscale.rb +2 -0
- data/lib/sidekiq/heroku_autoscale.rb +69 -0
- data/lib/sidekiq/heroku_autoscale/heroku_app.rb +144 -0
- data/lib/sidekiq/heroku_autoscale/middleware.rb +21 -0
- data/lib/sidekiq/heroku_autoscale/poll_interval.rb +34 -0
- data/lib/sidekiq/heroku_autoscale/process.rb +266 -0
- data/lib/sidekiq/heroku_autoscale/queue_system.rb +102 -0
- data/lib/sidekiq/heroku_autoscale/scale_strategy.rb +53 -0
- data/lib/sidekiq/heroku_autoscale/version.rb +7 -0
- data/lib/sidekiq/heroku_autoscale/web.rb +7 -0
- data/lib/sidekiq/heroku_autoscale/web/inactive.erb +4 -0
- data/lib/sidekiq/heroku_autoscale/web/index.erb +38 -0
- data/lib/sidekiq/heroku_autoscale/web/index.js +182 -0
- data/lib/sidekiq/heroku_autoscale/web_extension.rb +38 -0
- metadata +86 -0
@@ -0,0 +1,266 @@
|
|
1
|
+
module Sidekiq
|
2
|
+
module HerokuAutoscale
|
3
|
+
|
4
|
+
class Process
|
5
|
+
WAKE_THROTTLE = PollInterval.new(:wait_for_update!, before_update: 2)
|
6
|
+
SHUTDOWN_POLL = PollInterval.new(:wait_for_shutdown!, before_update: 10)
|
7
|
+
|
8
|
+
attr_reader :app_name, :name, :throttle, :history, :client
|
9
|
+
attr_reader :queue_system, :scale_strategy
|
10
|
+
|
11
|
+
attr_accessor :active_at, :updated_at, :quieted_at
|
12
|
+
attr_accessor :dynos, :quieted_to, :quiet_buffer
|
13
|
+
|
14
|
+
def initialize(
|
15
|
+
name: 'worker',
|
16
|
+
app_name: nil,
|
17
|
+
client: nil,
|
18
|
+
throttle: 10, # 10 seconds
|
19
|
+
history: 3600, # 1 hour
|
20
|
+
quiet_buffer: 10,
|
21
|
+
system: {},
|
22
|
+
scale: {}
|
23
|
+
)
|
24
|
+
@app_name = app_name || name.to_s
|
25
|
+
@name = name.to_s
|
26
|
+
@client = client
|
27
|
+
@queue_system = QueueSystem.new(system)
|
28
|
+
@scale_strategy = ScaleStrategy.new(scale)
|
29
|
+
|
30
|
+
@dynos = 0
|
31
|
+
@active_at = nil
|
32
|
+
@updated_at = nil
|
33
|
+
@quieted_at = nil
|
34
|
+
@quieted_to = nil
|
35
|
+
|
36
|
+
@throttle = throttle
|
37
|
+
@history = history
|
38
|
+
@quiet_buffer = quiet_buffer
|
39
|
+
end
|
40
|
+
|
41
|
+
def status
|
42
|
+
if shutting_down?
|
43
|
+
'stopping'
|
44
|
+
elsif quieting?
|
45
|
+
'quieting'
|
46
|
+
elsif @dynos > 0
|
47
|
+
'running'
|
48
|
+
else
|
49
|
+
'stopped'
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# request a throttled update
|
54
|
+
def ping!
|
55
|
+
@active_at = Time.now.utc
|
56
|
+
if ::Sidekiq.server?
|
57
|
+
# submit the process for runscaling (up or down)
|
58
|
+
# the process is polled until shutdown occurs
|
59
|
+
SHUTDOWN_POLL.call(self)
|
60
|
+
else
|
61
|
+
# submits the process for upscaling (wake up)
|
62
|
+
# the process is polled until an update is run
|
63
|
+
WAKE_THROTTLE.call(self)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
# checks if the system is downscaling
|
68
|
+
# no other scaling is allowed during a cooling period
|
69
|
+
def quieting?
|
70
|
+
!!(@quieted_to && @quieted_at)
|
71
|
+
end
|
72
|
+
|
73
|
+
def shutting_down?
|
74
|
+
quieting? && @quieted_to.zero?
|
75
|
+
end
|
76
|
+
|
77
|
+
def fulfills_quietdown?
|
78
|
+
!!(@quieted_at && Time.now.utc >= @quieted_at + @quiet_buffer)
|
79
|
+
end
|
80
|
+
|
81
|
+
# check if a probe time is newer than the last update
|
82
|
+
def updated_since_last_activity?
|
83
|
+
!!(@active_at && @updated_at && @updated_at > @active_at)
|
84
|
+
end
|
85
|
+
|
86
|
+
# # check if last update falls within the throttle window
|
87
|
+
def throttled?
|
88
|
+
!!(@updated_at && Time.now.utc < @updated_at + @throttle)
|
89
|
+
end
|
90
|
+
|
91
|
+
# starts a quietdown period in which excess workers are quieted
|
92
|
+
# no formation changes are allowed during a quietdown window.
|
93
|
+
def quietdown(to=0)
|
94
|
+
quiet_to = [0, to].max
|
95
|
+
quiet_at = Time.now.utc
|
96
|
+
unless queue_system.quietdown!(quiet_to)
|
97
|
+
# omit quiet buffer if no workers were actually quieted
|
98
|
+
# allows direct downscaling without buffer delay
|
99
|
+
# (though uptime buffer may still have an effect)
|
100
|
+
quiet_at -= (@quiet_buffer + 1)
|
101
|
+
end
|
102
|
+
set_attributes(quieted_to: quiet_to, quieted_at: quiet_at)
|
103
|
+
end
|
104
|
+
|
105
|
+
# wrapper for throttling the upscale process (client)
|
106
|
+
# polling runs until the next update has been called.
|
107
|
+
def wait_for_update!
|
108
|
+
# resolve (true) when already updated by another process
|
109
|
+
# keep waiting (false) when:
|
110
|
+
# - redundant updates are called within the throttle window
|
111
|
+
# - the system has been fully quieted and must shutdown before upscaling
|
112
|
+
return true if updated_since_last_activity?
|
113
|
+
return false if throttled?
|
114
|
+
|
115
|
+
# first round of checks use local (process-specific) settings
|
116
|
+
# now hit the redis cache and double check settings from other processes
|
117
|
+
sync_attributes
|
118
|
+
return true if updated_since_last_activity?
|
119
|
+
return false if throttled?
|
120
|
+
|
121
|
+
update!
|
122
|
+
true
|
123
|
+
end
|
124
|
+
|
125
|
+
# wrapper for monitoring the downscale process (server)
|
126
|
+
# polling runs until an update returns zero dynos.
|
127
|
+
def wait_for_shutdown!
|
128
|
+
return false if throttled?
|
129
|
+
|
130
|
+
sync_attributes
|
131
|
+
return false if throttled?
|
132
|
+
|
133
|
+
update!.zero?
|
134
|
+
end
|
135
|
+
|
136
|
+
# update the process with live dyno count from Heroku,
|
137
|
+
# and then reassess workload and scale transitions.
|
138
|
+
# this method shouldn't be called directly... just ping! it.
|
139
|
+
def update!(current=nil, target=nil)
|
140
|
+
current ||= fetch_dyno_count
|
141
|
+
|
142
|
+
attrs = { dynos: current, updated_at: Time.now.utc }
|
143
|
+
if current.zero?
|
144
|
+
attrs[:quieted_to] = nil
|
145
|
+
attrs[:quieted_at] = nil
|
146
|
+
end
|
147
|
+
set_attributes(attrs)
|
148
|
+
|
149
|
+
# No changes are allowed while quieting...
|
150
|
+
# the quieted dyno needs to be removed (downscaled)
|
151
|
+
# before making other changes to the formation.
|
152
|
+
unless quieting?
|
153
|
+
# select a new scale target to shoot for
|
154
|
+
# (provides a trajectory, not necessarily a destination)
|
155
|
+
target ||= scale_strategy.call(queue_system)
|
156
|
+
|
157
|
+
# idle
|
158
|
+
if current == target
|
159
|
+
::Sidekiq.logger.info("IDLE at #{ target } dynos")
|
160
|
+
return current
|
161
|
+
|
162
|
+
# upscale
|
163
|
+
elsif current < target
|
164
|
+
return set_dyno_count!(target)
|
165
|
+
|
166
|
+
# quietdown
|
167
|
+
elsif current > target
|
168
|
+
::Sidekiq.logger.info("QUIET to #{ current - 1 } dynos")
|
169
|
+
quietdown(current - 1)
|
170
|
+
# do NOT return...
|
171
|
+
# allows downscale conditions to run during the same update
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
# downscale
|
176
|
+
if quieting? && fulfills_quietdown?
|
177
|
+
return set_dyno_count!(@quieted_to)
|
178
|
+
end
|
179
|
+
|
180
|
+
current
|
181
|
+
end
|
182
|
+
|
183
|
+
# gets a live dyno count from Heroku
|
184
|
+
def fetch_dyno_count
|
185
|
+
if @client
|
186
|
+
@client.formation.list(app_name)
|
187
|
+
.select { |item| item['type'] == name }
|
188
|
+
.map { |item| item['quantity'] }
|
189
|
+
.reduce(0, &:+)
|
190
|
+
else
|
191
|
+
@dynos
|
192
|
+
end
|
193
|
+
rescue StandardError => e
|
194
|
+
::Sidekiq::HerokuAutoscale.exception_handler.call(e)
|
195
|
+
0
|
196
|
+
end
|
197
|
+
|
198
|
+
# sets the live dyno count on Heroku
|
199
|
+
def set_dyno_count!(count)
|
200
|
+
::Sidekiq.logger.info("SCALE to #{ count } dynos")
|
201
|
+
@client.formation.update(app_name, name, { quantity: count }) if @client
|
202
|
+
set_attributes(dynos: count, quieted_to: nil, quieted_at: nil, history_at: Time.now.utc)
|
203
|
+
count
|
204
|
+
rescue StandardError => e
|
205
|
+
::Sidekiq::HerokuAutoscale.exception_handler.call(e)
|
206
|
+
@dynos
|
207
|
+
end
|
208
|
+
|
209
|
+
# sets redis-cached process attributes
|
210
|
+
def set_attributes(attrs)
|
211
|
+
cache = {}
|
212
|
+
prev_dynos = @dynos
|
213
|
+
if attrs.key?(:dynos)
|
214
|
+
cache['dynos'] = @dynos = attrs[:dynos]
|
215
|
+
end
|
216
|
+
if attrs.key?(:quieted_to)
|
217
|
+
cache['quieted_to'] = @quieted_to = attrs[:quieted_to]
|
218
|
+
end
|
219
|
+
if attrs.key?(:quieted_at)
|
220
|
+
@quieted_at = attrs[:quieted_at]
|
221
|
+
cache['quieted_at'] = @quieted_at ? @quieted_at.to_i : nil
|
222
|
+
end
|
223
|
+
if attrs.key?(:updated_at)
|
224
|
+
@updated_at = attrs[:updated_at]
|
225
|
+
cache['updated_at'] = @updated_at ? @updated_at.to_i : nil
|
226
|
+
end
|
227
|
+
|
228
|
+
::Sidekiq.redis do |c|
|
229
|
+
c.pipelined do
|
230
|
+
# set new keys, delete expired keys
|
231
|
+
del, set = cache.partition { |k, v| v.nil? }
|
232
|
+
c.hmset(cache_key, *set.flatten) if set.any?
|
233
|
+
c.hdel(cache_key, *del.map(&:first)) if del.any?
|
234
|
+
|
235
|
+
if attrs[:history_at]
|
236
|
+
# set a dyno count history marker
|
237
|
+
event_time = (attrs[:history_at].to_f / @throttle).floor * @throttle
|
238
|
+
history_page = (attrs[:history_at].to_f / @history).floor * @history
|
239
|
+
history_key = "#{ cache_key }:#{ history_page }"
|
240
|
+
|
241
|
+
c.hmset(history_key, (event_time - @throttle).to_s, prev_dynos, event_time.to_s, @dynos)
|
242
|
+
c.expire(history_key, @history * 2)
|
243
|
+
end
|
244
|
+
end
|
245
|
+
end
|
246
|
+
end
|
247
|
+
|
248
|
+
# syncs configuration across process instances (dynos)
|
249
|
+
def sync_attributes
|
250
|
+
if cache = ::Sidekiq.redis { |c| c.hgetall(cache_key) }
|
251
|
+
@dynos = cache['dynos'] ? cache['dynos'].to_i : 0
|
252
|
+
@quieted_to = cache['quieted_to'] ? cache['quieted_to'].to_i : nil
|
253
|
+
@quieted_at = cache['quieted_at'] ? Time.at(cache['quieted_at'].to_i).utc : nil
|
254
|
+
@updated_at = cache['updated_at'] ? Time.at(cache['updated_at'].to_i).utc : nil
|
255
|
+
return true
|
256
|
+
end
|
257
|
+
false
|
258
|
+
end
|
259
|
+
|
260
|
+
def cache_key
|
261
|
+
[self.class.name.gsub('::', '/').downcase, app_name, name].join(':')
|
262
|
+
end
|
263
|
+
end
|
264
|
+
|
265
|
+
end
|
266
|
+
end
|
@@ -0,0 +1,102 @@
|
|
1
|
+
require 'sidekiq/api'
|
2
|
+
|
3
|
+
module Sidekiq
|
4
|
+
module HerokuAutoscale
|
5
|
+
|
6
|
+
class QueueSystem
|
7
|
+
ALL_QUEUES = '*'.freeze
|
8
|
+
|
9
|
+
attr_accessor :watch_queues, :include_retrying, :include_scheduled
|
10
|
+
|
11
|
+
def initialize(watch_queues: ALL_QUEUES, include_retrying: true, include_scheduled: true)
|
12
|
+
@watch_queues = [watch_queues].flatten.uniq
|
13
|
+
@include_retrying = include_retrying
|
14
|
+
@include_scheduled = include_scheduled
|
15
|
+
end
|
16
|
+
|
17
|
+
def all_queues?
|
18
|
+
@watch_queues.first == ALL_QUEUES
|
19
|
+
end
|
20
|
+
|
21
|
+
# number of dynos (process instances) running sidekiq
|
22
|
+
# this may include one-or-more instances of one-or-more heroku process types
|
23
|
+
# (though they should all be one process type if setup validation was observed)
|
24
|
+
def dynos
|
25
|
+
sidekiq_processes.size
|
26
|
+
end
|
27
|
+
|
28
|
+
# number of worker threads currently running sidekiq jobs
|
29
|
+
# counts all queue-specific threads across all dynos (process instances)
|
30
|
+
def threads
|
31
|
+
# work => { 'queue' => name, 'run_at' => timestamp, 'payload' => msg }
|
32
|
+
worker_set = ::Sidekiq::Workers.new.to_a
|
33
|
+
worker_set = worker_set.select { |pid, tid, work| watch_queues.include?(work['queue']) } unless all_queues?
|
34
|
+
worker_set.length
|
35
|
+
end
|
36
|
+
|
37
|
+
# number of jobs sitting in the active work queue
|
38
|
+
def enqueued
|
39
|
+
counts = all_queues? ? sidekiq_queues.values : sidekiq_queues.slice(*watch_queues).values
|
40
|
+
counts.map(&:to_i).reduce(&:+) || 0
|
41
|
+
end
|
42
|
+
|
43
|
+
# number of jobs in the scheduled set
|
44
|
+
def scheduled
|
45
|
+
return 0 unless @include_scheduled
|
46
|
+
count_jobs(::Sidekiq::ScheduledSet.new)
|
47
|
+
end
|
48
|
+
|
49
|
+
# number of jobs in the retry set
|
50
|
+
def retrying
|
51
|
+
return 0 unless @include_retrying
|
52
|
+
count_jobs(::Sidekiq::RetrySet.new)
|
53
|
+
end
|
54
|
+
|
55
|
+
def total_work
|
56
|
+
enqueued + scheduled + retrying + threads
|
57
|
+
end
|
58
|
+
|
59
|
+
def has_work?
|
60
|
+
total_work > 0
|
61
|
+
end
|
62
|
+
|
63
|
+
# When scaling down workers, heroku stops the one with the highest number...
|
64
|
+
# from https://stackoverflow.com/questions/25215334/scale-down-specific-heroku-worker-dynos
|
65
|
+
def quietdown!(scale)
|
66
|
+
quieted = false
|
67
|
+
# processes have hostnames formatted as "worker.1", "worker.2", "sidekiq.1", etc...
|
68
|
+
# this groups processes by type, then sorts by number, and then quiets beyond scale.
|
69
|
+
sidekiq_processes.group_by { |p| p['hostname'].split('.').first }.each_pair do |type, group|
|
70
|
+
# there should only ever be a single group here (assuming setup validations were observed)
|
71
|
+
group.sort_by { |p| p['hostname'].split('.').last.to_i }.each_with_index do |process, index|
|
72
|
+
if index + 1 > scale && !process.stopping?
|
73
|
+
process.quiet!
|
74
|
+
quieted = true
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
quieted
|
80
|
+
end
|
81
|
+
|
82
|
+
def sidekiq_queues
|
83
|
+
::Sidekiq::Stats.new.queues
|
84
|
+
end
|
85
|
+
|
86
|
+
def sidekiq_processes
|
87
|
+
process_set = ::Sidekiq::ProcessSet.new
|
88
|
+
# select all processes with queues that intersect watched queues
|
89
|
+
process_set = process_set.select { |p| (p['queues'] & @watch_queues).any? } unless all_queues?
|
90
|
+
process_set
|
91
|
+
end
|
92
|
+
|
93
|
+
private
|
94
|
+
|
95
|
+
def count_jobs(job_set)
|
96
|
+
return job_set.size if all_queues?
|
97
|
+
job_set.count { |j| watch_queues.include?(j.queue) }
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
end
|
102
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module Sidekiq
|
2
|
+
module HerokuAutoscale
|
3
|
+
|
4
|
+
class ScaleStrategy
|
5
|
+
attr_accessor :mode, :max_dynos, :workers_per_dyno, :min_factor
|
6
|
+
|
7
|
+
def initialize(mode: :binary, max_dynos: 1, workers_per_dyno: 25, min_factor: 0)
|
8
|
+
@mode = mode
|
9
|
+
@max_dynos = max_dynos
|
10
|
+
@workers_per_dyno = workers_per_dyno
|
11
|
+
@min_factor = min_factor
|
12
|
+
end
|
13
|
+
|
14
|
+
def call(sys)
|
15
|
+
case @mode.to_s
|
16
|
+
when 'linear'
|
17
|
+
linear(sys)
|
18
|
+
else
|
19
|
+
binary(sys)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def binary(sys)
|
24
|
+
sys.has_work? ? @max_dynos : 0
|
25
|
+
end
|
26
|
+
|
27
|
+
def linear(sys)
|
28
|
+
# total capacity of max workers
|
29
|
+
total_capacity = (@max_dynos * @workers_per_dyno).to_f
|
30
|
+
|
31
|
+
# min capacity required to scale first worker
|
32
|
+
min_capacity = [0, @min_factor].max.to_f * @workers_per_dyno
|
33
|
+
|
34
|
+
# min percentage of total capacity
|
35
|
+
min_capacity_percentage = min_capacity / total_capacity
|
36
|
+
requested_capacity_percentage = sys.total_work / total_capacity
|
37
|
+
|
38
|
+
# Scale requested capacity taking into account the minimum required
|
39
|
+
scale_factor = (requested_capacity_percentage - min_capacity_percentage) / (total_capacity - min_capacity_percentage)
|
40
|
+
scale_factor = 0 if scale_factor.nan? # Handle DIVZERO
|
41
|
+
scaled_capacity_percentage = scale_factor * total_capacity
|
42
|
+
|
43
|
+
# don't scale down past number of currently engaged workers,
|
44
|
+
# and don't scale up past maximum dynos
|
45
|
+
ideal_dynos = ([0, scaled_capacity_percentage].max * @max_dynos).ceil
|
46
|
+
minimum_dynos = [sys.dynos, ideal_dynos].max
|
47
|
+
maximum_dynos = [minimum_dynos, @max_dynos].min
|
48
|
+
[minimum_dynos, maximum_dynos].min
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
<div class= "dashboard clearfix">
|
2
|
+
<h3>Heorku Dynos
|
3
|
+
<span class="beacon" id="beacon">
|
4
|
+
<span class="ring"></span>
|
5
|
+
<span class="dot"></span>
|
6
|
+
</span>
|
7
|
+
</h3>
|
8
|
+
</div>
|
9
|
+
|
10
|
+
<div class="row chart">
|
11
|
+
<div id="history" data-update-url="<%= root_path %>stats" data-dynos-url="<%= root_path %>dynos/stats"></div>
|
12
|
+
<div id="history-legend"></div>
|
13
|
+
<script id="history-data" type="text/json">
|
14
|
+
<%= JSON.generate(@dyno_stats) %>
|
15
|
+
</script>
|
16
|
+
</div>
|
17
|
+
|
18
|
+
<h5>Process types</h5>
|
19
|
+
<div class="table_container">
|
20
|
+
<table class="processes table table-hover table-bordered table-striped table-white">
|
21
|
+
<thead>
|
22
|
+
<th>Name</th>
|
23
|
+
<th>Updated at</th>
|
24
|
+
<th>Status</th>
|
25
|
+
<th>Dynos</th>
|
26
|
+
</thead>
|
27
|
+
<% @dyno_stats.each_pair do |key, stats| %>
|
28
|
+
<tr>
|
29
|
+
<td><%= key %></td>
|
30
|
+
<td id="<%= key %>-updated"><%= stats[:updated] %></td>
|
31
|
+
<td id="<%= key %>-status"><%= stats[:status] %></td>
|
32
|
+
<td id="<%= key %>-dynos"><%= stats[:dynos] %></td>
|
33
|
+
</tr>
|
34
|
+
<% end %>
|
35
|
+
</table>
|
36
|
+
</div>
|
37
|
+
|
38
|
+
<script type="text/javascript" src="<%= root_path %>dynos/index.js"></script>
|