sidekiq-amigo 1.11.0 → 1.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/amigo/autoscaler/checkers/fake.rb +22 -0
- data/lib/amigo/autoscaler/checkers/sidekiq.rb +19 -0
- data/lib/amigo/autoscaler/checkers/web_latency.rb +84 -0
- data/lib/amigo/autoscaler/handlers/chain.rb +28 -0
- data/lib/amigo/autoscaler/handlers/fake.rb +27 -0
- data/lib/amigo/autoscaler/handlers/heroku.rb +141 -0
- data/lib/amigo/autoscaler/handlers/log.rb +35 -0
- data/lib/amigo/autoscaler/handlers/sentry.rb +38 -0
- data/lib/amigo/autoscaler.rb +71 -96
- data/lib/amigo/version.rb +1 -1
- metadata +37 -2
- data/lib/amigo/autoscaler/heroku.rb +0 -145
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8f2d4776669bc7327b064ae2f0b2f22a83e35cd351da05a145d1b3b7bf086334
|
4
|
+
data.tar.gz: b3122b4fa37a8c6c93afbd485ce4b813536a2c1b91d96c266b2c2bb04be15d98
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c24ff6b6af38bb638be36dfa18aaca7500df9bf0126a4adf814e179fc05f3784b27004286e4ef796285cd047388d88486ee7da5fe9050454e33fc9b52d8f4698
|
7
|
+
data.tar.gz: 52b8edc30efe323786963559a7330058bd6b70552eedbdb4f7d3f3a8727373de6d033f2c0dd0113f9303f80e312d957df1e50c1f0fb5aec3e7bdcd9918b3a3bf
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "amigo/autoscaler"
|
4
|
+
|
5
|
+
module Amigo
|
6
|
+
class Autoscaler
|
7
|
+
module Checkers
|
8
|
+
class Fake < Amigo::Autoscaler::Checker
|
9
|
+
def initialize(latencies)
|
10
|
+
@latencies = latencies
|
11
|
+
super()
|
12
|
+
end
|
13
|
+
|
14
|
+
def get_latencies
|
15
|
+
return @latencies.call if @latencies.respond_to?(:call)
|
16
|
+
return @latencies.shift if @latencies.is_a?(Array)
|
17
|
+
return @latencies
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "sidekiq/api"
|
4
|
+
|
5
|
+
require "amigo/autoscaler"
|
6
|
+
|
7
|
+
module Amigo
|
8
|
+
class Autoscaler
|
9
|
+
module Checkers
|
10
|
+
class Sidekiq < Amigo::Autoscaler::Checker
|
11
|
+
def get_latencies
|
12
|
+
return ::Sidekiq::Queue.all.
|
13
|
+
map { |q| [q.name, q.latency] }.
|
14
|
+
to_h
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "amigo/autoscaler"
|
4
|
+
|
5
|
+
module Amigo
|
6
|
+
class Autoscaler
|
7
|
+
module Checkers
|
8
|
+
class WebLatency < Amigo::Autoscaler::Checker
|
9
|
+
NAMESPACE = "amigo/autoscaler/web_latency"
|
10
|
+
WINDOW = 60
|
11
|
+
|
12
|
+
# Set the latency.
|
13
|
+
# @param redis [RedisClient::Common] Redis connection.
|
14
|
+
# @param namespace [String] Key namespace.
|
15
|
+
# @param at [Time,Integer] Time this record was taken.
|
16
|
+
# @param duration [Numeric] Duration of the request in fractional seconds.
|
17
|
+
def self.set_latency(redis:, namespace:, at:, duration:)
|
18
|
+
bucket = at.to_i
|
19
|
+
key = "#{namespace}/latencies:#{bucket}"
|
20
|
+
duration_ms = (duration * 1000).round
|
21
|
+
redis.call("HINCRBY", key, "count", 1)
|
22
|
+
redis.call("HINCRBY", key, "sum", duration_ms)
|
23
|
+
redis.call("EXPIRE", key, WINDOW + 1)
|
24
|
+
end
|
25
|
+
|
26
|
+
def initialize(redis:, namespace: NAMESPACE)
|
27
|
+
@redis = redis
|
28
|
+
@namespace = namespace
|
29
|
+
super()
|
30
|
+
end
|
31
|
+
|
32
|
+
def get_latencies
|
33
|
+
now = Time.now.to_i
|
34
|
+
keys = (now - 59..now).map { |t| "#{@namespace}/latencies:#{t}" }
|
35
|
+
counts = 0
|
36
|
+
sums = 0
|
37
|
+
results = @redis.pipelined do |pipeline|
|
38
|
+
keys.each do |k|
|
39
|
+
pipeline.call("HMGET", k, "count", "sum")
|
40
|
+
end
|
41
|
+
end
|
42
|
+
results.each do |count, sum|
|
43
|
+
counts += count.to_i
|
44
|
+
sums += sum.to_i
|
45
|
+
end
|
46
|
+
return {} if counts.zero?
|
47
|
+
latency = sums.to_f / counts
|
48
|
+
return {"web" => latency.to_f / 1000}
|
49
|
+
end
|
50
|
+
|
51
|
+
class Middleware
|
52
|
+
# @param threshold [Float] Do not record the latency of requests faster than this.
|
53
|
+
# These are usually just things like healthchecks, files, or other very fast requests
|
54
|
+
# which do not represent the overall system slowness.
|
55
|
+
def initialize(app, redis:, threshold: 0.08, namespace: NAMESPACE)
|
56
|
+
@app = app
|
57
|
+
@redis = redis
|
58
|
+
@threshold = threshold
|
59
|
+
@namespace = namespace
|
60
|
+
end
|
61
|
+
|
62
|
+
def call(env)
|
63
|
+
start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
64
|
+
status, headers, body = @app.call(env)
|
65
|
+
duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start
|
66
|
+
if duration > @threshold
|
67
|
+
begin
|
68
|
+
WebLatency.set_latency(
|
69
|
+
redis: @redis,
|
70
|
+
namespace: @namespace,
|
71
|
+
at: Time.now,
|
72
|
+
duration:,
|
73
|
+
)
|
74
|
+
rescue StandardError => e
|
75
|
+
Amigo.log(nil, :error, "web_latency_error", exception: e)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
[status, headers, body]
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "amigo/autoscaler"
|
4
|
+
|
5
|
+
module Amigo
|
6
|
+
class Autoscaler
|
7
|
+
module Handlers
|
8
|
+
class Chain < Amigo::Autoscaler::Handler
|
9
|
+
attr_accessor :chain
|
10
|
+
|
11
|
+
# Chain multiple handlers together.
|
12
|
+
# @param chain [Array<Amigo::Autoscaler::Handler>]
|
13
|
+
def initialize(chain)
|
14
|
+
@chain = chain
|
15
|
+
super()
|
16
|
+
end
|
17
|
+
|
18
|
+
def scale_up(*args, **kw)
|
19
|
+
@chain.each { |c| c.scale_up(*args, **kw) }
|
20
|
+
end
|
21
|
+
|
22
|
+
def scale_down(*args, **kw)
|
23
|
+
@chain.each { |c| c.scale_down(*args, **kw) }
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "amigo/autoscaler"
|
4
|
+
|
5
|
+
module Amigo
|
6
|
+
class Autoscaler
|
7
|
+
module Handlers
|
8
|
+
class Fake < Amigo::Autoscaler::Handler
|
9
|
+
attr_accessor :ups, :downs
|
10
|
+
|
11
|
+
def initialize
|
12
|
+
@ups = []
|
13
|
+
@downs = []
|
14
|
+
super()
|
15
|
+
end
|
16
|
+
|
17
|
+
def scale_up(checked_latencies, depth:, duration:, **kw)
|
18
|
+
@ups << [checked_latencies, depth, duration, kw]
|
19
|
+
end
|
20
|
+
|
21
|
+
def scale_down(depth:, duration:, **kw)
|
22
|
+
@downs << [depth, duration, kw]
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,141 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "platform-api"
|
4
|
+
|
5
|
+
require "amigo/autoscaler"
|
6
|
+
|
7
|
+
module Amigo
|
8
|
+
class Autoscaler
|
9
|
+
module Handlers
|
10
|
+
# Autoscaler to use on Heroku, that starts additional worker processes when there is a high latency event
|
11
|
+
# and scales them down after the event is finished.
|
12
|
+
#
|
13
|
+
# When the first call of a high latency event happens (depth: 1), this class
|
14
|
+
# will ask Heroku how many dynos are in the formation. This is known as +active_event_initial_workers+.
|
15
|
+
#
|
16
|
+
# If +active_event_initial_workers+ is 0, no autoscaling will be done.
|
17
|
+
# This avoids a situation where a high latency event is triggered
|
18
|
+
# due to workers being deprovisioned intentionally, perhaps for maintenance.
|
19
|
+
#
|
20
|
+
# Each time the alert fires (see +Amigo::Autoscaler#alert_interval+),
|
21
|
+
# an additional worker will be added to the formation, up to +max_additional_workers+.
|
22
|
+
# So with +active_event_initial_workers+ of 1 and +max_additional_workers+ of 2,
|
23
|
+
# the first time the alert times, the formation will be set to 2 workers.
|
24
|
+
# The next time, it'll be set to 3 workers.
|
25
|
+
# After that, no additional workers will be provisioned.
|
26
|
+
#
|
27
|
+
# After the high latency event resolves,
|
28
|
+
# the dyno formation is restored to +active_event_initial_workers+.
|
29
|
+
#
|
30
|
+
# To use:
|
31
|
+
#
|
32
|
+
# heroku = PlatformAPI.connect_oauth(heroku_oauth_token)
|
33
|
+
# heroku_scaler = Amigo::Autoscaler::Heroku.new(heroku:, default_workers: 1)
|
34
|
+
# Amigo::Autoscaler.new(
|
35
|
+
# handlers: [heroku_scaler.alert_callback],
|
36
|
+
# latency_restored_handlers: [heroku_scaler.restored_callback],
|
37
|
+
# )
|
38
|
+
#
|
39
|
+
# See instance attributes for additional options.
|
40
|
+
#
|
41
|
+
# Note that this class is provided as an example, and potentially a base or implementation class.
|
42
|
+
# Your actual implementation may also want to alert when a max depth or duration is reached,
|
43
|
+
# since it can indicate a bigger problem. Autoscaling, especially of workers, is a tough problem
|
44
|
+
# without a one-size-fits-all approach.
|
45
|
+
class Heroku < Amigo::Autoscaler::Handler
|
46
|
+
# Heroku client, usually created via PlatformAPI.oauth_connect.
|
47
|
+
# @return [PlatformAPI::Client]
|
48
|
+
attr_reader :heroku
|
49
|
+
|
50
|
+
# Captured at the start of a high latency event.
|
51
|
+
# Nil otherwise.
|
52
|
+
# @return [Integer]
|
53
|
+
attr_reader :active_event_initial_workers
|
54
|
+
|
55
|
+
# Maximum number of workers to add.
|
56
|
+
#
|
57
|
+
# As the 'depth' of the alert is increased,
|
58
|
+
# workers are added to the recorded worker count until the max is reached.
|
59
|
+
# By default, this is 2 (so the max workers will be the recorded number, plus 2).
|
60
|
+
# Do not set this too high, since it can for example exhaust database connections or just end up
|
61
|
+
# increasing load.
|
62
|
+
#
|
63
|
+
# See class docs for more information.
|
64
|
+
# @return [Integer]
|
65
|
+
attr_reader :max_additional_workers
|
66
|
+
|
67
|
+
# Defaults to HEROKU_APP_NAME, which should already be set if you use Heroku dyna metadata,
|
68
|
+
# as per https://devcenter.heroku.com/articles/dyno-metadata.
|
69
|
+
# This must be provided if the env var is missing.
|
70
|
+
# @return [String]
|
71
|
+
attr_reader :app_id_or_app_name
|
72
|
+
|
73
|
+
# Formation ID or name.
|
74
|
+
# Usually 'worker' to scale Sidekiq workers, or 'web' for the web worker.
|
75
|
+
# If you use multiple worker processes for different queues, this class probably isn't sufficient.
|
76
|
+
# You will probably need to look at the slow queue names and determine the formation name to scale up.
|
77
|
+
# @return [String]
|
78
|
+
attr_reader :formation
|
79
|
+
|
80
|
+
def initialize(
|
81
|
+
client:,
|
82
|
+
formation:,
|
83
|
+
max_additional_workers: 2,
|
84
|
+
app_id_or_app_name: ENV.fetch("HEROKU_APP_NAME")
|
85
|
+
)
|
86
|
+
super()
|
87
|
+
@client = client
|
88
|
+
@max_additional_workers = max_additional_workers
|
89
|
+
@app_id_or_app_name = app_id_or_app_name
|
90
|
+
@formation = formation
|
91
|
+
# Is nil outside a latency event, set during a latency event. So if this is initialized to non-nil,
|
92
|
+
# we're already in a latency event.
|
93
|
+
@active_event_initial_workers = Sidekiq.redis do |r|
|
94
|
+
v = r.get("#{namespace}/active_event_initial_workers")
|
95
|
+
v&.to_i
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
protected def namespace
|
100
|
+
return "amigo/autoscaler/heroku/#{self.formation}"
|
101
|
+
end
|
102
|
+
|
103
|
+
# Potentially add another worker to the formation.
|
104
|
+
# @return [:noscale, :maxscale, :scaled] One of :noscale (no +active_event_initial_workers+),
|
105
|
+
# :maxscale (+max_additional_workers+ reached), or :scaled.
|
106
|
+
def scale_up(_queues_and_latencies, depth:, **)
|
107
|
+
# When the scaling event starts (or if this is the first time we've seen it
|
108
|
+
# but the event is already in progress), store how many workers we have.
|
109
|
+
# It needs to be stored in redis so it persists if
|
110
|
+
# the latency event continues through restarts.
|
111
|
+
if @active_event_initial_workers.nil?
|
112
|
+
@active_event_initial_workers = @client.formation.info(@app_id_or_app_name, @formation).
|
113
|
+
fetch("quantity")
|
114
|
+
Sidekiq.redis do |r|
|
115
|
+
r.set("#{namespace}/active_event_initial_workers", @active_event_initial_workers.to_s)
|
116
|
+
end
|
117
|
+
end
|
118
|
+
return :noscale if @active_event_initial_workers.zero?
|
119
|
+
new_quantity = @active_event_initial_workers + depth
|
120
|
+
max_quantity = @active_event_initial_workers + @max_additional_workers
|
121
|
+
return :maxscale if new_quantity > max_quantity
|
122
|
+
@client.formation.update(@app_id_or_app_name, @formation, {quantity: new_quantity})
|
123
|
+
return :scaled
|
124
|
+
end
|
125
|
+
|
126
|
+
# Reset the formation to +active_event_initial_workers+.
|
127
|
+
# @return [:noscale, :scaled] :noscale if +active_event_initial_workers+ is 0, otherwise :scaled.
|
128
|
+
def scale_down(**)
|
129
|
+
initial_workers = @active_event_initial_workers
|
130
|
+
Sidekiq.redis do |r|
|
131
|
+
r.del("#{namespace}/active_event_initial_workers")
|
132
|
+
end
|
133
|
+
@active_event_initial_workers = nil
|
134
|
+
return :noscale if initial_workers.zero?
|
135
|
+
@client.formation.update(@app_id_or_app_name, @formation, {quantity: initial_workers})
|
136
|
+
return :scaled
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "amigo/autoscaler"
|
4
|
+
|
5
|
+
module Amigo
|
6
|
+
class Autoscaler
|
7
|
+
module Handlers
|
8
|
+
class Log < Amigo::Autoscaler::Handler
|
9
|
+
DEFAULT_LOG = ->(level, message, params={}) { Amigo.log(nil, level, message, params) }
|
10
|
+
|
11
|
+
# @param message [String] Log message for structured logging.\
|
12
|
+
# Has "_restored" appended on +scale_down+.
|
13
|
+
# @param log [Proc] Proc/callable called with (level, message, params={}).
|
14
|
+
# By default, use +Amigo.log+ (which logs to the Sidekiq logger).
|
15
|
+
def initialize(message: "high_latency_queues", log: DEFAULT_LOG)
|
16
|
+
@message = message
|
17
|
+
@log = log
|
18
|
+
super()
|
19
|
+
end
|
20
|
+
|
21
|
+
def scale_up(checked_latencies, depth:, duration:, **_kw)
|
22
|
+
self._log(:warn, @message, queues: checked_latencies, depth: depth, duration: duration)
|
23
|
+
end
|
24
|
+
|
25
|
+
def scale_down(depth:, duration:, **_kw)
|
26
|
+
self._log(:info, "#{@message}_restored", depth: depth, duration: duration)
|
27
|
+
end
|
28
|
+
|
29
|
+
protected def _log(level, msg, **kw)
|
30
|
+
@log[level, msg, kw]
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "amigo/autoscaler"
|
4
|
+
|
5
|
+
module Amigo
|
6
|
+
class Autoscaler
|
7
|
+
module Handlers
|
8
|
+
class Sentry < Amigo::Autoscaler::Handler
|
9
|
+
# @param interval [Integer] How many seconds between Sentry alerts?
|
10
|
+
# This is similar to +alert_interval+ on the Autoscaler,
|
11
|
+
# but Sentry has its own interval, since it is used for reporting,
|
12
|
+
# and not latency reduction.
|
13
|
+
# @param message [String] Message to capture.
|
14
|
+
# @param level [:debug,:info,:warning,:warn,:error,:fatal] Sentry level.
|
15
|
+
def initialize(interval: 300, message: "Some queues have a high latency", level: :warn)
|
16
|
+
@interval = interval
|
17
|
+
@message = message
|
18
|
+
@level = level
|
19
|
+
@last_alerted = Time.at(0)
|
20
|
+
super()
|
21
|
+
end
|
22
|
+
|
23
|
+
def scale_up(checked_latencies, depth:, duration:, **)
|
24
|
+
now = Time.now
|
25
|
+
call_sentry = @last_alerted < (now - @interval)
|
26
|
+
return unless call_sentry
|
27
|
+
::Sentry.with_scope do |scope|
|
28
|
+
scope&.set_extras(high_latency_queues: checked_latencies, depth:, duration:)
|
29
|
+
::Sentry.capture_message(@message, level: @level)
|
30
|
+
end
|
31
|
+
@last_alerted = now
|
32
|
+
end
|
33
|
+
|
34
|
+
def scale_down(**) = nil
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
data/lib/amigo/autoscaler.rb
CHANGED
@@ -4,37 +4,37 @@ require "sidekiq/api"
|
|
4
4
|
|
5
5
|
require "amigo"
|
6
6
|
|
7
|
-
#
|
8
|
-
# take
|
7
|
+
# Generic autoscaling handler that will check for latency
|
8
|
+
# and take an action.
|
9
|
+
# For Sidekiq on Heroku for instance,
|
10
|
+
# this means checking queues for a latency above a threshold, and adding workers up to a limit.
|
11
|
+
#
|
9
12
|
# You should start this up at Web application startup:
|
10
13
|
#
|
11
14
|
# # puma.rb or similar
|
12
|
-
# Amigo::Autoscaler.new
|
15
|
+
# checker = Amigo::Autoscaler::Checkers::SidekiqLatency.new
|
16
|
+
# heroku_client = PlatformAPI.connect_oauth(ENV['MYAPP_HEROKU_OAUTH_TOKEN'])
|
17
|
+
# handler = Amigo::Autoscaler::Handlers::Heroku.new(client: heroku_client, formation: 'worker')
|
18
|
+
# Amigo::Autoscaler.new(checker:, handler:).start
|
13
19
|
#
|
14
20
|
# When latency grows beyond +latency_threshold+,
|
15
21
|
# a "high latency event" is started.
|
16
|
-
# Some action
|
17
|
-
# This includes logging, alerting, and/or autoscaling.
|
22
|
+
# Some action should be taken, which is handled by the handler's +scale_up+ method.
|
23
|
+
# This usually includes logging, alerting, and/or autoscaling.
|
18
24
|
#
|
19
25
|
# When latency returns to normal (defined by +latency_restored_threshold+),
|
20
26
|
# the high latency event finishes.
|
21
|
-
# Some additional action is taken,
|
27
|
+
# Some additional action is taken, handled by the handler's +scale_down+ method.
|
22
28
|
# Usually this is logging, and/or returning autoscaling to its original status.
|
23
29
|
#
|
24
30
|
# There are several parameters to control behavior, such as how often polling is done,
|
25
31
|
# how often alerting/scaling is done, and more.
|
26
32
|
#
|
27
|
-
# As an example autoscaler that includes actual resource scaling,
|
28
|
-
# check out +Amigo::Autoscaler::Heroku+.
|
29
|
-
# Its ideas can easily be expanded to other platforms.
|
30
|
-
#
|
31
33
|
# Note that +Autoscaler+ maintains its state over multiple processes;
|
32
34
|
# it needs to keep track of high latency events even if the process running the autoscaler
|
33
35
|
# (usually a web process) restarts.
|
34
36
|
module Amigo
|
35
37
|
class Autoscaler
|
36
|
-
class InvalidHandler < StandardError; end
|
37
|
-
|
38
38
|
# Struct representing data serialized to Redis.
|
39
39
|
# Useful for diagnostics. Can be retried with +fetch_persisted+.
|
40
40
|
# @!attribute last_alerted_at [Time] 0-time if there is no recent alert.
|
@@ -56,49 +56,32 @@ module Amigo
|
|
56
56
|
# are generally easier to find).
|
57
57
|
# @return [Regexp]
|
58
58
|
attr_reader :hostname_regex
|
59
|
-
# Methods to call when alerting, as strings/symbols or procs.
|
60
|
-
# Valid string values are 'log' and 'sentry' (requires Sentry to be required already).
|
61
|
-
# Anything that responds to +call+ will be invoked with:
|
62
|
-
# - Positional argument which is a +Hash+ of `{queue name => latency in seconds}`
|
63
|
-
# - Keyword argument +:depth+: Number of alerts as part of this latency event.
|
64
|
-
# For example, the first alert has a depth of 1, and if latency stays high,
|
65
|
-
# it'll be 2 on the next call, etc. +depth+ can be used to incrementally provision
|
66
|
-
# additional processing capacity, and stop adding capacity at a certain depth
|
67
|
-
# to avoid problems with too many workers (like excessive DB load).
|
68
|
-
# - Keyword argument +:duration+: Number of seconds since this latency spike started.
|
69
|
-
# - Additional undefined keywords. Handlers should accept additional options,
|
70
|
-
# like via `**kw` or `opts={}`, for compatibility.
|
71
|
-
# @return [Array<String,Symbol,Proc,#call>]
|
72
|
-
attr_reader :handlers
|
73
59
|
# Only alert this often.
|
74
60
|
# For example, with poll_interval of 10 seconds
|
75
61
|
# and alert_interval of 200 seconds,
|
76
62
|
# we'd alert once and then 210 seconds later.
|
77
63
|
# @return [Integer]
|
78
64
|
attr_reader :alert_interval
|
65
|
+
|
79
66
|
# After an alert happens, what latency should be considered "back to normal" and
|
80
|
-
# +
|
67
|
+
# +scale_down+ will be called?
|
81
68
|
# In most cases this should be the same as (and defaults to) +latency_threshold+
|
82
69
|
# so that we're 'back to normal' once we're below the threshold.
|
83
70
|
# It may also commonly be 0, so that the callback is fired when the queue is entirely clear.
|
84
71
|
# Note that, if +latency_restored_threshold+ is less than +latency_threshold+,
|
85
72
|
# while the latency is between the two, no alerts will fire.
|
86
73
|
attr_reader :latency_restored_threshold
|
87
|
-
|
88
|
-
#
|
89
|
-
|
90
|
-
#
|
91
|
-
|
92
|
-
|
93
|
-
#
|
94
|
-
#
|
95
|
-
#
|
96
|
-
|
97
|
-
|
98
|
-
attr_reader :latency_restored_handlers
|
99
|
-
# Proc/callable called with (level, message, params={}).
|
100
|
-
# By default, use +Amigo.log+ (which logs to the Sidekiq logger).
|
101
|
-
attr_reader :log
|
74
|
+
|
75
|
+
# @return [Amigo::Autoscaler::Checker]
|
76
|
+
attr_reader :checker
|
77
|
+
# @return [Amigo::Autoscaler::Handler]
|
78
|
+
attr_reader :handler
|
79
|
+
|
80
|
+
# Store autoscaler keys in this Redis namespace.
|
81
|
+
# Note that if you are running multiple autoscalers for different services (web, worker),
|
82
|
+
# you will need different namespaces.
|
83
|
+
attr_reader :namespace
|
84
|
+
|
102
85
|
# Proc called with an exception that occurs while the thread is running.
|
103
86
|
# If the handler returns +true+, then the thread will keep going.
|
104
87
|
# All other values will kill the thread, which breaks autoscaling.
|
@@ -108,15 +91,15 @@ module Amigo
|
|
108
91
|
attr_reader :on_unhandled_exception
|
109
92
|
|
110
93
|
def initialize(
|
94
|
+
handler:,
|
95
|
+
checker:,
|
111
96
|
poll_interval: 20,
|
112
97
|
latency_threshold: 5,
|
113
98
|
hostname_regex: /^web\.1$/,
|
114
|
-
handlers: [:log],
|
115
99
|
alert_interval: 120,
|
116
100
|
latency_restored_threshold: latency_threshold,
|
117
|
-
|
118
|
-
|
119
|
-
on_unhandled_exception: nil
|
101
|
+
on_unhandled_exception: nil,
|
102
|
+
namespace: "amigo/autoscaler"
|
120
103
|
)
|
121
104
|
raise ArgumentError, "latency_threshold must be > 0" if
|
122
105
|
latency_threshold <= 0
|
@@ -124,15 +107,15 @@ module Amigo
|
|
124
107
|
latency_restored_threshold.negative?
|
125
108
|
raise ArgumentError, "latency_restored_threshold must be <= latency_threshold" if
|
126
109
|
latency_restored_threshold > latency_threshold
|
110
|
+
@handler = handler
|
111
|
+
@checker = checker
|
127
112
|
@poll_interval = poll_interval
|
128
113
|
@latency_threshold = latency_threshold
|
129
114
|
@hostname_regex = hostname_regex
|
130
|
-
@handlers = handlers.freeze
|
131
115
|
@alert_interval = alert_interval
|
132
116
|
@latency_restored_threshold = latency_restored_threshold
|
133
|
-
@latency_restored_handlers = latency_restored_handlers.freeze
|
134
|
-
@log = log
|
135
117
|
@on_unhandled_exception = on_unhandled_exception
|
118
|
+
@namespace = namespace
|
136
119
|
end
|
137
120
|
|
138
121
|
# @return [Thread]
|
@@ -143,8 +126,6 @@ module Amigo
|
|
143
126
|
def setup
|
144
127
|
# Store these as strings OR procs, rather than grabbing self.method here.
|
145
128
|
# It gets extremely hard ot test if we capture the method here.
|
146
|
-
@alert_methods = self.handlers.map { |a| _handler_to_method("alert_", a) }
|
147
|
-
@restored_methods = self.latency_restored_handlers.map { |a| _handler_to_method("alert_restored_", a) }
|
148
129
|
@stop = false
|
149
130
|
persisted = self.fetch_persisted
|
150
131
|
@last_alerted = persisted.last_alerted_at
|
@@ -181,24 +162,13 @@ module Amigo
|
|
181
162
|
end
|
182
163
|
end
|
183
164
|
|
184
|
-
protected def namespace
|
185
|
-
return "amigo/autoscaler"
|
186
|
-
end
|
187
|
-
|
188
|
-
private def _handler_to_method(prefix, a)
|
189
|
-
return a if a.respond_to?(:call)
|
190
|
-
method_name = "#{prefix}#{a.to_s.strip}".to_sym
|
191
|
-
raise InvalidHandler, a.inspect unless (meth = self.method(method_name))
|
192
|
-
return meth
|
193
|
-
end
|
194
|
-
|
195
165
|
def start
|
196
166
|
raise "already started" unless @polling_thread.nil?
|
197
167
|
|
198
168
|
hostname = ENV.fetch("DYNO") { Socket.gethostname }
|
199
169
|
return false unless self.hostname_regex.match?(hostname)
|
200
170
|
|
201
|
-
self.
|
171
|
+
self._debug(:info, "async_autoscaler_starting")
|
202
172
|
self.setup
|
203
173
|
@polling_thread = Thread.new do
|
204
174
|
until @stop
|
@@ -216,7 +186,7 @@ module Amigo
|
|
216
186
|
def check
|
217
187
|
self._check
|
218
188
|
rescue StandardError => e
|
219
|
-
self.
|
189
|
+
self._debug(:error, "async_autoscaler_unhandled_error", exception: e)
|
220
190
|
handled = self.on_unhandled_exception&.call(e)
|
221
191
|
raise e unless handled.eql?(true)
|
222
192
|
end
|
@@ -225,22 +195,18 @@ module Amigo
|
|
225
195
|
now = Time.now
|
226
196
|
skip_check = now < (@last_alerted + self.alert_interval)
|
227
197
|
if skip_check
|
228
|
-
self.
|
198
|
+
self._debug(:debug, "async_autoscaler_skip_check")
|
229
199
|
return
|
230
200
|
end
|
231
|
-
self.
|
232
|
-
high_latency_queues =
|
233
|
-
|
234
|
-
select { |(_, latency)| latency > self.latency_threshold }.
|
235
|
-
to_h
|
201
|
+
self._debug(:info, "async_autoscaler_check")
|
202
|
+
high_latency_queues = self.checker.get_latencies.
|
203
|
+
select { |_, latency| latency > self.latency_threshold }
|
236
204
|
if high_latency_queues.empty?
|
237
205
|
# Whenever we are in a latency event, we have a depth > 0. So a depth of 0 means
|
238
206
|
# we're not in a latency event, and still have no latency, so can noop.
|
239
207
|
return if @depth.zero?
|
240
208
|
# We WERE in a latency event, and now we're not, so report on it.
|
241
|
-
@
|
242
|
-
m.call(depth: @depth, duration: (Time.now - @latency_event_started).to_f)
|
243
|
-
end
|
209
|
+
self.handler.scale_down(depth: @depth, duration: (Time.now - @latency_event_started).to_f)
|
244
210
|
# Reset back to 0 depth so we know we're not in a latency event.
|
245
211
|
@depth = 0
|
246
212
|
@latency_event_started = Time.at(0)
|
@@ -260,38 +226,47 @@ module Amigo
|
|
260
226
|
end
|
261
227
|
# Alert each handler. For legacy reasons, we support handlers that accept
|
262
228
|
# ({queues and latencies}) and ({queues and latencies}, {}keywords}).
|
263
|
-
|
264
|
-
@alert_methods.each do |m|
|
265
|
-
if m.respond_to?(:arity) && m.arity == 1
|
266
|
-
m.call(high_latency_queues)
|
267
|
-
else
|
268
|
-
m.call(high_latency_queues, **kw)
|
269
|
-
end
|
270
|
-
end
|
229
|
+
@handler.scale_up(high_latency_queues, depth: @depth, duration: duration)
|
271
230
|
@last_alerted = now
|
272
231
|
self.persist
|
273
232
|
end
|
274
233
|
|
275
|
-
def
|
276
|
-
|
277
|
-
|
278
|
-
names = names_and_latencies.map(&:first).sort.join(", ")
|
279
|
-
Sentry.capture_message("Some queues have a high latency: #{names}")
|
280
|
-
end
|
234
|
+
def _debug(lvl, msg, **kw)
|
235
|
+
return unless ENV["DEBUG"]
|
236
|
+
Amigo.log(nil, lvl, msg, kw)
|
281
237
|
end
|
282
238
|
|
283
|
-
|
284
|
-
|
239
|
+
class Checker
|
240
|
+
# Return relevant latencies for this checker.
|
241
|
+
# This could be the latencies of each Sidekiq queue, or web latencies, etc.
|
242
|
+
# @return [Hash] Key is the queue name (or some other value); value is the latency in seconds.
|
243
|
+
def get_latencies = raise NotImplementedError
|
285
244
|
end
|
286
245
|
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
246
|
+
class Handler
|
247
|
+
# Called when a latency event starts, and as it fails to resolve.
|
248
|
+
# @param checked_latencies [Hash] The +Hash+ returned from +Amigo::Autoscaler::Handler#check+.
|
249
|
+
# For Sidekiq, this will look like `{queue name => latency in seconds}`
|
250
|
+
# @param depth [Integer] Number of alerts as part of this latency event.
|
251
|
+
# For example, the first alert has a depth of 1, and if latency stays high,
|
252
|
+
# it'll be 2 on the next call, etc. +depth+ can be used to incrementally provision
|
253
|
+
# additional processing capacity, and stop adding capacity at a certain depth
|
254
|
+
# to avoid problems with too many workers (like excessive DB load).
|
255
|
+
# @param duration [Float] Number of seconds since this latency spike started.
|
256
|
+
# @param kw [Hash] Additional undefined keywords. Handlers should accept additional options,
|
257
|
+
# like via `**kw` or `opts={}`, for compatibility.
|
258
|
+
# @return [Array<String,Symbol,Proc,#call>]
|
259
|
+
def scale_up(checked_latencies, depth:, duration:, **kw) = raise NotImplementedError
|
292
260
|
|
293
|
-
|
294
|
-
|
261
|
+
# Called when a latency of +latency_restored_threshold+ is reached
|
262
|
+
# (ie, when we get back to normal latency after a high latency event).
|
263
|
+
# Usually this handler will deprovision capacity procured as part of the +scale_up+.
|
264
|
+
# @param depth [Integer] The number of times an alert happened before
|
265
|
+
# the latency spike was resolved.
|
266
|
+
# @param duration [Float] The number of seconds for the latency spike has been going on.
|
267
|
+
# @param kw [Hash] Additional undefined keywords. Handlers should accept additional options,
|
268
|
+
# like via `**kw` or `opts={}`, for compatibility.
|
269
|
+
def scale_down(depth:, duration:, **kw) = raise NotImplementedError
|
295
270
|
end
|
296
271
|
end
|
297
272
|
end
|
data/lib/amigo/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sidekiq-amigo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.12.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Lithic Technology
|
@@ -135,6 +135,34 @@ dependencies:
|
|
135
135
|
- - "~>"
|
136
136
|
- !ruby/object:Gem::Version
|
137
137
|
version: '5'
|
138
|
+
- !ruby/object:Gem::Dependency
|
139
|
+
name: simplecov
|
140
|
+
requirement: !ruby/object:Gem::Requirement
|
141
|
+
requirements:
|
142
|
+
- - "~>"
|
143
|
+
- !ruby/object:Gem::Version
|
144
|
+
version: '0.22'
|
145
|
+
type: :development
|
146
|
+
prerelease: false
|
147
|
+
version_requirements: !ruby/object:Gem::Requirement
|
148
|
+
requirements:
|
149
|
+
- - "~>"
|
150
|
+
- !ruby/object:Gem::Version
|
151
|
+
version: '0.22'
|
152
|
+
- !ruby/object:Gem::Dependency
|
153
|
+
name: simplecov-cobertura
|
154
|
+
requirement: !ruby/object:Gem::Requirement
|
155
|
+
requirements:
|
156
|
+
- - "~>"
|
157
|
+
- !ruby/object:Gem::Version
|
158
|
+
version: '3.1'
|
159
|
+
type: :development
|
160
|
+
prerelease: false
|
161
|
+
version_requirements: !ruby/object:Gem::Requirement
|
162
|
+
requirements:
|
163
|
+
- - "~>"
|
164
|
+
- !ruby/object:Gem::Version
|
165
|
+
version: '3.1'
|
138
166
|
- !ruby/object:Gem::Dependency
|
139
167
|
name: timecop
|
140
168
|
requirement: !ruby/object:Gem::Requirement
|
@@ -175,7 +203,14 @@ files:
|
|
175
203
|
- lib/amigo.rb
|
176
204
|
- lib/amigo/audit_logger.rb
|
177
205
|
- lib/amigo/autoscaler.rb
|
178
|
-
- lib/amigo/autoscaler/
|
206
|
+
- lib/amigo/autoscaler/checkers/fake.rb
|
207
|
+
- lib/amigo/autoscaler/checkers/sidekiq.rb
|
208
|
+
- lib/amigo/autoscaler/checkers/web_latency.rb
|
209
|
+
- lib/amigo/autoscaler/handlers/chain.rb
|
210
|
+
- lib/amigo/autoscaler/handlers/fake.rb
|
211
|
+
- lib/amigo/autoscaler/handlers/heroku.rb
|
212
|
+
- lib/amigo/autoscaler/handlers/log.rb
|
213
|
+
- lib/amigo/autoscaler/handlers/sentry.rb
|
179
214
|
- lib/amigo/deprecated_jobs.rb
|
180
215
|
- lib/amigo/job.rb
|
181
216
|
- lib/amigo/memory_pressure.rb
|
@@ -1,145 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require "platform-api"
|
4
|
-
|
5
|
-
require "amigo/autoscaler"
|
6
|
-
|
7
|
-
module Amigo
|
8
|
-
class Autoscaler
|
9
|
-
# Autoscaler to use on Heroku, that starts additional worker processes when there is a high latency event
|
10
|
-
# and scales them down after the event is finished.
|
11
|
-
#
|
12
|
-
# When the first call of a high latency event happens (depth: 1), this class
|
13
|
-
# will ask Heroku how many dynos are in the formation. This is known as +active_event_initial_workers+.
|
14
|
-
#
|
15
|
-
# If +active_event_initial_workers+ is 0, no autoscaling will be done.
|
16
|
-
# This avoids a situation where a high latency event is triggered
|
17
|
-
# due to workers being deprovisioned intentionally, perhaps for maintenance.
|
18
|
-
#
|
19
|
-
# Each time the alert fires (see +Amigo::Autoscaler#alert_interval+),
|
20
|
-
# an additional worker will be added to the formation, up to +max_additional_workers+.
|
21
|
-
# So with +active_event_initial_workers+ of 1 and +max_additional_workers+ of 2,
|
22
|
-
# the first time the alert times, the formation will be set to 2 workers.
|
23
|
-
# The next time, it'll be set to 3 workers.
|
24
|
-
# After that, no additional workers will be provisioned.
|
25
|
-
#
|
26
|
-
# After the high latency event resolves,
|
27
|
-
# the dyno formation is restored to +active_event_initial_workers+.
|
28
|
-
#
|
29
|
-
# To use:
|
30
|
-
#
|
31
|
-
# heroku = PlatformAPI.connect_oauth(heroku_oauth_token)
|
32
|
-
# heroku_scaler = Amigo::Autoscaler::Heroku.new(heroku:, default_workers: 1)
|
33
|
-
# Amigo::Autoscaler.new(
|
34
|
-
# handlers: [heroku_scaler.alert_callback],
|
35
|
-
# latency_restored_handlers: [heroku_scaler.restored_callback],
|
36
|
-
# )
|
37
|
-
#
|
38
|
-
# See instance attributes for additional options.
|
39
|
-
#
|
40
|
-
# Note that this class is provided as an example, and potentially a base or implementation class.
|
41
|
-
# Your actual implementation may also want to alert when a max depth or duration is reached,
|
42
|
-
# since it can indicate a bigger problem. Autoscaling, especially of workers, is a tough problem
|
43
|
-
# without a one-size-fits-all approach.
|
44
|
-
class Heroku
|
45
|
-
# Heroku client, usually created via PlatformAPI.oauth_connect.
|
46
|
-
# @return [PlatformAPI::Client]
|
47
|
-
attr_reader :heroku
|
48
|
-
|
49
|
-
# Captured at the start of a high latency event.
|
50
|
-
# Nil otherwise.
|
51
|
-
# @return [Integer]
|
52
|
-
attr_reader :active_event_initial_workers
|
53
|
-
|
54
|
-
# Maximum number of workers to add.
|
55
|
-
#
|
56
|
-
# As the 'depth' of the alert is increased,
|
57
|
-
# workers are added to the recorded worker count until the max is reached.
|
58
|
-
# By default, this is 2 (so the max workers will be the recorded number, plus 2).
|
59
|
-
# Do not set this too high, since it can for example exhaust database connections or just end up
|
60
|
-
# increasing load.
|
61
|
-
#
|
62
|
-
# See class docs for more information.
|
63
|
-
# @return [Integer]
|
64
|
-
attr_reader :max_additional_workers
|
65
|
-
|
66
|
-
# Defaults to HEROKU_APP_NAME, which should already be set if you use Heroku dyna metadata,
|
67
|
-
# as per https://devcenter.heroku.com/articles/dyno-metadata.
|
68
|
-
# This must be provided if the env var is missing.
|
69
|
-
# @return [String]
|
70
|
-
attr_reader :app_id_or_app_name
|
71
|
-
|
72
|
-
# Defaults to 'worker', which is what you'll probably use if you have a simple system.
|
73
|
-
# If you use multiple worker processes for different queues, this class probably isn't sufficient.
|
74
|
-
# You will probably need to look at the slow queue names and determine the formation name to scale up.
|
75
|
-
# @return [String]
|
76
|
-
attr_reader :formation_id_or_formation_type
|
77
|
-
|
78
|
-
def initialize(
|
79
|
-
heroku:,
|
80
|
-
max_additional_workers: 2,
|
81
|
-
app_id_or_app_name: ENV.fetch("HEROKU_APP_NAME"),
|
82
|
-
formation_id_or_formation_type: "worker"
|
83
|
-
)
|
84
|
-
@heroku = heroku
|
85
|
-
@max_additional_workers = max_additional_workers
|
86
|
-
@app_id_or_app_name = app_id_or_app_name
|
87
|
-
@formation_id_or_formation_type = formation_id_or_formation_type
|
88
|
-
# Is nil outside of a latency event, set during a latency event. So if this is initialized to non-nil,
|
89
|
-
# we're already in a latency event.
|
90
|
-
@active_event_initial_workers = Sidekiq.redis do |r|
|
91
|
-
v = r.get("#{namespace}/active_event_initial_workers")
|
92
|
-
v&.to_i
|
93
|
-
end
|
94
|
-
end
|
95
|
-
|
96
|
-
def alert_callback
|
97
|
-
self.method(:scale_up)
|
98
|
-
end
|
99
|
-
|
100
|
-
def restored_callback
|
101
|
-
self.method(:scale_down)
|
102
|
-
end
|
103
|
-
|
104
|
-
protected def namespace
|
105
|
-
return "amigo/autoscaler/heroku"
|
106
|
-
end
|
107
|
-
|
108
|
-
# Potentially add another worker to the formation.
|
109
|
-
# @return [:noscale, :maxscale, :scaled] One of :noscale (no +active_event_initial_workers+),
|
110
|
-
# :maxscale (+max_additional_workers+ reached), or :scaled.
|
111
|
-
def scale_up(_queues_and_latencies, depth:, **)
|
112
|
-
# When the scaling event starts (or if this is the first time we've seen it
|
113
|
-
# but the event is already in progress), store how many workers we have.
|
114
|
-
# It needs to be stored in redis so it persists if
|
115
|
-
# the latency event continues through restarts.
|
116
|
-
if @active_event_initial_workers.nil?
|
117
|
-
@active_event_initial_workers = @heroku.formation.info(@app_id_or_app_name, @formation_id_or_formation_type).
|
118
|
-
fetch("quantity")
|
119
|
-
Sidekiq.redis do |r|
|
120
|
-
r.set("#{namespace}/active_event_initial_workers", @active_event_initial_workers.to_s)
|
121
|
-
end
|
122
|
-
end
|
123
|
-
return :noscale if @active_event_initial_workers.zero?
|
124
|
-
new_quantity = @active_event_initial_workers + depth
|
125
|
-
max_quantity = @active_event_initial_workers + @max_additional_workers
|
126
|
-
return :maxscale if new_quantity > max_quantity
|
127
|
-
@heroku.formation.update(@app_id_or_app_name, @formation_id_or_formation_type, {quantity: new_quantity})
|
128
|
-
return :scaled
|
129
|
-
end
|
130
|
-
|
131
|
-
# Reset the formation to +active_event_initial_workers+.
|
132
|
-
# @return [:noscale, :scaled] :noscale if +active_event_initial_workers+ is 0, otherwise :scaled.
|
133
|
-
def scale_down(**)
|
134
|
-
initial_workers = @active_event_initial_workers
|
135
|
-
Sidekiq.redis do |r|
|
136
|
-
r.del("#{namespace}/active_event_initial_workers")
|
137
|
-
end
|
138
|
-
@active_event_initial_workers = nil
|
139
|
-
return :noscale if initial_workers.zero?
|
140
|
-
@heroku.formation.update(@app_id_or_app_name, @formation_id_or_formation_type, {quantity: initial_workers})
|
141
|
-
return :scaled
|
142
|
-
end
|
143
|
-
end
|
144
|
-
end
|
145
|
-
end
|