sidekiq-amigo 1.10.0 → 1.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/amigo/audit_logger.rb +1 -1
- data/lib/amigo/autoscaler/checkers/fake.rb +22 -0
- data/lib/amigo/autoscaler/checkers/sidekiq.rb +19 -0
- data/lib/amigo/autoscaler/checkers/web_latency.rb +84 -0
- data/lib/amigo/autoscaler/handlers/chain.rb +28 -0
- data/lib/amigo/autoscaler/handlers/fake.rb +27 -0
- data/lib/amigo/autoscaler/handlers/heroku.rb +141 -0
- data/lib/amigo/autoscaler/handlers/log.rb +35 -0
- data/lib/amigo/autoscaler/handlers/sentry.rb +38 -0
- data/lib/amigo/autoscaler.rb +90 -99
- data/lib/amigo/job.rb +1 -1
- data/lib/amigo/memory_pressure.rb +16 -3
- data/lib/amigo/retry.rb +5 -3
- data/lib/amigo/router.rb +1 -1
- data/lib/amigo/scheduled_job.rb +1 -1
- data/lib/amigo/semaphore_backoff_job.rb +4 -4
- data/lib/amigo/spec_helpers.rb +7 -6
- data/lib/amigo/version.rb +1 -1
- data/lib/amigo.rb +4 -5
- metadata +48 -16
- data/lib/amigo/autoscaler/heroku.rb +0 -145
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8f2d4776669bc7327b064ae2f0b2f22a83e35cd351da05a145d1b3b7bf086334
|
4
|
+
data.tar.gz: b3122b4fa37a8c6c93afbd485ce4b813536a2c1b91d96c266b2c2bb04be15d98
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c24ff6b6af38bb638be36dfa18aaca7500df9bf0126a4adf814e179fc05f3784b27004286e4ef796285cd047388d88486ee7da5fe9050454e33fc9b52d8f4698
|
7
|
+
data.tar.gz: 52b8edc30efe323786963559a7330058bd6b70552eedbdb4f7d3f3a8727373de6d033f2c0dd0113f9303f80e312d957df1e50c1f0fb5aec3e7bdcd9918b3a3bf
|
data/lib/amigo/audit_logger.rb
CHANGED
@@ -0,0 +1,22 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "amigo/autoscaler"
|
4
|
+
|
5
|
+
module Amigo
|
6
|
+
class Autoscaler
|
7
|
+
module Checkers
|
8
|
+
class Fake < Amigo::Autoscaler::Checker
|
9
|
+
def initialize(latencies)
|
10
|
+
@latencies = latencies
|
11
|
+
super()
|
12
|
+
end
|
13
|
+
|
14
|
+
def get_latencies
|
15
|
+
return @latencies.call if @latencies.respond_to?(:call)
|
16
|
+
return @latencies.shift if @latencies.is_a?(Array)
|
17
|
+
return @latencies
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "sidekiq/api"
|
4
|
+
|
5
|
+
require "amigo/autoscaler"
|
6
|
+
|
7
|
+
module Amigo
|
8
|
+
class Autoscaler
|
9
|
+
module Checkers
|
10
|
+
class Sidekiq < Amigo::Autoscaler::Checker
|
11
|
+
def get_latencies
|
12
|
+
return ::Sidekiq::Queue.all.
|
13
|
+
map { |q| [q.name, q.latency] }.
|
14
|
+
to_h
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "amigo/autoscaler"
|
4
|
+
|
5
|
+
module Amigo
|
6
|
+
class Autoscaler
|
7
|
+
module Checkers
|
8
|
+
class WebLatency < Amigo::Autoscaler::Checker
|
9
|
+
NAMESPACE = "amigo/autoscaler/web_latency"
|
10
|
+
WINDOW = 60
|
11
|
+
|
12
|
+
# Set the latency.
|
13
|
+
# @param redis [RedisClient::Common] Redis connection.
|
14
|
+
# @param namespace [String] Key namespace.
|
15
|
+
# @param at [Time,Integer] Time this record was taken.
|
16
|
+
# @param duration [Numeric] Duration of the request in fractional seconds.
|
17
|
+
def self.set_latency(redis:, namespace:, at:, duration:)
|
18
|
+
bucket = at.to_i
|
19
|
+
key = "#{namespace}/latencies:#{bucket}"
|
20
|
+
duration_ms = (duration * 1000).round
|
21
|
+
redis.call("HINCRBY", key, "count", 1)
|
22
|
+
redis.call("HINCRBY", key, "sum", duration_ms)
|
23
|
+
redis.call("EXPIRE", key, WINDOW + 1)
|
24
|
+
end
|
25
|
+
|
26
|
+
def initialize(redis:, namespace: NAMESPACE)
|
27
|
+
@redis = redis
|
28
|
+
@namespace = namespace
|
29
|
+
super()
|
30
|
+
end
|
31
|
+
|
32
|
+
def get_latencies
|
33
|
+
now = Time.now.to_i
|
34
|
+
keys = (now - 59..now).map { |t| "#{@namespace}/latencies:#{t}" }
|
35
|
+
counts = 0
|
36
|
+
sums = 0
|
37
|
+
results = @redis.pipelined do |pipeline|
|
38
|
+
keys.each do |k|
|
39
|
+
pipeline.call("HMGET", k, "count", "sum")
|
40
|
+
end
|
41
|
+
end
|
42
|
+
results.each do |count, sum|
|
43
|
+
counts += count.to_i
|
44
|
+
sums += sum.to_i
|
45
|
+
end
|
46
|
+
return {} if counts.zero?
|
47
|
+
latency = sums.to_f / counts
|
48
|
+
return {"web" => latency.to_f / 1000}
|
49
|
+
end
|
50
|
+
|
51
|
+
class Middleware
|
52
|
+
# @param threshold [Float] Do not record the latency of requests faster than this.
|
53
|
+
# These are usually just things like healthchecks, files, or other very fast requests
|
54
|
+
# which do not represent the overall system slowness.
|
55
|
+
def initialize(app, redis:, threshold: 0.08, namespace: NAMESPACE)
|
56
|
+
@app = app
|
57
|
+
@redis = redis
|
58
|
+
@threshold = threshold
|
59
|
+
@namespace = namespace
|
60
|
+
end
|
61
|
+
|
62
|
+
def call(env)
|
63
|
+
start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
64
|
+
status, headers, body = @app.call(env)
|
65
|
+
duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start
|
66
|
+
if duration > @threshold
|
67
|
+
begin
|
68
|
+
WebLatency.set_latency(
|
69
|
+
redis: @redis,
|
70
|
+
namespace: @namespace,
|
71
|
+
at: Time.now,
|
72
|
+
duration:,
|
73
|
+
)
|
74
|
+
rescue StandardError => e
|
75
|
+
Amigo.log(nil, :error, "web_latency_error", exception: e)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
[status, headers, body]
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "amigo/autoscaler"
|
4
|
+
|
5
|
+
module Amigo
|
6
|
+
class Autoscaler
|
7
|
+
module Handlers
|
8
|
+
class Chain < Amigo::Autoscaler::Handler
|
9
|
+
attr_accessor :chain
|
10
|
+
|
11
|
+
# Chain multiple handlers together.
|
12
|
+
# @param chain [Array<Amigo::Autoscaler::Handler>]
|
13
|
+
def initialize(chain)
|
14
|
+
@chain = chain
|
15
|
+
super()
|
16
|
+
end
|
17
|
+
|
18
|
+
def scale_up(*args, **kw)
|
19
|
+
@chain.each { |c| c.scale_up(*args, **kw) }
|
20
|
+
end
|
21
|
+
|
22
|
+
def scale_down(*args, **kw)
|
23
|
+
@chain.each { |c| c.scale_down(*args, **kw) }
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "amigo/autoscaler"
|
4
|
+
|
5
|
+
module Amigo
|
6
|
+
class Autoscaler
|
7
|
+
module Handlers
|
8
|
+
class Fake < Amigo::Autoscaler::Handler
|
9
|
+
attr_accessor :ups, :downs
|
10
|
+
|
11
|
+
def initialize
|
12
|
+
@ups = []
|
13
|
+
@downs = []
|
14
|
+
super()
|
15
|
+
end
|
16
|
+
|
17
|
+
def scale_up(checked_latencies, depth:, duration:, **kw)
|
18
|
+
@ups << [checked_latencies, depth, duration, kw]
|
19
|
+
end
|
20
|
+
|
21
|
+
def scale_down(depth:, duration:, **kw)
|
22
|
+
@downs << [depth, duration, kw]
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,141 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "platform-api"
|
4
|
+
|
5
|
+
require "amigo/autoscaler"
|
6
|
+
|
7
|
+
module Amigo
|
8
|
+
class Autoscaler
|
9
|
+
module Handlers
|
10
|
+
# Autoscaler to use on Heroku, that starts additional worker processes when there is a high latency event
|
11
|
+
# and scales them down after the event is finished.
|
12
|
+
#
|
13
|
+
# When the first call of a high latency event happens (depth: 1), this class
|
14
|
+
# will ask Heroku how many dynos are in the formation. This is known as +active_event_initial_workers+.
|
15
|
+
#
|
16
|
+
# If +active_event_initial_workers+ is 0, no autoscaling will be done.
|
17
|
+
# This avoids a situation where a high latency event is triggered
|
18
|
+
# due to workers being deprovisioned intentionally, perhaps for maintenance.
|
19
|
+
#
|
20
|
+
# Each time the alert fires (see +Amigo::Autoscaler#alert_interval+),
|
21
|
+
# an additional worker will be added to the formation, up to +max_additional_workers+.
|
22
|
+
# So with +active_event_initial_workers+ of 1 and +max_additional_workers+ of 2,
|
23
|
+
# the first time the alert times, the formation will be set to 2 workers.
|
24
|
+
# The next time, it'll be set to 3 workers.
|
25
|
+
# After that, no additional workers will be provisioned.
|
26
|
+
#
|
27
|
+
# After the high latency event resolves,
|
28
|
+
# the dyno formation is restored to +active_event_initial_workers+.
|
29
|
+
#
|
30
|
+
# To use:
|
31
|
+
#
|
32
|
+
# heroku = PlatformAPI.connect_oauth(heroku_oauth_token)
|
33
|
+
# heroku_scaler = Amigo::Autoscaler::Heroku.new(heroku:, default_workers: 1)
|
34
|
+
# Amigo::Autoscaler.new(
|
35
|
+
# handlers: [heroku_scaler.alert_callback],
|
36
|
+
# latency_restored_handlers: [heroku_scaler.restored_callback],
|
37
|
+
# )
|
38
|
+
#
|
39
|
+
# See instance attributes for additional options.
|
40
|
+
#
|
41
|
+
# Note that this class is provided as an example, and potentially a base or implementation class.
|
42
|
+
# Your actual implementation may also want to alert when a max depth or duration is reached,
|
43
|
+
# since it can indicate a bigger problem. Autoscaling, especially of workers, is a tough problem
|
44
|
+
# without a one-size-fits-all approach.
|
45
|
+
class Heroku < Amigo::Autoscaler::Handler
|
46
|
+
# Heroku client, usually created via PlatformAPI.oauth_connect.
|
47
|
+
# @return [PlatformAPI::Client]
|
48
|
+
attr_reader :heroku
|
49
|
+
|
50
|
+
# Captured at the start of a high latency event.
|
51
|
+
# Nil otherwise.
|
52
|
+
# @return [Integer]
|
53
|
+
attr_reader :active_event_initial_workers
|
54
|
+
|
55
|
+
# Maximum number of workers to add.
|
56
|
+
#
|
57
|
+
# As the 'depth' of the alert is increased,
|
58
|
+
# workers are added to the recorded worker count until the max is reached.
|
59
|
+
# By default, this is 2 (so the max workers will be the recorded number, plus 2).
|
60
|
+
# Do not set this too high, since it can for example exhaust database connections or just end up
|
61
|
+
# increasing load.
|
62
|
+
#
|
63
|
+
# See class docs for more information.
|
64
|
+
# @return [Integer]
|
65
|
+
attr_reader :max_additional_workers
|
66
|
+
|
67
|
+
# Defaults to HEROKU_APP_NAME, which should already be set if you use Heroku dyna metadata,
|
68
|
+
# as per https://devcenter.heroku.com/articles/dyno-metadata.
|
69
|
+
# This must be provided if the env var is missing.
|
70
|
+
# @return [String]
|
71
|
+
attr_reader :app_id_or_app_name
|
72
|
+
|
73
|
+
# Formation ID or name.
|
74
|
+
# Usually 'worker' to scale Sidekiq workers, or 'web' for the web worker.
|
75
|
+
# If you use multiple worker processes for different queues, this class probably isn't sufficient.
|
76
|
+
# You will probably need to look at the slow queue names and determine the formation name to scale up.
|
77
|
+
# @return [String]
|
78
|
+
attr_reader :formation
|
79
|
+
|
80
|
+
def initialize(
|
81
|
+
client:,
|
82
|
+
formation:,
|
83
|
+
max_additional_workers: 2,
|
84
|
+
app_id_or_app_name: ENV.fetch("HEROKU_APP_NAME")
|
85
|
+
)
|
86
|
+
super()
|
87
|
+
@client = client
|
88
|
+
@max_additional_workers = max_additional_workers
|
89
|
+
@app_id_or_app_name = app_id_or_app_name
|
90
|
+
@formation = formation
|
91
|
+
# Is nil outside a latency event, set during a latency event. So if this is initialized to non-nil,
|
92
|
+
# we're already in a latency event.
|
93
|
+
@active_event_initial_workers = Sidekiq.redis do |r|
|
94
|
+
v = r.get("#{namespace}/active_event_initial_workers")
|
95
|
+
v&.to_i
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
protected def namespace
|
100
|
+
return "amigo/autoscaler/heroku/#{self.formation}"
|
101
|
+
end
|
102
|
+
|
103
|
+
# Potentially add another worker to the formation.
|
104
|
+
# @return [:noscale, :maxscale, :scaled] One of :noscale (no +active_event_initial_workers+),
|
105
|
+
# :maxscale (+max_additional_workers+ reached), or :scaled.
|
106
|
+
def scale_up(_queues_and_latencies, depth:, **)
|
107
|
+
# When the scaling event starts (or if this is the first time we've seen it
|
108
|
+
# but the event is already in progress), store how many workers we have.
|
109
|
+
# It needs to be stored in redis so it persists if
|
110
|
+
# the latency event continues through restarts.
|
111
|
+
if @active_event_initial_workers.nil?
|
112
|
+
@active_event_initial_workers = @client.formation.info(@app_id_or_app_name, @formation).
|
113
|
+
fetch("quantity")
|
114
|
+
Sidekiq.redis do |r|
|
115
|
+
r.set("#{namespace}/active_event_initial_workers", @active_event_initial_workers.to_s)
|
116
|
+
end
|
117
|
+
end
|
118
|
+
return :noscale if @active_event_initial_workers.zero?
|
119
|
+
new_quantity = @active_event_initial_workers + depth
|
120
|
+
max_quantity = @active_event_initial_workers + @max_additional_workers
|
121
|
+
return :maxscale if new_quantity > max_quantity
|
122
|
+
@client.formation.update(@app_id_or_app_name, @formation, {quantity: new_quantity})
|
123
|
+
return :scaled
|
124
|
+
end
|
125
|
+
|
126
|
+
# Reset the formation to +active_event_initial_workers+.
|
127
|
+
# @return [:noscale, :scaled] :noscale if +active_event_initial_workers+ is 0, otherwise :scaled.
|
128
|
+
def scale_down(**)
|
129
|
+
initial_workers = @active_event_initial_workers
|
130
|
+
Sidekiq.redis do |r|
|
131
|
+
r.del("#{namespace}/active_event_initial_workers")
|
132
|
+
end
|
133
|
+
@active_event_initial_workers = nil
|
134
|
+
return :noscale if initial_workers.zero?
|
135
|
+
@client.formation.update(@app_id_or_app_name, @formation, {quantity: initial_workers})
|
136
|
+
return :scaled
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "amigo/autoscaler"
|
4
|
+
|
5
|
+
module Amigo
|
6
|
+
class Autoscaler
|
7
|
+
module Handlers
|
8
|
+
class Log < Amigo::Autoscaler::Handler
|
9
|
+
DEFAULT_LOG = ->(level, message, params={}) { Amigo.log(nil, level, message, params) }
|
10
|
+
|
11
|
+
# @param message [String] Log message for structured logging.\
|
12
|
+
# Has "_restored" appended on +scale_down+.
|
13
|
+
# @param log [Proc] Proc/callable called with (level, message, params={}).
|
14
|
+
# By default, use +Amigo.log+ (which logs to the Sidekiq logger).
|
15
|
+
def initialize(message: "high_latency_queues", log: DEFAULT_LOG)
|
16
|
+
@message = message
|
17
|
+
@log = log
|
18
|
+
super()
|
19
|
+
end
|
20
|
+
|
21
|
+
def scale_up(checked_latencies, depth:, duration:, **_kw)
|
22
|
+
self._log(:warn, @message, queues: checked_latencies, depth: depth, duration: duration)
|
23
|
+
end
|
24
|
+
|
25
|
+
def scale_down(depth:, duration:, **_kw)
|
26
|
+
self._log(:info, "#{@message}_restored", depth: depth, duration: duration)
|
27
|
+
end
|
28
|
+
|
29
|
+
protected def _log(level, msg, **kw)
|
30
|
+
@log[level, msg, kw]
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "amigo/autoscaler"
|
4
|
+
|
5
|
+
module Amigo
|
6
|
+
class Autoscaler
|
7
|
+
module Handlers
|
8
|
+
class Sentry < Amigo::Autoscaler::Handler
|
9
|
+
# @param interval [Integer] How many seconds between Sentry alerts?
|
10
|
+
# This is similar to +alert_interval+ on the Autoscaler,
|
11
|
+
# but Sentry has its own interval, since it is used for reporting,
|
12
|
+
# and not latency reduction.
|
13
|
+
# @param message [String] Message to capture.
|
14
|
+
# @param level [:debug,:info,:warning,:warn,:error,:fatal] Sentry level.
|
15
|
+
def initialize(interval: 300, message: "Some queues have a high latency", level: :warn)
|
16
|
+
@interval = interval
|
17
|
+
@message = message
|
18
|
+
@level = level
|
19
|
+
@last_alerted = Time.at(0)
|
20
|
+
super()
|
21
|
+
end
|
22
|
+
|
23
|
+
def scale_up(checked_latencies, depth:, duration:, **)
|
24
|
+
now = Time.now
|
25
|
+
call_sentry = @last_alerted < (now - @interval)
|
26
|
+
return unless call_sentry
|
27
|
+
::Sentry.with_scope do |scope|
|
28
|
+
scope&.set_extras(high_latency_queues: checked_latencies, depth:, duration:)
|
29
|
+
::Sentry.capture_message(@message, level: @level)
|
30
|
+
end
|
31
|
+
@last_alerted = now
|
32
|
+
end
|
33
|
+
|
34
|
+
def scale_down(**) = nil
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
data/lib/amigo/autoscaler.rb
CHANGED
@@ -4,36 +4,43 @@ require "sidekiq/api"
|
|
4
4
|
|
5
5
|
require "amigo"
|
6
6
|
|
7
|
-
#
|
8
|
-
# take
|
7
|
+
# Generic autoscaling handler that will check for latency
|
8
|
+
# and take an action.
|
9
|
+
# For Sidekiq on Heroku for instance,
|
10
|
+
# this means checking queues for a latency above a threshold, and adding workers up to a limit.
|
11
|
+
#
|
9
12
|
# You should start this up at Web application startup:
|
10
13
|
#
|
11
14
|
# # puma.rb or similar
|
12
|
-
# Amigo::Autoscaler.new
|
15
|
+
# checker = Amigo::Autoscaler::Checkers::SidekiqLatency.new
|
16
|
+
# heroku_client = PlatformAPI.connect_oauth(ENV['MYAPP_HEROKU_OAUTH_TOKEN'])
|
17
|
+
# handler = Amigo::Autoscaler::Handlers::Heroku.new(client: heroku_client, formation: 'worker')
|
18
|
+
# Amigo::Autoscaler.new(checker:, handler:).start
|
13
19
|
#
|
14
20
|
# When latency grows beyond +latency_threshold+,
|
15
21
|
# a "high latency event" is started.
|
16
|
-
# Some action
|
17
|
-
# This includes logging, alerting, and/or autoscaling.
|
22
|
+
# Some action should be taken, which is handled by the handler's +scale_up+ method.
|
23
|
+
# This usually includes logging, alerting, and/or autoscaling.
|
18
24
|
#
|
19
25
|
# When latency returns to normal (defined by +latency_restored_threshold+),
|
20
26
|
# the high latency event finishes.
|
21
|
-
# Some additional action is taken,
|
27
|
+
# Some additional action is taken, handled by the handler's +scale_down+ method.
|
22
28
|
# Usually this is logging, and/or returning autoscaling to its original status.
|
23
29
|
#
|
24
30
|
# There are several parameters to control behavior, such as how often polling is done,
|
25
31
|
# how often alerting/scaling is done, and more.
|
26
32
|
#
|
27
|
-
# As an example autoscaler that includes actual resource scaling,
|
28
|
-
# check out +Amigo::Autoscaler::Heroku+.
|
29
|
-
# Its ideas can easily be expanded to other platforms.
|
30
|
-
#
|
31
33
|
# Note that +Autoscaler+ maintains its state over multiple processes;
|
32
34
|
# it needs to keep track of high latency events even if the process running the autoscaler
|
33
35
|
# (usually a web process) restarts.
|
34
36
|
module Amigo
|
35
37
|
class Autoscaler
|
36
|
-
|
38
|
+
# Struct representing data serialized to Redis.
|
39
|
+
# Useful for diagnostics. Can be retried with +fetch_persisted+.
|
40
|
+
# @!attribute last_alerted_at [Time] 0-time if there is no recent alert.
|
41
|
+
# @!attribute depth [Integer] 0 if not in a latency event.
|
42
|
+
# @!attribute latency_event_started_at [Time] 0-time if not in a latency event.
|
43
|
+
Persisted = Struct.new(:last_alerted_at, :depth, :latency_event_started_at)
|
37
44
|
|
38
45
|
# How often should Autoscaler check for latency?
|
39
46
|
# @return [Integer]
|
@@ -49,49 +56,32 @@ module Amigo
|
|
49
56
|
# are generally easier to find).
|
50
57
|
# @return [Regexp]
|
51
58
|
attr_reader :hostname_regex
|
52
|
-
# Methods to call when alerting, as strings/symbols or procs.
|
53
|
-
# Valid string values are 'log' and 'sentry' (requires Sentry to be required already).
|
54
|
-
# Anything that responds to +call+ will be invoked with:
|
55
|
-
# - Positional argument which is a +Hash+ of `{queue name => latency in seconds}`
|
56
|
-
# - Keyword argument +:depth+: Number of alerts as part of this latency event.
|
57
|
-
# For example, the first alert has a depth of 1, and if latency stays high,
|
58
|
-
# it'll be 2 on the next call, etc. +depth+ can be used to incrementally provision
|
59
|
-
# additional processing capacity, and stop adding capacity at a certain depth
|
60
|
-
# to avoid problems with too many workers (like excessive DB load).
|
61
|
-
# - Keyword argument +:duration+: Number of seconds since this latency spike started.
|
62
|
-
# - Additional undefined keywords. Handlers should accept additional options,
|
63
|
-
# like via `**kw` or `opts={}`, for compatibility.
|
64
|
-
# @return [Array<String,Symbol,Proc,#call>]
|
65
|
-
attr_reader :handlers
|
66
59
|
# Only alert this often.
|
67
60
|
# For example, with poll_interval of 10 seconds
|
68
61
|
# and alert_interval of 200 seconds,
|
69
62
|
# we'd alert once and then 210 seconds later.
|
70
63
|
# @return [Integer]
|
71
64
|
attr_reader :alert_interval
|
65
|
+
|
72
66
|
# After an alert happens, what latency should be considered "back to normal" and
|
73
|
-
# +
|
67
|
+
# +scale_down+ will be called?
|
74
68
|
# In most cases this should be the same as (and defaults to) +latency_threshold+
|
75
69
|
# so that we're 'back to normal' once we're below the threshold.
|
76
70
|
# It may also commonly be 0, so that the callback is fired when the queue is entirely clear.
|
77
71
|
# Note that, if +latency_restored_threshold+ is less than +latency_threshold+,
|
78
72
|
# while the latency is between the two, no alerts will fire.
|
79
73
|
attr_reader :latency_restored_threshold
|
80
|
-
|
81
|
-
#
|
82
|
-
|
83
|
-
#
|
84
|
-
|
85
|
-
|
86
|
-
#
|
87
|
-
#
|
88
|
-
#
|
89
|
-
|
90
|
-
|
91
|
-
attr_reader :latency_restored_handlers
|
92
|
-
# Proc/callable called with (level, message, params={}).
|
93
|
-
# By default, use +Amigo.log+ (which logs to the Sidekiq logger).
|
94
|
-
attr_reader :log
|
74
|
+
|
75
|
+
# @return [Amigo::Autoscaler::Checker]
|
76
|
+
attr_reader :checker
|
77
|
+
# @return [Amigo::Autoscaler::Handler]
|
78
|
+
attr_reader :handler
|
79
|
+
|
80
|
+
# Store autoscaler keys in this Redis namespace.
|
81
|
+
# Note that if you are running multiple autoscalers for different services (web, worker),
|
82
|
+
# you will need different namespaces.
|
83
|
+
attr_reader :namespace
|
84
|
+
|
95
85
|
# Proc called with an exception that occurs while the thread is running.
|
96
86
|
# If the handler returns +true+, then the thread will keep going.
|
97
87
|
# All other values will kill the thread, which breaks autoscaling.
|
@@ -101,15 +91,15 @@ module Amigo
|
|
101
91
|
attr_reader :on_unhandled_exception
|
102
92
|
|
103
93
|
def initialize(
|
94
|
+
handler:,
|
95
|
+
checker:,
|
104
96
|
poll_interval: 20,
|
105
97
|
latency_threshold: 5,
|
106
98
|
hostname_regex: /^web\.1$/,
|
107
|
-
handlers: [:log],
|
108
99
|
alert_interval: 120,
|
109
100
|
latency_restored_threshold: latency_threshold,
|
110
|
-
|
111
|
-
|
112
|
-
on_unhandled_exception: nil
|
101
|
+
on_unhandled_exception: nil,
|
102
|
+
namespace: "amigo/autoscaler"
|
113
103
|
)
|
114
104
|
raise ArgumentError, "latency_threshold must be > 0" if
|
115
105
|
latency_threshold <= 0
|
@@ -117,15 +107,15 @@ module Amigo
|
|
117
107
|
latency_restored_threshold.negative?
|
118
108
|
raise ArgumentError, "latency_restored_threshold must be <= latency_threshold" if
|
119
109
|
latency_restored_threshold > latency_threshold
|
110
|
+
@handler = handler
|
111
|
+
@checker = checker
|
120
112
|
@poll_interval = poll_interval
|
121
113
|
@latency_threshold = latency_threshold
|
122
114
|
@hostname_regex = hostname_regex
|
123
|
-
@handlers = handlers.freeze
|
124
115
|
@alert_interval = alert_interval
|
125
116
|
@latency_restored_threshold = latency_restored_threshold
|
126
|
-
@latency_restored_handlers = latency_restored_handlers.freeze
|
127
|
-
@log = log
|
128
117
|
@on_unhandled_exception = on_unhandled_exception
|
118
|
+
@namespace = namespace
|
129
119
|
end
|
130
120
|
|
131
121
|
# @return [Thread]
|
@@ -136,13 +126,20 @@ module Amigo
|
|
136
126
|
def setup
|
137
127
|
# Store these as strings OR procs, rather than grabbing self.method here.
|
138
128
|
# It gets extremely hard ot test if we capture the method here.
|
139
|
-
@alert_methods = self.handlers.map { |a| _handler_to_method("alert_", a) }
|
140
|
-
@restored_methods = self.latency_restored_handlers.map { |a| _handler_to_method("alert_restored_", a) }
|
141
129
|
@stop = false
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
130
|
+
persisted = self.fetch_persisted
|
131
|
+
@last_alerted = persisted.last_alerted_at
|
132
|
+
@depth = persisted.depth
|
133
|
+
@latency_event_started = persisted.latency_event_started_at
|
134
|
+
end
|
135
|
+
|
136
|
+
def fetch_persisted
|
137
|
+
return Sidekiq.redis do |r|
|
138
|
+
Persisted.new(
|
139
|
+
Time.at((r.get("#{namespace}/last_alerted") || 0).to_f),
|
140
|
+
(r.get("#{namespace}/depth") || 0).to_i,
|
141
|
+
Time.at((r.get("#{namespace}/latency_event_started") || 0).to_f),
|
142
|
+
)
|
146
143
|
end
|
147
144
|
end
|
148
145
|
|
@@ -165,24 +162,13 @@ module Amigo
|
|
165
162
|
end
|
166
163
|
end
|
167
164
|
|
168
|
-
protected def namespace
|
169
|
-
return "amigo/autoscaler"
|
170
|
-
end
|
171
|
-
|
172
|
-
private def _handler_to_method(prefix, a)
|
173
|
-
return a if a.respond_to?(:call)
|
174
|
-
method_name = "#{prefix}#{a.to_s.strip}".to_sym
|
175
|
-
raise InvalidHandler, a.inspect unless (meth = self.method(method_name))
|
176
|
-
return meth
|
177
|
-
end
|
178
|
-
|
179
165
|
def start
|
180
166
|
raise "already started" unless @polling_thread.nil?
|
181
167
|
|
182
168
|
hostname = ENV.fetch("DYNO") { Socket.gethostname }
|
183
169
|
return false unless self.hostname_regex.match?(hostname)
|
184
170
|
|
185
|
-
self.
|
171
|
+
self._debug(:info, "async_autoscaler_starting")
|
186
172
|
self.setup
|
187
173
|
@polling_thread = Thread.new do
|
188
174
|
until @stop
|
@@ -200,7 +186,7 @@ module Amigo
|
|
200
186
|
def check
|
201
187
|
self._check
|
202
188
|
rescue StandardError => e
|
203
|
-
self.
|
189
|
+
self._debug(:error, "async_autoscaler_unhandled_error", exception: e)
|
204
190
|
handled = self.on_unhandled_exception&.call(e)
|
205
191
|
raise e unless handled.eql?(true)
|
206
192
|
end
|
@@ -209,22 +195,18 @@ module Amigo
|
|
209
195
|
now = Time.now
|
210
196
|
skip_check = now < (@last_alerted + self.alert_interval)
|
211
197
|
if skip_check
|
212
|
-
self.
|
198
|
+
self._debug(:debug, "async_autoscaler_skip_check")
|
213
199
|
return
|
214
200
|
end
|
215
|
-
self.
|
216
|
-
high_latency_queues =
|
217
|
-
|
218
|
-
select { |(_, latency)| latency > self.latency_threshold }.
|
219
|
-
to_h
|
201
|
+
self._debug(:info, "async_autoscaler_check")
|
202
|
+
high_latency_queues = self.checker.get_latencies.
|
203
|
+
select { |_, latency| latency > self.latency_threshold }
|
220
204
|
if high_latency_queues.empty?
|
221
205
|
# Whenever we are in a latency event, we have a depth > 0. So a depth of 0 means
|
222
206
|
# we're not in a latency event, and still have no latency, so can noop.
|
223
207
|
return if @depth.zero?
|
224
208
|
# We WERE in a latency event, and now we're not, so report on it.
|
225
|
-
@
|
226
|
-
m.call(depth: @depth, duration: (Time.now - @latency_event_started).to_f)
|
227
|
-
end
|
209
|
+
self.handler.scale_down(depth: @depth, duration: (Time.now - @latency_event_started).to_f)
|
228
210
|
# Reset back to 0 depth so we know we're not in a latency event.
|
229
211
|
@depth = 0
|
230
212
|
@latency_event_started = Time.at(0)
|
@@ -244,38 +226,47 @@ module Amigo
|
|
244
226
|
end
|
245
227
|
# Alert each handler. For legacy reasons, we support handlers that accept
|
246
228
|
# ({queues and latencies}) and ({queues and latencies}, {}keywords}).
|
247
|
-
|
248
|
-
@alert_methods.each do |m|
|
249
|
-
if m.respond_to?(:arity) && m.arity == 1
|
250
|
-
m.call(high_latency_queues)
|
251
|
-
else
|
252
|
-
m.call(high_latency_queues, **kw)
|
253
|
-
end
|
254
|
-
end
|
229
|
+
@handler.scale_up(high_latency_queues, depth: @depth, duration: duration)
|
255
230
|
@last_alerted = now
|
256
231
|
self.persist
|
257
232
|
end
|
258
233
|
|
259
|
-
def
|
260
|
-
|
261
|
-
|
262
|
-
names = names_and_latencies.map(&:first).sort.join(", ")
|
263
|
-
Sentry.capture_message("Some queues have a high latency: #{names}")
|
264
|
-
end
|
234
|
+
def _debug(lvl, msg, **kw)
|
235
|
+
return unless ENV["DEBUG"]
|
236
|
+
Amigo.log(nil, lvl, msg, kw)
|
265
237
|
end
|
266
238
|
|
267
|
-
|
268
|
-
|
239
|
+
class Checker
|
240
|
+
# Return relevant latencies for this checker.
|
241
|
+
# This could be the latencies of each Sidekiq queue, or web latencies, etc.
|
242
|
+
# @return [Hash] Key is the queue name (or some other value); value is the latency in seconds.
|
243
|
+
def get_latencies = raise NotImplementedError
|
269
244
|
end
|
270
245
|
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
246
|
+
class Handler
|
247
|
+
# Called when a latency event starts, and as it fails to resolve.
|
248
|
+
# @param checked_latencies [Hash] The +Hash+ returned from +Amigo::Autoscaler::Handler#check+.
|
249
|
+
# For Sidekiq, this will look like `{queue name => latency in seconds}`
|
250
|
+
# @param depth [Integer] Number of alerts as part of this latency event.
|
251
|
+
# For example, the first alert has a depth of 1, and if latency stays high,
|
252
|
+
# it'll be 2 on the next call, etc. +depth+ can be used to incrementally provision
|
253
|
+
# additional processing capacity, and stop adding capacity at a certain depth
|
254
|
+
# to avoid problems with too many workers (like excessive DB load).
|
255
|
+
# @param duration [Float] Number of seconds since this latency spike started.
|
256
|
+
# @param kw [Hash] Additional undefined keywords. Handlers should accept additional options,
|
257
|
+
# like via `**kw` or `opts={}`, for compatibility.
|
258
|
+
# @return [Array<String,Symbol,Proc,#call>]
|
259
|
+
def scale_up(checked_latencies, depth:, duration:, **kw) = raise NotImplementedError
|
276
260
|
|
277
|
-
|
278
|
-
|
261
|
+
# Called when a latency of +latency_restored_threshold+ is reached
|
262
|
+
# (ie, when we get back to normal latency after a high latency event).
|
263
|
+
# Usually this handler will deprovision capacity procured as part of the +scale_up+.
|
264
|
+
# @param depth [Integer] The number of times an alert happened before
|
265
|
+
# the latency spike was resolved.
|
266
|
+
# @param duration [Float] The number of seconds for the latency spike has been going on.
|
267
|
+
# @param kw [Hash] Additional undefined keywords. Handlers should accept additional options,
|
268
|
+
# like via `**kw` or `opts={}`, for compatibility.
|
269
|
+
def scale_down(depth:, duration:, **kw) = raise NotImplementedError
|
279
270
|
end
|
280
271
|
end
|
281
272
|
end
|
data/lib/amigo/job.rb
CHANGED
@@ -74,10 +74,23 @@ module Amigo
|
|
74
74
|
return percentage > self.threshold
|
75
75
|
end
|
76
76
|
|
77
|
-
|
78
|
-
|
79
|
-
|
77
|
+
def get_memory_info
|
78
|
+
s = self.get_memory_info_string
|
79
|
+
return self.parse_memory_string(s)
|
80
|
+
end
|
81
|
+
|
82
|
+
protected def get_memory_info_string
|
83
|
+
s = Sidekiq.redis do |c|
|
84
|
+
c.call("INFO", "MEMORY")
|
80
85
|
end
|
86
|
+
return s
|
87
|
+
end
|
88
|
+
|
89
|
+
protected def parse_memory_string(s)
|
90
|
+
# See bottom of https://redis.io/docs/latest/commands/info/ for format.
|
91
|
+
pairs = s.split("\r\n").reject { |line| line.start_with?("#") }.map { |pair| pair.split(":", 2) }
|
92
|
+
h = pairs.to_h
|
93
|
+
return h
|
81
94
|
end
|
82
95
|
end
|
83
96
|
end
|
data/lib/amigo/retry.rb
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
require "sidekiq"
|
4
4
|
require "sidekiq/api"
|
5
5
|
|
6
|
-
# Middleware so Sidekiq
|
6
|
+
# Middleware so Sidekiq jobs can use a custom retry logic.
|
7
7
|
# See +Amigo::Retry::Retry+, +Amigo::Retry::Die+,
|
8
8
|
# and +Amigo::Retry::OrDie+ for more details
|
9
9
|
# on how these should be used.
|
@@ -83,6 +83,8 @@ module Amigo
|
|
83
83
|
end
|
84
84
|
|
85
85
|
class ServerMiddleware
|
86
|
+
include Sidekiq::ServerMiddleware
|
87
|
+
|
86
88
|
def call(worker, job, _queue)
|
87
89
|
yield
|
88
90
|
rescue Amigo::Retry::Retry => e
|
@@ -120,14 +122,14 @@ module Amigo
|
|
120
122
|
end
|
121
123
|
end
|
122
124
|
|
123
|
-
def amigo_retry_in(
|
125
|
+
def amigo_retry_in(job_class, item, interval)
|
124
126
|
# pulled from perform_in
|
125
127
|
int = interval.to_f
|
126
128
|
now = Time.now.to_f
|
127
129
|
ts = (int < 1_000_000_000 ? now + int : int)
|
128
130
|
item["at"] = ts if ts > now
|
129
131
|
item["retry_count"] = item.fetch("retry_count", 0) + 1
|
130
|
-
|
132
|
+
job_class.client_push(item)
|
131
133
|
end
|
132
134
|
end
|
133
135
|
end
|
data/lib/amigo/router.rb
CHANGED
data/lib/amigo/scheduled_job.rb
CHANGED
@@ -33,7 +33,7 @@ require "amigo/memory_pressure"
|
|
33
33
|
# - `semaphore_expiry` should return the TTL of the semaphore key.
|
34
34
|
# Defaults to 30 seconds. See below for key expiry and negative semaphore value details.
|
35
35
|
# - `before_perform` is called before calling the `perform` method.
|
36
|
-
# This is required so that implementers can set
|
36
|
+
# This is required so that implementers can set job state, based on job arguments,
|
37
37
|
# that can be used for calculating the semaphore key.
|
38
38
|
#
|
39
39
|
# Note that we give the semaphore key an expiry. This is to avoid situation where
|
@@ -41,7 +41,7 @@ require "amigo/memory_pressure"
|
|
41
41
|
# have fewer than the expected number of jobs running.
|
42
42
|
#
|
43
43
|
# This does mean that, when a job runs longer than the semaphore expiry,
|
44
|
-
# another
|
44
|
+
# another job can be started, which would increment the counter back to 1.
|
45
45
|
# When the original job ends, the counter would be 0; then when the new job ends,
|
46
46
|
# the counter would be -1. To avoid negative counters (which create the same issue
|
47
47
|
# around missing decrements), if we ever detect a negative 'jobs running',
|
@@ -78,11 +78,11 @@ module Amigo
|
|
78
78
|
|
79
79
|
module InstanceMethods
|
80
80
|
def semaphore_key
|
81
|
-
raise NotImplementedError, "must be implemented on
|
81
|
+
raise NotImplementedError, "must be implemented on job"
|
82
82
|
end
|
83
83
|
|
84
84
|
def semaphore_size
|
85
|
-
raise NotImplementedError, "must be implemented on
|
85
|
+
raise NotImplementedError, "must be implemented on job"
|
86
86
|
end
|
87
87
|
|
88
88
|
def semaphore_backoff
|
data/lib/amigo/spec_helpers.rb
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require "amigo"
|
4
|
-
require "sidekiq/worker"
|
5
4
|
|
6
5
|
module Amigo
|
7
6
|
module SpecHelpers
|
@@ -248,7 +247,7 @@ module Amigo
|
|
248
247
|
return PerformAsyncJobMatcher.new(job)
|
249
248
|
end
|
250
249
|
|
251
|
-
# Like a Sidekiq
|
250
|
+
# Like a Sidekiq job's perform_inline,
|
252
251
|
# but allows an arbitrary item to be used, rather than just the
|
253
252
|
# given class and args. For example, when testing,
|
254
253
|
# you may need to assume something like 'retry_count' is in the job payload,
|
@@ -256,18 +255,18 @@ module Amigo
|
|
256
255
|
# This allows those arbitrary job payload fields
|
257
256
|
# to be included when the job is run.
|
258
257
|
module_function def sidekiq_perform_inline(klass, args, item=nil)
|
259
|
-
Sidekiq::
|
258
|
+
Sidekiq::Job::Setter.override_item = item
|
260
259
|
begin
|
261
260
|
klass.perform_inline(*args)
|
262
261
|
ensure
|
263
|
-
Sidekiq::
|
262
|
+
Sidekiq::Job::Setter.override_item = nil
|
264
263
|
end
|
265
264
|
end
|
266
265
|
|
267
266
|
module_function def drain_sidekiq_jobs(q)
|
268
267
|
all_sidekiq_jobs(q).each do |job|
|
269
268
|
klass = job.item.fetch("class")
|
270
|
-
klass =
|
269
|
+
klass = Object.const_get(klass) if klass.is_a?(String)
|
271
270
|
sidekiq_perform_inline(klass, job.item["args"], job.item)
|
272
271
|
job.delete
|
273
272
|
end
|
@@ -282,6 +281,8 @@ module Amigo
|
|
282
281
|
# Use this middleware to pass an arbitrary callback evaluated before a job runs.
|
283
282
|
# Make sure to call +reset+ after the test.
|
284
283
|
class ServerCallbackMiddleware
|
284
|
+
include Sidekiq::ServerMiddleware
|
285
|
+
|
285
286
|
class << self
|
286
287
|
attr_accessor :callback
|
287
288
|
end
|
@@ -304,7 +305,7 @@ module Amigo
|
|
304
305
|
end
|
305
306
|
|
306
307
|
module ::Sidekiq
|
307
|
-
module
|
308
|
+
module Job
|
308
309
|
class Setter
|
309
310
|
class << self
|
310
311
|
attr_accessor :override_item
|
data/lib/amigo/version.rb
CHANGED
data/lib/amigo.rb
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require "redis"
|
4
3
|
require "sidekiq"
|
5
4
|
require "sidekiq-cron"
|
6
5
|
|
@@ -61,18 +60,18 @@ require "sidekiq-cron"
|
|
61
60
|
# to control the matching rules more closely than File.fnmatch can provide.
|
62
61
|
#
|
63
62
|
# Jobs must implement a `_perform` method, which takes a Amigo::Event.
|
64
|
-
# Note that normal Sidekiq
|
63
|
+
# Note that normal Sidekiq jobs use a 'perform' method that takes a variable number of arguments;
|
65
64
|
# the base Async::Job class has this method and delegates its business logic to the subclass _perform method.
|
66
65
|
#
|
67
66
|
# Routing
|
68
67
|
#
|
69
|
-
# There are two special
|
70
|
-
# (and do not inherit from Job but rather than Sidekiq::
|
68
|
+
# There are two special jobs that are important for the overall functioning of the system
|
69
|
+
# (and do not inherit from Job but rather than Sidekiq::Job so they are not classified and treated as 'Jobs').
|
71
70
|
#
|
72
71
|
# The first is the AuditLogger, which is a basic job that logs all async events.
|
73
72
|
# This acts as a useful change log for the state of the database.
|
74
73
|
#
|
75
|
-
# The second special
|
74
|
+
# The second special job is the Router, which calls `perform` on the event Jobs
|
76
75
|
# that match the routing information, as explained in Jobs.
|
77
76
|
# It does this by filtering through all event-based jobs and performing the ones with a route match.
|
78
77
|
#
|
metadata
CHANGED
@@ -1,43 +1,42 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sidekiq-amigo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.12.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Lithic Technology
|
8
|
-
autorequire:
|
9
8
|
bindir: bin
|
10
9
|
cert_chain: []
|
11
|
-
date:
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
12
11
|
dependencies:
|
13
12
|
- !ruby/object:Gem::Dependency
|
14
13
|
name: sidekiq
|
15
14
|
requirement: !ruby/object:Gem::Requirement
|
16
15
|
requirements:
|
17
|
-
- - "
|
16
|
+
- - ">="
|
18
17
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
18
|
+
version: '7'
|
20
19
|
type: :runtime
|
21
20
|
prerelease: false
|
22
21
|
version_requirements: !ruby/object:Gem::Requirement
|
23
22
|
requirements:
|
24
|
-
- - "
|
23
|
+
- - ">="
|
25
24
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
25
|
+
version: '7'
|
27
26
|
- !ruby/object:Gem::Dependency
|
28
27
|
name: sidekiq-cron
|
29
28
|
requirement: !ruby/object:Gem::Requirement
|
30
29
|
requirements:
|
31
30
|
- - "~>"
|
32
31
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
32
|
+
version: '2'
|
34
33
|
type: :runtime
|
35
34
|
prerelease: false
|
36
35
|
version_requirements: !ruby/object:Gem::Requirement
|
37
36
|
requirements:
|
38
37
|
- - "~>"
|
39
38
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
39
|
+
version: '2'
|
41
40
|
- !ruby/object:Gem::Dependency
|
42
41
|
name: platform-api
|
43
42
|
requirement: !ruby/object:Gem::Requirement
|
@@ -58,14 +57,14 @@ dependencies:
|
|
58
57
|
requirements:
|
59
58
|
- - "~>"
|
60
59
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
60
|
+
version: '3.1'
|
62
61
|
type: :development
|
63
62
|
prerelease: false
|
64
63
|
version_requirements: !ruby/object:Gem::Requirement
|
65
64
|
requirements:
|
66
65
|
- - "~>"
|
67
66
|
- !ruby/object:Gem::Version
|
68
|
-
version: '
|
67
|
+
version: '3.1'
|
69
68
|
- !ruby/object:Gem::Dependency
|
70
69
|
name: rspec
|
71
70
|
requirement: !ruby/object:Gem::Requirement
|
@@ -136,6 +135,34 @@ dependencies:
|
|
136
135
|
- - "~>"
|
137
136
|
- !ruby/object:Gem::Version
|
138
137
|
version: '5'
|
138
|
+
- !ruby/object:Gem::Dependency
|
139
|
+
name: simplecov
|
140
|
+
requirement: !ruby/object:Gem::Requirement
|
141
|
+
requirements:
|
142
|
+
- - "~>"
|
143
|
+
- !ruby/object:Gem::Version
|
144
|
+
version: '0.22'
|
145
|
+
type: :development
|
146
|
+
prerelease: false
|
147
|
+
version_requirements: !ruby/object:Gem::Requirement
|
148
|
+
requirements:
|
149
|
+
- - "~>"
|
150
|
+
- !ruby/object:Gem::Version
|
151
|
+
version: '0.22'
|
152
|
+
- !ruby/object:Gem::Dependency
|
153
|
+
name: simplecov-cobertura
|
154
|
+
requirement: !ruby/object:Gem::Requirement
|
155
|
+
requirements:
|
156
|
+
- - "~>"
|
157
|
+
- !ruby/object:Gem::Version
|
158
|
+
version: '3.1'
|
159
|
+
type: :development
|
160
|
+
prerelease: false
|
161
|
+
version_requirements: !ruby/object:Gem::Requirement
|
162
|
+
requirements:
|
163
|
+
- - "~>"
|
164
|
+
- !ruby/object:Gem::Version
|
165
|
+
version: '3.1'
|
139
166
|
- !ruby/object:Gem::Dependency
|
140
167
|
name: timecop
|
141
168
|
requirement: !ruby/object:Gem::Requirement
|
@@ -176,7 +203,14 @@ files:
|
|
176
203
|
- lib/amigo.rb
|
177
204
|
- lib/amigo/audit_logger.rb
|
178
205
|
- lib/amigo/autoscaler.rb
|
179
|
-
- lib/amigo/autoscaler/
|
206
|
+
- lib/amigo/autoscaler/checkers/fake.rb
|
207
|
+
- lib/amigo/autoscaler/checkers/sidekiq.rb
|
208
|
+
- lib/amigo/autoscaler/checkers/web_latency.rb
|
209
|
+
- lib/amigo/autoscaler/handlers/chain.rb
|
210
|
+
- lib/amigo/autoscaler/handlers/fake.rb
|
211
|
+
- lib/amigo/autoscaler/handlers/heroku.rb
|
212
|
+
- lib/amigo/autoscaler/handlers/log.rb
|
213
|
+
- lib/amigo/autoscaler/handlers/sentry.rb
|
180
214
|
- lib/amigo/deprecated_jobs.rb
|
181
215
|
- lib/amigo/job.rb
|
182
216
|
- lib/amigo/memory_pressure.rb
|
@@ -193,7 +227,6 @@ licenses:
|
|
193
227
|
- MIT
|
194
228
|
metadata:
|
195
229
|
rubygems_mfa_required: 'true'
|
196
|
-
post_install_message:
|
197
230
|
rdoc_options: []
|
198
231
|
require_paths:
|
199
232
|
- lib
|
@@ -201,15 +234,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
201
234
|
requirements:
|
202
235
|
- - ">="
|
203
236
|
- !ruby/object:Gem::Version
|
204
|
-
version: 3.
|
237
|
+
version: 3.2.0
|
205
238
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
206
239
|
requirements:
|
207
240
|
- - ">="
|
208
241
|
- !ruby/object:Gem::Version
|
209
242
|
version: '0'
|
210
243
|
requirements: []
|
211
|
-
rubygems_version: 3.
|
212
|
-
signing_key:
|
244
|
+
rubygems_version: 3.6.7
|
213
245
|
specification_version: 4
|
214
246
|
summary: Pubsub system and other enhancements around Sidekiq.
|
215
247
|
test_files: []
|
@@ -1,145 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require "platform-api"
|
4
|
-
|
5
|
-
require "amigo/autoscaler"
|
6
|
-
|
7
|
-
module Amigo
|
8
|
-
class Autoscaler
|
9
|
-
# Autoscaler to use on Heroku, that starts additional worker processes when there is a high latency event
|
10
|
-
# and scales them down after the event is finished.
|
11
|
-
#
|
12
|
-
# When the first call of a high latency event happens (depth: 1), this class
|
13
|
-
# will ask Heroku how many dynos are in the formation. This is known as +active_event_initial_workers+.
|
14
|
-
#
|
15
|
-
# If +active_event_initial_workers+ is 0, no autoscaling will be done.
|
16
|
-
# This avoids a situation where a high latency event is triggered
|
17
|
-
# due to workers being deprovisioned intentionally, perhaps for maintenance.
|
18
|
-
#
|
19
|
-
# Each time the alert fires (see +Amigo::Autoscaler#alert_interval+),
|
20
|
-
# an additional worker will be added to the formation, up to +max_additional_workers+.
|
21
|
-
# So with +active_event_initial_workers+ of 1 and +max_additional_workers+ of 2,
|
22
|
-
# the first time the alert times, the formation will be set to 2 workers.
|
23
|
-
# The next time, it'll be set to 3 workers.
|
24
|
-
# After that, no additional workers will be provisioned.
|
25
|
-
#
|
26
|
-
# After the high latency event resolves,
|
27
|
-
# the dyno formation is restored to +active_event_initial_workers+.
|
28
|
-
#
|
29
|
-
# To use:
|
30
|
-
#
|
31
|
-
# heroku = PlatformAPI.connect_oauth(heroku_oauth_token)
|
32
|
-
# heroku_scaler = Amigo::Autoscaler::Heroku.new(heroku:, default_workers: 1)
|
33
|
-
# Amigo::Autoscaler.new(
|
34
|
-
# handlers: [heroku_scaler.alert_callback],
|
35
|
-
# latency_restored_handlers: [heroku_scaler.restored_callback],
|
36
|
-
# )
|
37
|
-
#
|
38
|
-
# See instance attributes for additional options.
|
39
|
-
#
|
40
|
-
# Note that this class is provided as an example, and potentially a base or implementation class.
|
41
|
-
# Your actual implementation may also want to alert when a max depth or duration is reached,
|
42
|
-
# since it can indicate a bigger problem. Autoscaling, especially of workers, is a tough problem
|
43
|
-
# without a one-size-fits-all approach.
|
44
|
-
class Heroku
|
45
|
-
# Heroku client, usually created via PlatformAPI.oauth_connect.
|
46
|
-
# @return [PlatformAPI::Client]
|
47
|
-
attr_reader :heroku
|
48
|
-
|
49
|
-
# Captured at the start of a high latency event.
|
50
|
-
# Nil otherwise.
|
51
|
-
# @return [Integer]
|
52
|
-
attr_reader :active_event_initial_workers
|
53
|
-
|
54
|
-
# Maximum number of workers to add.
|
55
|
-
#
|
56
|
-
# As the 'depth' of the alert is increased,
|
57
|
-
# workers are added to the recorded worker count until the max is reached.
|
58
|
-
# By default, this is 2 (so the max workers will be the recorded number, plus 2).
|
59
|
-
# Do not set this too high, since it can for example exhaust database connections or just end up
|
60
|
-
# increasing load.
|
61
|
-
#
|
62
|
-
# See class docs for more information.
|
63
|
-
# @return [Integer]
|
64
|
-
attr_reader :max_additional_workers
|
65
|
-
|
66
|
-
# Defaults to HEROKU_APP_NAME, which should already be set if you use Heroku dyna metadata,
|
67
|
-
# as per https://devcenter.heroku.com/articles/dyno-metadata.
|
68
|
-
# This must be provided if the env var is missing.
|
69
|
-
# @return [String]
|
70
|
-
attr_reader :app_id_or_app_name
|
71
|
-
|
72
|
-
# Defaults to 'worker', which is what you'll probably use if you have a simple system.
|
73
|
-
# If you use multiple worker processes for different queues, this class probably isn't sufficient.
|
74
|
-
# You will probably need to look at the slow queue names and determine the formation name to scale up.
|
75
|
-
# @return [String]
|
76
|
-
attr_reader :formation_id_or_formation_type
|
77
|
-
|
78
|
-
def initialize(
|
79
|
-
heroku:,
|
80
|
-
max_additional_workers: 2,
|
81
|
-
app_id_or_app_name: ENV.fetch("HEROKU_APP_NAME"),
|
82
|
-
formation_id_or_formation_type: "worker"
|
83
|
-
)
|
84
|
-
@heroku = heroku
|
85
|
-
@max_additional_workers = max_additional_workers
|
86
|
-
@app_id_or_app_name = app_id_or_app_name
|
87
|
-
@formation_id_or_formation_type = formation_id_or_formation_type
|
88
|
-
# Is nil outside of a latency event, set during a latency event. So if this is initialized to non-nil,
|
89
|
-
# we're already in a latency event.
|
90
|
-
@active_event_initial_workers = Sidekiq.redis do |r|
|
91
|
-
v = r.get("#{namespace}/active_event_initial_workers")
|
92
|
-
v&.to_i
|
93
|
-
end
|
94
|
-
end
|
95
|
-
|
96
|
-
def alert_callback
|
97
|
-
self.method(:scale_up)
|
98
|
-
end
|
99
|
-
|
100
|
-
def restored_callback
|
101
|
-
self.method(:scale_down)
|
102
|
-
end
|
103
|
-
|
104
|
-
protected def namespace
|
105
|
-
return "amigo/autoscaler/heroku"
|
106
|
-
end
|
107
|
-
|
108
|
-
# Potentially add another worker to the formation.
|
109
|
-
# @return [:noscale, :maxscale, :scaled] One of :noscale (no +active_event_initial_workers+),
|
110
|
-
# :maxscale (+max_additional_workers+ reached), or :scaled.
|
111
|
-
def scale_up(_queues_and_latencies, depth:, **)
|
112
|
-
# When the scaling event starts (or if this is the first time we've seen it
|
113
|
-
# but the event is already in progress), store how many workers we have.
|
114
|
-
# It needs to be stored in redis so it persists if
|
115
|
-
# the latency event continues through restarts.
|
116
|
-
if @active_event_initial_workers.nil?
|
117
|
-
@active_event_initial_workers = @heroku.formation.info(@app_id_or_app_name, @formation_id_or_formation_type).
|
118
|
-
fetch("quantity")
|
119
|
-
Sidekiq.redis do |r|
|
120
|
-
r.set("#{namespace}/active_event_initial_workers", @active_event_initial_workers.to_s)
|
121
|
-
end
|
122
|
-
end
|
123
|
-
return :noscale if @active_event_initial_workers.zero?
|
124
|
-
new_quantity = @active_event_initial_workers + depth
|
125
|
-
max_quantity = @active_event_initial_workers + @max_additional_workers
|
126
|
-
return :maxscale if new_quantity > max_quantity
|
127
|
-
@heroku.formation.update(@app_id_or_app_name, @formation_id_or_formation_type, {quantity: new_quantity})
|
128
|
-
return :scaled
|
129
|
-
end
|
130
|
-
|
131
|
-
# Reset the formation to +active_event_initial_workers+.
|
132
|
-
# @return [:noscale, :scaled] :noscale if +active_event_initial_workers+ is 0, otherwise :scaled.
|
133
|
-
def scale_down(**)
|
134
|
-
initial_workers = @active_event_initial_workers
|
135
|
-
Sidekiq.redis do |r|
|
136
|
-
r.del("#{namespace}/active_event_initial_workers")
|
137
|
-
end
|
138
|
-
@active_event_initial_workers = nil
|
139
|
-
return :noscale if initial_workers.zero?
|
140
|
-
@heroku.formation.update(@app_id_or_app_name, @formation_id_or_formation_type, {quantity: initial_workers})
|
141
|
-
return :scaled
|
142
|
-
end
|
143
|
-
end
|
144
|
-
end
|
145
|
-
end
|