sidekiq-amigo 1.11.0 → 1.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5e1aa001f6060b1e13decb95357627151007e8f17066cef3401021103a3cd7a2
4
- data.tar.gz: 955cb548d0739f5fa51bc0fe887a4e0df5d15c154ee5470e4bdba4883a308af4
3
+ metadata.gz: 8f2d4776669bc7327b064ae2f0b2f22a83e35cd351da05a145d1b3b7bf086334
4
+ data.tar.gz: b3122b4fa37a8c6c93afbd485ce4b813536a2c1b91d96c266b2c2bb04be15d98
5
5
  SHA512:
6
- metadata.gz: 9fb081204d9465257ac48df3193925bb547961971fc0db39ea235584e9f4b9acef7ab2f93eaed60c1dcd4d1de759d09daadc5a94ec89473e600eb72b67c4afc7
7
- data.tar.gz: bea97ecddf27912029035a941e6ee3aff653c0f8ba958e27f21a539f984a987c017311aa21b5f306832bb2240a0f487522e0cd63bf5a87988e5e568ebf503c7b
6
+ metadata.gz: c24ff6b6af38bb638be36dfa18aaca7500df9bf0126a4adf814e179fc05f3784b27004286e4ef796285cd047388d88486ee7da5fe9050454e33fc9b52d8f4698
7
+ data.tar.gz: 52b8edc30efe323786963559a7330058bd6b70552eedbdb4f7d3f3a8727373de6d033f2c0dd0113f9303f80e312d957df1e50c1f0fb5aec3e7bdcd9918b3a3bf
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "amigo/autoscaler"
4
+
5
+ module Amigo
6
+ class Autoscaler
7
+ module Checkers
8
+ class Fake < Amigo::Autoscaler::Checker
9
+ def initialize(latencies)
10
+ @latencies = latencies
11
+ super()
12
+ end
13
+
14
+ def get_latencies
15
+ return @latencies.call if @latencies.respond_to?(:call)
16
+ return @latencies.shift if @latencies.is_a?(Array)
17
+ return @latencies
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "sidekiq/api"
4
+
5
+ require "amigo/autoscaler"
6
+
7
+ module Amigo
8
+ class Autoscaler
9
+ module Checkers
10
+ class Sidekiq < Amigo::Autoscaler::Checker
11
+ def get_latencies
12
+ return ::Sidekiq::Queue.all.
13
+ map { |q| [q.name, q.latency] }.
14
+ to_h
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,84 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "amigo/autoscaler"
4
+
5
+ module Amigo
6
+ class Autoscaler
7
+ module Checkers
8
+ class WebLatency < Amigo::Autoscaler::Checker
9
+ NAMESPACE = "amigo/autoscaler/web_latency"
10
+ WINDOW = 60
11
+
12
+ # Set the latency.
13
+ # @param redis [RedisClient::Common] Redis connection.
14
+ # @param namespace [String] Key namespace.
15
+ # @param at [Time,Integer] Time this record was taken.
16
+ # @param duration [Numeric] Duration of the request in fractional seconds.
17
+ def self.set_latency(redis:, namespace:, at:, duration:)
18
+ bucket = at.to_i
19
+ key = "#{namespace}/latencies:#{bucket}"
20
+ duration_ms = (duration * 1000).round
21
+ redis.call("HINCRBY", key, "count", 1)
22
+ redis.call("HINCRBY", key, "sum", duration_ms)
23
+ redis.call("EXPIRE", key, WINDOW + 1)
24
+ end
25
+
26
+ def initialize(redis:, namespace: NAMESPACE)
27
+ @redis = redis
28
+ @namespace = namespace
29
+ super()
30
+ end
31
+
32
+ def get_latencies
33
+ now = Time.now.to_i
34
+ keys = (now - 59..now).map { |t| "#{@namespace}/latencies:#{t}" }
35
+ counts = 0
36
+ sums = 0
37
+ results = @redis.pipelined do |pipeline|
38
+ keys.each do |k|
39
+ pipeline.call("HMGET", k, "count", "sum")
40
+ end
41
+ end
42
+ results.each do |count, sum|
43
+ counts += count.to_i
44
+ sums += sum.to_i
45
+ end
46
+ return {} if counts.zero?
47
+ latency = sums.to_f / counts
48
+ return {"web" => latency.to_f / 1000}
49
+ end
50
+
51
+ class Middleware
52
+ # @param threshold [Float] Do not record the latency of requests faster than this.
53
+ # These are usually just things like healthchecks, files, or other very fast requests
54
+ # which do not represent the overall system slowness.
55
+ def initialize(app, redis:, threshold: 0.08, namespace: NAMESPACE)
56
+ @app = app
57
+ @redis = redis
58
+ @threshold = threshold
59
+ @namespace = namespace
60
+ end
61
+
62
+ def call(env)
63
+ start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
64
+ status, headers, body = @app.call(env)
65
+ duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start
66
+ if duration > @threshold
67
+ begin
68
+ WebLatency.set_latency(
69
+ redis: @redis,
70
+ namespace: @namespace,
71
+ at: Time.now,
72
+ duration:,
73
+ )
74
+ rescue StandardError => e
75
+ Amigo.log(nil, :error, "web_latency_error", exception: e)
76
+ end
77
+ end
78
+ [status, headers, body]
79
+ end
80
+ end
81
+ end
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "amigo/autoscaler"
4
+
5
+ module Amigo
6
+ class Autoscaler
7
+ module Handlers
8
+ class Chain < Amigo::Autoscaler::Handler
9
+ attr_accessor :chain
10
+
11
+ # Chain multiple handlers together.
12
+ # @param chain [Array<Amigo::Autoscaler::Handler>]
13
+ def initialize(chain)
14
+ @chain = chain
15
+ super()
16
+ end
17
+
18
+ def scale_up(*args, **kw)
19
+ @chain.each { |c| c.scale_up(*args, **kw) }
20
+ end
21
+
22
+ def scale_down(*args, **kw)
23
+ @chain.each { |c| c.scale_down(*args, **kw) }
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "amigo/autoscaler"
4
+
5
+ module Amigo
6
+ class Autoscaler
7
+ module Handlers
8
+ class Fake < Amigo::Autoscaler::Handler
9
+ attr_accessor :ups, :downs
10
+
11
+ def initialize
12
+ @ups = []
13
+ @downs = []
14
+ super()
15
+ end
16
+
17
+ def scale_up(checked_latencies, depth:, duration:, **kw)
18
+ @ups << [checked_latencies, depth, duration, kw]
19
+ end
20
+
21
+ def scale_down(depth:, duration:, **kw)
22
+ @downs << [depth, duration, kw]
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,141 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "platform-api"
4
+
5
+ require "amigo/autoscaler"
6
+
7
+ module Amigo
8
+ class Autoscaler
9
+ module Handlers
10
+ # Autoscaler to use on Heroku, that starts additional worker processes when there is a high latency event
11
+ # and scales them down after the event is finished.
12
+ #
13
+ # When the first call of a high latency event happens (depth: 1), this class
14
+ # will ask Heroku how many dynos are in the formation. This is known as +active_event_initial_workers+.
15
+ #
16
+ # If +active_event_initial_workers+ is 0, no autoscaling will be done.
17
+ # This avoids a situation where a high latency event is triggered
18
+ # due to workers being deprovisioned intentionally, perhaps for maintenance.
19
+ #
20
+ # Each time the alert fires (see +Amigo::Autoscaler#alert_interval+),
21
+ # an additional worker will be added to the formation, up to +max_additional_workers+.
22
+ # So with +active_event_initial_workers+ of 1 and +max_additional_workers+ of 2,
23
+ # the first time the alert times, the formation will be set to 2 workers.
24
+ # The next time, it'll be set to 3 workers.
25
+ # After that, no additional workers will be provisioned.
26
+ #
27
+ # After the high latency event resolves,
28
+ # the dyno formation is restored to +active_event_initial_workers+.
29
+ #
30
+ # To use:
31
+ #
32
+ # heroku = PlatformAPI.connect_oauth(heroku_oauth_token)
33
+ # heroku_scaler = Amigo::Autoscaler::Heroku.new(heroku:, default_workers: 1)
34
+ # Amigo::Autoscaler.new(
35
+ # handlers: [heroku_scaler.alert_callback],
36
+ # latency_restored_handlers: [heroku_scaler.restored_callback],
37
+ # )
38
+ #
39
+ # See instance attributes for additional options.
40
+ #
41
+ # Note that this class is provided as an example, and potentially a base or implementation class.
42
+ # Your actual implementation may also want to alert when a max depth or duration is reached,
43
+ # since it can indicate a bigger problem. Autoscaling, especially of workers, is a tough problem
44
+ # without a one-size-fits-all approach.
45
+ class Heroku < Amigo::Autoscaler::Handler
46
+ # Heroku client, usually created via PlatformAPI.oauth_connect.
47
+ # @return [PlatformAPI::Client]
48
+ attr_reader :heroku
49
+
50
+ # Captured at the start of a high latency event.
51
+ # Nil otherwise.
52
+ # @return [Integer]
53
+ attr_reader :active_event_initial_workers
54
+
55
+ # Maximum number of workers to add.
56
+ #
57
+ # As the 'depth' of the alert is increased,
58
+ # workers are added to the recorded worker count until the max is reached.
59
+ # By default, this is 2 (so the max workers will be the recorded number, plus 2).
60
+ # Do not set this too high, since it can for example exhaust database connections or just end up
61
+ # increasing load.
62
+ #
63
+ # See class docs for more information.
64
+ # @return [Integer]
65
+ attr_reader :max_additional_workers
66
+
67
+ # Defaults to HEROKU_APP_NAME, which should already be set if you use Heroku dyna metadata,
68
+ # as per https://devcenter.heroku.com/articles/dyno-metadata.
69
+ # This must be provided if the env var is missing.
70
+ # @return [String]
71
+ attr_reader :app_id_or_app_name
72
+
73
+ # Formation ID or name.
74
+ # Usually 'worker' to scale Sidekiq workers, or 'web' for the web worker.
75
+ # If you use multiple worker processes for different queues, this class probably isn't sufficient.
76
+ # You will probably need to look at the slow queue names and determine the formation name to scale up.
77
+ # @return [String]
78
+ attr_reader :formation
79
+
80
+ def initialize(
81
+ client:,
82
+ formation:,
83
+ max_additional_workers: 2,
84
+ app_id_or_app_name: ENV.fetch("HEROKU_APP_NAME")
85
+ )
86
+ super()
87
+ @client = client
88
+ @max_additional_workers = max_additional_workers
89
+ @app_id_or_app_name = app_id_or_app_name
90
+ @formation = formation
91
+ # Is nil outside a latency event, set during a latency event. So if this is initialized to non-nil,
92
+ # we're already in a latency event.
93
+ @active_event_initial_workers = Sidekiq.redis do |r|
94
+ v = r.get("#{namespace}/active_event_initial_workers")
95
+ v&.to_i
96
+ end
97
+ end
98
+
99
+ protected def namespace
100
+ return "amigo/autoscaler/heroku/#{self.formation}"
101
+ end
102
+
103
+ # Potentially add another worker to the formation.
104
+ # @return [:noscale, :maxscale, :scaled] One of :noscale (no +active_event_initial_workers+),
105
+ # :maxscale (+max_additional_workers+ reached), or :scaled.
106
+ def scale_up(_queues_and_latencies, depth:, **)
107
+ # When the scaling event starts (or if this is the first time we've seen it
108
+ # but the event is already in progress), store how many workers we have.
109
+ # It needs to be stored in redis so it persists if
110
+ # the latency event continues through restarts.
111
+ if @active_event_initial_workers.nil?
112
+ @active_event_initial_workers = @client.formation.info(@app_id_or_app_name, @formation).
113
+ fetch("quantity")
114
+ Sidekiq.redis do |r|
115
+ r.set("#{namespace}/active_event_initial_workers", @active_event_initial_workers.to_s)
116
+ end
117
+ end
118
+ return :noscale if @active_event_initial_workers.zero?
119
+ new_quantity = @active_event_initial_workers + depth
120
+ max_quantity = @active_event_initial_workers + @max_additional_workers
121
+ return :maxscale if new_quantity > max_quantity
122
+ @client.formation.update(@app_id_or_app_name, @formation, {quantity: new_quantity})
123
+ return :scaled
124
+ end
125
+
126
+ # Reset the formation to +active_event_initial_workers+.
127
+ # @return [:noscale, :scaled] :noscale if +active_event_initial_workers+ is 0, otherwise :scaled.
128
+ def scale_down(**)
129
+ initial_workers = @active_event_initial_workers
130
+ Sidekiq.redis do |r|
131
+ r.del("#{namespace}/active_event_initial_workers")
132
+ end
133
+ @active_event_initial_workers = nil
134
+ return :noscale if initial_workers.zero?
135
+ @client.formation.update(@app_id_or_app_name, @formation, {quantity: initial_workers})
136
+ return :scaled
137
+ end
138
+ end
139
+ end
140
+ end
141
+ end
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "amigo/autoscaler"
4
+
5
+ module Amigo
6
+ class Autoscaler
7
+ module Handlers
8
+ class Log < Amigo::Autoscaler::Handler
9
+ DEFAULT_LOG = ->(level, message, params={}) { Amigo.log(nil, level, message, params) }
10
+
11
+ # @param message [String] Log message for structured logging.\
12
+ # Has "_restored" appended on +scale_down+.
13
+ # @param log [Proc] Proc/callable called with (level, message, params={}).
14
+ # By default, use +Amigo.log+ (which logs to the Sidekiq logger).
15
+ def initialize(message: "high_latency_queues", log: DEFAULT_LOG)
16
+ @message = message
17
+ @log = log
18
+ super()
19
+ end
20
+
21
+ def scale_up(checked_latencies, depth:, duration:, **_kw)
22
+ self._log(:warn, @message, queues: checked_latencies, depth: depth, duration: duration)
23
+ end
24
+
25
+ def scale_down(depth:, duration:, **_kw)
26
+ self._log(:info, "#{@message}_restored", depth: depth, duration: duration)
27
+ end
28
+
29
+ protected def _log(level, msg, **kw)
30
+ @log[level, msg, kw]
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "amigo/autoscaler"
4
+
5
+ module Amigo
6
+ class Autoscaler
7
+ module Handlers
8
+ class Sentry < Amigo::Autoscaler::Handler
9
+ # @param interval [Integer] How many seconds between Sentry alerts?
10
+ # This is similar to +alert_interval+ on the Autoscaler,
11
+ # but Sentry has its own interval, since it is used for reporting,
12
+ # and not latency reduction.
13
+ # @param message [String] Message to capture.
14
+ # @param level [:debug,:info,:warning,:warn,:error,:fatal] Sentry level.
15
+ def initialize(interval: 300, message: "Some queues have a high latency", level: :warn)
16
+ @interval = interval
17
+ @message = message
18
+ @level = level
19
+ @last_alerted = Time.at(0)
20
+ super()
21
+ end
22
+
23
+ def scale_up(checked_latencies, depth:, duration:, **)
24
+ now = Time.now
25
+ call_sentry = @last_alerted < (now - @interval)
26
+ return unless call_sentry
27
+ ::Sentry.with_scope do |scope|
28
+ scope&.set_extras(high_latency_queues: checked_latencies, depth:, duration:)
29
+ ::Sentry.capture_message(@message, level: @level)
30
+ end
31
+ @last_alerted = now
32
+ end
33
+
34
+ def scale_down(**) = nil
35
+ end
36
+ end
37
+ end
38
+ end
@@ -4,37 +4,37 @@ require "sidekiq/api"
4
4
 
5
5
  require "amigo"
6
6
 
7
- # When queues achieve a latency that is too high,
8
- # take some action.
7
+ # Generic autoscaling handler that will check for latency
8
+ # and take an action.
9
+ # For Sidekiq on Heroku for instance,
10
+ # this means checking queues for a latency above a threshold, and adding workers up to a limit.
11
+ #
9
12
  # You should start this up at Web application startup:
10
13
  #
11
14
  # # puma.rb or similar
12
- # Amigo::Autoscaler.new.start
15
+ # checker = Amigo::Autoscaler::Checkers::SidekiqLatency.new
16
+ # heroku_client = PlatformAPI.connect_oauth(ENV['MYAPP_HEROKU_OAUTH_TOKEN'])
17
+ # handler = Amigo::Autoscaler::Handlers::Heroku.new(client: heroku_client, formation: 'worker')
18
+ # Amigo::Autoscaler.new(checker:, handler:).start
13
19
  #
14
20
  # When latency grows beyond +latency_threshold+,
15
21
  # a "high latency event" is started.
16
- # Some action is taken, which is defined by the +handlers+ argument.
17
- # This includes logging, alerting, and/or autoscaling.
22
+ # Some action should be taken, which is handled by the handler's +scale_up+ method.
23
+ # This usually includes logging, alerting, and/or autoscaling.
18
24
  #
19
25
  # When latency returns to normal (defined by +latency_restored_threshold+),
20
26
  # the high latency event finishes.
21
- # Some additional action is taken, which is defined by the +latency_restored_handlers+ argument.
27
+ # Some additional action is taken, handled by the handler's +scale_down+ method.
22
28
  # Usually this is logging, and/or returning autoscaling to its original status.
23
29
  #
24
30
  # There are several parameters to control behavior, such as how often polling is done,
25
31
  # how often alerting/scaling is done, and more.
26
32
  #
27
- # As an example autoscaler that includes actual resource scaling,
28
- # check out +Amigo::Autoscaler::Heroku+.
29
- # Its ideas can easily be expanded to other platforms.
30
- #
31
33
  # Note that +Autoscaler+ maintains its state over multiple processes;
32
34
  # it needs to keep track of high latency events even if the process running the autoscaler
33
35
  # (usually a web process) restarts.
34
36
  module Amigo
35
37
  class Autoscaler
36
- class InvalidHandler < StandardError; end
37
-
38
38
  # Struct representing data serialized to Redis.
39
39
  # Useful for diagnostics. Can be retried with +fetch_persisted+.
40
40
  # @!attribute last_alerted_at [Time] 0-time if there is no recent alert.
@@ -56,49 +56,32 @@ module Amigo
56
56
  # are generally easier to find).
57
57
  # @return [Regexp]
58
58
  attr_reader :hostname_regex
59
- # Methods to call when alerting, as strings/symbols or procs.
60
- # Valid string values are 'log' and 'sentry' (requires Sentry to be required already).
61
- # Anything that responds to +call+ will be invoked with:
62
- # - Positional argument which is a +Hash+ of `{queue name => latency in seconds}`
63
- # - Keyword argument +:depth+: Number of alerts as part of this latency event.
64
- # For example, the first alert has a depth of 1, and if latency stays high,
65
- # it'll be 2 on the next call, etc. +depth+ can be used to incrementally provision
66
- # additional processing capacity, and stop adding capacity at a certain depth
67
- # to avoid problems with too many workers (like excessive DB load).
68
- # - Keyword argument +:duration+: Number of seconds since this latency spike started.
69
- # - Additional undefined keywords. Handlers should accept additional options,
70
- # like via `**kw` or `opts={}`, for compatibility.
71
- # @return [Array<String,Symbol,Proc,#call>]
72
- attr_reader :handlers
73
59
  # Only alert this often.
74
60
  # For example, with poll_interval of 10 seconds
75
61
  # and alert_interval of 200 seconds,
76
62
  # we'd alert once and then 210 seconds later.
77
63
  # @return [Integer]
78
64
  attr_reader :alert_interval
65
+
79
66
  # After an alert happens, what latency should be considered "back to normal" and
80
- # +latency_restored_handlers+ will be called?
67
+ # +scale_down+ will be called?
81
68
  # In most cases this should be the same as (and defaults to) +latency_threshold+
82
69
  # so that we're 'back to normal' once we're below the threshold.
83
70
  # It may also commonly be 0, so that the callback is fired when the queue is entirely clear.
84
71
  # Note that, if +latency_restored_threshold+ is less than +latency_threshold+,
85
72
  # while the latency is between the two, no alerts will fire.
86
73
  attr_reader :latency_restored_threshold
87
- # Methods to call when a latency of +latency_restored_threshold+ is reached
88
- # (ie, when we get back to normal latency after a high latency event).
89
- # Valid string values are 'log'.
90
- # Usually this handler will deprovision capacity procured as part of the alert +handlers+.
91
- # Anything that responds to +call+ will be invoked with:
92
- # - Keyword +:depth+, the number of times an alert happened before
93
- # the latency spike was resolved.
94
- # - Keyword +:duration+, the number of seconds for the latency spike has been going on.
95
- # - Additional undefined keywords. Handlers should accept additional options,
96
- # like via `**kw`, for compatibility.
97
- # @return [Array<String,Symbol,Proc,#call>]
98
- attr_reader :latency_restored_handlers
99
- # Proc/callable called with (level, message, params={}).
100
- # By default, use +Amigo.log+ (which logs to the Sidekiq logger).
101
- attr_reader :log
74
+
75
+ # @return [Amigo::Autoscaler::Checker]
76
+ attr_reader :checker
77
+ # @return [Amigo::Autoscaler::Handler]
78
+ attr_reader :handler
79
+
80
+ # Store autoscaler keys in this Redis namespace.
81
+ # Note that if you are running multiple autoscalers for different services (web, worker),
82
+ # you will need different namespaces.
83
+ attr_reader :namespace
84
+
102
85
  # Proc called with an exception that occurs while the thread is running.
103
86
  # If the handler returns +true+, then the thread will keep going.
104
87
  # All other values will kill the thread, which breaks autoscaling.
@@ -108,15 +91,15 @@ module Amigo
108
91
  attr_reader :on_unhandled_exception
109
92
 
110
93
  def initialize(
94
+ handler:,
95
+ checker:,
111
96
  poll_interval: 20,
112
97
  latency_threshold: 5,
113
98
  hostname_regex: /^web\.1$/,
114
- handlers: [:log],
115
99
  alert_interval: 120,
116
100
  latency_restored_threshold: latency_threshold,
117
- latency_restored_handlers: [:log],
118
- log: ->(level, message, params={}) { Amigo.log(nil, level, message, params) },
119
- on_unhandled_exception: nil
101
+ on_unhandled_exception: nil,
102
+ namespace: "amigo/autoscaler"
120
103
  )
121
104
  raise ArgumentError, "latency_threshold must be > 0" if
122
105
  latency_threshold <= 0
@@ -124,15 +107,15 @@ module Amigo
124
107
  latency_restored_threshold.negative?
125
108
  raise ArgumentError, "latency_restored_threshold must be <= latency_threshold" if
126
109
  latency_restored_threshold > latency_threshold
110
+ @handler = handler
111
+ @checker = checker
127
112
  @poll_interval = poll_interval
128
113
  @latency_threshold = latency_threshold
129
114
  @hostname_regex = hostname_regex
130
- @handlers = handlers.freeze
131
115
  @alert_interval = alert_interval
132
116
  @latency_restored_threshold = latency_restored_threshold
133
- @latency_restored_handlers = latency_restored_handlers.freeze
134
- @log = log
135
117
  @on_unhandled_exception = on_unhandled_exception
118
+ @namespace = namespace
136
119
  end
137
120
 
138
121
  # @return [Thread]
@@ -143,8 +126,6 @@ module Amigo
143
126
  def setup
144
127
  # Store these as strings OR procs, rather than grabbing self.method here.
145
128
  # It gets extremely hard ot test if we capture the method here.
146
- @alert_methods = self.handlers.map { |a| _handler_to_method("alert_", a) }
147
- @restored_methods = self.latency_restored_handlers.map { |a| _handler_to_method("alert_restored_", a) }
148
129
  @stop = false
149
130
  persisted = self.fetch_persisted
150
131
  @last_alerted = persisted.last_alerted_at
@@ -181,24 +162,13 @@ module Amigo
181
162
  end
182
163
  end
183
164
 
184
- protected def namespace
185
- return "amigo/autoscaler"
186
- end
187
-
188
- private def _handler_to_method(prefix, a)
189
- return a if a.respond_to?(:call)
190
- method_name = "#{prefix}#{a.to_s.strip}".to_sym
191
- raise InvalidHandler, a.inspect unless (meth = self.method(method_name))
192
- return meth
193
- end
194
-
195
165
  def start
196
166
  raise "already started" unless @polling_thread.nil?
197
167
 
198
168
  hostname = ENV.fetch("DYNO") { Socket.gethostname }
199
169
  return false unless self.hostname_regex.match?(hostname)
200
170
 
201
- self._log(:info, "async_autoscaler_starting")
171
+ self._debug(:info, "async_autoscaler_starting")
202
172
  self.setup
203
173
  @polling_thread = Thread.new do
204
174
  until @stop
@@ -216,7 +186,7 @@ module Amigo
216
186
  def check
217
187
  self._check
218
188
  rescue StandardError => e
219
- self._log(:error, "async_autoscaler_unhandled_error", exception: e)
189
+ self._debug(:error, "async_autoscaler_unhandled_error", exception: e)
220
190
  handled = self.on_unhandled_exception&.call(e)
221
191
  raise e unless handled.eql?(true)
222
192
  end
@@ -225,22 +195,18 @@ module Amigo
225
195
  now = Time.now
226
196
  skip_check = now < (@last_alerted + self.alert_interval)
227
197
  if skip_check
228
- self._log(:debug, "async_autoscaler_skip_check")
198
+ self._debug(:debug, "async_autoscaler_skip_check")
229
199
  return
230
200
  end
231
- self._log(:info, "async_autoscaler_check")
232
- high_latency_queues = Sidekiq::Queue.all.
233
- map { |q| [q.name, q.latency] }.
234
- select { |(_, latency)| latency > self.latency_threshold }.
235
- to_h
201
+ self._debug(:info, "async_autoscaler_check")
202
+ high_latency_queues = self.checker.get_latencies.
203
+ select { |_, latency| latency > self.latency_threshold }
236
204
  if high_latency_queues.empty?
237
205
  # Whenever we are in a latency event, we have a depth > 0. So a depth of 0 means
238
206
  # we're not in a latency event, and still have no latency, so can noop.
239
207
  return if @depth.zero?
240
208
  # We WERE in a latency event, and now we're not, so report on it.
241
- @restored_methods.each do |m|
242
- m.call(depth: @depth, duration: (Time.now - @latency_event_started).to_f)
243
- end
209
+ self.handler.scale_down(depth: @depth, duration: (Time.now - @latency_event_started).to_f)
244
210
  # Reset back to 0 depth so we know we're not in a latency event.
245
211
  @depth = 0
246
212
  @latency_event_started = Time.at(0)
@@ -260,38 +226,47 @@ module Amigo
260
226
  end
261
227
  # Alert each handler. For legacy reasons, we support handlers that accept
262
228
  # ({queues and latencies}) and ({queues and latencies}, {}keywords}).
263
- kw = {depth: @depth, duration: duration}
264
- @alert_methods.each do |m|
265
- if m.respond_to?(:arity) && m.arity == 1
266
- m.call(high_latency_queues)
267
- else
268
- m.call(high_latency_queues, **kw)
269
- end
270
- end
229
+ @handler.scale_up(high_latency_queues, depth: @depth, duration: duration)
271
230
  @last_alerted = now
272
231
  self.persist
273
232
  end
274
233
 
275
- def alert_sentry(names_and_latencies)
276
- Sentry.with_scope do |scope|
277
- scope.set_extras(high_latency_queues: names_and_latencies)
278
- names = names_and_latencies.map(&:first).sort.join(", ")
279
- Sentry.capture_message("Some queues have a high latency: #{names}")
280
- end
234
+ def _debug(lvl, msg, **kw)
235
+ return unless ENV["DEBUG"]
236
+ Amigo.log(nil, lvl, msg, kw)
281
237
  end
282
238
 
283
- def alert_log(names_and_latencies, depth:, duration:)
284
- self._log(:warn, "high_latency_queues", queues: names_and_latencies, depth: depth, duration: duration)
239
+ class Checker
240
+ # Return relevant latencies for this checker.
241
+ # This could be the latencies of each Sidekiq queue, or web latencies, etc.
242
+ # @return [Hash] Key is the queue name (or some other value); value is the latency in seconds.
243
+ def get_latencies = raise NotImplementedError
285
244
  end
286
245
 
287
- def alert_test(_names_and_latencies, _opts={}); end
288
-
289
- def alert_restored_log(depth:, duration:)
290
- self._log(:info, "high_latency_queues_restored", depth: depth, duration: duration)
291
- end
246
+ class Handler
247
+ # Called when a latency event starts, and as it fails to resolve.
248
+ # @param checked_latencies [Hash] The +Hash+ returned from +Amigo::Autoscaler::Handler#check+.
249
+ # For Sidekiq, this will look like `{queue name => latency in seconds}`
250
+ # @param depth [Integer] Number of alerts as part of this latency event.
251
+ # For example, the first alert has a depth of 1, and if latency stays high,
252
+ # it'll be 2 on the next call, etc. +depth+ can be used to incrementally provision
253
+ # additional processing capacity, and stop adding capacity at a certain depth
254
+ # to avoid problems with too many workers (like excessive DB load).
255
+ # @param duration [Float] Number of seconds since this latency spike started.
256
+ # @param kw [Hash] Additional undefined keywords. Handlers should accept additional options,
257
+ # like via `**kw` or `opts={}`, for compatibility.
258
+ # @return [Array<String,Symbol,Proc,#call>]
259
+ def scale_up(checked_latencies, depth:, duration:, **kw) = raise NotImplementedError
292
260
 
293
- protected def _log(level, msg, **kw)
294
- self.log[level, msg, kw]
261
+ # Called when a latency of +latency_restored_threshold+ is reached
262
+ # (ie, when we get back to normal latency after a high latency event).
263
+ # Usually this handler will deprovision capacity procured as part of the +scale_up+.
264
+ # @param depth [Integer] The number of times an alert happened before
265
+ # the latency spike was resolved.
266
+ # @param duration [Float] The number of seconds for the latency spike has been going on.
267
+ # @param kw [Hash] Additional undefined keywords. Handlers should accept additional options,
268
+ # like via `**kw` or `opts={}`, for compatibility.
269
+ def scale_down(depth:, duration:, **kw) = raise NotImplementedError
295
270
  end
296
271
  end
297
272
  end
data/lib/amigo/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Amigo
4
- VERSION = "1.11.0"
4
+ VERSION = "1.12.0"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sidekiq-amigo
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.11.0
4
+ version: 1.12.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Lithic Technology
@@ -135,6 +135,34 @@ dependencies:
135
135
  - - "~>"
136
136
  - !ruby/object:Gem::Version
137
137
  version: '5'
138
+ - !ruby/object:Gem::Dependency
139
+ name: simplecov
140
+ requirement: !ruby/object:Gem::Requirement
141
+ requirements:
142
+ - - "~>"
143
+ - !ruby/object:Gem::Version
144
+ version: '0.22'
145
+ type: :development
146
+ prerelease: false
147
+ version_requirements: !ruby/object:Gem::Requirement
148
+ requirements:
149
+ - - "~>"
150
+ - !ruby/object:Gem::Version
151
+ version: '0.22'
152
+ - !ruby/object:Gem::Dependency
153
+ name: simplecov-cobertura
154
+ requirement: !ruby/object:Gem::Requirement
155
+ requirements:
156
+ - - "~>"
157
+ - !ruby/object:Gem::Version
158
+ version: '3.1'
159
+ type: :development
160
+ prerelease: false
161
+ version_requirements: !ruby/object:Gem::Requirement
162
+ requirements:
163
+ - - "~>"
164
+ - !ruby/object:Gem::Version
165
+ version: '3.1'
138
166
  - !ruby/object:Gem::Dependency
139
167
  name: timecop
140
168
  requirement: !ruby/object:Gem::Requirement
@@ -175,7 +203,14 @@ files:
175
203
  - lib/amigo.rb
176
204
  - lib/amigo/audit_logger.rb
177
205
  - lib/amigo/autoscaler.rb
178
- - lib/amigo/autoscaler/heroku.rb
206
+ - lib/amigo/autoscaler/checkers/fake.rb
207
+ - lib/amigo/autoscaler/checkers/sidekiq.rb
208
+ - lib/amigo/autoscaler/checkers/web_latency.rb
209
+ - lib/amigo/autoscaler/handlers/chain.rb
210
+ - lib/amigo/autoscaler/handlers/fake.rb
211
+ - lib/amigo/autoscaler/handlers/heroku.rb
212
+ - lib/amigo/autoscaler/handlers/log.rb
213
+ - lib/amigo/autoscaler/handlers/sentry.rb
179
214
  - lib/amigo/deprecated_jobs.rb
180
215
  - lib/amigo/job.rb
181
216
  - lib/amigo/memory_pressure.rb
@@ -1,145 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "platform-api"
4
-
5
- require "amigo/autoscaler"
6
-
7
- module Amigo
8
- class Autoscaler
9
- # Autoscaler to use on Heroku, that starts additional worker processes when there is a high latency event
10
- # and scales them down after the event is finished.
11
- #
12
- # When the first call of a high latency event happens (depth: 1), this class
13
- # will ask Heroku how many dynos are in the formation. This is known as +active_event_initial_workers+.
14
- #
15
- # If +active_event_initial_workers+ is 0, no autoscaling will be done.
16
- # This avoids a situation where a high latency event is triggered
17
- # due to workers being deprovisioned intentionally, perhaps for maintenance.
18
- #
19
- # Each time the alert fires (see +Amigo::Autoscaler#alert_interval+),
20
- # an additional worker will be added to the formation, up to +max_additional_workers+.
21
- # So with +active_event_initial_workers+ of 1 and +max_additional_workers+ of 2,
22
- # the first time the alert times, the formation will be set to 2 workers.
23
- # The next time, it'll be set to 3 workers.
24
- # After that, no additional workers will be provisioned.
25
- #
26
- # After the high latency event resolves,
27
- # the dyno formation is restored to +active_event_initial_workers+.
28
- #
29
- # To use:
30
- #
31
- # heroku = PlatformAPI.connect_oauth(heroku_oauth_token)
32
- # heroku_scaler = Amigo::Autoscaler::Heroku.new(heroku:, default_workers: 1)
33
- # Amigo::Autoscaler.new(
34
- # handlers: [heroku_scaler.alert_callback],
35
- # latency_restored_handlers: [heroku_scaler.restored_callback],
36
- # )
37
- #
38
- # See instance attributes for additional options.
39
- #
40
- # Note that this class is provided as an example, and potentially a base or implementation class.
41
- # Your actual implementation may also want to alert when a max depth or duration is reached,
42
- # since it can indicate a bigger problem. Autoscaling, especially of workers, is a tough problem
43
- # without a one-size-fits-all approach.
44
- class Heroku
45
- # Heroku client, usually created via PlatformAPI.oauth_connect.
46
- # @return [PlatformAPI::Client]
47
- attr_reader :heroku
48
-
49
- # Captured at the start of a high latency event.
50
- # Nil otherwise.
51
- # @return [Integer]
52
- attr_reader :active_event_initial_workers
53
-
54
- # Maximum number of workers to add.
55
- #
56
- # As the 'depth' of the alert is increased,
57
- # workers are added to the recorded worker count until the max is reached.
58
- # By default, this is 2 (so the max workers will be the recorded number, plus 2).
59
- # Do not set this too high, since it can for example exhaust database connections or just end up
60
- # increasing load.
61
- #
62
- # See class docs for more information.
63
- # @return [Integer]
64
- attr_reader :max_additional_workers
65
-
66
- # Defaults to HEROKU_APP_NAME, which should already be set if you use Heroku dyna metadata,
67
- # as per https://devcenter.heroku.com/articles/dyno-metadata.
68
- # This must be provided if the env var is missing.
69
- # @return [String]
70
- attr_reader :app_id_or_app_name
71
-
72
- # Defaults to 'worker', which is what you'll probably use if you have a simple system.
73
- # If you use multiple worker processes for different queues, this class probably isn't sufficient.
74
- # You will probably need to look at the slow queue names and determine the formation name to scale up.
75
- # @return [String]
76
- attr_reader :formation_id_or_formation_type
77
-
78
- def initialize(
79
- heroku:,
80
- max_additional_workers: 2,
81
- app_id_or_app_name: ENV.fetch("HEROKU_APP_NAME"),
82
- formation_id_or_formation_type: "worker"
83
- )
84
- @heroku = heroku
85
- @max_additional_workers = max_additional_workers
86
- @app_id_or_app_name = app_id_or_app_name
87
- @formation_id_or_formation_type = formation_id_or_formation_type
88
- # Is nil outside of a latency event, set during a latency event. So if this is initialized to non-nil,
89
- # we're already in a latency event.
90
- @active_event_initial_workers = Sidekiq.redis do |r|
91
- v = r.get("#{namespace}/active_event_initial_workers")
92
- v&.to_i
93
- end
94
- end
95
-
96
- def alert_callback
97
- self.method(:scale_up)
98
- end
99
-
100
- def restored_callback
101
- self.method(:scale_down)
102
- end
103
-
104
- protected def namespace
105
- return "amigo/autoscaler/heroku"
106
- end
107
-
108
- # Potentially add another worker to the formation.
109
- # @return [:noscale, :maxscale, :scaled] One of :noscale (no +active_event_initial_workers+),
110
- # :maxscale (+max_additional_workers+ reached), or :scaled.
111
- def scale_up(_queues_and_latencies, depth:, **)
112
- # When the scaling event starts (or if this is the first time we've seen it
113
- # but the event is already in progress), store how many workers we have.
114
- # It needs to be stored in redis so it persists if
115
- # the latency event continues through restarts.
116
- if @active_event_initial_workers.nil?
117
- @active_event_initial_workers = @heroku.formation.info(@app_id_or_app_name, @formation_id_or_formation_type).
118
- fetch("quantity")
119
- Sidekiq.redis do |r|
120
- r.set("#{namespace}/active_event_initial_workers", @active_event_initial_workers.to_s)
121
- end
122
- end
123
- return :noscale if @active_event_initial_workers.zero?
124
- new_quantity = @active_event_initial_workers + depth
125
- max_quantity = @active_event_initial_workers + @max_additional_workers
126
- return :maxscale if new_quantity > max_quantity
127
- @heroku.formation.update(@app_id_or_app_name, @formation_id_or_formation_type, {quantity: new_quantity})
128
- return :scaled
129
- end
130
-
131
- # Reset the formation to +active_event_initial_workers+.
132
- # @return [:noscale, :scaled] :noscale if +active_event_initial_workers+ is 0, otherwise :scaled.
133
- def scale_down(**)
134
- initial_workers = @active_event_initial_workers
135
- Sidekiq.redis do |r|
136
- r.del("#{namespace}/active_event_initial_workers")
137
- end
138
- @active_event_initial_workers = nil
139
- return :noscale if initial_workers.zero?
140
- @heroku.formation.update(@app_id_or_app_name, @formation_id_or_formation_type, {quantity: initial_workers})
141
- return :scaled
142
- end
143
- end
144
- end
145
- end