sidekiq-amigo 1.10.0 → 1.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3d3e3fb7d4b61921787b7e2baf6f6cdb25008877b9e33b786fc967b40bc9e305
4
- data.tar.gz: 497eeba83785fe528b6053b6e1b8158a4459f21bdde74b7b056fe20a17a77e4e
3
+ metadata.gz: 8f2d4776669bc7327b064ae2f0b2f22a83e35cd351da05a145d1b3b7bf086334
4
+ data.tar.gz: b3122b4fa37a8c6c93afbd485ce4b813536a2c1b91d96c266b2c2bb04be15d98
5
5
  SHA512:
6
- metadata.gz: 9c41c3a9b2483ba59ad03b7fbde4bad4c21bb7516f857e5e287805236372cc71f9a08c03da0ee5015286607d71e069b38de3310d6c13bf74b85e2509b7de08b8
7
- data.tar.gz: 40ed4bd8c85cc532d6f31aa1bbeadb3c13fb949555d4f7e31099658bbd3021586466a0af6fe1bfefae102def7a0852fd57a4b72c57e6717415fee488f19edb1a
6
+ metadata.gz: c24ff6b6af38bb638be36dfa18aaca7500df9bf0126a4adf814e179fc05f3784b27004286e4ef796285cd047388d88486ee7da5fe9050454e33fc9b52d8f4698
7
+ data.tar.gz: 52b8edc30efe323786963559a7330058bd6b70552eedbdb4f7d3f3a8727373de6d033f2c0dd0113f9303f80e312d957df1e50c1f0fb5aec3e7bdcd9918b3a3bf
@@ -4,7 +4,7 @@ require "amigo"
4
4
 
5
5
  module Amigo
6
6
  class AuditLogger
7
- include Sidekiq::Worker
7
+ include Sidekiq::Job
8
8
 
9
9
  def audit_log_level
10
10
  return :info
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "amigo/autoscaler"
4
+
5
+ module Amigo
6
+ class Autoscaler
7
+ module Checkers
8
+ class Fake < Amigo::Autoscaler::Checker
9
+ def initialize(latencies)
10
+ @latencies = latencies
11
+ super()
12
+ end
13
+
14
+ def get_latencies
15
+ return @latencies.call if @latencies.respond_to?(:call)
16
+ return @latencies.shift if @latencies.is_a?(Array)
17
+ return @latencies
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "sidekiq/api"
4
+
5
+ require "amigo/autoscaler"
6
+
7
+ module Amigo
8
+ class Autoscaler
9
+ module Checkers
10
+ class Sidekiq < Amigo::Autoscaler::Checker
11
+ def get_latencies
12
+ return ::Sidekiq::Queue.all.
13
+ map { |q| [q.name, q.latency] }.
14
+ to_h
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,84 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "amigo/autoscaler"
4
+
5
+ module Amigo
6
+ class Autoscaler
7
+ module Checkers
8
+ class WebLatency < Amigo::Autoscaler::Checker
9
+ NAMESPACE = "amigo/autoscaler/web_latency"
10
+ WINDOW = 60
11
+
12
+ # Set the latency.
13
+ # @param redis [RedisClient::Common] Redis connection.
14
+ # @param namespace [String] Key namespace.
15
+ # @param at [Time,Integer] Time this record was taken.
16
+ # @param duration [Numeric] Duration of the request in fractional seconds.
17
+ def self.set_latency(redis:, namespace:, at:, duration:)
18
+ bucket = at.to_i
19
+ key = "#{namespace}/latencies:#{bucket}"
20
+ duration_ms = (duration * 1000).round
21
+ redis.call("HINCRBY", key, "count", 1)
22
+ redis.call("HINCRBY", key, "sum", duration_ms)
23
+ redis.call("EXPIRE", key, WINDOW + 1)
24
+ end
25
+
26
+ def initialize(redis:, namespace: NAMESPACE)
27
+ @redis = redis
28
+ @namespace = namespace
29
+ super()
30
+ end
31
+
32
+ def get_latencies
33
+ now = Time.now.to_i
34
+ keys = (now - 59..now).map { |t| "#{@namespace}/latencies:#{t}" }
35
+ counts = 0
36
+ sums = 0
37
+ results = @redis.pipelined do |pipeline|
38
+ keys.each do |k|
39
+ pipeline.call("HMGET", k, "count", "sum")
40
+ end
41
+ end
42
+ results.each do |count, sum|
43
+ counts += count.to_i
44
+ sums += sum.to_i
45
+ end
46
+ return {} if counts.zero?
47
+ latency = sums.to_f / counts
48
+ return {"web" => latency.to_f / 1000}
49
+ end
50
+
51
+ class Middleware
52
+ # @param threshold [Float] Do not record the latency of requests faster than this.
53
+ # These are usually just things like healthchecks, files, or other very fast requests
54
+ # which do not represent the overall system slowness.
55
+ def initialize(app, redis:, threshold: 0.08, namespace: NAMESPACE)
56
+ @app = app
57
+ @redis = redis
58
+ @threshold = threshold
59
+ @namespace = namespace
60
+ end
61
+
62
+ def call(env)
63
+ start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
64
+ status, headers, body = @app.call(env)
65
+ duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start
66
+ if duration > @threshold
67
+ begin
68
+ WebLatency.set_latency(
69
+ redis: @redis,
70
+ namespace: @namespace,
71
+ at: Time.now,
72
+ duration:,
73
+ )
74
+ rescue StandardError => e
75
+ Amigo.log(nil, :error, "web_latency_error", exception: e)
76
+ end
77
+ end
78
+ [status, headers, body]
79
+ end
80
+ end
81
+ end
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "amigo/autoscaler"
4
+
5
+ module Amigo
6
+ class Autoscaler
7
+ module Handlers
8
+ class Chain < Amigo::Autoscaler::Handler
9
+ attr_accessor :chain
10
+
11
+ # Chain multiple handlers together.
12
+ # @param chain [Array<Amigo::Autoscaler::Handler>]
13
+ def initialize(chain)
14
+ @chain = chain
15
+ super()
16
+ end
17
+
18
+ def scale_up(*args, **kw)
19
+ @chain.each { |c| c.scale_up(*args, **kw) }
20
+ end
21
+
22
+ def scale_down(*args, **kw)
23
+ @chain.each { |c| c.scale_down(*args, **kw) }
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "amigo/autoscaler"
4
+
5
+ module Amigo
6
+ class Autoscaler
7
+ module Handlers
8
+ class Fake < Amigo::Autoscaler::Handler
9
+ attr_accessor :ups, :downs
10
+
11
+ def initialize
12
+ @ups = []
13
+ @downs = []
14
+ super()
15
+ end
16
+
17
+ def scale_up(checked_latencies, depth:, duration:, **kw)
18
+ @ups << [checked_latencies, depth, duration, kw]
19
+ end
20
+
21
+ def scale_down(depth:, duration:, **kw)
22
+ @downs << [depth, duration, kw]
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,141 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "platform-api"
4
+
5
+ require "amigo/autoscaler"
6
+
7
+ module Amigo
8
+ class Autoscaler
9
+ module Handlers
10
+ # Autoscaler to use on Heroku, that starts additional worker processes when there is a high latency event
11
+ # and scales them down after the event is finished.
12
+ #
13
+ # When the first call of a high latency event happens (depth: 1), this class
14
+ # will ask Heroku how many dynos are in the formation. This is known as +active_event_initial_workers+.
15
+ #
16
+ # If +active_event_initial_workers+ is 0, no autoscaling will be done.
17
+ # This avoids a situation where a high latency event is triggered
18
+ # due to workers being deprovisioned intentionally, perhaps for maintenance.
19
+ #
20
+ # Each time the alert fires (see +Amigo::Autoscaler#alert_interval+),
21
+ # an additional worker will be added to the formation, up to +max_additional_workers+.
22
+ # So with +active_event_initial_workers+ of 1 and +max_additional_workers+ of 2,
23
+ # the first time the alert times, the formation will be set to 2 workers.
24
+ # The next time, it'll be set to 3 workers.
25
+ # After that, no additional workers will be provisioned.
26
+ #
27
+ # After the high latency event resolves,
28
+ # the dyno formation is restored to +active_event_initial_workers+.
29
+ #
30
+ # To use:
31
+ #
32
+ # heroku = PlatformAPI.connect_oauth(heroku_oauth_token)
33
+ # heroku_scaler = Amigo::Autoscaler::Heroku.new(heroku:, default_workers: 1)
34
+ # Amigo::Autoscaler.new(
35
+ # handlers: [heroku_scaler.alert_callback],
36
+ # latency_restored_handlers: [heroku_scaler.restored_callback],
37
+ # )
38
+ #
39
+ # See instance attributes for additional options.
40
+ #
41
+ # Note that this class is provided as an example, and potentially a base or implementation class.
42
+ # Your actual implementation may also want to alert when a max depth or duration is reached,
43
+ # since it can indicate a bigger problem. Autoscaling, especially of workers, is a tough problem
44
+ # without a one-size-fits-all approach.
45
+ class Heroku < Amigo::Autoscaler::Handler
46
+ # Heroku client, usually created via PlatformAPI.oauth_connect.
47
+ # @return [PlatformAPI::Client]
48
+ attr_reader :heroku
49
+
50
+ # Captured at the start of a high latency event.
51
+ # Nil otherwise.
52
+ # @return [Integer]
53
+ attr_reader :active_event_initial_workers
54
+
55
+ # Maximum number of workers to add.
56
+ #
57
+ # As the 'depth' of the alert is increased,
58
+ # workers are added to the recorded worker count until the max is reached.
59
+ # By default, this is 2 (so the max workers will be the recorded number, plus 2).
60
+ # Do not set this too high, since it can for example exhaust database connections or just end up
61
+ # increasing load.
62
+ #
63
+ # See class docs for more information.
64
+ # @return [Integer]
65
+ attr_reader :max_additional_workers
66
+
67
+ # Defaults to HEROKU_APP_NAME, which should already be set if you use Heroku dyna metadata,
68
+ # as per https://devcenter.heroku.com/articles/dyno-metadata.
69
+ # This must be provided if the env var is missing.
70
+ # @return [String]
71
+ attr_reader :app_id_or_app_name
72
+
73
+ # Formation ID or name.
74
+ # Usually 'worker' to scale Sidekiq workers, or 'web' for the web worker.
75
+ # If you use multiple worker processes for different queues, this class probably isn't sufficient.
76
+ # You will probably need to look at the slow queue names and determine the formation name to scale up.
77
+ # @return [String]
78
+ attr_reader :formation
79
+
80
+ def initialize(
81
+ client:,
82
+ formation:,
83
+ max_additional_workers: 2,
84
+ app_id_or_app_name: ENV.fetch("HEROKU_APP_NAME")
85
+ )
86
+ super()
87
+ @client = client
88
+ @max_additional_workers = max_additional_workers
89
+ @app_id_or_app_name = app_id_or_app_name
90
+ @formation = formation
91
+ # Is nil outside a latency event, set during a latency event. So if this is initialized to non-nil,
92
+ # we're already in a latency event.
93
+ @active_event_initial_workers = Sidekiq.redis do |r|
94
+ v = r.get("#{namespace}/active_event_initial_workers")
95
+ v&.to_i
96
+ end
97
+ end
98
+
99
+ protected def namespace
100
+ return "amigo/autoscaler/heroku/#{self.formation}"
101
+ end
102
+
103
+ # Potentially add another worker to the formation.
104
+ # @return [:noscale, :maxscale, :scaled] One of :noscale (no +active_event_initial_workers+),
105
+ # :maxscale (+max_additional_workers+ reached), or :scaled.
106
+ def scale_up(_queues_and_latencies, depth:, **)
107
+ # When the scaling event starts (or if this is the first time we've seen it
108
+ # but the event is already in progress), store how many workers we have.
109
+ # It needs to be stored in redis so it persists if
110
+ # the latency event continues through restarts.
111
+ if @active_event_initial_workers.nil?
112
+ @active_event_initial_workers = @client.formation.info(@app_id_or_app_name, @formation).
113
+ fetch("quantity")
114
+ Sidekiq.redis do |r|
115
+ r.set("#{namespace}/active_event_initial_workers", @active_event_initial_workers.to_s)
116
+ end
117
+ end
118
+ return :noscale if @active_event_initial_workers.zero?
119
+ new_quantity = @active_event_initial_workers + depth
120
+ max_quantity = @active_event_initial_workers + @max_additional_workers
121
+ return :maxscale if new_quantity > max_quantity
122
+ @client.formation.update(@app_id_or_app_name, @formation, {quantity: new_quantity})
123
+ return :scaled
124
+ end
125
+
126
+ # Reset the formation to +active_event_initial_workers+.
127
+ # @return [:noscale, :scaled] :noscale if +active_event_initial_workers+ is 0, otherwise :scaled.
128
+ def scale_down(**)
129
+ initial_workers = @active_event_initial_workers
130
+ Sidekiq.redis do |r|
131
+ r.del("#{namespace}/active_event_initial_workers")
132
+ end
133
+ @active_event_initial_workers = nil
134
+ return :noscale if initial_workers.zero?
135
+ @client.formation.update(@app_id_or_app_name, @formation, {quantity: initial_workers})
136
+ return :scaled
137
+ end
138
+ end
139
+ end
140
+ end
141
+ end
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "amigo/autoscaler"
4
+
5
+ module Amigo
6
+ class Autoscaler
7
+ module Handlers
8
+ class Log < Amigo::Autoscaler::Handler
9
+ DEFAULT_LOG = ->(level, message, params={}) { Amigo.log(nil, level, message, params) }
10
+
11
+ # @param message [String] Log message for structured logging.\
12
+ # Has "_restored" appended on +scale_down+.
13
+ # @param log [Proc] Proc/callable called with (level, message, params={}).
14
+ # By default, use +Amigo.log+ (which logs to the Sidekiq logger).
15
+ def initialize(message: "high_latency_queues", log: DEFAULT_LOG)
16
+ @message = message
17
+ @log = log
18
+ super()
19
+ end
20
+
21
+ def scale_up(checked_latencies, depth:, duration:, **_kw)
22
+ self._log(:warn, @message, queues: checked_latencies, depth: depth, duration: duration)
23
+ end
24
+
25
+ def scale_down(depth:, duration:, **_kw)
26
+ self._log(:info, "#{@message}_restored", depth: depth, duration: duration)
27
+ end
28
+
29
+ protected def _log(level, msg, **kw)
30
+ @log[level, msg, kw]
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "amigo/autoscaler"
4
+
5
+ module Amigo
6
+ class Autoscaler
7
+ module Handlers
8
+ class Sentry < Amigo::Autoscaler::Handler
9
+ # @param interval [Integer] How many seconds between Sentry alerts?
10
+ # This is similar to +alert_interval+ on the Autoscaler,
11
+ # but Sentry has its own interval, since it is used for reporting,
12
+ # and not latency reduction.
13
+ # @param message [String] Message to capture.
14
+ # @param level [:debug,:info,:warning,:warn,:error,:fatal] Sentry level.
15
+ def initialize(interval: 300, message: "Some queues have a high latency", level: :warn)
16
+ @interval = interval
17
+ @message = message
18
+ @level = level
19
+ @last_alerted = Time.at(0)
20
+ super()
21
+ end
22
+
23
+ def scale_up(checked_latencies, depth:, duration:, **)
24
+ now = Time.now
25
+ call_sentry = @last_alerted < (now - @interval)
26
+ return unless call_sentry
27
+ ::Sentry.with_scope do |scope|
28
+ scope&.set_extras(high_latency_queues: checked_latencies, depth:, duration:)
29
+ ::Sentry.capture_message(@message, level: @level)
30
+ end
31
+ @last_alerted = now
32
+ end
33
+
34
+ def scale_down(**) = nil
35
+ end
36
+ end
37
+ end
38
+ end
@@ -4,36 +4,43 @@ require "sidekiq/api"
4
4
 
5
5
  require "amigo"
6
6
 
7
- # When queues achieve a latency that is too high,
8
- # take some action.
7
+ # Generic autoscaling handler that will check for latency
8
+ # and take an action.
9
+ # For Sidekiq on Heroku for instance,
10
+ # this means checking queues for a latency above a threshold, and adding workers up to a limit.
11
+ #
9
12
  # You should start this up at Web application startup:
10
13
  #
11
14
  # # puma.rb or similar
12
- # Amigo::Autoscaler.new.start
15
+ # checker = Amigo::Autoscaler::Checkers::SidekiqLatency.new
16
+ # heroku_client = PlatformAPI.connect_oauth(ENV['MYAPP_HEROKU_OAUTH_TOKEN'])
17
+ # handler = Amigo::Autoscaler::Handlers::Heroku.new(client: heroku_client, formation: 'worker')
18
+ # Amigo::Autoscaler.new(checker:, handler:).start
13
19
  #
14
20
  # When latency grows beyond +latency_threshold+,
15
21
  # a "high latency event" is started.
16
- # Some action is taken, which is defined by the +handlers+ argument.
17
- # This includes logging, alerting, and/or autoscaling.
22
+ # Some action should be taken, which is handled by the handler's +scale_up+ method.
23
+ # This usually includes logging, alerting, and/or autoscaling.
18
24
  #
19
25
  # When latency returns to normal (defined by +latency_restored_threshold+),
20
26
  # the high latency event finishes.
21
- # Some additional action is taken, which is defined by the +latency_restored_handlers+ argument.
27
+ # Some additional action is taken, handled by the handler's +scale_down+ method.
22
28
  # Usually this is logging, and/or returning autoscaling to its original status.
23
29
  #
24
30
  # There are several parameters to control behavior, such as how often polling is done,
25
31
  # how often alerting/scaling is done, and more.
26
32
  #
27
- # As an example autoscaler that includes actual resource scaling,
28
- # check out +Amigo::Autoscaler::Heroku+.
29
- # Its ideas can easily be expanded to other platforms.
30
- #
31
33
  # Note that +Autoscaler+ maintains its state over multiple processes;
32
34
  # it needs to keep track of high latency events even if the process running the autoscaler
33
35
  # (usually a web process) restarts.
34
36
  module Amigo
35
37
  class Autoscaler
36
- class InvalidHandler < StandardError; end
38
+ # Struct representing data serialized to Redis.
39
+ # Useful for diagnostics. Can be retried with +fetch_persisted+.
40
+ # @!attribute last_alerted_at [Time] 0-time if there is no recent alert.
41
+ # @!attribute depth [Integer] 0 if not in a latency event.
42
+ # @!attribute latency_event_started_at [Time] 0-time if not in a latency event.
43
+ Persisted = Struct.new(:last_alerted_at, :depth, :latency_event_started_at)
37
44
 
38
45
  # How often should Autoscaler check for latency?
39
46
  # @return [Integer]
@@ -49,49 +56,32 @@ module Amigo
49
56
  # are generally easier to find).
50
57
  # @return [Regexp]
51
58
  attr_reader :hostname_regex
52
- # Methods to call when alerting, as strings/symbols or procs.
53
- # Valid string values are 'log' and 'sentry' (requires Sentry to be required already).
54
- # Anything that responds to +call+ will be invoked with:
55
- # - Positional argument which is a +Hash+ of `{queue name => latency in seconds}`
56
- # - Keyword argument +:depth+: Number of alerts as part of this latency event.
57
- # For example, the first alert has a depth of 1, and if latency stays high,
58
- # it'll be 2 on the next call, etc. +depth+ can be used to incrementally provision
59
- # additional processing capacity, and stop adding capacity at a certain depth
60
- # to avoid problems with too many workers (like excessive DB load).
61
- # - Keyword argument +:duration+: Number of seconds since this latency spike started.
62
- # - Additional undefined keywords. Handlers should accept additional options,
63
- # like via `**kw` or `opts={}`, for compatibility.
64
- # @return [Array<String,Symbol,Proc,#call>]
65
- attr_reader :handlers
66
59
  # Only alert this often.
67
60
  # For example, with poll_interval of 10 seconds
68
61
  # and alert_interval of 200 seconds,
69
62
  # we'd alert once and then 210 seconds later.
70
63
  # @return [Integer]
71
64
  attr_reader :alert_interval
65
+
72
66
  # After an alert happens, what latency should be considered "back to normal" and
73
- # +latency_restored_handlers+ will be called?
67
+ # +scale_down+ will be called?
74
68
  # In most cases this should be the same as (and defaults to) +latency_threshold+
75
69
  # so that we're 'back to normal' once we're below the threshold.
76
70
  # It may also commonly be 0, so that the callback is fired when the queue is entirely clear.
77
71
  # Note that, if +latency_restored_threshold+ is less than +latency_threshold+,
78
72
  # while the latency is between the two, no alerts will fire.
79
73
  attr_reader :latency_restored_threshold
80
- # Methods to call when a latency of +latency_restored_threshold+ is reached
81
- # (ie, when we get back to normal latency after a high latency event).
82
- # Valid string values are 'log'.
83
- # Usually this handler will deprovision capacity procured as part of the alert +handlers+.
84
- # Anything that responds to +call+ will be invoked with:
85
- # - Keyword +:depth+, the number of times an alert happened before
86
- # the latency spike was resolved.
87
- # - Keyword +:duration+, the number of seconds for the latency spike has been going on.
88
- # - Additional undefined keywords. Handlers should accept additional options,
89
- # like via `**kw`, for compatibility.
90
- # @return [Array<String,Symbol,Proc,#call>]
91
- attr_reader :latency_restored_handlers
92
- # Proc/callable called with (level, message, params={}).
93
- # By default, use +Amigo.log+ (which logs to the Sidekiq logger).
94
- attr_reader :log
74
+
75
+ # @return [Amigo::Autoscaler::Checker]
76
+ attr_reader :checker
77
+ # @return [Amigo::Autoscaler::Handler]
78
+ attr_reader :handler
79
+
80
+ # Store autoscaler keys in this Redis namespace.
81
+ # Note that if you are running multiple autoscalers for different services (web, worker),
82
+ # you will need different namespaces.
83
+ attr_reader :namespace
84
+
95
85
  # Proc called with an exception that occurs while the thread is running.
96
86
  # If the handler returns +true+, then the thread will keep going.
97
87
  # All other values will kill the thread, which breaks autoscaling.
@@ -101,15 +91,15 @@ module Amigo
101
91
  attr_reader :on_unhandled_exception
102
92
 
103
93
  def initialize(
94
+ handler:,
95
+ checker:,
104
96
  poll_interval: 20,
105
97
  latency_threshold: 5,
106
98
  hostname_regex: /^web\.1$/,
107
- handlers: [:log],
108
99
  alert_interval: 120,
109
100
  latency_restored_threshold: latency_threshold,
110
- latency_restored_handlers: [:log],
111
- log: ->(level, message, params={}) { Amigo.log(nil, level, message, params) },
112
- on_unhandled_exception: nil
101
+ on_unhandled_exception: nil,
102
+ namespace: "amigo/autoscaler"
113
103
  )
114
104
  raise ArgumentError, "latency_threshold must be > 0" if
115
105
  latency_threshold <= 0
@@ -117,15 +107,15 @@ module Amigo
117
107
  latency_restored_threshold.negative?
118
108
  raise ArgumentError, "latency_restored_threshold must be <= latency_threshold" if
119
109
  latency_restored_threshold > latency_threshold
110
+ @handler = handler
111
+ @checker = checker
120
112
  @poll_interval = poll_interval
121
113
  @latency_threshold = latency_threshold
122
114
  @hostname_regex = hostname_regex
123
- @handlers = handlers.freeze
124
115
  @alert_interval = alert_interval
125
116
  @latency_restored_threshold = latency_restored_threshold
126
- @latency_restored_handlers = latency_restored_handlers.freeze
127
- @log = log
128
117
  @on_unhandled_exception = on_unhandled_exception
118
+ @namespace = namespace
129
119
  end
130
120
 
131
121
  # @return [Thread]
@@ -136,13 +126,20 @@ module Amigo
136
126
  def setup
137
127
  # Store these as strings OR procs, rather than grabbing self.method here.
138
128
  # It gets extremely hard ot test if we capture the method here.
139
- @alert_methods = self.handlers.map { |a| _handler_to_method("alert_", a) }
140
- @restored_methods = self.latency_restored_handlers.map { |a| _handler_to_method("alert_restored_", a) }
141
129
  @stop = false
142
- Sidekiq.redis do |r|
143
- @last_alerted = Time.at((r.get("#{namespace}/last_alerted") || 0).to_f)
144
- @depth = (r.get("#{namespace}/depth") || 0).to_i
145
- @latency_event_started = Time.at((r.get("#{namespace}/latency_event_started") || 0).to_f)
130
+ persisted = self.fetch_persisted
131
+ @last_alerted = persisted.last_alerted_at
132
+ @depth = persisted.depth
133
+ @latency_event_started = persisted.latency_event_started_at
134
+ end
135
+
136
+ def fetch_persisted
137
+ return Sidekiq.redis do |r|
138
+ Persisted.new(
139
+ Time.at((r.get("#{namespace}/last_alerted") || 0).to_f),
140
+ (r.get("#{namespace}/depth") || 0).to_i,
141
+ Time.at((r.get("#{namespace}/latency_event_started") || 0).to_f),
142
+ )
146
143
  end
147
144
  end
148
145
 
@@ -165,24 +162,13 @@ module Amigo
165
162
  end
166
163
  end
167
164
 
168
- protected def namespace
169
- return "amigo/autoscaler"
170
- end
171
-
172
- private def _handler_to_method(prefix, a)
173
- return a if a.respond_to?(:call)
174
- method_name = "#{prefix}#{a.to_s.strip}".to_sym
175
- raise InvalidHandler, a.inspect unless (meth = self.method(method_name))
176
- return meth
177
- end
178
-
179
165
  def start
180
166
  raise "already started" unless @polling_thread.nil?
181
167
 
182
168
  hostname = ENV.fetch("DYNO") { Socket.gethostname }
183
169
  return false unless self.hostname_regex.match?(hostname)
184
170
 
185
- self._log(:info, "async_autoscaler_starting")
171
+ self._debug(:info, "async_autoscaler_starting")
186
172
  self.setup
187
173
  @polling_thread = Thread.new do
188
174
  until @stop
@@ -200,7 +186,7 @@ module Amigo
200
186
  def check
201
187
  self._check
202
188
  rescue StandardError => e
203
- self._log(:error, "async_autoscaler_unhandled_error", exception: e)
189
+ self._debug(:error, "async_autoscaler_unhandled_error", exception: e)
204
190
  handled = self.on_unhandled_exception&.call(e)
205
191
  raise e unless handled.eql?(true)
206
192
  end
@@ -209,22 +195,18 @@ module Amigo
209
195
  now = Time.now
210
196
  skip_check = now < (@last_alerted + self.alert_interval)
211
197
  if skip_check
212
- self._log(:debug, "async_autoscaler_skip_check")
198
+ self._debug(:debug, "async_autoscaler_skip_check")
213
199
  return
214
200
  end
215
- self._log(:info, "async_autoscaler_check")
216
- high_latency_queues = Sidekiq::Queue.all.
217
- map { |q| [q.name, q.latency] }.
218
- select { |(_, latency)| latency > self.latency_threshold }.
219
- to_h
201
+ self._debug(:info, "async_autoscaler_check")
202
+ high_latency_queues = self.checker.get_latencies.
203
+ select { |_, latency| latency > self.latency_threshold }
220
204
  if high_latency_queues.empty?
221
205
  # Whenever we are in a latency event, we have a depth > 0. So a depth of 0 means
222
206
  # we're not in a latency event, and still have no latency, so can noop.
223
207
  return if @depth.zero?
224
208
  # We WERE in a latency event, and now we're not, so report on it.
225
- @restored_methods.each do |m|
226
- m.call(depth: @depth, duration: (Time.now - @latency_event_started).to_f)
227
- end
209
+ self.handler.scale_down(depth: @depth, duration: (Time.now - @latency_event_started).to_f)
228
210
  # Reset back to 0 depth so we know we're not in a latency event.
229
211
  @depth = 0
230
212
  @latency_event_started = Time.at(0)
@@ -244,38 +226,47 @@ module Amigo
244
226
  end
245
227
  # Alert each handler. For legacy reasons, we support handlers that accept
246
228
  # ({queues and latencies}) and ({queues and latencies}, {}keywords}).
247
- kw = {depth: @depth, duration: duration}
248
- @alert_methods.each do |m|
249
- if m.respond_to?(:arity) && m.arity == 1
250
- m.call(high_latency_queues)
251
- else
252
- m.call(high_latency_queues, **kw)
253
- end
254
- end
229
+ @handler.scale_up(high_latency_queues, depth: @depth, duration: duration)
255
230
  @last_alerted = now
256
231
  self.persist
257
232
  end
258
233
 
259
- def alert_sentry(names_and_latencies)
260
- Sentry.with_scope do |scope|
261
- scope.set_extras(high_latency_queues: names_and_latencies)
262
- names = names_and_latencies.map(&:first).sort.join(", ")
263
- Sentry.capture_message("Some queues have a high latency: #{names}")
264
- end
234
+ def _debug(lvl, msg, **kw)
235
+ return unless ENV["DEBUG"]
236
+ Amigo.log(nil, lvl, msg, kw)
265
237
  end
266
238
 
267
- def alert_log(names_and_latencies, depth:, duration:)
268
- self._log(:warn, "high_latency_queues", queues: names_and_latencies, depth: depth, duration: duration)
239
+ class Checker
240
+ # Return relevant latencies for this checker.
241
+ # This could be the latencies of each Sidekiq queue, or web latencies, etc.
242
+ # @return [Hash] Key is the queue name (or some other value); value is the latency in seconds.
243
+ def get_latencies = raise NotImplementedError
269
244
  end
270
245
 
271
- def alert_test(_names_and_latencies, _opts={}); end
272
-
273
- def alert_restored_log(depth:, duration:)
274
- self._log(:info, "high_latency_queues_restored", depth: depth, duration: duration)
275
- end
246
+ class Handler
247
+ # Called when a latency event starts, and as it fails to resolve.
248
+ # @param checked_latencies [Hash] The +Hash+ returned from +Amigo::Autoscaler::Handler#check+.
249
+ # For Sidekiq, this will look like `{queue name => latency in seconds}`
250
+ # @param depth [Integer] Number of alerts as part of this latency event.
251
+ # For example, the first alert has a depth of 1, and if latency stays high,
252
+ # it'll be 2 on the next call, etc. +depth+ can be used to incrementally provision
253
+ # additional processing capacity, and stop adding capacity at a certain depth
254
+ # to avoid problems with too many workers (like excessive DB load).
255
+ # @param duration [Float] Number of seconds since this latency spike started.
256
+ # @param kw [Hash] Additional undefined keywords. Handlers should accept additional options,
257
+ # like via `**kw` or `opts={}`, for compatibility.
258
+ # @return [Array<String,Symbol,Proc,#call>]
259
+ def scale_up(checked_latencies, depth:, duration:, **kw) = raise NotImplementedError
276
260
 
277
- protected def _log(level, msg, **kw)
278
- self.log[level, msg, kw]
261
+ # Called when a latency of +latency_restored_threshold+ is reached
262
+ # (ie, when we get back to normal latency after a high latency event).
263
+ # Usually this handler will deprovision capacity procured as part of the +scale_up+.
264
+ # @param depth [Integer] The number of times an alert happened before
265
+ # the latency spike was resolved.
266
+ # @param duration [Float] The number of seconds for the latency spike has been going on.
267
+ # @param kw [Hash] Additional undefined keywords. Handlers should accept additional options,
268
+ # like via `**kw` or `opts={}`, for compatibility.
269
+ def scale_down(depth:, duration:, **kw) = raise NotImplementedError
279
270
  end
280
271
  end
281
272
  end
data/lib/amigo/job.rb CHANGED
@@ -7,7 +7,7 @@ require "amigo"
7
7
  module Amigo
8
8
  module Job
9
9
  def self.extended(cls)
10
- cls.include(Sidekiq::Worker)
10
+ cls.include(Sidekiq::Job)
11
11
  cls.extend(ClassMethods)
12
12
  cls.pattern = ""
13
13
  cls.include(InstanceMethods)
@@ -74,10 +74,23 @@ module Amigo
74
74
  return percentage > self.threshold
75
75
  end
76
76
 
77
- protected def get_memory_info
78
- Sidekiq.redis do |c|
79
- c.info :memory
77
+ def get_memory_info
78
+ s = self.get_memory_info_string
79
+ return self.parse_memory_string(s)
80
+ end
81
+
82
+ protected def get_memory_info_string
83
+ s = Sidekiq.redis do |c|
84
+ c.call("INFO", "MEMORY")
80
85
  end
86
+ return s
87
+ end
88
+
89
+ protected def parse_memory_string(s)
90
+ # See bottom of https://redis.io/docs/latest/commands/info/ for format.
91
+ pairs = s.split("\r\n").reject { |line| line.start_with?("#") }.map { |pair| pair.split(":", 2) }
92
+ h = pairs.to_h
93
+ return h
81
94
  end
82
95
  end
83
96
  end
data/lib/amigo/retry.rb CHANGED
@@ -3,7 +3,7 @@
3
3
  require "sidekiq"
4
4
  require "sidekiq/api"
5
5
 
6
- # Middleware so Sidekiq workers can use a custom retry logic.
6
+ # Middleware so Sidekiq jobs can use a custom retry logic.
7
7
  # See +Amigo::Retry::Retry+, +Amigo::Retry::Die+,
8
8
  # and +Amigo::Retry::OrDie+ for more details
9
9
  # on how these should be used.
@@ -83,6 +83,8 @@ module Amigo
83
83
  end
84
84
 
85
85
  class ServerMiddleware
86
+ include Sidekiq::ServerMiddleware
87
+
86
88
  def call(worker, job, _queue)
87
89
  yield
88
90
  rescue Amigo::Retry::Retry => e
@@ -120,14 +122,14 @@ module Amigo
120
122
  end
121
123
  end
122
124
 
123
- def amigo_retry_in(worker_class, item, interval)
125
+ def amigo_retry_in(job_class, item, interval)
124
126
  # pulled from perform_in
125
127
  int = interval.to_f
126
128
  now = Time.now.to_f
127
129
  ts = (int < 1_000_000_000 ? now + int : int)
128
130
  item["at"] = ts if ts > now
129
131
  item["retry_count"] = item.fetch("retry_count", 0) + 1
130
- worker_class.client_push(item)
132
+ job_class.client_push(item)
131
133
  end
132
134
  end
133
135
  end
data/lib/amigo/router.rb CHANGED
@@ -6,7 +6,7 @@ require "amigo"
6
6
 
7
7
  module Amigo
8
8
  class Router
9
- include Sidekiq::Worker
9
+ include Sidekiq::Job
10
10
 
11
11
  def perform(event_json)
12
12
  event_name = event_json["name"]
@@ -8,7 +8,7 @@ require "amigo"
8
8
  module Amigo
9
9
  module ScheduledJob
10
10
  def self.extended(cls)
11
- cls.include(Sidekiq::Worker)
11
+ cls.include(Sidekiq::Job)
12
12
  cls.sidekiq_options(retry: false)
13
13
  cls.extend(ClassMethods)
14
14
  cls.splay_duration = 30
@@ -33,7 +33,7 @@ require "amigo/memory_pressure"
33
33
  # - `semaphore_expiry` should return the TTL of the semaphore key.
34
34
  # Defaults to 30 seconds. See below for key expiry and negative semaphore value details.
35
35
  # - `before_perform` is called before calling the `perform` method.
36
- # This is required so that implementers can set worker state, based on job arguments,
36
+ # This is required so that implementers can set job state, based on job arguments,
37
37
  # that can be used for calculating the semaphore key.
38
38
  #
39
39
  # Note that we give the semaphore key an expiry. This is to avoid situation where
@@ -41,7 +41,7 @@ require "amigo/memory_pressure"
41
41
  # have fewer than the expected number of jobs running.
42
42
  #
43
43
  # This does mean that, when a job runs longer than the semaphore expiry,
44
- # another worker can be started, which would increment the counter back to 1.
44
+ # another job can be started, which would increment the counter back to 1.
45
45
  # When the original job ends, the counter would be 0; then when the new job ends,
46
46
  # the counter would be -1. To avoid negative counters (which create the same issue
47
47
  # around missing decrements), if we ever detect a negative 'jobs running',
@@ -78,11 +78,11 @@ module Amigo
78
78
 
79
79
  module InstanceMethods
80
80
  def semaphore_key
81
- raise NotImplementedError, "must be implemented on worker"
81
+ raise NotImplementedError, "must be implemented on job"
82
82
  end
83
83
 
84
84
  def semaphore_size
85
- raise NotImplementedError, "must be implemented on worker"
85
+ raise NotImplementedError, "must be implemented on job"
86
86
  end
87
87
 
88
88
  def semaphore_backoff
@@ -1,7 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "amigo"
4
- require "sidekiq/worker"
5
4
 
6
5
  module Amigo
7
6
  module SpecHelpers
@@ -248,7 +247,7 @@ module Amigo
248
247
  return PerformAsyncJobMatcher.new(job)
249
248
  end
250
249
 
251
- # Like a Sidekiq worker's perform_inline,
250
+ # Like a Sidekiq job's perform_inline,
252
251
  # but allows an arbitrary item to be used, rather than just the
253
252
  # given class and args. For example, when testing,
254
253
  # you may need to assume something like 'retry_count' is in the job payload,
@@ -256,18 +255,18 @@ module Amigo
256
255
  # This allows those arbitrary job payload fields
257
256
  # to be included when the job is run.
258
257
  module_function def sidekiq_perform_inline(klass, args, item=nil)
259
- Sidekiq::Worker::Setter.override_item = item
258
+ Sidekiq::Job::Setter.override_item = item
260
259
  begin
261
260
  klass.perform_inline(*args)
262
261
  ensure
263
- Sidekiq::Worker::Setter.override_item = nil
262
+ Sidekiq::Job::Setter.override_item = nil
264
263
  end
265
264
  end
266
265
 
267
266
  module_function def drain_sidekiq_jobs(q)
268
267
  all_sidekiq_jobs(q).each do |job|
269
268
  klass = job.item.fetch("class")
270
- klass = Sidekiq::Testing.constantize(klass) if klass.is_a?(String)
269
+ klass = Object.const_get(klass) if klass.is_a?(String)
271
270
  sidekiq_perform_inline(klass, job.item["args"], job.item)
272
271
  job.delete
273
272
  end
@@ -282,6 +281,8 @@ module Amigo
282
281
  # Use this middleware to pass an arbitrary callback evaluated before a job runs.
283
282
  # Make sure to call +reset+ after the test.
284
283
  class ServerCallbackMiddleware
284
+ include Sidekiq::ServerMiddleware
285
+
285
286
  class << self
286
287
  attr_accessor :callback
287
288
  end
@@ -304,7 +305,7 @@ module Amigo
304
305
  end
305
306
 
306
307
  module ::Sidekiq
307
- module Worker
308
+ module Job
308
309
  class Setter
309
310
  class << self
310
311
  attr_accessor :override_item
data/lib/amigo/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Amigo
4
- VERSION = "1.10.0"
4
+ VERSION = "1.12.0"
5
5
  end
data/lib/amigo.rb CHANGED
@@ -1,6 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require "redis"
4
3
  require "sidekiq"
5
4
  require "sidekiq-cron"
6
5
 
@@ -61,18 +60,18 @@ require "sidekiq-cron"
61
60
  # to control the matching rules more closely than File.fnmatch can provide.
62
61
  #
63
62
  # Jobs must implement a `_perform` method, which takes a Amigo::Event.
64
- # Note that normal Sidekiq workers use a 'perform' method that takes a variable number of arguments;
63
+ # Note that normal Sidekiq jobs use a 'perform' method that takes a variable number of arguments;
65
64
  # the base Async::Job class has this method and delegates its business logic to the subclass _perform method.
66
65
  #
67
66
  # Routing
68
67
  #
69
- # There are two special workers that are important for the overall functioning of the system
70
- # (and do not inherit from Job but rather than Sidekiq::Worker so they are not classified and treated as 'Jobs').
68
+ # There are two special jobs that are important for the overall functioning of the system
69
+ # (and do not inherit from Job but rather than Sidekiq::Job so they are not classified and treated as 'Jobs').
71
70
  #
72
71
  # The first is the AuditLogger, which is a basic job that logs all async events.
73
72
  # This acts as a useful change log for the state of the database.
74
73
  #
75
- # The second special worker is the Router, which calls `perform` on the event Jobs
74
+ # The second special job is the Router, which calls `perform` on the event Jobs
76
75
  # that match the routing information, as explained in Jobs.
77
76
  # It does this by filtering through all event-based jobs and performing the ones with a route match.
78
77
  #
metadata CHANGED
@@ -1,43 +1,42 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sidekiq-amigo
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.10.0
4
+ version: 1.12.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Lithic Technology
8
- autorequire:
9
8
  bindir: bin
10
9
  cert_chain: []
11
- date: 2025-06-24 00:00:00.000000000 Z
10
+ date: 1980-01-02 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: sidekiq
15
14
  requirement: !ruby/object:Gem::Requirement
16
15
  requirements:
17
- - - "~>"
16
+ - - ">="
18
17
  - !ruby/object:Gem::Version
19
- version: '6'
18
+ version: '7'
20
19
  type: :runtime
21
20
  prerelease: false
22
21
  version_requirements: !ruby/object:Gem::Requirement
23
22
  requirements:
24
- - - "~>"
23
+ - - ">="
25
24
  - !ruby/object:Gem::Version
26
- version: '6'
25
+ version: '7'
27
26
  - !ruby/object:Gem::Dependency
28
27
  name: sidekiq-cron
29
28
  requirement: !ruby/object:Gem::Requirement
30
29
  requirements:
31
30
  - - "~>"
32
31
  - !ruby/object:Gem::Version
33
- version: '1'
32
+ version: '2'
34
33
  type: :runtime
35
34
  prerelease: false
36
35
  version_requirements: !ruby/object:Gem::Requirement
37
36
  requirements:
38
37
  - - "~>"
39
38
  - !ruby/object:Gem::Version
40
- version: '1'
39
+ version: '2'
41
40
  - !ruby/object:Gem::Dependency
42
41
  name: platform-api
43
42
  requirement: !ruby/object:Gem::Requirement
@@ -58,14 +57,14 @@ dependencies:
58
57
  requirements:
59
58
  - - "~>"
60
59
  - !ruby/object:Gem::Version
61
- version: '2.2'
60
+ version: '3.1'
62
61
  type: :development
63
62
  prerelease: false
64
63
  version_requirements: !ruby/object:Gem::Requirement
65
64
  requirements:
66
65
  - - "~>"
67
66
  - !ruby/object:Gem::Version
68
- version: '2.2'
67
+ version: '3.1'
69
68
  - !ruby/object:Gem::Dependency
70
69
  name: rspec
71
70
  requirement: !ruby/object:Gem::Requirement
@@ -136,6 +135,34 @@ dependencies:
136
135
  - - "~>"
137
136
  - !ruby/object:Gem::Version
138
137
  version: '5'
138
+ - !ruby/object:Gem::Dependency
139
+ name: simplecov
140
+ requirement: !ruby/object:Gem::Requirement
141
+ requirements:
142
+ - - "~>"
143
+ - !ruby/object:Gem::Version
144
+ version: '0.22'
145
+ type: :development
146
+ prerelease: false
147
+ version_requirements: !ruby/object:Gem::Requirement
148
+ requirements:
149
+ - - "~>"
150
+ - !ruby/object:Gem::Version
151
+ version: '0.22'
152
+ - !ruby/object:Gem::Dependency
153
+ name: simplecov-cobertura
154
+ requirement: !ruby/object:Gem::Requirement
155
+ requirements:
156
+ - - "~>"
157
+ - !ruby/object:Gem::Version
158
+ version: '3.1'
159
+ type: :development
160
+ prerelease: false
161
+ version_requirements: !ruby/object:Gem::Requirement
162
+ requirements:
163
+ - - "~>"
164
+ - !ruby/object:Gem::Version
165
+ version: '3.1'
139
166
  - !ruby/object:Gem::Dependency
140
167
  name: timecop
141
168
  requirement: !ruby/object:Gem::Requirement
@@ -176,7 +203,14 @@ files:
176
203
  - lib/amigo.rb
177
204
  - lib/amigo/audit_logger.rb
178
205
  - lib/amigo/autoscaler.rb
179
- - lib/amigo/autoscaler/heroku.rb
206
+ - lib/amigo/autoscaler/checkers/fake.rb
207
+ - lib/amigo/autoscaler/checkers/sidekiq.rb
208
+ - lib/amigo/autoscaler/checkers/web_latency.rb
209
+ - lib/amigo/autoscaler/handlers/chain.rb
210
+ - lib/amigo/autoscaler/handlers/fake.rb
211
+ - lib/amigo/autoscaler/handlers/heroku.rb
212
+ - lib/amigo/autoscaler/handlers/log.rb
213
+ - lib/amigo/autoscaler/handlers/sentry.rb
180
214
  - lib/amigo/deprecated_jobs.rb
181
215
  - lib/amigo/job.rb
182
216
  - lib/amigo/memory_pressure.rb
@@ -193,7 +227,6 @@ licenses:
193
227
  - MIT
194
228
  metadata:
195
229
  rubygems_mfa_required: 'true'
196
- post_install_message:
197
230
  rdoc_options: []
198
231
  require_paths:
199
232
  - lib
@@ -201,15 +234,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
201
234
  requirements:
202
235
  - - ">="
203
236
  - !ruby/object:Gem::Version
204
- version: 3.0.0
237
+ version: 3.2.0
205
238
  required_rubygems_version: !ruby/object:Gem::Requirement
206
239
  requirements:
207
240
  - - ">="
208
241
  - !ruby/object:Gem::Version
209
242
  version: '0'
210
243
  requirements: []
211
- rubygems_version: 3.3.7
212
- signing_key:
244
+ rubygems_version: 3.6.7
213
245
  specification_version: 4
214
246
  summary: Pubsub system and other enhancements around Sidekiq.
215
247
  test_files: []
@@ -1,145 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "platform-api"
4
-
5
- require "amigo/autoscaler"
6
-
7
- module Amigo
8
- class Autoscaler
9
- # Autoscaler to use on Heroku, that starts additional worker processes when there is a high latency event
10
- # and scales them down after the event is finished.
11
- #
12
- # When the first call of a high latency event happens (depth: 1), this class
13
- # will ask Heroku how many dynos are in the formation. This is known as +active_event_initial_workers+.
14
- #
15
- # If +active_event_initial_workers+ is 0, no autoscaling will be done.
16
- # This avoids a situation where a high latency event is triggered
17
- # due to workers being deprovisioned intentionally, perhaps for maintenance.
18
- #
19
- # Each time the alert fires (see +Amigo::Autoscaler#alert_interval+),
20
- # an additional worker will be added to the formation, up to +max_additional_workers+.
21
- # So with +active_event_initial_workers+ of 1 and +max_additional_workers+ of 2,
22
- # the first time the alert times, the formation will be set to 2 workers.
23
- # The next time, it'll be set to 3 workers.
24
- # After that, no additional workers will be provisioned.
25
- #
26
- # After the high latency event resolves,
27
- # the dyno formation is restored to +active_event_initial_workers+.
28
- #
29
- # To use:
30
- #
31
- # heroku = PlatformAPI.connect_oauth(heroku_oauth_token)
32
- # heroku_scaler = Amigo::Autoscaler::Heroku.new(heroku:, default_workers: 1)
33
- # Amigo::Autoscaler.new(
34
- # handlers: [heroku_scaler.alert_callback],
35
- # latency_restored_handlers: [heroku_scaler.restored_callback],
36
- # )
37
- #
38
- # See instance attributes for additional options.
39
- #
40
- # Note that this class is provided as an example, and potentially a base or implementation class.
41
- # Your actual implementation may also want to alert when a max depth or duration is reached,
42
- # since it can indicate a bigger problem. Autoscaling, especially of workers, is a tough problem
43
- # without a one-size-fits-all approach.
44
- class Heroku
45
- # Heroku client, usually created via PlatformAPI.oauth_connect.
46
- # @return [PlatformAPI::Client]
47
- attr_reader :heroku
48
-
49
- # Captured at the start of a high latency event.
50
- # Nil otherwise.
51
- # @return [Integer]
52
- attr_reader :active_event_initial_workers
53
-
54
- # Maximum number of workers to add.
55
- #
56
- # As the 'depth' of the alert is increased,
57
- # workers are added to the recorded worker count until the max is reached.
58
- # By default, this is 2 (so the max workers will be the recorded number, plus 2).
59
- # Do not set this too high, since it can for example exhaust database connections or just end up
60
- # increasing load.
61
- #
62
- # See class docs for more information.
63
- # @return [Integer]
64
- attr_reader :max_additional_workers
65
-
66
- # Defaults to HEROKU_APP_NAME, which should already be set if you use Heroku dyna metadata,
67
- # as per https://devcenter.heroku.com/articles/dyno-metadata.
68
- # This must be provided if the env var is missing.
69
- # @return [String]
70
- attr_reader :app_id_or_app_name
71
-
72
- # Defaults to 'worker', which is what you'll probably use if you have a simple system.
73
- # If you use multiple worker processes for different queues, this class probably isn't sufficient.
74
- # You will probably need to look at the slow queue names and determine the formation name to scale up.
75
- # @return [String]
76
- attr_reader :formation_id_or_formation_type
77
-
78
- def initialize(
79
- heroku:,
80
- max_additional_workers: 2,
81
- app_id_or_app_name: ENV.fetch("HEROKU_APP_NAME"),
82
- formation_id_or_formation_type: "worker"
83
- )
84
- @heroku = heroku
85
- @max_additional_workers = max_additional_workers
86
- @app_id_or_app_name = app_id_or_app_name
87
- @formation_id_or_formation_type = formation_id_or_formation_type
88
- # Is nil outside of a latency event, set during a latency event. So if this is initialized to non-nil,
89
- # we're already in a latency event.
90
- @active_event_initial_workers = Sidekiq.redis do |r|
91
- v = r.get("#{namespace}/active_event_initial_workers")
92
- v&.to_i
93
- end
94
- end
95
-
96
- def alert_callback
97
- self.method(:scale_up)
98
- end
99
-
100
- def restored_callback
101
- self.method(:scale_down)
102
- end
103
-
104
- protected def namespace
105
- return "amigo/autoscaler/heroku"
106
- end
107
-
108
- # Potentially add another worker to the formation.
109
- # @return [:noscale, :maxscale, :scaled] One of :noscale (no +active_event_initial_workers+),
110
- # :maxscale (+max_additional_workers+ reached), or :scaled.
111
- def scale_up(_queues_and_latencies, depth:, **)
112
- # When the scaling event starts (or if this is the first time we've seen it
113
- # but the event is already in progress), store how many workers we have.
114
- # It needs to be stored in redis so it persists if
115
- # the latency event continues through restarts.
116
- if @active_event_initial_workers.nil?
117
- @active_event_initial_workers = @heroku.formation.info(@app_id_or_app_name, @formation_id_or_formation_type).
118
- fetch("quantity")
119
- Sidekiq.redis do |r|
120
- r.set("#{namespace}/active_event_initial_workers", @active_event_initial_workers.to_s)
121
- end
122
- end
123
- return :noscale if @active_event_initial_workers.zero?
124
- new_quantity = @active_event_initial_workers + depth
125
- max_quantity = @active_event_initial_workers + @max_additional_workers
126
- return :maxscale if new_quantity > max_quantity
127
- @heroku.formation.update(@app_id_or_app_name, @formation_id_or_formation_type, {quantity: new_quantity})
128
- return :scaled
129
- end
130
-
131
- # Reset the formation to +active_event_initial_workers+.
132
- # @return [:noscale, :scaled] :noscale if +active_event_initial_workers+ is 0, otherwise :scaled.
133
- def scale_down(**)
134
- initial_workers = @active_event_initial_workers
135
- Sidekiq.redis do |r|
136
- r.del("#{namespace}/active_event_initial_workers")
137
- end
138
- @active_event_initial_workers = nil
139
- return :noscale if initial_workers.zero?
140
- @heroku.formation.update(@app_id_or_app_name, @formation_id_or_formation_type, {quantity: initial_workers})
141
- return :scaled
142
- end
143
- end
144
- end
145
- end