RubyGems - sidekiq-amigo - Versions diffs - 1.11.0 → 1.12.0 - Mend

sidekiq-amigo 1.11.0 → 1.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml +4 -4
data/lib/amigo/autoscaler/checkers/fake.rb +22 -0
data/lib/amigo/autoscaler/checkers/sidekiq.rb +19 -0
data/lib/amigo/autoscaler/checkers/web_latency.rb +84 -0
data/lib/amigo/autoscaler/handlers/chain.rb +28 -0
data/lib/amigo/autoscaler/handlers/fake.rb +27 -0
data/lib/amigo/autoscaler/handlers/heroku.rb +141 -0
data/lib/amigo/autoscaler/handlers/log.rb +35 -0
data/lib/amigo/autoscaler/handlers/sentry.rb +38 -0
data/lib/amigo/autoscaler.rb +71 -96
data/lib/amigo/version.rb +1 -1
metadata +37 -2
data/lib/amigo/autoscaler/heroku.rb +0 -145

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 5e1aa001f6060b1e13decb95357627151007e8f17066cef3401021103a3cd7a2
-  data.tar.gz: 955cb548d0739f5fa51bc0fe887a4e0df5d15c154ee5470e4bdba4883a308af4
+  metadata.gz: 8f2d4776669bc7327b064ae2f0b2f22a83e35cd351da05a145d1b3b7bf086334
+  data.tar.gz: b3122b4fa37a8c6c93afbd485ce4b813536a2c1b91d96c266b2c2bb04be15d98
 SHA512:
-  metadata.gz: 9fb081204d9465257ac48df3193925bb547961971fc0db39ea235584e9f4b9acef7ab2f93eaed60c1dcd4d1de759d09daadc5a94ec89473e600eb72b67c4afc7
-  data.tar.gz: bea97ecddf27912029035a941e6ee3aff653c0f8ba958e27f21a539f984a987c017311aa21b5f306832bb2240a0f487522e0cd63bf5a87988e5e568ebf503c7b
+  metadata.gz: c24ff6b6af38bb638be36dfa18aaca7500df9bf0126a4adf814e179fc05f3784b27004286e4ef796285cd047388d88486ee7da5fe9050454e33fc9b52d8f4698
+  data.tar.gz: 52b8edc30efe323786963559a7330058bd6b70552eedbdb4f7d3f3a8727373de6d033f2c0dd0113f9303f80e312d957df1e50c1f0fb5aec3e7bdcd9918b3a3bf

data/lib/amigo/autoscaler/checkers/fake.rb ADDED Viewed

@@ -0,0 +1,22 @@
+# frozen_string_literal: true
+require "amigo/autoscaler"
+module Amigo
+  class Autoscaler
+    module Checkers
+      class Fake < Amigo::Autoscaler::Checker
+        def initialize(latencies)
+          @latencies = latencies
+          super()
+        end
+        def get_latencies
+          return @latencies.call if @latencies.respond_to?(:call)
+          return @latencies.shift if @latencies.is_a?(Array)
+          return @latencies
+        end
+      end
+    end
+  end
+end

data/lib/amigo/autoscaler/checkers/sidekiq.rb ADDED Viewed

@@ -0,0 +1,19 @@
+# frozen_string_literal: true
+require "sidekiq/api"
+require "amigo/autoscaler"
+module Amigo
+  class Autoscaler
+    module Checkers
+      class Sidekiq < Amigo::Autoscaler::Checker
+        def get_latencies
+          return ::Sidekiq::Queue.all.
+              map { |q| [q.name, q.latency] }.
+              to_h
+        end
+      end
+    end
+  end
+end

data/lib/amigo/autoscaler/checkers/web_latency.rb ADDED Viewed

@@ -0,0 +1,84 @@
+# frozen_string_literal: true
+require "amigo/autoscaler"
+module Amigo
+  class Autoscaler
+    module Checkers
+      class WebLatency < Amigo::Autoscaler::Checker
+        NAMESPACE = "amigo/autoscaler/web_latency"
+        WINDOW = 60
+        # Set the latency.
+        # @param redis [RedisClient::Common] Redis connection.
+        # @param namespace [String] Key namespace.
+        # @param at [Time,Integer] Time this record was taken.
+        # @param duration [Numeric] Duration of the request in fractional seconds.
+        def self.set_latency(redis:, namespace:, at:, duration:)
+          bucket = at.to_i
+          key = "#{namespace}/latencies:#{bucket}"
+          duration_ms = (duration * 1000).round
+          redis.call("HINCRBY", key, "count", 1)
+          redis.call("HINCRBY", key, "sum", duration_ms)
+          redis.call("EXPIRE", key, WINDOW + 1)
+        end
+        def initialize(redis:, namespace: NAMESPACE)
+          @redis = redis
+          @namespace = namespace
+          super()
+        end
+        def get_latencies
+          now = Time.now.to_i
+          keys = (now - 59..now).map { |t| "#{@namespace}/latencies:#{t}" }
+          counts = 0
+          sums = 0
+          results = @redis.pipelined do |pipeline|
+            keys.each do |k|
+              pipeline.call("HMGET", k, "count", "sum")
+            end
+          end
+          results.each do |count, sum|
+            counts += count.to_i
+            sums   += sum.to_i
+          end
+          return {} if counts.zero?
+          latency = sums.to_f / counts
+          return {"web" => latency.to_f / 1000}
+        end
+        class Middleware
+          # @param threshold [Float] Do not record the latency of requests faster than this.
+          #   These are usually just things like healthchecks, files, or other very fast requests
+          #   which do not represent the overall system slowness.
+          def initialize(app, redis:, threshold: 0.08, namespace: NAMESPACE)
+            @app = app
+            @redis = redis
+            @threshold = threshold
+            @namespace = namespace
+          end
+          def call(env)
+            start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
+            status, headers, body = @app.call(env)
+            duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start
+            if duration > @threshold
+              begin
+                WebLatency.set_latency(
+                  redis: @redis,
+                  namespace: @namespace,
+                  at: Time.now,
+                  duration:,
+                )
+              rescue StandardError => e
+                Amigo.log(nil, :error, "web_latency_error", exception: e)
+              end
+            end
+            [status, headers, body]
+          end
+        end
+      end
+    end
+  end
+end

data/lib/amigo/autoscaler/handlers/chain.rb ADDED Viewed

@@ -0,0 +1,28 @@
+# frozen_string_literal: true
+require "amigo/autoscaler"
+module Amigo
+  class Autoscaler
+    module Handlers
+      class Chain < Amigo::Autoscaler::Handler
+        attr_accessor :chain
+        # Chain multiple handlers together.
+        # @param chain [Array<Amigo::Autoscaler::Handler>]
+        def initialize(chain)
+          @chain = chain
+          super()
+        end
+        def scale_up(*args, **kw)
+          @chain.each { |c| c.scale_up(*args, **kw) }
+        end
+        def scale_down(*args, **kw)
+          @chain.each { |c| c.scale_down(*args, **kw) }
+        end
+      end
+    end
+  end
+end

data/lib/amigo/autoscaler/handlers/fake.rb ADDED Viewed

@@ -0,0 +1,27 @@
+# frozen_string_literal: true
+require "amigo/autoscaler"
+module Amigo
+  class Autoscaler
+    module Handlers
+      class Fake < Amigo::Autoscaler::Handler
+        attr_accessor :ups, :downs
+        def initialize
+          @ups = []
+          @downs = []
+          super()
+        end
+        def scale_up(checked_latencies, depth:, duration:, **kw)
+          @ups << [checked_latencies, depth, duration, kw]
+        end
+        def scale_down(depth:, duration:, **kw)
+          @downs << [depth, duration, kw]
+        end
+      end
+    end
+  end
+end

data/lib/amigo/autoscaler/handlers/heroku.rb ADDED Viewed

@@ -0,0 +1,141 @@
+# frozen_string_literal: true
+require "platform-api"
+require "amigo/autoscaler"
+module Amigo
+  class Autoscaler
+    module Handlers
+      # Autoscaler to use on Heroku, that starts additional worker processes when there is a high latency event
+      # and scales them down after the event is finished.
+      #
+      # When the first call of a high latency event happens (depth: 1), this class
+      # will ask Heroku how many dynos are in the formation. This is known as +active_event_initial_workers+.
+      #
+      # If +active_event_initial_workers+ is 0, no autoscaling will be done.
+      # This avoids a situation where a high latency event is triggered
+      # due to workers being deprovisioned intentionally, perhaps for maintenance.
+      #
+      # Each time the alert fires (see +Amigo::Autoscaler#alert_interval+),
+      # an additional worker will be added to the formation, up to +max_additional_workers+.
+      # So with +active_event_initial_workers+ of 1 and +max_additional_workers+ of 2,
+      # the first time the alert times, the formation will be set to 2 workers.
+      # The next time, it'll be set to 3 workers.
+      # After that, no additional workers will be provisioned.
+      #
+      # After the high latency event resolves,
+      # the dyno formation is restored to +active_event_initial_workers+.
+      #
+      # To use:
+      #
+      #   heroku = PlatformAPI.connect_oauth(heroku_oauth_token)
+      #   heroku_scaler = Amigo::Autoscaler::Heroku.new(heroku:, default_workers: 1)
+      #   Amigo::Autoscaler.new(
+      #     handlers: [heroku_scaler.alert_callback],
+      #     latency_restored_handlers: [heroku_scaler.restored_callback],
+      #   )
+      #
+      # See instance attributes for additional options.
+      #
+      # Note that this class is provided as an example, and potentially a base or implementation class.
+      # Your actual implementation may also want to alert when a max depth or duration is reached,
+      # since it can indicate a bigger problem. Autoscaling, especially of workers, is a tough problem
+      # without a one-size-fits-all approach.
+      class Heroku < Amigo::Autoscaler::Handler
+        # Heroku client, usually created via PlatformAPI.oauth_connect.
+        # @return [PlatformAPI::Client]
+        attr_reader :heroku
+        # Captured at the start of a high latency event.
+        # Nil otherwise.
+        # @return [Integer]
+        attr_reader :active_event_initial_workers
+        # Maximum number of workers to add.
+        #
+        # As the 'depth' of the alert is increased,
+        # workers are added to the recorded worker count until the max is reached.
+        # By default, this is 2 (so the max workers will be the recorded number, plus 2).
+        # Do not set this too high, since it can for example exhaust database connections or just end up
+        # increasing load.
+        #
+        # See class docs for more information.
+        # @return [Integer]
+        attr_reader :max_additional_workers
+        # Defaults to HEROKU_APP_NAME, which should already be set if you use Heroku dyna metadata,
+        # as per https://devcenter.heroku.com/articles/dyno-metadata.
+        # This must be provided if the env var is missing.
+        # @return [String]
+        attr_reader :app_id_or_app_name
+        # Formation ID or name.
+        # Usually 'worker' to scale Sidekiq workers, or 'web' for the web worker.
+        # If you use multiple worker processes for different queues, this class probably isn't sufficient.
+        # You will probably need to look at the slow queue names and determine the formation name to scale up.
+        # @return [String]
+        attr_reader :formation
+        def initialize(
+          client:,
+          formation:,
+          max_additional_workers: 2,
+          app_id_or_app_name: ENV.fetch("HEROKU_APP_NAME")
+        )
+          super()
+          @client = client
+          @max_additional_workers = max_additional_workers
+          @app_id_or_app_name = app_id_or_app_name
+          @formation = formation
+          # Is nil outside a latency event, set during a latency event. So if this is initialized to non-nil,
+          # we're already in a latency event.
+          @active_event_initial_workers = Sidekiq.redis do |r|
+            v = r.get("#{namespace}/active_event_initial_workers")
+            v&.to_i
+          end
+        end
+        protected def namespace
+          return "amigo/autoscaler/heroku/#{self.formation}"
+        end
+        # Potentially add another worker to the formation.
+        # @return [:noscale, :maxscale, :scaled] One of :noscale (no +active_event_initial_workers+),
+        #   :maxscale (+max_additional_workers+ reached), or :scaled.
+        def scale_up(_queues_and_latencies, depth:, **)
+          # When the scaling event starts (or if this is the first time we've seen it
+          # but the event is already in progress), store how many workers we have.
+          # It needs to be stored in redis so it persists if
+          # the latency event continues through restarts.
+          if @active_event_initial_workers.nil?
+            @active_event_initial_workers = @client.formation.info(@app_id_or_app_name, @formation).
+              fetch("quantity")
+            Sidekiq.redis do |r|
+              r.set("#{namespace}/active_event_initial_workers", @active_event_initial_workers.to_s)
+            end
+          end
+          return :noscale if @active_event_initial_workers.zero?
+          new_quantity = @active_event_initial_workers + depth
+          max_quantity = @active_event_initial_workers + @max_additional_workers
+          return :maxscale if new_quantity > max_quantity
+          @client.formation.update(@app_id_or_app_name, @formation, {quantity: new_quantity})
+          return :scaled
+        end
+        # Reset the formation to +active_event_initial_workers+.
+        # @return [:noscale, :scaled] :noscale if +active_event_initial_workers+ is 0, otherwise :scaled.
+        def scale_down(**)
+          initial_workers = @active_event_initial_workers
+          Sidekiq.redis do |r|
+            r.del("#{namespace}/active_event_initial_workers")
+          end
+          @active_event_initial_workers = nil
+          return :noscale if initial_workers.zero?
+          @client.formation.update(@app_id_or_app_name, @formation, {quantity: initial_workers})
+          return :scaled
+        end
+      end
+    end
+  end
+end

data/lib/amigo/autoscaler/handlers/log.rb ADDED Viewed

@@ -0,0 +1,35 @@
+# frozen_string_literal: true
+require "amigo/autoscaler"
+module Amigo
+  class Autoscaler
+    module Handlers
+      class Log < Amigo::Autoscaler::Handler
+        DEFAULT_LOG = ->(level, message, params={}) { Amigo.log(nil, level, message, params) }
+        # @param message [String] Log message for structured logging.\
+        #   Has "_restored" appended on +scale_down+.
+        # @param log [Proc] Proc/callable called with (level, message, params={}).
+        #   By default, use +Amigo.log+ (which logs to the Sidekiq logger).
+        def initialize(message: "high_latency_queues", log: DEFAULT_LOG)
+          @message = message
+          @log = log
+          super()
+        end
+        def scale_up(checked_latencies, depth:, duration:, **_kw)
+          self._log(:warn, @message, queues: checked_latencies, depth: depth, duration: duration)
+        end
+        def scale_down(depth:, duration:, **_kw)
+          self._log(:info, "#{@message}_restored", depth: depth, duration: duration)
+        end
+        protected def _log(level, msg, **kw)
+          @log[level, msg, kw]
+        end
+      end
+    end
+  end
+end

data/lib/amigo/autoscaler/handlers/sentry.rb ADDED Viewed

@@ -0,0 +1,38 @@
+# frozen_string_literal: true
+require "amigo/autoscaler"
+module Amigo
+  class Autoscaler
+    module Handlers
+      class Sentry < Amigo::Autoscaler::Handler
+        # @param interval [Integer] How many seconds between Sentry alerts?
+        #   This is similar to +alert_interval+ on the Autoscaler,
+        #   but Sentry has its own interval, since it is used for reporting,
+        #   and not latency reduction.
+        # @param message [String] Message to capture.
+        # @param level [:debug,:info,:warning,:warn,:error,:fatal] Sentry level.
+        def initialize(interval: 300, message: "Some queues have a high latency", level: :warn)
+          @interval = interval
+          @message = message
+          @level = level
+          @last_alerted = Time.at(0)
+          super()
+        end
+        def scale_up(checked_latencies, depth:, duration:, **)
+          now = Time.now
+          call_sentry = @last_alerted < (now - @interval)
+          return unless call_sentry
+          ::Sentry.with_scope do |scope|
+            scope&.set_extras(high_latency_queues: checked_latencies, depth:, duration:)
+            ::Sentry.capture_message(@message, level: @level)
+          end
+          @last_alerted = now
+        end
+        def scale_down(**) = nil
+      end
+    end
+  end
+end

data/lib/amigo/autoscaler.rb CHANGED Viewed

@@ -4,37 +4,37 @@ require "sidekiq/api"
 require "amigo"
-# When queues achieve a latency that is too high,
-# take some action.
+# Generic autoscaling handler that will check for latency
+# and take an action.
+# For Sidekiq on Heroku for instance,
+# this means checking queues for a latency above a threshold, and adding workers up to a limit.
+#
 # You should start this up at Web application startup:
 #
 #   # puma.rb or similar
-#   Amigo::Autoscaler.new.start
+#   checker = Amigo::Autoscaler::Checkers::SidekiqLatency.new
+#   heroku_client = PlatformAPI.connect_oauth(ENV['MYAPP_HEROKU_OAUTH_TOKEN'])
+#   handler = Amigo::Autoscaler::Handlers::Heroku.new(client: heroku_client, formation: 'worker')
+#   Amigo::Autoscaler.new(checker:, handler:).start
 #
 # When latency grows beyond +latency_threshold+,
 # a "high latency event" is started.
-# Some action is taken, which is defined by the +handlers+ argument.
-# This includes logging, alerting, and/or autoscaling.
+# Some action should be taken, which is handled by the handler's +scale_up+ method.
+# This usually includes logging, alerting, and/or autoscaling.
 #
 # When latency returns to normal (defined by +latency_restored_threshold+),
 # the high latency event finishes.
-# Some additional action is taken, which is defined by the +latency_restored_handlers+ argument.
+# Some additional action is taken, handled by the handler's +scale_down+ method.
 # Usually this is logging, and/or returning autoscaling to its original status.
 #
 # There are several parameters to control behavior, such as how often polling is done,
 # how often alerting/scaling is done, and more.
 #
-# As an example autoscaler that includes actual resource scaling,
-# check out +Amigo::Autoscaler::Heroku+.
-# Its ideas can easily be expanded to other platforms.
-#
 # Note that +Autoscaler+ maintains its state over multiple processes;
 # it needs to keep track of high latency events even if the process running the autoscaler
 # (usually a web process) restarts.
 module Amigo
   class Autoscaler
-    class InvalidHandler < StandardError; end
     # Struct representing data serialized to Redis.
     # Useful for diagnostics. Can be retried with +fetch_persisted+.
     # @!attribute last_alerted_at [Time] 0-time if there is no recent alert.
@@ -56,49 +56,32 @@ module Amigo
     # are generally easier to find).
     # @return [Regexp]
     attr_reader :hostname_regex
-    # Methods to call when alerting, as strings/symbols or procs.
-    # Valid string values are 'log' and 'sentry' (requires Sentry to be required already).
-    # Anything that responds to +call+ will be invoked with:
-    # - Positional argument which is a +Hash+ of `{queue name => latency in seconds}`
-    # - Keyword argument +:depth+: Number of alerts as part of this latency event.
-    #   For example, the first alert has a depth of 1, and if latency stays high,
-    #   it'll be 2 on the next call, etc. +depth+ can be used to incrementally provision
-    #   additional processing capacity, and stop adding capacity at a certain depth
-    #   to avoid problems with too many workers (like excessive DB load).
-    # - Keyword argument +:duration+: Number of seconds since this latency spike started.
-    # - Additional undefined keywords. Handlers should accept additional options,
-    #   like via `**kw` or `opts={}`, for compatibility.
-    # @return [Array<String,Symbol,Proc,#call>]
-    attr_reader :handlers
     # Only alert this often.
     # For example, with poll_interval of 10 seconds
     # and alert_interval of 200 seconds,
     # we'd alert once and then 210 seconds later.
     # @return [Integer]
     attr_reader :alert_interval
     # After an alert happens, what latency should be considered "back to normal" and
-    # +latency_restored_handlers+ will be called?
+    # +scale_down+ will be called?
     # In most cases this should be the same as (and defaults to) +latency_threshold+
     # so that we're 'back to normal' once we're below the threshold.
     # It may also commonly be 0, so that the callback is fired when the queue is entirely clear.
     # Note that, if +latency_restored_threshold+ is less than +latency_threshold+,
     # while the latency is between the two, no alerts will fire.
     attr_reader :latency_restored_threshold
-    # Methods to call when a latency of +latency_restored_threshold+ is reached
-    # (ie, when we get back to normal latency after a high latency event).
-    # Valid string values are 'log'.
-    # Usually this handler will deprovision capacity procured as part of the alert +handlers+.
-    # Anything that responds to +call+ will be invoked with:
-    # - Keyword +:depth+, the number of times an alert happened before
-    #   the latency spike was resolved.
-    # - Keyword +:duration+, the number of seconds for the latency spike has been going on.
-    # - Additional undefined keywords. Handlers should accept additional options,
-    #   like via `**kw`, for compatibility.
-    # @return [Array<String,Symbol,Proc,#call>]
-    attr_reader :latency_restored_handlers
-    # Proc/callable called with (level, message, params={}).
-    # By default, use +Amigo.log+ (which logs to the Sidekiq logger).
-    attr_reader :log
+    # @return [Amigo::Autoscaler::Checker]
+    attr_reader :checker
+    # @return [Amigo::Autoscaler::Handler]
+    attr_reader :handler
+    # Store autoscaler keys in this Redis namespace.
+    # Note that if you are running multiple autoscalers for different services (web, worker),
+    # you will need different namespaces.
+    attr_reader :namespace
     # Proc called with an exception that occurs while the thread is running.
     # If the handler returns +true+, then the thread will keep going.
     # All other values will kill the thread, which breaks autoscaling.
@@ -108,15 +91,15 @@ module Amigo
     attr_reader :on_unhandled_exception
     def initialize(
+      handler:,
+      checker:,
       poll_interval: 20,
       latency_threshold: 5,
       hostname_regex: /^web\.1$/,
-      handlers: [:log],
       alert_interval: 120,
       latency_restored_threshold: latency_threshold,
-      latency_restored_handlers: [:log],
-      log: ->(level, message, params={}) { Amigo.log(nil, level, message, params) },
-      on_unhandled_exception: nil
+      on_unhandled_exception: nil,
+      namespace: "amigo/autoscaler"
     )
       raise ArgumentError, "latency_threshold must be > 0" if
         latency_threshold <= 0
@@ -124,15 +107,15 @@ module Amigo
         latency_restored_threshold.negative?
       raise ArgumentError, "latency_restored_threshold must be <= latency_threshold" if
         latency_restored_threshold > latency_threshold
+      @handler = handler
+      @checker = checker
       @poll_interval = poll_interval
       @latency_threshold = latency_threshold
       @hostname_regex = hostname_regex
-      @handlers = handlers.freeze
       @alert_interval = alert_interval
       @latency_restored_threshold = latency_restored_threshold
-      @latency_restored_handlers = latency_restored_handlers.freeze
-      @log = log
       @on_unhandled_exception = on_unhandled_exception
+      @namespace = namespace
     end
     # @return [Thread]
@@ -143,8 +126,6 @@ module Amigo
     def setup
       # Store these as strings OR procs, rather than grabbing self.method here.
       # It gets extremely hard ot test if we capture the method here.
-      @alert_methods = self.handlers.map { |a| _handler_to_method("alert_", a) }
-      @restored_methods = self.latency_restored_handlers.map { |a| _handler_to_method("alert_restored_", a) }
       @stop = false
       persisted = self.fetch_persisted
       @last_alerted = persisted.last_alerted_at
@@ -181,24 +162,13 @@ module Amigo
       end
     end
-    protected def namespace
-      return "amigo/autoscaler"
-    end
-    private def _handler_to_method(prefix, a)
-      return a if a.respond_to?(:call)
-      method_name = "#{prefix}#{a.to_s.strip}".to_sym
-      raise InvalidHandler, a.inspect unless (meth = self.method(method_name))
-      return meth
-    end
     def start
       raise "already started" unless @polling_thread.nil?
       hostname = ENV.fetch("DYNO") { Socket.gethostname }
       return false unless self.hostname_regex.match?(hostname)
-      self._log(:info, "async_autoscaler_starting")
+      self._debug(:info, "async_autoscaler_starting")
       self.setup
       @polling_thread = Thread.new do
         until @stop
@@ -216,7 +186,7 @@ module Amigo
     def check
       self._check
     rescue StandardError => e
-      self._log(:error, "async_autoscaler_unhandled_error", exception: e)
+      self._debug(:error, "async_autoscaler_unhandled_error", exception: e)
       handled = self.on_unhandled_exception&.call(e)
       raise e unless handled.eql?(true)
     end
@@ -225,22 +195,18 @@ module Amigo
       now = Time.now
       skip_check = now < (@last_alerted + self.alert_interval)
       if skip_check
-        self._log(:debug, "async_autoscaler_skip_check")
+        self._debug(:debug, "async_autoscaler_skip_check")
         return
       end
-      self._log(:info, "async_autoscaler_check")
-      high_latency_queues = Sidekiq::Queue.all.
-        map { |q| [q.name, q.latency] }.
-        select { |(_, latency)| latency > self.latency_threshold }.
-        to_h
+      self._debug(:info, "async_autoscaler_check")
+      high_latency_queues = self.checker.get_latencies.
+        select { |_, latency| latency > self.latency_threshold }
       if high_latency_queues.empty?
         # Whenever we are in a latency event, we have a depth > 0. So a depth of 0 means
         # we're not in a latency event, and still have no latency, so can noop.
         return if @depth.zero?
         # We WERE in a latency event, and now we're not, so report on it.
-        @restored_methods.each do |m|
-          m.call(depth: @depth, duration: (Time.now - @latency_event_started).to_f)
-        end
+        self.handler.scale_down(depth: @depth, duration: (Time.now - @latency_event_started).to_f)
         # Reset back to 0 depth so we know we're not in a latency event.
         @depth = 0
         @latency_event_started = Time.at(0)
@@ -260,38 +226,47 @@ module Amigo
       end
       # Alert each handler. For legacy reasons, we support handlers that accept
       # ({queues and latencies}) and ({queues and latencies}, {}keywords}).
-      kw = {depth: @depth, duration: duration}
-      @alert_methods.each do |m|
-        if m.respond_to?(:arity) && m.arity == 1
-          m.call(high_latency_queues)
-        else
-          m.call(high_latency_queues, **kw)
-        end
-      end
+      @handler.scale_up(high_latency_queues, depth: @depth, duration: duration)
       @last_alerted = now
       self.persist
     end
-    def alert_sentry(names_and_latencies)
-      Sentry.with_scope do |scope|
-        scope.set_extras(high_latency_queues: names_and_latencies)
-        names = names_and_latencies.map(&:first).sort.join(", ")
-        Sentry.capture_message("Some queues have a high latency: #{names}")
-      end
+    def _debug(lvl, msg, **kw)
+      return unless ENV["DEBUG"]
+      Amigo.log(nil, lvl, msg, kw)
     end
-    def alert_log(names_and_latencies, depth:, duration:)
-      self._log(:warn, "high_latency_queues", queues: names_and_latencies, depth: depth, duration: duration)
+    class Checker
+      # Return relevant latencies for this checker.
+      # This could be the latencies of each Sidekiq queue, or web latencies, etc.
+      # @return [Hash] Key is the queue name (or some other value); value is the latency in seconds.
+      def get_latencies = raise NotImplementedError
     end
-    def alert_test(_names_and_latencies, _opts={}); end
-    def alert_restored_log(depth:, duration:)
-      self._log(:info, "high_latency_queues_restored", depth: depth, duration: duration)
-    end
+    class Handler
+      # Called when a latency event starts, and as it fails to resolve.
+      # @param checked_latencies [Hash] The +Hash+ returned from +Amigo::Autoscaler::Handler#check+.
+      #   For Sidekiq, this will look like `{queue name => latency in seconds}`
+      # @param depth [Integer] Number of alerts as part of this latency event.
+      #   For example, the first alert has a depth of 1, and if latency stays high,
+      #   it'll be 2 on the next call, etc. +depth+ can be used to incrementally provision
+      #   additional processing capacity, and stop adding capacity at a certain depth
+      #   to avoid problems with too many workers (like excessive DB load).
+      # @param duration [Float] Number of seconds since this latency spike started.
+      # @param kw [Hash] Additional undefined keywords. Handlers should accept additional options,
+      #   like via `**kw` or `opts={}`, for compatibility.
+      # @return [Array<String,Symbol,Proc,#call>]
+      def scale_up(checked_latencies, depth:, duration:, **kw) = raise NotImplementedError
-    protected def _log(level, msg, **kw)
-      self.log[level, msg, kw]
+      # Called when a latency of +latency_restored_threshold+ is reached
+      # (ie, when we get back to normal latency after a high latency event).
+      # Usually this handler will deprovision capacity procured as part of the +scale_up+.
+      # @param depth [Integer] The number of times an alert happened before
+      #   the latency spike was resolved.
+      # @param duration [Float] The number of seconds for the latency spike has been going on.
+      # @param kw [Hash] Additional undefined keywords. Handlers should accept additional options,
+      #   like via `**kw` or `opts={}`, for compatibility.
+      def scale_down(depth:, duration:, **kw) = raise NotImplementedError
     end
   end
 end

data/lib/amigo/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Amigo
-  VERSION = "1.11.0"
+  VERSION = "1.12.0"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: sidekiq-amigo
 version: !ruby/object:Gem::Version
-  version: 1.11.0
+  version: 1.12.0
 platform: ruby
 authors:
 - Lithic Technology
@@ -135,6 +135,34 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '5'
+- !ruby/object:Gem::Dependency
+  name: simplecov
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.22'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.22'
+- !ruby/object:Gem::Dependency
+  name: simplecov-cobertura
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.1'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.1'
 - !ruby/object:Gem::Dependency
   name: timecop
   requirement: !ruby/object:Gem::Requirement
@@ -175,7 +203,14 @@ files:
 - lib/amigo.rb
 - lib/amigo/audit_logger.rb
 - lib/amigo/autoscaler.rb
-- lib/amigo/autoscaler/heroku.rb
+- lib/amigo/autoscaler/checkers/fake.rb
+- lib/amigo/autoscaler/checkers/sidekiq.rb
+- lib/amigo/autoscaler/checkers/web_latency.rb
+- lib/amigo/autoscaler/handlers/chain.rb
+- lib/amigo/autoscaler/handlers/fake.rb
+- lib/amigo/autoscaler/handlers/heroku.rb
+- lib/amigo/autoscaler/handlers/log.rb
+- lib/amigo/autoscaler/handlers/sentry.rb
 - lib/amigo/deprecated_jobs.rb
 - lib/amigo/job.rb
 - lib/amigo/memory_pressure.rb

data/lib/amigo/autoscaler/heroku.rb DELETED Viewed

@@ -1,145 +0,0 @@
-# frozen_string_literal: true
-require "platform-api"
-require "amigo/autoscaler"
-module Amigo
-  class Autoscaler
-    # Autoscaler to use on Heroku, that starts additional worker processes when there is a high latency event
-    # and scales them down after the event is finished.
-    #
-    # When the first call of a high latency event happens (depth: 1), this class
-    # will ask Heroku how many dynos are in the formation. This is known as +active_event_initial_workers+.
-    #
-    # If +active_event_initial_workers+ is 0, no autoscaling will be done.
-    # This avoids a situation where a high latency event is triggered
-    # due to workers being deprovisioned intentionally, perhaps for maintenance.
-    #
-    # Each time the alert fires (see +Amigo::Autoscaler#alert_interval+),
-    # an additional worker will be added to the formation, up to +max_additional_workers+.
-    # So with +active_event_initial_workers+ of 1 and +max_additional_workers+ of 2,
-    # the first time the alert times, the formation will be set to 2 workers.
-    # The next time, it'll be set to 3 workers.
-    # After that, no additional workers will be provisioned.
-    #
-    # After the high latency event resolves,
-    # the dyno formation is restored to +active_event_initial_workers+.
-    #
-    # To use:
-    #
-    #   heroku = PlatformAPI.connect_oauth(heroku_oauth_token)
-    #   heroku_scaler = Amigo::Autoscaler::Heroku.new(heroku:, default_workers: 1)
-    #   Amigo::Autoscaler.new(
-    #     handlers: [heroku_scaler.alert_callback],
-    #     latency_restored_handlers: [heroku_scaler.restored_callback],
-    #   )
-    #
-    # See instance attributes for additional options.
-    #
-    # Note that this class is provided as an example, and potentially a base or implementation class.
-    # Your actual implementation may also want to alert when a max depth or duration is reached,
-    # since it can indicate a bigger problem. Autoscaling, especially of workers, is a tough problem
-    # without a one-size-fits-all approach.
-    class Heroku
-      # Heroku client, usually created via PlatformAPI.oauth_connect.
-      # @return [PlatformAPI::Client]
-      attr_reader :heroku
-      # Captured at the start of a high latency event.
-      # Nil otherwise.
-      # @return [Integer]
-      attr_reader :active_event_initial_workers
-      # Maximum number of workers to add.
-      #
-      # As the 'depth' of the alert is increased,
-      # workers are added to the recorded worker count until the max is reached.
-      # By default, this is 2 (so the max workers will be the recorded number, plus 2).
-      # Do not set this too high, since it can for example exhaust database connections or just end up
-      # increasing load.
-      #
-      # See class docs for more information.
-      # @return [Integer]
-      attr_reader :max_additional_workers
-      # Defaults to HEROKU_APP_NAME, which should already be set if you use Heroku dyna metadata,
-      # as per https://devcenter.heroku.com/articles/dyno-metadata.
-      # This must be provided if the env var is missing.
-      # @return [String]
-      attr_reader :app_id_or_app_name
-      # Defaults to 'worker', which is what you'll probably use if you have a simple system.
-      # If you use multiple worker processes for different queues, this class probably isn't sufficient.
-      # You will probably need to look at the slow queue names and determine the formation name to scale up.
-      # @return [String]
-      attr_reader :formation_id_or_formation_type
-      def initialize(
-        heroku:,
-        max_additional_workers: 2,
-        app_id_or_app_name: ENV.fetch("HEROKU_APP_NAME"),
-        formation_id_or_formation_type: "worker"
-      )
-        @heroku = heroku
-        @max_additional_workers = max_additional_workers
-        @app_id_or_app_name = app_id_or_app_name
-        @formation_id_or_formation_type = formation_id_or_formation_type
-        # Is nil outside of a latency event, set during a latency event. So if this is initialized to non-nil,
-        # we're already in a latency event.
-        @active_event_initial_workers = Sidekiq.redis do |r|
-          v = r.get("#{namespace}/active_event_initial_workers")
-          v&.to_i
-        end
-      end
-      def alert_callback
-        self.method(:scale_up)
-      end
-      def restored_callback
-        self.method(:scale_down)
-      end
-      protected def namespace
-        return "amigo/autoscaler/heroku"
-      end
-      # Potentially add another worker to the formation.
-      # @return [:noscale, :maxscale, :scaled] One of :noscale (no +active_event_initial_workers+),
-      #   :maxscale (+max_additional_workers+ reached), or :scaled.
-      def scale_up(_queues_and_latencies, depth:, **)
-        # When the scaling event starts (or if this is the first time we've seen it
-        # but the event is already in progress), store how many workers we have.
-        # It needs to be stored in redis so it persists if
-        # the latency event continues through restarts.
-        if @active_event_initial_workers.nil?
-          @active_event_initial_workers = @heroku.formation.info(@app_id_or_app_name, @formation_id_or_formation_type).
-            fetch("quantity")
-          Sidekiq.redis do |r|
-            r.set("#{namespace}/active_event_initial_workers", @active_event_initial_workers.to_s)
-          end
-        end
-        return :noscale if @active_event_initial_workers.zero?
-        new_quantity = @active_event_initial_workers + depth
-        max_quantity = @active_event_initial_workers + @max_additional_workers
-        return :maxscale if new_quantity > max_quantity
-        @heroku.formation.update(@app_id_or_app_name, @formation_id_or_formation_type, {quantity: new_quantity})
-        return :scaled
-      end
-      # Reset the formation to +active_event_initial_workers+.
-      # @return [:noscale, :scaled] :noscale if +active_event_initial_workers+ is 0, otherwise :scaled.
-      def scale_down(**)
-        initial_workers = @active_event_initial_workers
-        Sidekiq.redis do |r|
-          r.del("#{namespace}/active_event_initial_workers")
-        end
-        @active_event_initial_workers = nil
-        return :noscale if initial_workers.zero?
-        @heroku.formation.update(@app_id_or_app_name, @formation_id_or_formation_type, {quantity: initial_workers})
-        return :scaled
-      end
-    end
-  end
-end