RubyGems - sidekiq - Versions diffs - 3.5.4 → 4.0.0.pre1 - Mend

sidekiq 3.5.4 → 4.0.0.pre1

Potentially problematic release.

This version of sidekiq might be problematic. Click here for more details.

Files changed (32) hide show

checksums.yaml +4 -4
data/4.0-Upgrade.md +40 -0
data/Changes.md +5 -8
data/Ent-Changes.md +6 -0
data/Gemfile +2 -1
data/Pro-3.0-Upgrade.md +46 -0
data/bin/sidekiqctl +3 -3
data/bin/sidekiqload +17 -6
data/lib/sidekiq.rb +24 -11
data/lib/sidekiq/cli.rb +19 -28
data/lib/sidekiq/client.rb +0 -5
data/lib/sidekiq/fetch.rb +35 -111
data/lib/sidekiq/launcher.rb +105 -46
data/lib/sidekiq/manager.rb +71 -181
data/lib/sidekiq/middleware/server/retry_jobs.rb +1 -1
data/lib/sidekiq/processor.rb +119 -96
data/lib/sidekiq/redis_connection.rb +22 -4
data/lib/sidekiq/scheduled.rb +47 -26
data/lib/sidekiq/util.rb +7 -0
data/lib/sidekiq/version.rb +1 -1
data/sidekiq.gemspec +1 -1
data/test/helper.rb +30 -5
data/test/test_actors.rb +137 -0
data/test/test_api.rb +395 -394
data/test/test_fetch.rb +2 -57
data/test/test_launcher.rb +80 -0
data/test/test_manager.rb +13 -132
data/test/test_middleware.rb +3 -5
data/test/test_processor.rb +20 -57
data/test/test_scheduled.rb +2 -2
metadata +19 -14
data/lib/sidekiq/actor.rb +0 -39

data/lib/sidekiq/middleware/server/retry_jobs.rb CHANGED

@@ -96,7 +96,7 @@ module Sidekiq
           # App code can stuff all sorts of crazy binary data into the error message
           # that won't convert to JSON.
-          m = exception.message.to_s[0, 10_000]
+          m = exception.message[0..10_000]
           if m.respond_to?(:scrub!)
             m.force_encoding("utf-8")
             m.scrub!

data/lib/sidekiq/processor.rb CHANGED

@@ -1,59 +1,135 @@
 require 'sidekiq/util'
-require 'sidekiq/actor'
-require 'sidekiq/middleware/server/retry_jobs'
-require 'sidekiq/middleware/server/logging'
+require 'sidekiq/fetch'
+require 'thread'
+require 'concurrent/map'
+require 'concurrent/atomic/atomic_fixnum'
 module Sidekiq
   ##
-  # The Processor receives a message from the Manager and actually
-  # processes it.  It instantiates the worker, runs the middleware
-  # chain and then calls Sidekiq::Worker#perform.
+  # The Processor is a standalone thread which:
+  #
+  # 1. fetches a job from Redis
+  # 2. executes the job
+  #   a. instantiate the Worker
+  #   b. run the middleware chain
+  #   c. call #perform
+  #
+  # A Processor can exit due to shutdown (processor_stopped)
+  # or due to an error during job execution (processor_died)
+  #
+  # If an error occurs in the job execution, the
+  # Processor calls the Manager to create a new one
+  # to replace itself and exits.
+  #
   class Processor
-    # To prevent a memory leak, ensure that stats expire. However, they should take up a minimal amount of storage
-    # so keep them around for a long time
-    STATS_TIMEOUT = 24 * 60 * 60 * 365 * 5
     include Util
-    include Actor
-    def self.default_middleware
-      Middleware::Chain.new do |m|
-        m.add Middleware::Server::Logging
-        m.add Middleware::Server::RetryJobs
-        if defined?(::ActiveRecord::Base)
-          require 'sidekiq/middleware/server/active_record'
-          m.add Sidekiq::Middleware::Server::ActiveRecord
+    attr_reader :thread
+    attr_reader :job
+    def initialize(mgr)
+      @mgr = mgr
+      @down = false
+      @done = false
+      @job = nil
+      @thread = nil
+      @strategy = (mgr.options[:fetch] || Sidekiq::BasicFetch).new(mgr.options)
+    end
+    def terminate(wait=false)
+      @done = true
+      return if !@thread
+      @thread.value if wait
+    end
+    def kill(wait=false)
+      @done = true
+      return if !@thread
+      # unlike the other actors, terminate does not wait
+      # for the thread to finish because we don't know how
+      # long the job will take to finish.  Instead we
+      # provide a `kill` method to call after the shutdown
+      # timeout passes.
+      @thread.raise ::Sidekiq::Shutdown
+      @thread.value if wait
+    end
+    def start
+      @thread ||= safe_thread("processor", &method(:run))
+    end
+    private unless $TESTING
+    def run
+      begin
+        while !@done
+          process_one
         end
+        @mgr.processor_stopped(self)
+      rescue Sidekiq::Shutdown
+        @mgr.processor_stopped(self)
+      rescue Exception => ex
+        @mgr.processor_died(self, ex)
       end
     end
-    attr_accessor :proxy_id
+    def process_one
+      @job = fetch
+      process(@job) if @job
+      @job = nil
+    end
+    def get_one
+      begin
+        work = @strategy.retrieve_work
+        (logger.info { "Redis is online, #{Time.now - @down} sec downtime" }; @down = nil) if @down
+        work
+      rescue Sidekiq::Shutdown
+      rescue => ex
+        handle_fetch_exception(ex)
+      end
+    end
+    def fetch
+      j = get_one
+      if j && @done
+        j.requeue
+        nil
+      else
+        j
+      end
+    end
-    def initialize(boss)
-      @boss = boss
+    def handle_fetch_exception(ex)
+      if !@down
+        @down = Time.now
+        logger.error("Error fetching job: #{ex}")
+        ex.backtrace.each do |bt|
+          logger.error(bt)
+        end
+      end
+      sleep(1)
     end
     def process(work)
-      msgstr = work.message
+      jobstr = work.job
       queue = work.queue_name
-      @boss.async.real_thread(proxy_id, Thread.current)
       ack = false
       begin
-        msg = Sidekiq.load_json(msgstr)
-        klass  = msg['class'.freeze].constantize
+        job = Sidekiq.load_json(jobstr)
+        klass  = job['class'.freeze].constantize
         worker = klass.new
-        worker.jid = msg['jid'.freeze]
+        worker.jid = job['jid'.freeze]
-        stats(worker, msg, queue) do
-          Sidekiq.server_middleware.invoke(worker, msg, queue) do
+        stats(worker, job, queue) do
+          Sidekiq.server_middleware.invoke(worker, job, queue) do
             # Only ack if we either attempted to start this job or
             # successfully completed it. This prevents us from
             # losing jobs if a middleware raises an exception before yielding
             ack = true
-            execute_job(worker, cloned(msg['args'.freeze]))
+            execute_job(worker, cloned(job['args'.freeze]))
           end
         end
         ack = true
@@ -63,99 +139,46 @@ module Sidekiq
         # we didn't properly finish it.
         ack = false
       rescue Exception => ex
-        handle_exception(ex, msg || { :message => msgstr })
+        handle_exception(ex, job || { :job => jobstr })
         raise
       ensure
         work.acknowledge if ack
       end
-      @boss.async.processor_done(current_actor)
-    end
-    def inspect
-      "<Processor##{object_id.to_s(16)}>"
     end
     def execute_job(worker, cloned_args)
       worker.perform(*cloned_args)
     end
-    private
     def thread_identity
       @str ||= Thread.current.object_id.to_s(36)
     end
-    def stats(worker, msg, queue)
-      # Do not conflate errors from the job with errors caused by updating
-      # stats so calling code can react appropriately
-      retry_and_suppress_exceptions do
-        hash = Sidekiq.dump_json({:queue => queue, :payload => msg, :run_at => Time.now.to_i })
-        Sidekiq.redis do |conn|
-          conn.multi do
-            conn.hmset("#{identity}:workers", thread_identity, hash)
-            conn.expire("#{identity}:workers", 60*60*4)
-          end
-        end
-      end
+    WORKER_STATE = Concurrent::Map.new
+    PROCESSED = Concurrent::AtomicFixnum.new
+    FAILURE = Concurrent::AtomicFixnum.new
+    def stats(worker, job, queue)
+      tid = thread_identity
+      WORKER_STATE[tid] = {:queue => queue, :payload => job, :run_at => Time.now.to_i }
-      nowdate = Time.now.utc.strftime("%Y-%m-%d".freeze)
       begin
         yield
       rescue Exception
-        retry_and_suppress_exceptions do
-          failed = "stat:failed:#{nowdate}"
-          Sidekiq.redis do |conn|
-            conn.multi do
-              conn.incrby("stat:failed".freeze, 1)
-              conn.incrby(failed, 1)
-              conn.expire(failed, STATS_TIMEOUT)
-            end
-          end
-        end
+        FAILURE.increment
         raise
       ensure
-        retry_and_suppress_exceptions do
-          processed = "stat:processed:#{nowdate}"
-          Sidekiq.redis do |conn|
-            conn.multi do
-              conn.hdel("#{identity}:workers", thread_identity)
-              conn.incrby("stat:processed".freeze, 1)
-              conn.incrby(processed, 1)
-              conn.expire(processed, STATS_TIMEOUT)
-            end
-          end
-        end
+        WORKER_STATE.delete(tid)
+        PROCESSED.increment
       end
     end
     # Deep clone the arguments passed to the worker so that if
-    # the message fails, what is pushed back onto Redis hasn't
+    # the job fails, what is pushed back onto Redis hasn't
     # been mutated by the worker.
     def cloned(ary)
       Marshal.load(Marshal.dump(ary))
     end
-    # If an exception occurs in the block passed to this method, that block will be retried up to max_retries times.
-    # All exceptions will be swallowed and logged.
-    def retry_and_suppress_exceptions(max_retries = 5)
-      retry_count = 0
-      begin
-        yield
-      rescue => e
-        retry_count += 1
-        if retry_count <= max_retries
-          Sidekiq.logger.debug {"Suppressing and retrying error: #{e.inspect}"}
-          pause_for_recovery(retry_count)
-          retry
-        else
-          handle_exception(e, { :message => "Exhausted #{max_retries} retries"})
-        end
-      end
-    end
-    def pause_for_recovery(retry_count)
-      sleep(retry_count)
-    end
   end
 end

data/lib/sidekiq/redis_connection.rb CHANGED

@@ -9,10 +9,11 @@ module Sidekiq
       def create(options={})
         options[:url] ||= determine_redis_provider
-        # need a connection for Fetcher and Retry
         size = options[:size] || (Sidekiq.server? ? (Sidekiq.options[:concurrency] + 2) : 5)
-        pool_timeout = options[:pool_timeout] || 1
+        verify_sizing(size, Sidekiq.options[:concurrency]) if Sidekiq.server?
+        pool_timeout = options[:pool_timeout] || 1
         log_info(options)
         ConnectionPool.new(:timeout => pool_timeout, :size => size) do
@@ -22,13 +23,30 @@ module Sidekiq
       private
+      # Sidekiq needs a lot of concurrent Redis connections.
+      #
+      # We need a connection for each Processor.
+      # We need a connection for Pro's real-time change listener
+      # We need a connection to various features to call Redis every few seconds:
+      #   - the process heartbeat.
+      #   - enterprise's leader election
+      #   - enterprise's cron support
+      def verify_sizing(size, concurrency)
+        raise ArgumentError, "Your Redis connection pool is too small for Sidekiq to work, your pool has #{size} connections but really needs to have at least #{concurrency + 2}" if size <= concurrency
+      end
       def build_client(options)
         namespace = options[:namespace]
         client = Redis.new client_opts(options)
         if namespace
-          require 'redis/namespace'
-          Redis::Namespace.new(namespace, :redis => client)
+          begin
+            require 'redis/namespace'
+            Redis::Namespace.new(namespace, :redis => client)
+          rescue LoadError
+            Sidekiq.logger.error("redis-namespace gem not included in Gemfile, cannot use namespace '#{namespace}'")
+            exit(-127)
+          end
         else
           client
         end

data/lib/sidekiq/scheduled.rb CHANGED

@@ -1,6 +1,5 @@
 require 'sidekiq'
 require 'sidekiq/util'
-require 'sidekiq/actor'
 require 'sidekiq/api'
 module Sidekiq
@@ -17,7 +16,7 @@ module Sidekiq
             # We need to go through the list one at a time to reduce the risk of something
             # going wrong between the time jobs are popped from the scheduled queue and when
             # they are pushed onto a work queue and losing the jobs.
-            while job = conn.zrangebyscore(sorted_set, '-inf', now, :limit => [0, 1]).first do
+            while job = conn.zrangebyscore(sorted_set, '-inf'.freeze, now, :limit => [0, 1]).first do
               # Pop item off the queue and add it to the work queue. If the job can't be popped from
               # the queue, it's because another process already popped it so we can move on to the
@@ -39,33 +38,56 @@ module Sidekiq
     # workers can pick it up like any other job.
     class Poller
       include Util
-      include Actor
       INITIAL_WAIT = 10
       def initialize
         @enq = (Sidekiq.options[:scheduled_enq] || Sidekiq::Scheduled::Enq).new
+        @sleeper = ConnectionPool::TimedStack.new
+        @done = false
       end
-      def poll(first_time=false)
-        watchdog('scheduling poller thread died!') do
-          initial_wait if first_time
-          begin
-            @enq.enqueue_jobs
-          rescue => ex
-            # Most likely a problem with redis networking.
-            # Punt and try again at the next interval
-            logger.error ex.message
-            logger.error ex.backtrace.first
+      # Shut down this instance, will pause until the thread is dead.
+      def terminate
+        @done = true
+        if @thread
+          t = @thread
+          @thread = nil
+          @sleeper << 0
+          t.value
+        end
+      end
+      def start
+        @thread ||= safe_thread("scheduler") do
+          initial_wait
+          while !@done
+            enqueue
+            wait
           end
+          Sidekiq.logger.info("Scheduler exiting...")
+        end
+      end
-          after(random_poll_interval) { poll }
+      def enqueue
+        begin
+          @enq.enqueue_jobs
+        rescue => ex
+          # Most likely a problem with redis networking.
+          # Punt and try again at the next interval
+          logger.error ex.message
+          logger.error ex.backtrace.first
         end
       end
       private
+      def wait
+        @sleeper.pop(random_poll_interval)
+      rescue Timeout::Error
+      end
       # Calculates a random interval that is ±50% the desired average.
       def random_poll_interval
         poll_interval_average * rand + poll_interval_average.to_f / 2
@@ -83,7 +105,7 @@ module Sidekiq
       # all your Sidekiq processes at the same time will lead to them all polling at
       # the same time: the thundering herd problem.
       #
-      # We only do this if poll_interval is unset (the default).
+      # We only do this if poll_interval_average is unset (the default).
       def poll_interval_average
         Sidekiq.options[:poll_interval_average] ||= scaled_poll_interval
       end
@@ -98,16 +120,15 @@ module Sidekiq
       end
       def initial_wait
-        begin
-          # Have all processes sleep between 5-15 seconds.  10 seconds
-          # to give time for the heartbeat to register (if the poll interval is going to be calculated by the number
-          # of workers), and 5 random seconds to ensure they don't all hit Redis at the same time.
-          sleep(INITIAL_WAIT) unless Sidekiq.options[:poll_interval_average]
-          sleep(5 * rand)
-        rescue Celluloid::TaskTerminated
-          # Hit Ctrl-C when Sidekiq is finished booting and we have a chance
-          # to get here.
-        end
+        # Have all processes sleep between 5-15 seconds.  10 seconds
+        # to give time for the heartbeat to register (if the poll interval is going to be calculated by the number
+        # of workers), and 5 random seconds to ensure they don't all hit Redis at the same time.
+        total = 0
+        total += INITIAL_WAIT unless Sidekiq.options[:poll_interval_average]
+        total += (5 * rand)
+        @sleeper.pop(total)
+      rescue Timeout::Error
       end
     end