RubyGems - nogara-resque-scheduler - Versions diffs - 2.0.1 → 2.0.2 - Mend

nogara-resque-scheduler 2.0.1 → 2.0.2

Files changed (7) hide show

data/README.markdown +19 -11
data/lib/resque/scheduler.rb +26 -75
data/lib/resque/scheduler_locking.rb +111 -0
data/lib/resque_scheduler.rb +4 -0
data/lib/resque_scheduler/version.rb +1 -1
data/test/scheduler_test.rb +31 -2
metadata +3 -2

data/README.markdown CHANGED

@@ -99,17 +99,6 @@ any nonempty value, they will take effect.  `VERBOSE` simply dumps more output
 to stdout.  `MUTE` does the opposite and silences all output. `MUTE`
 supersedes `VERBOSE`.
-NOTE: You DO NOT want to run >1 instance of the scheduler.  Doing so will
-result in the same job being queued more than once.  You only need one
-instance of the scheduler running per resque instance (regardless of number
-of machines).
-If the scheduler process goes down for whatever reason, the delayed items
-that should have fired during the outage will fire once the scheduler process
-is started back up again (regardless of it being on a new machine).  Missed
-scheduled jobs, however, will not fire upon recovery of the scheduler process.
 ### Delayed jobs
@@ -280,6 +269,25 @@ custom job class to support the #scheduled method:
       end
     end
+### Redundancy and Fail-Over
+*>= 2.0.1 only.  Prior to 2.0.1, it is not recommended to run multiple resque-scheduler processes and will result in duplicate jobs.*
+You may want to have resque-scheduler running on multiple machines for
+redudancy.  Electing a master and failover is built in and default.  Simply
+run resque-scheduler on as many machine as you want pointing to the same
+redis instance and schedule.  The scheduler processes will use redis to
+elect a master process and detect failover when the master dies.  Precautions are
+taken to prevent jobs from potentially being queued twice during failover even
+when the clocks of the scheduler machines are slightly out of sync (or load affects
+scheduled job firing time).  If you want the gory details, look at Resque::SchedulerLocking.
+If the scheduler process(es) goes down for whatever reason, the delayed items
+that should have fired during the outage will fire once the scheduler process
+is started back up again (regardless of it being on a new machine).  Missed
+scheduled jobs, however, will not fire upon recovery of the scheduler process.
+Think of scheduled (recurring) jobs as cron jobs - if you stop cron, it doesn't fire
+missed jobs once it starts back up.
 ### resque-web Additions

data/lib/resque/scheduler.rb CHANGED

@@ -1,16 +1,16 @@
 require 'rufus/scheduler'
 require 'thwait'
+require 'resque/scheduler_locking'
 module Resque
   class Scheduler
     extend Resque::Helpers
+    extend Resque::SchedulerLocking
     class << self
-      LOCK_TIMEOUT = 60 * 5
       # If true, logs more stuff...
       attr_accessor :verbose
@@ -39,43 +39,30 @@ module Resque
         # trap signals
         register_signal_handlers
-        loop do
-          got_lock = can_lock_scheduler?
-          if got_lock == true
-            # Load the schedule into rufus
-            # If dynamic is set, load that schedule otherwise use normal load
-            if dynamic
-              reload_schedule!
-            else
-              load_schedule!
-            end
-            first_time = false
-            # Now start the scheduling part of the loop.
+        # Load the schedule into rufus
+        # If dynamic is set, load that schedule otherwise use normal load
+        if dynamic
+          reload_schedule!
+        else
+          load_schedule!
+        end
-            30.times do #30 * 5 seconds, it should be less than the timeout defined above
-            # loop do
-              begin
-                handle_delayed_items
-                update_schedule if dynamic
-              rescue Errno::EAGAIN, Errno::ECONNRESET => e
-                warn e.message
-              end
-              poll_sleep
+        # Now start the scheduling part of the loop.
+        loop do
+          if is_master?
+            begin
+              handle_delayed_items
+              update_schedule if dynamic
+            rescue Errno::EAGAIN, Errno::ECONNRESET => e
+              warn e.message
             end
-            unlock_scheduler
-            clear_schedule!
-          else
-            puts "Scheduler locked!!!"
-            sleep 5
           end
+          poll_sleep
         end
         # never gets here.
       end
       # For all signals, set the shutdown flag and wait for current
       # poll/enqueing to finish (should be almost istant).  In the
@@ -151,8 +138,10 @@ module Resque
             if !config[interval_type].nil? && config[interval_type].length > 0
               args = optionizate_interval_value(config[interval_type])
               @@scheduled_jobs[name] = rufus_scheduler.send(interval_type, *args) do
-                log! "queueing #{config['class']} (#{name})"
-                handle_errors { enqueue_from_config(config) }
+                if is_master?
+                  log! "queueing #{config['class']} (#{name})"
+                  handle_errors { enqueue_from_config(config) }
+                end
               end
               interval_defined = true
               break
@@ -187,7 +176,8 @@ module Resque
         item = nil
         begin
           handle_shutdown do
-            if item = Resque.next_item_for_timestamp(timestamp)
+            # Continually check that it is still the master
+            if is_master? && item = Resque.next_item_for_timestamp(timestamp)
               log "queuing #{item['class']} [delayed]"
               handle_errors { enqueue_from_config(item) }
             end
@@ -197,16 +187,8 @@ module Resque
       end
       def handle_shutdown
-        begin
-          unlock_scheduler if @shutdown
-        rescue
-        end
         exit if @shutdown
         yield
-        begin
-          unlock_scheduler if @shutdown
-        rescue
-        end
         exit if @shutdown
       end
@@ -324,37 +306,6 @@ module Resque
         $0 = "resque-scheduler-#{ResqueScheduler::VERSION}: #{string}"
       end
-      def lock_timeout
-        Time.now.utc.to_i + LOCK_TIMEOUT + 1
-      end
-      def can_lock_scheduler?
-        #using logic from http://redis.io/commands/getset
-        got_lock = Resque.redis.setnx('scheduler:lock', lock_timeout)
-        puts "First get lock #{got_lock}"
-        unless got_lock
-          timestamp = Resque.redis.get('scheduler:lock').to_i
-          puts "Timestamp: #{timestamp}"
-          timestamp_now = Time.now.utc.to_i
-          puts "Timestamp Now: #{timestamp_now}"
-          if timestamp_now > timestamp
-            timestamp_old = Resque.redis.getset('scheduler:lock', lock_timeout).to_i
-           puts "Timestamp Old: #{timestamp_old}"
-            if timestamp_old < timestamp_now
-              puts "Got lock here"
-              got_lock = true
-            end
-          end
-        end
-        puts "Second get lock #{got_lock}"
-        got_lock
-      end
-      def unlock_scheduler
-        puts "Unlocking scheduler lock"
-        Resque.redis.del('scheduler:lock')
-      end
     end
   end

data/lib/resque/scheduler_locking.rb ADDED

@@ -0,0 +1,111 @@
+# ### Locking the scheduler process
+#
+# There are two places in resque-scheduler that need to be synchonized
+# in order to be able to run redundant scheduler processes while ensuring jobs don't
+# get queued multiple times when the master process changes.
+#
+# 1) Processing the delayed queues (jobs that are created from enqueue_at/enqueue_in, etc)
+# 2) Processing the scheduled (cron-like) jobs from rufus-scheduler
+#
+# Protecting the delayed queues (#1) is relatively easy.  A simple SETNX in
+# redis would suffice.  However, protecting the scheduled jobs is trickier
+# because the clocks on machines could be slightly off or actual firing times
+# could vary slightly due to load.  If scheduler A's clock is slightly ahead
+# of scheduler B's clock (since they are on different machines), when
+# scheduler A dies, we need to ensure that scheduler B doesn't queue jobs
+# that A already queued before it's death. (This all assumes that it is
+# better to miss a few scheduled jobs than it is to run them multiple times
+# for the same iteration.)
+#
+# To avoid queuing multiple jobs in the case of master fail-over, the master
+# should remain the master as long as it can rather than a simple SETNX which
+# would result in the master roll being passed around frequently.
+#
+# Locking Scheme:
+# Each resque-scheduler process attempts to get the master lock via SETNX.
+# Once obtained, it sets the expiration for 3 minutes (configurable).  The
+# master process continually updates the timeout on the lock key to be 3
+# minutes in the future in it's loop(s) (see `run`) and when jobs come out of
+# rufus-scheduler (see `load_schedule_job`).  That ensures that a minimum of
+# 3 minutes must pass since the last queuing operation before a new master is
+# chosen.  If, for whatever reason, the master fails to update the expiration
+# for 3 minutes, the key expires and the lock is up for grabs.  If
+# miraculously the original master comes back to life, it will realize it is
+# no longer the master and stop processing jobs.
+#
+# The clocks on the scheduler machines can then be up to 3 minutes off from
+# each other without the risk of queueing the same scheduled job twice during
+# a master change.  The catch is, in the event of a master change, no
+# scheduled jobs will be queued during those 3 minutes.  So, there is a trade
+# off: the higher the timeout, the less likely scheduled jobs will be fired
+# twice but greater chances of missing scheduled jobs.  The lower the timeout,
+# less likely jobs will be missed, greater the chances of jobs firing twice.  If
+# you don't care about jobs firing twice or are certain your machines' clocks
+# are well in sync, a lower timeout is preferable.  One thing to keep in mind:
+# this only effects *scheduled* jobs - delayed jobs will never be lost or
+# skipped since eventually a master will come online and it will process
+# everything that is ready (no matter how old it is).  Scheduled jobs work
+# like cron - if you stop cron, no jobs fire while it's stopped and it doesn't
+# fire jobs that were missed when it starts up again.
+module Resque
+  module SchedulerLocking
+    # The TTL (in seconds) for the master lock
+    def lock_timeout=(v)
+      @lock_timeout = v
+    end
+    def lock_timeout
+      @lock_timeout ||= 60 * 3 # 3 minutes
+    end
+    def hostname
+      Socket.gethostbyname(Socket.gethostname).first
+    end
+    def process_id
+      Process.pid
+    end
+    def is_master?
+      acquire_master_lock! || has_master_lock?
+    end
+    def master_lock_value
+      [hostname, process_id].join(':')
+    end
+    def master_lock_key
+      :master_lock
+    end
+    def extend_lock!
+      # If the master fails to checkin for 3 minutes, the lock is released and is up for grabs
+      Resque.redis.expire(master_lock_key, lock_timeout)
+    end
+    def acquire_master_lock!
+      if Resque.redis.setnx(master_lock_key, master_lock_value)
+        extend_lock!
+        true
+      end
+    end
+    def has_master_lock?
+      if Resque.redis.get(master_lock_key) == master_lock_value
+        extend_lock!
+        # Since this process could lose the lock between checking
+        # if it has it and extending the lock, check again to make
+        # sure it still has it.
+        if Resque.redis.get(master_lock_key) == master_lock_value
+          true
+        end
+      end
+    end
+  end
+end

data/lib/resque_scheduler.rb CHANGED

@@ -258,6 +258,9 @@ module ResqueScheduler
     def clean_up_timestamp(key, timestamp)
       # If the list is empty, remove it.
+      # Use a watch here to ensure nobody adds jobs to this delayed
+      # queue while we're removing it.
       redis.watch key
       if 0 == redis.llen(key).to_i
         redis.multi do
@@ -268,6 +271,7 @@ module ResqueScheduler
         redis.unwatch
       end
     end
     def validate_job!(klass)
       if klass.to_s.empty?
         raise Resque::NoClassError.new("Jobs must be given a class.")

data/lib/resque_scheduler/version.rb CHANGED

@@ -1,3 +1,3 @@
 module ResqueScheduler
-  VERSION = '2.0.1'
+  VERSION = '2.0.2'
 end

data/test/scheduler_test.rb CHANGED

@@ -4,8 +4,7 @@ context "Resque::Scheduler" do
   setup do
     Resque::Scheduler.dynamic = false
-    Resque.redis.del(:schedules)
-    Resque.redis.del(:schedules_changed)
+    Resque.redis.flushall
     Resque::Scheduler.mute = true
     Resque::Scheduler.clear_schedule!
     Resque::Scheduler.send(:class_variable_set, :@@scheduled_jobs, {})
@@ -237,6 +236,36 @@ context "Resque::Scheduler" do
     assert Resque.redis.sismember(:schedules_changed, "some_ivar_job3")
   end
+  test "has_master_lock? returns false if lock is set to something else" do
+    Resque.redis.set(Resque::Scheduler.master_lock_key, "someothermachine:1234")
+    assert !Resque::Scheduler.has_master_lock?
+  end
+  test "has_master_lock? returns true if process has lock" do
+    assert Resque::Scheduler.acquire_master_lock!, "Should have acquired the master lock"
+    assert Resque::Scheduler.has_master_lock?, "Should have the master lock"
+  end
+  test "has_master_lock? extends the TTL of the lock key" do
+    Resque.redis.setex(Resque::Scheduler.master_lock_key, 5, Resque::Scheduler.master_lock_value)
+    Resque::Scheduler.has_master_lock?
+    assert Resque.redis.ttl(Resque::Scheduler.master_lock_key) > 5, "TTL should have been updated to 180"
+  end
+  test "acquire_master_lock! sets the TTL" do
+    assert Resque::Scheduler.acquire_master_lock!
+    assert (175..185).include?(Resque.redis.ttl(Resque::Scheduler.master_lock_key)), "TTL should have been updated to 180"
+  end
+  test "is_master? should return true if process already has master lock" do
+    assert Resque::Scheduler.acquire_master_lock!, "Should have acquired the master lock"
+    assert Resque::Scheduler.is_master?, "Should have the lock"
+  end
+  test "is_master? should return true if it needs to acquire the lock" do
+    assert Resque::Scheduler.is_master?, "Should acquire the lock"
+  end
   test "adheres to lint" do
     assert_nothing_raised do
       Resque::Plugin.lint(Resque::Scheduler)

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: nogara-resque-scheduler
 version: !ruby/object:Gem::Version
-  version: 2.0.1
+  version: 2.0.2
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-08-15 00:00:00.000000000 Z
+date: 2012-08-16 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -91,6 +91,7 @@ files:
 - README.markdown
 - Rakefile
 - lib/resque/scheduler.rb
+- lib/resque/scheduler_locking.rb
 - lib/resque_scheduler.rb
 - lib/resque_scheduler/plugin.rb
 - lib/resque_scheduler/server.rb