RubyGems - nogara-resque-scheduler - Versions diffs - 2.0.1 → 2.0.2 - Mend

nogara-resque-scheduler 2.0.1 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/README.markdown +19 -11
data/lib/resque/scheduler.rb +26 -75
data/lib/resque/scheduler_locking.rb +111 -0
data/lib/resque_scheduler.rb +4 -0
data/lib/resque_scheduler/version.rb +1 -1
data/test/scheduler_test.rb +31 -2
metadata +3 -2

data/README.markdown CHANGED

@@ -99,17 +99,6 @@ any nonempty value, they will take effect.  `VERBOSE` simply dumps more output
 to stdout.  `MUTE` does the opposite and silences all output. `MUTE`
 supersedes `VERBOSE`.
-NOTE: You DO NOT want to run >1 instance of the scheduler.  Doing so will
-result in the same job being queued more than once.  You only need one
-instance of the scheduler running per resque instance (regardless of number
-of machines).
-If the scheduler process goes down for whatever reason, the delayed items
-that should have fired during the outage will fire once the scheduler process
-is started back up again (regardless of it being on a new machine).  Missed
-scheduled jobs, however, will not fire upon recovery of the scheduler process.
 ### Delayed jobs
@@ -280,6 +269,25 @@ custom job class to support the #scheduled method:
       end
     end
+### Redundancy and Fail-Over
+*>= 2.0.1 only.  Prior to 2.0.1, it is not recommended to run multiple resque-scheduler processes and will result in duplicate jobs.*
+You may want to have resque-scheduler running on multiple machines for
+redudancy.  Electing a master and failover is built in and default.  Simply
+run resque-scheduler on as many machine as you want pointing to the same
+redis instance and schedule.  The scheduler processes will use redis to
+elect a master process and detect failover when the master dies.  Precautions are
+taken to prevent jobs from potentially being queued twice during failover even
+when the clocks of the scheduler machines are slightly out of sync (or load affects
+scheduled job firing time).  If you want the gory details, look at Resque::SchedulerLocking.
+If the scheduler process(es) goes down for whatever reason, the delayed items
+that should have fired during the outage will fire once the scheduler process
+is started back up again (regardless of it being on a new machine).  Missed
+scheduled jobs, however, will not fire upon recovery of the scheduler process.
+Think of scheduled (recurring) jobs as cron jobs - if you stop cron, it doesn't fire
+missed jobs once it starts back up.
 ### resque-web Additions

data/lib/resque/scheduler.rb CHANGED

@@ -1,16 +1,16 @@
 require 'rufus/scheduler'
 require 'thwait'
+require 'resque/scheduler_locking'
 module Resque
   class Scheduler
     extend Resque::Helpers
+    extend Resque::SchedulerLocking
     class << self
-      LOCK_TIMEOUT = 60 * 5
       # If true, logs more stuff...
       attr_accessor :verbose
@@ -39,43 +39,30 @@ module Resque
         # trap signals
         register_signal_handlers
-        loop do
-          got_lock = can_lock_scheduler?
-          if got_lock == true
-            # Load the schedule into rufus
-            # If dynamic is set, load that schedule otherwise use normal load
-            if dynamic
-              reload_schedule!
-            else
-              load_schedule!
-            end
-            first_time = false
-            # Now start the scheduling part of the loop.
+        # Load the schedule into rufus
+        # If dynamic is set, load that schedule otherwise use normal load
+        if dynamic
+          reload_schedule!
+        else
+          load_schedule!
+        end
-            30.times do #30 * 5 seconds, it should be less than the timeout defined above
-            # loop do
-              begin
-                handle_delayed_items
-                update_schedule if dynamic
-              rescue Errno::EAGAIN, Errno::ECONNRESET => e
-                warn e.message
-              end
-              poll_sleep
+        # Now start the scheduling part of the loop.
+        loop do
+          if is_master?
+            begin
+              handle_delayed_items
+              update_schedule if dynamic
+            rescue Errno::EAGAIN, Errno::ECONNRESET => e
+              warn e.message
             end
-            unlock_scheduler
-            clear_schedule!
-          else
-            puts "Scheduler locked!!!"
-            sleep 5
           end
+          poll_sleep
         end
         # never gets here.
       end
       # For all signals, set the shutdown flag and wait for current
       # poll/enqueing to finish (should be almost istant).  In the
@@ -151,8 +138,10 @@ module Resque
             if !config[interval_type].nil? && config[interval_type].length > 0
               args = optionizate_interval_value(config[interval_type])
               @@scheduled_jobs[name] = rufus_scheduler.send(interval_type, *args) do
-                log! "queueing #{config['class']} (#{name})"
-                handle_errors { enqueue_from_config(config) }
+                if is_master?
+                  log! "queueing #{config['class']} (#{name})"
+                  handle_errors { enqueue_from_config(config) }
+                end
               end
               interval_defined = true
               break
@@ -187,7 +176,8 @@ module Resque
         item = nil
         begin
           handle_shutdown do
-            if item = Resque.next_item_for_timestamp(timestamp)
+            # Continually check that it is still the master
+            if is_master? && item = Resque.next_item_for_timestamp(timestamp)
               log "queuing #{item['class']} [delayed]"
               handle_errors { enqueue_from_config(item) }
             end
@@ -197,16 +187,8 @@ module Resque
       end
       def handle_shutdown
-        begin
-          unlock_scheduler if @shutdown
-        rescue
-        end
         exit if @shutdown
         yield
-        begin
-          unlock_scheduler if @shutdown
-        rescue
-        end
         exit if @shutdown
       end
@@ -324,37 +306,6 @@ module Resque
         $0 = "resque-scheduler-#{ResqueScheduler::VERSION}: #{string}"
       end
-      def lock_timeout
-        Time.now.utc.to_i + LOCK_TIMEOUT + 1
-      end
-      def can_lock_scheduler?
-        #using logic from http://redis.io/commands/getset
-        got_lock = Resque.redis.setnx('scheduler:lock', lock_timeout)
-        puts "First get lock #{got_lock}"
-        unless got_lock
-          timestamp = Resque.redis.get('scheduler:lock').to_i
-          puts "Timestamp: #{timestamp}"
-          timestamp_now = Time.now.utc.to_i
-          puts "Timestamp Now: #{timestamp_now}"
-          if timestamp_now > timestamp
-            timestamp_old = Resque.redis.getset('scheduler:lock', lock_timeout).to_i
-           puts "Timestamp Old: #{timestamp_old}"
-            if timestamp_old < timestamp_now
-              puts "Got lock here"
-              got_lock = true
-            end
-          end
-        end
-        puts "Second get lock #{got_lock}"
-        got_lock
-      end
-      def unlock_scheduler
-        puts "Unlocking scheduler lock"
-        Resque.redis.del('scheduler:lock')
-      end
     end
   end

data/lib/resque/scheduler_locking.rb ADDED

@@ -0,0 +1,111 @@
+# ### Locking the scheduler process
+#
+# There are two places in resque-scheduler that need to be synchonized
+# in order to be able to run redundant scheduler processes while ensuring jobs don't
+# get queued multiple times when the master process changes.
+#
+# 1) Processing the delayed queues (jobs that are created from enqueue_at/enqueue_in, etc)
+# 2) Processing the scheduled (cron-like) jobs from rufus-scheduler
+#
+# Protecting the delayed queues (#1) is relatively easy.  A simple SETNX in
+# redis would suffice.  However, protecting the scheduled jobs is trickier
+# because the clocks on machines could be slightly off or actual firing times
+# could vary slightly due to load.  If scheduler A's clock is slightly ahead
+# of scheduler B's clock (since they are on different machines), when
+# scheduler A dies, we need to ensure that scheduler B doesn't queue jobs
+# that A already queued before it's death. (This all assumes that it is
+# better to miss a few scheduled jobs than it is to run them multiple times
+# for the same iteration.)
+#
+# To avoid queuing multiple jobs in the case of master fail-over, the master
+# should remain the master as long as it can rather than a simple SETNX which
+# would result in the master roll being passed around frequently.
+#
+# Locking Scheme:
+# Each resque-scheduler process attempts to get the master lock via SETNX.
+# Once obtained, it sets the expiration for 3 minutes (configurable).  The
+# master process continually updates the timeout on the lock key to be 3
+# minutes in the future in it's loop(s) (see `run`) and when jobs come out of
+# rufus-scheduler (see `load_schedule_job`).  That ensures that a minimum of
+# 3 minutes must pass since the last queuing operation before a new master is
+# chosen.  If, for whatever reason, the master fails to update the expiration
+# for 3 minutes, the key expires and the lock is up for grabs.  If
+# miraculously the original master comes back to life, it will realize it is
+# no longer the master and stop processing jobs.
+#
+# The clocks on the scheduler machines can then be up to 3 minutes off from
+# each other without the risk of queueing the same scheduled job twice during
+# a master change.  The catch is, in the event of a master change, no
+# scheduled jobs will be queued during those 3 minutes.  So, there is a trade
+# off: the higher the timeout, the less likely scheduled jobs will be fired
+# twice but greater chances of missing scheduled jobs.  The lower the timeout,
+# less likely jobs will be missed, greater the chances of jobs firing twice.  If
+# you don't care about jobs firing twice or are certain your machines' clocks
+# are well in sync, a lower timeout is preferable.  One thing to keep in mind:
+# this only effects *scheduled* jobs - delayed jobs will never be lost or
+# skipped since eventually a master will come online and it will process
+# everything that is ready (no matter how old it is).  Scheduled jobs work
+# like cron - if you stop cron, no jobs fire while it's stopped and it doesn't
+# fire jobs that were missed when it starts up again.
+module Resque
+  module SchedulerLocking
+    # The TTL (in seconds) for the master lock
+    def lock_timeout=(v)
+      @lock_timeout = v
+    end
+    def lock_timeout
+      @lock_timeout ||= 60 * 3 # 3 minutes
+    end
+    def hostname
+      Socket.gethostbyname(Socket.gethostname).first
+    end
+    def process_id
+      Process.pid
+    end
+    def is_master?
+      acquire_master_lock! || has_master_lock?
+    end
+    def master_lock_value
+      [hostname, process_id].join(':')
+    end
+    def master_lock_key
+      :master_lock
+    end
+    def extend_lock!
+      # If the master fails to checkin for 3 minutes, the lock is released and is up for grabs
+      Resque.redis.expire(master_lock_key, lock_timeout)
+    end
+    def acquire_master_lock!
+      if Resque.redis.setnx(master_lock_key, master_lock_value)
+        extend_lock!
+        true
+      end
+    end
+    def has_master_lock?
+      if Resque.redis.get(master_lock_key) == master_lock_value
+        extend_lock!
+        # Since this process could lose the lock between checking
+        # if it has it and extending the lock, check again to make
+        # sure it still has it.
+        if Resque.redis.get(master_lock_key) == master_lock_value
+          true
+        end
+      end
+    end
+  end
+end

data/lib/resque_scheduler.rb CHANGED

@@ -258,6 +258,9 @@ module ResqueScheduler
     def clean_up_timestamp(key, timestamp)
       # If the list is empty, remove it.
+      # Use a watch here to ensure nobody adds jobs to this delayed
+      # queue while we're removing it.
       redis.watch key
       if 0 == redis.llen(key).to_i
         redis.multi do
@@ -268,6 +271,7 @@ module ResqueScheduler
         redis.unwatch
       end
     end
     def validate_job!(klass)
       if klass.to_s.empty?
         raise Resque::NoClassError.new("Jobs must be given a class.")

data/lib/resque_scheduler/version.rb CHANGED

@@ -1,3 +1,3 @@
 module ResqueScheduler
-  VERSION = '2.0.1'
+  VERSION = '2.0.2'
 end

data/test/scheduler_test.rb CHANGED

@@ -4,8 +4,7 @@ context "Resque::Scheduler" do
   setup do
     Resque::Scheduler.dynamic = false
-    Resque.redis.del(:schedules)
-    Resque.redis.del(:schedules_changed)
+    Resque.redis.flushall
     Resque::Scheduler.mute = true
     Resque::Scheduler.clear_schedule!
     Resque::Scheduler.send(:class_variable_set, :@@scheduled_jobs, {})
@@ -237,6 +236,36 @@ context "Resque::Scheduler" do
     assert Resque.redis.sismember(:schedules_changed, "some_ivar_job3")
   end
+  test "has_master_lock? returns false if lock is set to something else" do
+    Resque.redis.set(Resque::Scheduler.master_lock_key, "someothermachine:1234")
+    assert !Resque::Scheduler.has_master_lock?
+  end
+  test "has_master_lock? returns true if process has lock" do
+    assert Resque::Scheduler.acquire_master_lock!, "Should have acquired the master lock"
+    assert Resque::Scheduler.has_master_lock?, "Should have the master lock"
+  end
+  test "has_master_lock? extends the TTL of the lock key" do
+    Resque.redis.setex(Resque::Scheduler.master_lock_key, 5, Resque::Scheduler.master_lock_value)
+    Resque::Scheduler.has_master_lock?
+    assert Resque.redis.ttl(Resque::Scheduler.master_lock_key) > 5, "TTL should have been updated to 180"
+  end
+  test "acquire_master_lock! sets the TTL" do
+    assert Resque::Scheduler.acquire_master_lock!
+    assert (175..185).include?(Resque.redis.ttl(Resque::Scheduler.master_lock_key)), "TTL should have been updated to 180"
+  end
+  test "is_master? should return true if process already has master lock" do
+    assert Resque::Scheduler.acquire_master_lock!, "Should have acquired the master lock"
+    assert Resque::Scheduler.is_master?, "Should have the lock"
+  end
+  test "is_master? should return true if it needs to acquire the lock" do
+    assert Resque::Scheduler.is_master?, "Should acquire the lock"
+  end
   test "adheres to lint" do
     assert_nothing_raised do
       Resque::Plugin.lint(Resque::Scheduler)

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: nogara-resque-scheduler
 version: !ruby/object:Gem::Version
-  version: 2.0.1
+  version: 2.0.2
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-08-15 00:00:00.000000000 Z
+date: 2012-08-16 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -91,6 +91,7 @@ files:
 - README.markdown
 - Rakefile
 - lib/resque/scheduler.rb
+- lib/resque/scheduler_locking.rb
 - lib/resque_scheduler.rb
 - lib/resque_scheduler/plugin.rb
 - lib/resque_scheduler/server.rb