nogara-resque-scheduler 2.0.1 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -99,17 +99,6 @@ any nonempty value, they will take effect. `VERBOSE` simply dumps more output
99
99
  to stdout. `MUTE` does the opposite and silences all output. `MUTE`
100
100
  supersedes `VERBOSE`.
101
101
 
102
- NOTE: You DO NOT want to run >1 instance of the scheduler. Doing so will
103
- result in the same job being queued more than once. You only need one
104
- instance of the scheduler running per resque instance (regardless of number
105
- of machines).
106
-
107
- If the scheduler process goes down for whatever reason, the delayed items
108
- that should have fired during the outage will fire once the scheduler process
109
- is started back up again (regardless of it being on a new machine). Missed
110
- scheduled jobs, however, will not fire upon recovery of the scheduler process.
111
-
112
-
113
102
 
114
103
  ### Delayed jobs
115
104
 
@@ -280,6 +269,25 @@ custom job class to support the #scheduled method:
280
269
  end
281
270
  end
282
271
 
272
+ ### Redundancy and Fail-Over
273
+
274
+ *>= 2.0.1 only. Prior to 2.0.1, it is not recommended to run multiple resque-scheduler processes and will result in duplicate jobs.*
275
+
276
+ You may want to have resque-scheduler running on multiple machines for
277
+ redudancy. Electing a master and failover is built in and default. Simply
278
+ run resque-scheduler on as many machine as you want pointing to the same
279
+ redis instance and schedule. The scheduler processes will use redis to
280
+ elect a master process and detect failover when the master dies. Precautions are
281
+ taken to prevent jobs from potentially being queued twice during failover even
282
+ when the clocks of the scheduler machines are slightly out of sync (or load affects
283
+ scheduled job firing time). If you want the gory details, look at Resque::SchedulerLocking.
284
+
285
+ If the scheduler process(es) goes down for whatever reason, the delayed items
286
+ that should have fired during the outage will fire once the scheduler process
287
+ is started back up again (regardless of it being on a new machine). Missed
288
+ scheduled jobs, however, will not fire upon recovery of the scheduler process.
289
+ Think of scheduled (recurring) jobs as cron jobs - if you stop cron, it doesn't fire
290
+ missed jobs once it starts back up.
283
291
 
284
292
 
285
293
  ### resque-web Additions
@@ -1,16 +1,16 @@
1
1
  require 'rufus/scheduler'
2
2
  require 'thwait'
3
+ require 'resque/scheduler_locking'
3
4
 
4
5
  module Resque
5
6
 
6
7
  class Scheduler
7
8
 
8
9
  extend Resque::Helpers
10
+ extend Resque::SchedulerLocking
9
11
 
10
12
  class << self
11
13
 
12
- LOCK_TIMEOUT = 60 * 5
13
-
14
14
  # If true, logs more stuff...
15
15
  attr_accessor :verbose
16
16
 
@@ -39,43 +39,30 @@ module Resque
39
39
  # trap signals
40
40
  register_signal_handlers
41
41
 
42
- loop do
43
- got_lock = can_lock_scheduler?
44
- if got_lock == true
45
-
46
- # Load the schedule into rufus
47
- # If dynamic is set, load that schedule otherwise use normal load
48
- if dynamic
49
- reload_schedule!
50
- else
51
- load_schedule!
52
- end
53
-
54
- first_time = false
55
-
56
- # Now start the scheduling part of the loop.
42
+ # Load the schedule into rufus
43
+ # If dynamic is set, load that schedule otherwise use normal load
44
+ if dynamic
45
+ reload_schedule!
46
+ else
47
+ load_schedule!
48
+ end
57
49
 
58
- 30.times do #30 * 5 seconds, it should be less than the timeout defined above
59
- # loop do
60
- begin
61
- handle_delayed_items
62
- update_schedule if dynamic
63
- rescue Errno::EAGAIN, Errno::ECONNRESET => e
64
- warn e.message
65
- end
66
- poll_sleep
50
+ # Now start the scheduling part of the loop.
51
+ loop do
52
+ if is_master?
53
+ begin
54
+ handle_delayed_items
55
+ update_schedule if dynamic
56
+ rescue Errno::EAGAIN, Errno::ECONNRESET => e
57
+ warn e.message
67
58
  end
68
-
69
- unlock_scheduler
70
- clear_schedule!
71
-
72
- else
73
- puts "Scheduler locked!!!"
74
- sleep 5
75
59
  end
60
+ poll_sleep
76
61
  end
62
+
77
63
  # never gets here.
78
64
  end
65
+
79
66
 
80
67
  # For all signals, set the shutdown flag and wait for current
81
68
  # poll/enqueing to finish (should be almost istant). In the
@@ -151,8 +138,10 @@ module Resque
151
138
  if !config[interval_type].nil? && config[interval_type].length > 0
152
139
  args = optionizate_interval_value(config[interval_type])
153
140
  @@scheduled_jobs[name] = rufus_scheduler.send(interval_type, *args) do
154
- log! "queueing #{config['class']} (#{name})"
155
- handle_errors { enqueue_from_config(config) }
141
+ if is_master?
142
+ log! "queueing #{config['class']} (#{name})"
143
+ handle_errors { enqueue_from_config(config) }
144
+ end
156
145
  end
157
146
  interval_defined = true
158
147
  break
@@ -187,7 +176,8 @@ module Resque
187
176
  item = nil
188
177
  begin
189
178
  handle_shutdown do
190
- if item = Resque.next_item_for_timestamp(timestamp)
179
+ # Continually check that it is still the master
180
+ if is_master? && item = Resque.next_item_for_timestamp(timestamp)
191
181
  log "queuing #{item['class']} [delayed]"
192
182
  handle_errors { enqueue_from_config(item) }
193
183
  end
@@ -197,16 +187,8 @@ module Resque
197
187
  end
198
188
 
199
189
  def handle_shutdown
200
- begin
201
- unlock_scheduler if @shutdown
202
- rescue
203
- end
204
190
  exit if @shutdown
205
191
  yield
206
- begin
207
- unlock_scheduler if @shutdown
208
- rescue
209
- end
210
192
  exit if @shutdown
211
193
  end
212
194
 
@@ -324,37 +306,6 @@ module Resque
324
306
  $0 = "resque-scheduler-#{ResqueScheduler::VERSION}: #{string}"
325
307
  end
326
308
 
327
- def lock_timeout
328
- Time.now.utc.to_i + LOCK_TIMEOUT + 1
329
- end
330
-
331
- def can_lock_scheduler?
332
- #using logic from http://redis.io/commands/getset
333
- got_lock = Resque.redis.setnx('scheduler:lock', lock_timeout)
334
- puts "First get lock #{got_lock}"
335
- unless got_lock
336
- timestamp = Resque.redis.get('scheduler:lock').to_i
337
- puts "Timestamp: #{timestamp}"
338
- timestamp_now = Time.now.utc.to_i
339
- puts "Timestamp Now: #{timestamp_now}"
340
- if timestamp_now > timestamp
341
- timestamp_old = Resque.redis.getset('scheduler:lock', lock_timeout).to_i
342
- puts "Timestamp Old: #{timestamp_old}"
343
- if timestamp_old < timestamp_now
344
- puts "Got lock here"
345
- got_lock = true
346
- end
347
- end
348
- end
349
- puts "Second get lock #{got_lock}"
350
- got_lock
351
- end
352
-
353
- def unlock_scheduler
354
- puts "Unlocking scheduler lock"
355
- Resque.redis.del('scheduler:lock')
356
- end
357
-
358
309
  end
359
310
 
360
311
  end
@@ -0,0 +1,111 @@
1
+
2
+ # ### Locking the scheduler process
3
+ #
4
+ # There are two places in resque-scheduler that need to be synchonized
5
+ # in order to be able to run redundant scheduler processes while ensuring jobs don't
6
+ # get queued multiple times when the master process changes.
7
+ #
8
+ # 1) Processing the delayed queues (jobs that are created from enqueue_at/enqueue_in, etc)
9
+ # 2) Processing the scheduled (cron-like) jobs from rufus-scheduler
10
+ #
11
+ # Protecting the delayed queues (#1) is relatively easy. A simple SETNX in
12
+ # redis would suffice. However, protecting the scheduled jobs is trickier
13
+ # because the clocks on machines could be slightly off or actual firing times
14
+ # could vary slightly due to load. If scheduler A's clock is slightly ahead
15
+ # of scheduler B's clock (since they are on different machines), when
16
+ # scheduler A dies, we need to ensure that scheduler B doesn't queue jobs
17
+ # that A already queued before it's death. (This all assumes that it is
18
+ # better to miss a few scheduled jobs than it is to run them multiple times
19
+ # for the same iteration.)
20
+ #
21
+ # To avoid queuing multiple jobs in the case of master fail-over, the master
22
+ # should remain the master as long as it can rather than a simple SETNX which
23
+ # would result in the master roll being passed around frequently.
24
+ #
25
+ # Locking Scheme:
26
+ # Each resque-scheduler process attempts to get the master lock via SETNX.
27
+ # Once obtained, it sets the expiration for 3 minutes (configurable). The
28
+ # master process continually updates the timeout on the lock key to be 3
29
+ # minutes in the future in it's loop(s) (see `run`) and when jobs come out of
30
+ # rufus-scheduler (see `load_schedule_job`). That ensures that a minimum of
31
+ # 3 minutes must pass since the last queuing operation before a new master is
32
+ # chosen. If, for whatever reason, the master fails to update the expiration
33
+ # for 3 minutes, the key expires and the lock is up for grabs. If
34
+ # miraculously the original master comes back to life, it will realize it is
35
+ # no longer the master and stop processing jobs.
36
+ #
37
+ # The clocks on the scheduler machines can then be up to 3 minutes off from
38
+ # each other without the risk of queueing the same scheduled job twice during
39
+ # a master change. The catch is, in the event of a master change, no
40
+ # scheduled jobs will be queued during those 3 minutes. So, there is a trade
41
+ # off: the higher the timeout, the less likely scheduled jobs will be fired
42
+ # twice but greater chances of missing scheduled jobs. The lower the timeout,
43
+ # less likely jobs will be missed, greater the chances of jobs firing twice. If
44
+ # you don't care about jobs firing twice or are certain your machines' clocks
45
+ # are well in sync, a lower timeout is preferable. One thing to keep in mind:
46
+ # this only effects *scheduled* jobs - delayed jobs will never be lost or
47
+ # skipped since eventually a master will come online and it will process
48
+ # everything that is ready (no matter how old it is). Scheduled jobs work
49
+ # like cron - if you stop cron, no jobs fire while it's stopped and it doesn't
50
+ # fire jobs that were missed when it starts up again.
51
+
52
+ module Resque
53
+
54
+ module SchedulerLocking
55
+
56
+ # The TTL (in seconds) for the master lock
57
+ def lock_timeout=(v)
58
+ @lock_timeout = v
59
+ end
60
+
61
+ def lock_timeout
62
+ @lock_timeout ||= 60 * 3 # 3 minutes
63
+ end
64
+
65
+ def hostname
66
+ Socket.gethostbyname(Socket.gethostname).first
67
+ end
68
+
69
+ def process_id
70
+ Process.pid
71
+ end
72
+
73
+ def is_master?
74
+ acquire_master_lock! || has_master_lock?
75
+ end
76
+
77
+ def master_lock_value
78
+ [hostname, process_id].join(':')
79
+ end
80
+
81
+ def master_lock_key
82
+ :master_lock
83
+ end
84
+
85
+ def extend_lock!
86
+ # If the master fails to checkin for 3 minutes, the lock is released and is up for grabs
87
+ Resque.redis.expire(master_lock_key, lock_timeout)
88
+ end
89
+
90
+ def acquire_master_lock!
91
+ if Resque.redis.setnx(master_lock_key, master_lock_value)
92
+ extend_lock!
93
+ true
94
+ end
95
+ end
96
+
97
+ def has_master_lock?
98
+ if Resque.redis.get(master_lock_key) == master_lock_value
99
+ extend_lock!
100
+ # Since this process could lose the lock between checking
101
+ # if it has it and extending the lock, check again to make
102
+ # sure it still has it.
103
+ if Resque.redis.get(master_lock_key) == master_lock_value
104
+ true
105
+ end
106
+ end
107
+ end
108
+
109
+ end
110
+
111
+ end
@@ -258,6 +258,9 @@ module ResqueScheduler
258
258
 
259
259
  def clean_up_timestamp(key, timestamp)
260
260
  # If the list is empty, remove it.
261
+
262
+ # Use a watch here to ensure nobody adds jobs to this delayed
263
+ # queue while we're removing it.
261
264
  redis.watch key
262
265
  if 0 == redis.llen(key).to_i
263
266
  redis.multi do
@@ -268,6 +271,7 @@ module ResqueScheduler
268
271
  redis.unwatch
269
272
  end
270
273
  end
274
+
271
275
  def validate_job!(klass)
272
276
  if klass.to_s.empty?
273
277
  raise Resque::NoClassError.new("Jobs must be given a class.")
@@ -1,3 +1,3 @@
1
1
  module ResqueScheduler
2
- VERSION = '2.0.1'
2
+ VERSION = '2.0.2'
3
3
  end
@@ -4,8 +4,7 @@ context "Resque::Scheduler" do
4
4
 
5
5
  setup do
6
6
  Resque::Scheduler.dynamic = false
7
- Resque.redis.del(:schedules)
8
- Resque.redis.del(:schedules_changed)
7
+ Resque.redis.flushall
9
8
  Resque::Scheduler.mute = true
10
9
  Resque::Scheduler.clear_schedule!
11
10
  Resque::Scheduler.send(:class_variable_set, :@@scheduled_jobs, {})
@@ -237,6 +236,36 @@ context "Resque::Scheduler" do
237
236
  assert Resque.redis.sismember(:schedules_changed, "some_ivar_job3")
238
237
  end
239
238
 
239
+ test "has_master_lock? returns false if lock is set to something else" do
240
+ Resque.redis.set(Resque::Scheduler.master_lock_key, "someothermachine:1234")
241
+ assert !Resque::Scheduler.has_master_lock?
242
+ end
243
+
244
+ test "has_master_lock? returns true if process has lock" do
245
+ assert Resque::Scheduler.acquire_master_lock!, "Should have acquired the master lock"
246
+ assert Resque::Scheduler.has_master_lock?, "Should have the master lock"
247
+ end
248
+
249
+ test "has_master_lock? extends the TTL of the lock key" do
250
+ Resque.redis.setex(Resque::Scheduler.master_lock_key, 5, Resque::Scheduler.master_lock_value)
251
+ Resque::Scheduler.has_master_lock?
252
+ assert Resque.redis.ttl(Resque::Scheduler.master_lock_key) > 5, "TTL should have been updated to 180"
253
+ end
254
+
255
+ test "acquire_master_lock! sets the TTL" do
256
+ assert Resque::Scheduler.acquire_master_lock!
257
+ assert (175..185).include?(Resque.redis.ttl(Resque::Scheduler.master_lock_key)), "TTL should have been updated to 180"
258
+ end
259
+
260
+ test "is_master? should return true if process already has master lock" do
261
+ assert Resque::Scheduler.acquire_master_lock!, "Should have acquired the master lock"
262
+ assert Resque::Scheduler.is_master?, "Should have the lock"
263
+ end
264
+
265
+ test "is_master? should return true if it needs to acquire the lock" do
266
+ assert Resque::Scheduler.is_master?, "Should acquire the lock"
267
+ end
268
+
240
269
  test "adheres to lint" do
241
270
  assert_nothing_raised do
242
271
  Resque::Plugin.lint(Resque::Scheduler)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nogara-resque-scheduler
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.1
4
+ version: 2.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-08-15 00:00:00.000000000 Z
12
+ date: 2012-08-16 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -91,6 +91,7 @@ files:
91
91
  - README.markdown
92
92
  - Rakefile
93
93
  - lib/resque/scheduler.rb
94
+ - lib/resque/scheduler_locking.rb
94
95
  - lib/resque_scheduler.rb
95
96
  - lib/resque_scheduler/plugin.rb
96
97
  - lib/resque_scheduler/server.rb