nogara-resque-scheduler 2.0.1 → 2.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -99,17 +99,6 @@ any nonempty value, they will take effect. `VERBOSE` simply dumps more output
99
99
  to stdout. `MUTE` does the opposite and silences all output. `MUTE`
100
100
  supersedes `VERBOSE`.
101
101
 
102
- NOTE: You DO NOT want to run >1 instance of the scheduler. Doing so will
103
- result in the same job being queued more than once. You only need one
104
- instance of the scheduler running per resque instance (regardless of number
105
- of machines).
106
-
107
- If the scheduler process goes down for whatever reason, the delayed items
108
- that should have fired during the outage will fire once the scheduler process
109
- is started back up again (regardless of it being on a new machine). Missed
110
- scheduled jobs, however, will not fire upon recovery of the scheduler process.
111
-
112
-
113
102
 
114
103
  ### Delayed jobs
115
104
 
@@ -280,6 +269,25 @@ custom job class to support the #scheduled method:
280
269
  end
281
270
  end
282
271
 
272
+ ### Redundancy and Fail-Over
273
+
274
+ *>= 2.0.1 only. Prior to 2.0.1, it is not recommended to run multiple resque-scheduler processes and will result in duplicate jobs.*
275
+
276
+ You may want to have resque-scheduler running on multiple machines for
277
+ redudancy. Electing a master and failover is built in and default. Simply
278
+ run resque-scheduler on as many machine as you want pointing to the same
279
+ redis instance and schedule. The scheduler processes will use redis to
280
+ elect a master process and detect failover when the master dies. Precautions are
281
+ taken to prevent jobs from potentially being queued twice during failover even
282
+ when the clocks of the scheduler machines are slightly out of sync (or load affects
283
+ scheduled job firing time). If you want the gory details, look at Resque::SchedulerLocking.
284
+
285
+ If the scheduler process(es) goes down for whatever reason, the delayed items
286
+ that should have fired during the outage will fire once the scheduler process
287
+ is started back up again (regardless of it being on a new machine). Missed
288
+ scheduled jobs, however, will not fire upon recovery of the scheduler process.
289
+ Think of scheduled (recurring) jobs as cron jobs - if you stop cron, it doesn't fire
290
+ missed jobs once it starts back up.
283
291
 
284
292
 
285
293
  ### resque-web Additions
@@ -1,16 +1,16 @@
1
1
  require 'rufus/scheduler'
2
2
  require 'thwait'
3
+ require 'resque/scheduler_locking'
3
4
 
4
5
  module Resque
5
6
 
6
7
  class Scheduler
7
8
 
8
9
  extend Resque::Helpers
10
+ extend Resque::SchedulerLocking
9
11
 
10
12
  class << self
11
13
 
12
- LOCK_TIMEOUT = 60 * 5
13
-
14
14
  # If true, logs more stuff...
15
15
  attr_accessor :verbose
16
16
 
@@ -39,43 +39,30 @@ module Resque
39
39
  # trap signals
40
40
  register_signal_handlers
41
41
 
42
- loop do
43
- got_lock = can_lock_scheduler?
44
- if got_lock == true
45
-
46
- # Load the schedule into rufus
47
- # If dynamic is set, load that schedule otherwise use normal load
48
- if dynamic
49
- reload_schedule!
50
- else
51
- load_schedule!
52
- end
53
-
54
- first_time = false
55
-
56
- # Now start the scheduling part of the loop.
42
+ # Load the schedule into rufus
43
+ # If dynamic is set, load that schedule otherwise use normal load
44
+ if dynamic
45
+ reload_schedule!
46
+ else
47
+ load_schedule!
48
+ end
57
49
 
58
- 30.times do #30 * 5 seconds, it should be less than the timeout defined above
59
- # loop do
60
- begin
61
- handle_delayed_items
62
- update_schedule if dynamic
63
- rescue Errno::EAGAIN, Errno::ECONNRESET => e
64
- warn e.message
65
- end
66
- poll_sleep
50
+ # Now start the scheduling part of the loop.
51
+ loop do
52
+ if is_master?
53
+ begin
54
+ handle_delayed_items
55
+ update_schedule if dynamic
56
+ rescue Errno::EAGAIN, Errno::ECONNRESET => e
57
+ warn e.message
67
58
  end
68
-
69
- unlock_scheduler
70
- clear_schedule!
71
-
72
- else
73
- puts "Scheduler locked!!!"
74
- sleep 5
75
59
  end
60
+ poll_sleep
76
61
  end
62
+
77
63
  # never gets here.
78
64
  end
65
+
79
66
 
80
67
  # For all signals, set the shutdown flag and wait for current
81
68
  # poll/enqueing to finish (should be almost istant). In the
@@ -151,8 +138,10 @@ module Resque
151
138
  if !config[interval_type].nil? && config[interval_type].length > 0
152
139
  args = optionizate_interval_value(config[interval_type])
153
140
  @@scheduled_jobs[name] = rufus_scheduler.send(interval_type, *args) do
154
- log! "queueing #{config['class']} (#{name})"
155
- handle_errors { enqueue_from_config(config) }
141
+ if is_master?
142
+ log! "queueing #{config['class']} (#{name})"
143
+ handle_errors { enqueue_from_config(config) }
144
+ end
156
145
  end
157
146
  interval_defined = true
158
147
  break
@@ -187,7 +176,8 @@ module Resque
187
176
  item = nil
188
177
  begin
189
178
  handle_shutdown do
190
- if item = Resque.next_item_for_timestamp(timestamp)
179
+ # Continually check that it is still the master
180
+ if is_master? && item = Resque.next_item_for_timestamp(timestamp)
191
181
  log "queuing #{item['class']} [delayed]"
192
182
  handle_errors { enqueue_from_config(item) }
193
183
  end
@@ -197,16 +187,8 @@ module Resque
197
187
  end
198
188
 
199
189
  def handle_shutdown
200
- begin
201
- unlock_scheduler if @shutdown
202
- rescue
203
- end
204
190
  exit if @shutdown
205
191
  yield
206
- begin
207
- unlock_scheduler if @shutdown
208
- rescue
209
- end
210
192
  exit if @shutdown
211
193
  end
212
194
 
@@ -324,37 +306,6 @@ module Resque
324
306
  $0 = "resque-scheduler-#{ResqueScheduler::VERSION}: #{string}"
325
307
  end
326
308
 
327
- def lock_timeout
328
- Time.now.utc.to_i + LOCK_TIMEOUT + 1
329
- end
330
-
331
- def can_lock_scheduler?
332
- #using logic from http://redis.io/commands/getset
333
- got_lock = Resque.redis.setnx('scheduler:lock', lock_timeout)
334
- puts "First get lock #{got_lock}"
335
- unless got_lock
336
- timestamp = Resque.redis.get('scheduler:lock').to_i
337
- puts "Timestamp: #{timestamp}"
338
- timestamp_now = Time.now.utc.to_i
339
- puts "Timestamp Now: #{timestamp_now}"
340
- if timestamp_now > timestamp
341
- timestamp_old = Resque.redis.getset('scheduler:lock', lock_timeout).to_i
342
- puts "Timestamp Old: #{timestamp_old}"
343
- if timestamp_old < timestamp_now
344
- puts "Got lock here"
345
- got_lock = true
346
- end
347
- end
348
- end
349
- puts "Second get lock #{got_lock}"
350
- got_lock
351
- end
352
-
353
- def unlock_scheduler
354
- puts "Unlocking scheduler lock"
355
- Resque.redis.del('scheduler:lock')
356
- end
357
-
358
309
  end
359
310
 
360
311
  end
@@ -0,0 +1,111 @@
1
+
2
+ # ### Locking the scheduler process
3
+ #
4
+ # There are two places in resque-scheduler that need to be synchonized
5
+ # in order to be able to run redundant scheduler processes while ensuring jobs don't
6
+ # get queued multiple times when the master process changes.
7
+ #
8
+ # 1) Processing the delayed queues (jobs that are created from enqueue_at/enqueue_in, etc)
9
+ # 2) Processing the scheduled (cron-like) jobs from rufus-scheduler
10
+ #
11
+ # Protecting the delayed queues (#1) is relatively easy. A simple SETNX in
12
+ # redis would suffice. However, protecting the scheduled jobs is trickier
13
+ # because the clocks on machines could be slightly off or actual firing times
14
+ # could vary slightly due to load. If scheduler A's clock is slightly ahead
15
+ # of scheduler B's clock (since they are on different machines), when
16
+ # scheduler A dies, we need to ensure that scheduler B doesn't queue jobs
17
+ # that A already queued before it's death. (This all assumes that it is
18
+ # better to miss a few scheduled jobs than it is to run them multiple times
19
+ # for the same iteration.)
20
+ #
21
+ # To avoid queuing multiple jobs in the case of master fail-over, the master
22
+ # should remain the master as long as it can rather than a simple SETNX which
23
+ # would result in the master roll being passed around frequently.
24
+ #
25
+ # Locking Scheme:
26
+ # Each resque-scheduler process attempts to get the master lock via SETNX.
27
+ # Once obtained, it sets the expiration for 3 minutes (configurable). The
28
+ # master process continually updates the timeout on the lock key to be 3
29
+ # minutes in the future in it's loop(s) (see `run`) and when jobs come out of
30
+ # rufus-scheduler (see `load_schedule_job`). That ensures that a minimum of
31
+ # 3 minutes must pass since the last queuing operation before a new master is
32
+ # chosen. If, for whatever reason, the master fails to update the expiration
33
+ # for 3 minutes, the key expires and the lock is up for grabs. If
34
+ # miraculously the original master comes back to life, it will realize it is
35
+ # no longer the master and stop processing jobs.
36
+ #
37
+ # The clocks on the scheduler machines can then be up to 3 minutes off from
38
+ # each other without the risk of queueing the same scheduled job twice during
39
+ # a master change. The catch is, in the event of a master change, no
40
+ # scheduled jobs will be queued during those 3 minutes. So, there is a trade
41
+ # off: the higher the timeout, the less likely scheduled jobs will be fired
42
+ # twice but greater chances of missing scheduled jobs. The lower the timeout,
43
+ # less likely jobs will be missed, greater the chances of jobs firing twice. If
44
+ # you don't care about jobs firing twice or are certain your machines' clocks
45
+ # are well in sync, a lower timeout is preferable. One thing to keep in mind:
46
+ # this only effects *scheduled* jobs - delayed jobs will never be lost or
47
+ # skipped since eventually a master will come online and it will process
48
+ # everything that is ready (no matter how old it is). Scheduled jobs work
49
+ # like cron - if you stop cron, no jobs fire while it's stopped and it doesn't
50
+ # fire jobs that were missed when it starts up again.
51
+
52
+ module Resque
53
+
54
+ module SchedulerLocking
55
+
56
+ # The TTL (in seconds) for the master lock
57
+ def lock_timeout=(v)
58
+ @lock_timeout = v
59
+ end
60
+
61
+ def lock_timeout
62
+ @lock_timeout ||= 60 * 3 # 3 minutes
63
+ end
64
+
65
+ def hostname
66
+ Socket.gethostbyname(Socket.gethostname).first
67
+ end
68
+
69
+ def process_id
70
+ Process.pid
71
+ end
72
+
73
+ def is_master?
74
+ acquire_master_lock! || has_master_lock?
75
+ end
76
+
77
+ def master_lock_value
78
+ [hostname, process_id].join(':')
79
+ end
80
+
81
+ def master_lock_key
82
+ :master_lock
83
+ end
84
+
85
+ def extend_lock!
86
+ # If the master fails to checkin for 3 minutes, the lock is released and is up for grabs
87
+ Resque.redis.expire(master_lock_key, lock_timeout)
88
+ end
89
+
90
+ def acquire_master_lock!
91
+ if Resque.redis.setnx(master_lock_key, master_lock_value)
92
+ extend_lock!
93
+ true
94
+ end
95
+ end
96
+
97
+ def has_master_lock?
98
+ if Resque.redis.get(master_lock_key) == master_lock_value
99
+ extend_lock!
100
+ # Since this process could lose the lock between checking
101
+ # if it has it and extending the lock, check again to make
102
+ # sure it still has it.
103
+ if Resque.redis.get(master_lock_key) == master_lock_value
104
+ true
105
+ end
106
+ end
107
+ end
108
+
109
+ end
110
+
111
+ end
@@ -258,6 +258,9 @@ module ResqueScheduler
258
258
 
259
259
  def clean_up_timestamp(key, timestamp)
260
260
  # If the list is empty, remove it.
261
+
262
+ # Use a watch here to ensure nobody adds jobs to this delayed
263
+ # queue while we're removing it.
261
264
  redis.watch key
262
265
  if 0 == redis.llen(key).to_i
263
266
  redis.multi do
@@ -268,6 +271,7 @@ module ResqueScheduler
268
271
  redis.unwatch
269
272
  end
270
273
  end
274
+
271
275
  def validate_job!(klass)
272
276
  if klass.to_s.empty?
273
277
  raise Resque::NoClassError.new("Jobs must be given a class.")
@@ -1,3 +1,3 @@
1
1
  module ResqueScheduler
2
- VERSION = '2.0.1'
2
+ VERSION = '2.0.2'
3
3
  end
@@ -4,8 +4,7 @@ context "Resque::Scheduler" do
4
4
 
5
5
  setup do
6
6
  Resque::Scheduler.dynamic = false
7
- Resque.redis.del(:schedules)
8
- Resque.redis.del(:schedules_changed)
7
+ Resque.redis.flushall
9
8
  Resque::Scheduler.mute = true
10
9
  Resque::Scheduler.clear_schedule!
11
10
  Resque::Scheduler.send(:class_variable_set, :@@scheduled_jobs, {})
@@ -237,6 +236,36 @@ context "Resque::Scheduler" do
237
236
  assert Resque.redis.sismember(:schedules_changed, "some_ivar_job3")
238
237
  end
239
238
 
239
+ test "has_master_lock? returns false if lock is set to something else" do
240
+ Resque.redis.set(Resque::Scheduler.master_lock_key, "someothermachine:1234")
241
+ assert !Resque::Scheduler.has_master_lock?
242
+ end
243
+
244
+ test "has_master_lock? returns true if process has lock" do
245
+ assert Resque::Scheduler.acquire_master_lock!, "Should have acquired the master lock"
246
+ assert Resque::Scheduler.has_master_lock?, "Should have the master lock"
247
+ end
248
+
249
+ test "has_master_lock? extends the TTL of the lock key" do
250
+ Resque.redis.setex(Resque::Scheduler.master_lock_key, 5, Resque::Scheduler.master_lock_value)
251
+ Resque::Scheduler.has_master_lock?
252
+ assert Resque.redis.ttl(Resque::Scheduler.master_lock_key) > 5, "TTL should have been updated to 180"
253
+ end
254
+
255
+ test "acquire_master_lock! sets the TTL" do
256
+ assert Resque::Scheduler.acquire_master_lock!
257
+ assert (175..185).include?(Resque.redis.ttl(Resque::Scheduler.master_lock_key)), "TTL should have been updated to 180"
258
+ end
259
+
260
+ test "is_master? should return true if process already has master lock" do
261
+ assert Resque::Scheduler.acquire_master_lock!, "Should have acquired the master lock"
262
+ assert Resque::Scheduler.is_master?, "Should have the lock"
263
+ end
264
+
265
+ test "is_master? should return true if it needs to acquire the lock" do
266
+ assert Resque::Scheduler.is_master?, "Should acquire the lock"
267
+ end
268
+
240
269
  test "adheres to lint" do
241
270
  assert_nothing_raised do
242
271
  Resque::Plugin.lint(Resque::Scheduler)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nogara-resque-scheduler
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.1
4
+ version: 2.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-08-15 00:00:00.000000000 Z
12
+ date: 2012-08-16 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -91,6 +91,7 @@ files:
91
91
  - README.markdown
92
92
  - Rakefile
93
93
  - lib/resque/scheduler.rb
94
+ - lib/resque/scheduler_locking.rb
94
95
  - lib/resque_scheduler.rb
95
96
  - lib/resque_scheduler/plugin.rb
96
97
  - lib/resque_scheduler/server.rb