redis_failover 0.9.4 → 0.9.5

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -16,4 +16,4 @@ test/tmp
16
16
  test/version_tmp
17
17
  tmp
18
18
  tags
19
-
19
+ .DS_Store
data/Changes.md CHANGED
@@ -1,3 +1,8 @@
1
+ 0.9.5
2
+ -----------
3
+ - Introduce a safer master node discovery process for the Node Manager (#34)
4
+ - Improved shutdown process for Node Manager
5
+
1
6
  0.9.4
2
7
  -----------
3
8
  - Preserve original master by reading from existing znode state.
data/README.md CHANGED
@@ -155,7 +155,7 @@ redis_failover uses YARD for its API documentation. Refer to the generated [API
155
155
 
156
156
  ## Requirements
157
157
 
158
- - redis_failover is actively tested against MRI 1.9.2/1.9.3 and JRuby 1.6.7 (1.9 mode only). Other rubies may work, although I don't actively test against them.
158
+ - redis_failover is actively tested against MRI 1.8.7/1.9.2/1.9.3 and JRuby 1.6.7 (1.9 mode only). Other rubies may work, although I don't actively test against them.
159
159
  - redis_failover requires a ZooKeeper service cluster to ensure reliability and data consistency. ZooKeeper is very simple and easy to get up and running. Please refer to this [Quick ZooKeeper Guide](https://github.com/ryanlecompte/redis_failover/wiki/Quick-ZooKeeper-Guide) to get up and running quickly if you don't already have ZooKeeper as a part of your environment.
160
160
 
161
161
  ## Considerations
@@ -25,6 +25,13 @@ module RedisFailover
25
25
  class NoMasterError < Error
26
26
  end
27
27
 
28
+ # Raised when more than one master is found on startup.
29
+ class MultipleMastersError < Error
30
+ def initialize(nodes)
31
+ super("Multiple nodes with master role: #{nodes.map(&:to_s)}")
32
+ end
33
+ end
34
+
28
35
  # Raised when no slave is currently available.
29
36
  class NoSlaveError < Error
30
37
  end
@@ -118,7 +118,6 @@ module RedisFailover
118
118
  end
119
119
  alias_method :eql?, :==
120
120
 
121
-
122
121
  # @return [Integer] a hash value for this node
123
122
  def hash
124
123
  to_s.hash
@@ -32,13 +32,11 @@ module RedisFailover
32
32
  @znode = @options[:znode_path] || Util::DEFAULT_ZNODE_PATH
33
33
  @manual_znode = ManualFailover::ZNODE_PATH
34
34
  @mutex = Mutex.new
35
-
36
- # Name for the znode that handles exclusive locking between multiple
37
- # Node Manager processes. Whoever holds the lock will be considered
38
- # the "master" Node Manager, and will be responsible for monitoring
39
- # the redis nodes. When a Node Manager that holds the lock disappears
40
- # or fails, another Node Manager process will grab the lock and
41
- # become the
35
+ @shutdown = false
36
+ @leader = false
37
+ @master = nil
38
+ @slaves = []
39
+ @unavailable = []
42
40
  @lock_path = "#{@znode}_lock".freeze
43
41
  end
44
42
 
@@ -46,21 +44,22 @@ module RedisFailover
46
44
  #
47
45
  # @note This method does not return until the manager terminates.
48
46
  def start
47
+ return unless running?
49
48
  @queue = Queue.new
50
- @leader = false
51
49
  setup_zk
52
50
  logger.info('Waiting to become master Node Manager ...')
53
51
  with_lock do
54
52
  @leader = true
55
53
  logger.info('Acquired master Node Manager lock')
56
- discover_nodes
57
- initialize_path
58
- spawn_watchers
59
- handle_state_reports
54
+ if discover_nodes
55
+ initialize_path
56
+ spawn_watchers
57
+ handle_state_reports
58
+ end
60
59
  end
61
60
  rescue *ZK_ERRORS => ex
62
61
  logger.error("ZK error while attempting to manage nodes: #{ex.inspect}")
63
- shutdown
62
+ reset
64
63
  retry
65
64
  end
66
65
 
@@ -73,16 +72,23 @@ module RedisFailover
73
72
  @queue << [node, state]
74
73
  end
75
74
 
76
- # Performs a graceful shutdown of the manager.
77
- def shutdown
78
- @queue.clear
79
- @queue << nil
75
+ # Performs a reset of the manager.
76
+ def reset
77
+ @leader = false
80
78
  @watchers.each(&:shutdown) if @watchers
81
- sleep(TIMEOUT)
79
+ @queue.clear
82
80
  @zk.close! if @zk
83
81
  @zk_lock = nil
84
82
  end
85
83
 
84
+ # Initiates a graceful shutdown.
85
+ def shutdown
86
+ logger.info('Shutting down ...')
87
+ @mutex.synchronize do
88
+ @shutdown = true
89
+ end
90
+ end
91
+
86
92
  private
87
93
 
88
94
  # Configures the ZooKeeper client.
@@ -92,17 +98,8 @@ module RedisFailover
92
98
  @zk.on_expired_session { notify_state(:zk_disconnected, nil) }
93
99
 
94
100
  @zk.register(@manual_znode) do |event|
95
- @mutex.synchronize do
96
- begin
97
- if event.node_created? || event.node_changed?
98
- schedule_manual_failover
99
- end
100
- rescue => ex
101
- logger.error("Error scheduling a manual failover: #{ex.inspect}")
102
- logger.error(ex.backtrace.join("\n"))
103
- ensure
104
- @zk.stat(@manual_znode, :watch => true)
105
- end
101
+ if event.node_created? || event.node_changed?
102
+ perform_manual_failover
106
103
  end
107
104
  end
108
105
 
@@ -112,23 +109,23 @@ module RedisFailover
112
109
 
113
110
  # Handles periodic state reports from {RedisFailover::NodeWatcher} instances.
114
111
  def handle_state_reports
115
- while state_report = @queue.pop
116
- # Ensure that we still have the master lock.
117
- @zk_lock.assert!
118
-
112
+ while running? && (state_report = @queue.pop)
119
113
  begin
120
- node, state = state_report
121
- case state
122
- when :unavailable then handle_unavailable(node)
123
- when :available then handle_available(node)
124
- when :syncing then handle_syncing(node)
125
- when :manual_failover then handle_manual_failover(node)
126
- when :zk_disconnected then raise ZKDisconnectedError
127
- else raise InvalidNodeStateError.new(node, state)
128
- end
114
+ @mutex.synchronize do
115
+ return unless running?
116
+ @zk_lock.assert!
117
+ node, state = state_report
118
+ case state
119
+ when :unavailable then handle_unavailable(node)
120
+ when :available then handle_available(node)
121
+ when :syncing then handle_syncing(node)
122
+ when :zk_disconnected then raise ZKDisconnectedError
123
+ else raise InvalidNodeStateError.new(node, state)
124
+ end
129
125
 
130
- # flush current state
131
- write_state
126
+ # flush current state
127
+ write_state
128
+ end
132
129
  rescue *ZK_ERRORS
133
130
  # fail hard if this is a ZK connection-related error
134
131
  raise
@@ -204,7 +201,7 @@ module RedisFailover
204
201
  logger.info("Handling manual failover")
205
202
 
206
203
  # make current master a slave, and promote new master
207
- @slaves << @master
204
+ @slaves << @master if @master
208
205
  @slaves.delete(node)
209
206
  promote_new_master(node)
210
207
  end
@@ -234,16 +231,35 @@ module RedisFailover
234
231
  end
235
232
 
236
233
  # Discovers the current master and slave nodes.
234
+ # @return [Boolean] true if nodes successfully discovered, false otherwise
237
235
  def discover_nodes
238
- nodes = @options[:nodes].map { |opts| Node.new(opts) }.uniq
239
- @master = find_existing_master || find_master(nodes)
240
- @unavailable = []
241
- @slaves = nodes - [@master]
242
- logger.info("Managing master (#{@master}) and slaves" +
243
- " (#{@slaves.map(&:to_s).join(', ')})")
244
-
245
- # ensure that slaves are correctly pointing to this master
246
- redirect_slaves_to(@master) if @master
236
+ @mutex.synchronize do
237
+ return false unless running?
238
+ nodes = @options[:nodes].map { |opts| Node.new(opts) }.uniq
239
+ if @master = find_existing_master
240
+ logger.info("Using master #{@master} from existing znode config.")
241
+ elsif @master = guess_master(nodes)
242
+ logger.info("Guessed master #{@master} from known redis nodes.")
243
+ end
244
+ @slaves = nodes - [@master]
245
+ logger.info("Managing master (#{@master}) and slaves " +
246
+ "(#{@slaves.map(&:to_s).join(', ')})")
247
+ # ensure that slaves are correctly pointing to this master
248
+ redirect_slaves_to(@master)
249
+ true
250
+ end
251
+ rescue NodeUnavailableError, NoMasterError, MultipleMastersError => ex
252
+ msg = <<-MSG.gsub(/\s+/, ' ')
253
+ Failed to discover master node: #{ex.inspect}
254
+ In order to ensure a safe startup, redis_failover requires that all redis
255
+ nodes be accessible, and only a single node indicating that it's the master.
256
+ In order to fix this, you can perform a manual failover via redis_failover,
257
+ or manually fix the individual redis servers. This discovery process will
258
+ retry in #{TIMEOUT}s.
259
+ MSG
260
+ logger.warn(msg)
261
+ sleep(TIMEOUT)
262
+ retry
247
263
  end
248
264
 
249
265
  # Seeds the initial node master from an existing znode config.
@@ -251,7 +267,7 @@ module RedisFailover
251
267
  if data = @zk.get(@znode).first
252
268
  nodes = symbolize_keys(decode(data))
253
269
  master = node_from(nodes[:master])
254
- logger.info("Master from existing config: #{master || 'none'}")
270
+ logger.info("Master from existing znode config: #{master || 'none'}")
255
271
  master
256
272
  end
257
273
  rescue ZK::Exceptions::NoNode
@@ -281,14 +297,11 @@ module RedisFailover
281
297
  #
282
298
  # @param [Array<Node>] nodes the nodes to search
283
299
  # @return [Node] the found master node, nil if not found
284
- def find_master(nodes)
285
- nodes.find do |node|
286
- begin
287
- node.master?
288
- rescue NodeUnavailableError
289
- false
290
- end
291
- end
300
+ def guess_master(nodes)
301
+ master_nodes = nodes.select { |node| node.master? }
302
+ raise NoMasterError if master_nodes.empty?
303
+ raise MultipleMastersError.new(master_nodes) if master_nodes.size > 1
304
+ master_nodes.first
292
305
  end
293
306
 
294
307
  # Redirects all slaves to the specified node.
@@ -378,32 +391,44 @@ module RedisFailover
378
391
  # Executes a block wrapped in a ZK exclusive lock.
379
392
  def with_lock
380
393
  @zk_lock = @zk.locker(@lock_path)
381
- @zk_lock.lock(true)
382
- yield
394
+ while running? && !@zk_lock.lock
395
+ sleep(TIMEOUT)
396
+ end
397
+
398
+ if running?
399
+ yield
400
+ end
383
401
  ensure
384
402
  @zk_lock.unlock! if @zk_lock
385
403
  end
386
404
 
387
- # Schedules a manual failover to a redis node.
388
- def schedule_manual_failover
389
- return unless @leader
390
- new_master = @zk.get(@manual_znode, :watch => true).first
391
- return unless new_master && new_master.size > 0
392
- logger.info("Received manual failover request for: #{new_master}")
393
- logger.info("Current nodes: #{current_nodes.inspect}")
394
-
395
- node = if new_master == ManualFailover::ANY_SLAVE
396
- @slaves.shuffle.first
397
- else
398
- host, port = new_master.split(':', 2)
399
- Node.new(:host => host, :port => port, :password => @options[:password])
405
+ # Perform a manual failover to a redis node.
406
+ def perform_manual_failover
407
+ @mutex.synchronize do
408
+ return unless running? && @leader && @zk_lock
409
+ @zk_lock.assert!
410
+ new_master = @zk.get(@manual_znode, :watch => true).first
411
+ return unless new_master && new_master.size > 0
412
+ logger.info("Received manual failover request for: #{new_master}")
413
+ logger.info("Current nodes: #{current_nodes.inspect}")
414
+ node = new_master == ManualFailover::ANY_SLAVE ?
415
+ @slaves.shuffle.first : node_from(new_master)
416
+ if node
417
+ handle_manual_failover(node)
418
+ else
419
+ logger.error('Failed to perform manual failover, no candidate found.')
420
+ end
400
421
  end
422
+ rescue => ex
423
+ logger.error("Error handling a manual failover: #{ex.inspect}")
424
+ logger.error(ex.backtrace.join("\n"))
425
+ ensure
426
+ @zk.stat(@manual_znode, :watch => true)
427
+ end
401
428
 
402
- if node
403
- notify_state(node, :manual_failover)
404
- else
405
- logger.error('Failed to perform manual failover, no candidate found.')
406
- end
429
+ # @return [Boolean] true if running, false otherwise
430
+ def running?
431
+ !@shutdown
407
432
  end
408
433
  end
409
434
  end
@@ -35,8 +35,8 @@ module RedisFailover
35
35
  @done = true
36
36
  @node.wakeup
37
37
  @monitor_thread.join if @monitor_thread
38
- rescue
39
- # best effort
38
+ rescue => ex
39
+ logger.warn("Failed to gracefully shutdown watcher for #{@node}")
40
40
  end
41
41
 
42
42
  private
@@ -8,22 +8,20 @@ module RedisFailover
8
8
  # Node Manager is gracefully stopped
9
9
  def self.run(options)
10
10
  options = CLI.parse(options)
11
- @node_manager = NodeManager.new(options)
12
- trap_signals
13
- @node_manager_thread = Thread.new { @node_manager.start }
14
- @node_manager_thread.join
11
+ node_manager = NodeManager.new(options)
12
+ trap_signals(node_manager)
13
+ node_manager.start
15
14
  end
16
15
 
17
16
  # Traps shutdown signals.
18
- def self.trap_signals
17
+ # @param [NodeManager] node_manager the node manager
18
+ def self.trap_signals(node_manager)
19
19
  [:INT, :TERM].each do |signal|
20
20
  trap(signal) do
21
- Util.logger.info('Shutting down ...')
22
- @node_manager.shutdown
23
- @node_manager_thread.join
24
- exit(0)
21
+ node_manager.shutdown
25
22
  end
26
23
  end
27
24
  end
25
+ private_class_method :trap_signals
28
26
  end
29
27
  end
@@ -1,3 +1,3 @@
1
1
  module RedisFailover
2
- VERSION = '0.9.4'
2
+ VERSION = '0.9.5'
3
3
  end
@@ -108,5 +108,29 @@ module RedisFailover
108
108
  end
109
109
  end
110
110
  end
111
+
112
+ describe '#guess_master' do
113
+ let(:node1) { Node.new(:host => 'node1').extend(RedisStubSupport) }
114
+ let(:node2) { Node.new(:host => 'node2').extend(RedisStubSupport) }
115
+ let(:node3) { Node.new(:host => 'node3').extend(RedisStubSupport) }
116
+
117
+ it 'raises error when no master is found' do
118
+ node1.make_slave!(node3)
119
+ node2.make_slave!(node3)
120
+ expect { manager.guess_master([node1, node2]) }.to raise_error(NoMasterError)
121
+ end
122
+
123
+ it 'raises error when multiple masters found' do
124
+ node1.make_master!
125
+ node2.make_master!
126
+ expect { manager.guess_master([node1, node2]) }.to raise_error(MultipleMastersError)
127
+ end
128
+
129
+ it 'raises error when a node can not be reached' do
130
+ node1.make_master!
131
+ node2.redis.make_unavailable!
132
+ expect { manager.guess_master([node1, node2]) }.to raise_error(NodeUnavailableError)
133
+ end
134
+ end
111
135
  end
112
136
  end
@@ -1,11 +1,12 @@
1
1
  module RedisFailover
2
2
  class NodeManagerStub < NodeManager
3
3
  attr_accessor :master
4
- public :current_nodes
4
+ # HACK - this will go away once we refactor the tests to use a real ZK/Redis server.
5
+ public :current_nodes, :guess_master
5
6
 
6
7
  def discover_nodes
7
8
  # only discover nodes once in testing
8
- return if @nodes_discovered
9
+ return true if @nodes_discovered
9
10
 
10
11
  master = Node.new(:host => 'master')
11
12
  slave = Node.new(:host => 'slave')
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: redis_failover
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.4
4
+ version: 0.9.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-08-31 00:00:00.000000000 Z
12
+ date: 2012-09-01 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: redis
@@ -189,7 +189,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
189
189
  version: '0'
190
190
  segments:
191
191
  - 0
192
- hash: -3042115734438994013
192
+ hash: -2193925210006995870
193
193
  required_rubygems_version: !ruby/object:Gem::Requirement
194
194
  none: false
195
195
  requirements:
@@ -198,7 +198,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
198
198
  version: '0'
199
199
  segments:
200
200
  - 0
201
- hash: -3042115734438994013
201
+ hash: -2193925210006995870
202
202
  requirements: []
203
203
  rubyforge_project:
204
204
  rubygems_version: 1.8.23