redis_failover 0.9.4 → 0.9.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore CHANGED
@@ -16,4 +16,4 @@ test/tmp
16
16
  test/version_tmp
17
17
  tmp
18
18
  tags
19
-
19
+ .DS_Store
data/Changes.md CHANGED
@@ -1,3 +1,8 @@
1
+ 0.9.5
2
+ -----------
3
+ - Introduce a safer master node discovery process for the Node Manager (#34)
4
+ - Improved shutdown process for Node Manager
5
+
1
6
  0.9.4
2
7
  -----------
3
8
  - Preserve original master by reading from existing znode state.
data/README.md CHANGED
@@ -155,7 +155,7 @@ redis_failover uses YARD for its API documentation. Refer to the generated [API
155
155
 
156
156
  ## Requirements
157
157
 
158
- - redis_failover is actively tested against MRI 1.9.2/1.9.3 and JRuby 1.6.7 (1.9 mode only). Other rubies may work, although I don't actively test against them.
158
+ - redis_failover is actively tested against MRI 1.8.7/1.9.2/1.9.3 and JRuby 1.6.7 (1.9 mode only). Other rubies may work, although I don't actively test against them.
159
159
  - redis_failover requires a ZooKeeper service cluster to ensure reliability and data consistency. ZooKeeper is very simple and easy to get up and running. Please refer to this [Quick ZooKeeper Guide](https://github.com/ryanlecompte/redis_failover/wiki/Quick-ZooKeeper-Guide) to get up and running quickly if you don't already have ZooKeeper as a part of your environment.
160
160
 
161
161
  ## Considerations
@@ -25,6 +25,13 @@ module RedisFailover
25
25
  class NoMasterError < Error
26
26
  end
27
27
 
28
+ # Raised when more than one master is found on startup.
29
+ class MultipleMastersError < Error
30
+ def initialize(nodes)
31
+ super("Multiple nodes with master role: #{nodes.map(&:to_s)}")
32
+ end
33
+ end
34
+
28
35
  # Raised when no slave is currently available.
29
36
  class NoSlaveError < Error
30
37
  end
@@ -118,7 +118,6 @@ module RedisFailover
118
118
  end
119
119
  alias_method :eql?, :==
120
120
 
121
-
122
121
  # @return [Integer] a hash value for this node
123
122
  def hash
124
123
  to_s.hash
@@ -32,13 +32,11 @@ module RedisFailover
32
32
  @znode = @options[:znode_path] || Util::DEFAULT_ZNODE_PATH
33
33
  @manual_znode = ManualFailover::ZNODE_PATH
34
34
  @mutex = Mutex.new
35
-
36
- # Name for the znode that handles exclusive locking between multiple
37
- # Node Manager processes. Whoever holds the lock will be considered
38
- # the "master" Node Manager, and will be responsible for monitoring
39
- # the redis nodes. When a Node Manager that holds the lock disappears
40
- # or fails, another Node Manager process will grab the lock and
41
- # become the
35
+ @shutdown = false
36
+ @leader = false
37
+ @master = nil
38
+ @slaves = []
39
+ @unavailable = []
42
40
  @lock_path = "#{@znode}_lock".freeze
43
41
  end
44
42
 
@@ -46,21 +44,22 @@ module RedisFailover
46
44
  #
47
45
  # @note This method does not return until the manager terminates.
48
46
  def start
47
+ return unless running?
49
48
  @queue = Queue.new
50
- @leader = false
51
49
  setup_zk
52
50
  logger.info('Waiting to become master Node Manager ...')
53
51
  with_lock do
54
52
  @leader = true
55
53
  logger.info('Acquired master Node Manager lock')
56
- discover_nodes
57
- initialize_path
58
- spawn_watchers
59
- handle_state_reports
54
+ if discover_nodes
55
+ initialize_path
56
+ spawn_watchers
57
+ handle_state_reports
58
+ end
60
59
  end
61
60
  rescue *ZK_ERRORS => ex
62
61
  logger.error("ZK error while attempting to manage nodes: #{ex.inspect}")
63
- shutdown
62
+ reset
64
63
  retry
65
64
  end
66
65
 
@@ -73,16 +72,23 @@ module RedisFailover
73
72
  @queue << [node, state]
74
73
  end
75
74
 
76
- # Performs a graceful shutdown of the manager.
77
- def shutdown
78
- @queue.clear
79
- @queue << nil
75
+ # Performs a reset of the manager.
76
+ def reset
77
+ @leader = false
80
78
  @watchers.each(&:shutdown) if @watchers
81
- sleep(TIMEOUT)
79
+ @queue.clear
82
80
  @zk.close! if @zk
83
81
  @zk_lock = nil
84
82
  end
85
83
 
84
+ # Initiates a graceful shutdown.
85
+ def shutdown
86
+ logger.info('Shutting down ...')
87
+ @mutex.synchronize do
88
+ @shutdown = true
89
+ end
90
+ end
91
+
86
92
  private
87
93
 
88
94
  # Configures the ZooKeeper client.
@@ -92,17 +98,8 @@ module RedisFailover
92
98
  @zk.on_expired_session { notify_state(:zk_disconnected, nil) }
93
99
 
94
100
  @zk.register(@manual_znode) do |event|
95
- @mutex.synchronize do
96
- begin
97
- if event.node_created? || event.node_changed?
98
- schedule_manual_failover
99
- end
100
- rescue => ex
101
- logger.error("Error scheduling a manual failover: #{ex.inspect}")
102
- logger.error(ex.backtrace.join("\n"))
103
- ensure
104
- @zk.stat(@manual_znode, :watch => true)
105
- end
101
+ if event.node_created? || event.node_changed?
102
+ perform_manual_failover
106
103
  end
107
104
  end
108
105
 
@@ -112,23 +109,23 @@ module RedisFailover
112
109
 
113
110
  # Handles periodic state reports from {RedisFailover::NodeWatcher} instances.
114
111
  def handle_state_reports
115
- while state_report = @queue.pop
116
- # Ensure that we still have the master lock.
117
- @zk_lock.assert!
118
-
112
+ while running? && (state_report = @queue.pop)
119
113
  begin
120
- node, state = state_report
121
- case state
122
- when :unavailable then handle_unavailable(node)
123
- when :available then handle_available(node)
124
- when :syncing then handle_syncing(node)
125
- when :manual_failover then handle_manual_failover(node)
126
- when :zk_disconnected then raise ZKDisconnectedError
127
- else raise InvalidNodeStateError.new(node, state)
128
- end
114
+ @mutex.synchronize do
115
+ return unless running?
116
+ @zk_lock.assert!
117
+ node, state = state_report
118
+ case state
119
+ when :unavailable then handle_unavailable(node)
120
+ when :available then handle_available(node)
121
+ when :syncing then handle_syncing(node)
122
+ when :zk_disconnected then raise ZKDisconnectedError
123
+ else raise InvalidNodeStateError.new(node, state)
124
+ end
129
125
 
130
- # flush current state
131
- write_state
126
+ # flush current state
127
+ write_state
128
+ end
132
129
  rescue *ZK_ERRORS
133
130
  # fail hard if this is a ZK connection-related error
134
131
  raise
@@ -204,7 +201,7 @@ module RedisFailover
204
201
  logger.info("Handling manual failover")
205
202
 
206
203
  # make current master a slave, and promote new master
207
- @slaves << @master
204
+ @slaves << @master if @master
208
205
  @slaves.delete(node)
209
206
  promote_new_master(node)
210
207
  end
@@ -234,16 +231,35 @@ module RedisFailover
234
231
  end
235
232
 
236
233
  # Discovers the current master and slave nodes.
234
+ # @return [Boolean] true if nodes successfully discovered, false otherwise
237
235
  def discover_nodes
238
- nodes = @options[:nodes].map { |opts| Node.new(opts) }.uniq
239
- @master = find_existing_master || find_master(nodes)
240
- @unavailable = []
241
- @slaves = nodes - [@master]
242
- logger.info("Managing master (#{@master}) and slaves" +
243
- " (#{@slaves.map(&:to_s).join(', ')})")
244
-
245
- # ensure that slaves are correctly pointing to this master
246
- redirect_slaves_to(@master) if @master
236
+ @mutex.synchronize do
237
+ return false unless running?
238
+ nodes = @options[:nodes].map { |opts| Node.new(opts) }.uniq
239
+ if @master = find_existing_master
240
+ logger.info("Using master #{@master} from existing znode config.")
241
+ elsif @master = guess_master(nodes)
242
+ logger.info("Guessed master #{@master} from known redis nodes.")
243
+ end
244
+ @slaves = nodes - [@master]
245
+ logger.info("Managing master (#{@master}) and slaves " +
246
+ "(#{@slaves.map(&:to_s).join(', ')})")
247
+ # ensure that slaves are correctly pointing to this master
248
+ redirect_slaves_to(@master)
249
+ true
250
+ end
251
+ rescue NodeUnavailableError, NoMasterError, MultipleMastersError => ex
252
+ msg = <<-MSG.gsub(/\s+/, ' ')
253
+ Failed to discover master node: #{ex.inspect}
254
+ In order to ensure a safe startup, redis_failover requires that all redis
255
+ nodes be accessible, and only a single node indicating that it's the master.
256
+ In order to fix this, you can perform a manual failover via redis_failover,
257
+ or manually fix the individual redis servers. This discovery process will
258
+ retry in #{TIMEOUT}s.
259
+ MSG
260
+ logger.warn(msg)
261
+ sleep(TIMEOUT)
262
+ retry
247
263
  end
248
264
 
249
265
  # Seeds the initial node master from an existing znode config.
@@ -251,7 +267,7 @@ module RedisFailover
251
267
  if data = @zk.get(@znode).first
252
268
  nodes = symbolize_keys(decode(data))
253
269
  master = node_from(nodes[:master])
254
- logger.info("Master from existing config: #{master || 'none'}")
270
+ logger.info("Master from existing znode config: #{master || 'none'}")
255
271
  master
256
272
  end
257
273
  rescue ZK::Exceptions::NoNode
@@ -281,14 +297,11 @@ module RedisFailover
281
297
  #
282
298
  # @param [Array<Node>] nodes the nodes to search
283
299
  # @return [Node] the found master node, nil if not found
284
- def find_master(nodes)
285
- nodes.find do |node|
286
- begin
287
- node.master?
288
- rescue NodeUnavailableError
289
- false
290
- end
291
- end
300
+ def guess_master(nodes)
301
+ master_nodes = nodes.select { |node| node.master? }
302
+ raise NoMasterError if master_nodes.empty?
303
+ raise MultipleMastersError.new(master_nodes) if master_nodes.size > 1
304
+ master_nodes.first
292
305
  end
293
306
 
294
307
  # Redirects all slaves to the specified node.
@@ -378,32 +391,44 @@ module RedisFailover
378
391
  # Executes a block wrapped in a ZK exclusive lock.
379
392
  def with_lock
380
393
  @zk_lock = @zk.locker(@lock_path)
381
- @zk_lock.lock(true)
382
- yield
394
+ while running? && !@zk_lock.lock
395
+ sleep(TIMEOUT)
396
+ end
397
+
398
+ if running?
399
+ yield
400
+ end
383
401
  ensure
384
402
  @zk_lock.unlock! if @zk_lock
385
403
  end
386
404
 
387
- # Schedules a manual failover to a redis node.
388
- def schedule_manual_failover
389
- return unless @leader
390
- new_master = @zk.get(@manual_znode, :watch => true).first
391
- return unless new_master && new_master.size > 0
392
- logger.info("Received manual failover request for: #{new_master}")
393
- logger.info("Current nodes: #{current_nodes.inspect}")
394
-
395
- node = if new_master == ManualFailover::ANY_SLAVE
396
- @slaves.shuffle.first
397
- else
398
- host, port = new_master.split(':', 2)
399
- Node.new(:host => host, :port => port, :password => @options[:password])
405
+ # Perform a manual failover to a redis node.
406
+ def perform_manual_failover
407
+ @mutex.synchronize do
408
+ return unless running? && @leader && @zk_lock
409
+ @zk_lock.assert!
410
+ new_master = @zk.get(@manual_znode, :watch => true).first
411
+ return unless new_master && new_master.size > 0
412
+ logger.info("Received manual failover request for: #{new_master}")
413
+ logger.info("Current nodes: #{current_nodes.inspect}")
414
+ node = new_master == ManualFailover::ANY_SLAVE ?
415
+ @slaves.shuffle.first : node_from(new_master)
416
+ if node
417
+ handle_manual_failover(node)
418
+ else
419
+ logger.error('Failed to perform manual failover, no candidate found.')
420
+ end
400
421
  end
422
+ rescue => ex
423
+ logger.error("Error handling a manual failover: #{ex.inspect}")
424
+ logger.error(ex.backtrace.join("\n"))
425
+ ensure
426
+ @zk.stat(@manual_znode, :watch => true)
427
+ end
401
428
 
402
- if node
403
- notify_state(node, :manual_failover)
404
- else
405
- logger.error('Failed to perform manual failover, no candidate found.')
406
- end
429
+ # @return [Boolean] true if running, false otherwise
430
+ def running?
431
+ !@shutdown
407
432
  end
408
433
  end
409
434
  end
@@ -35,8 +35,8 @@ module RedisFailover
35
35
  @done = true
36
36
  @node.wakeup
37
37
  @monitor_thread.join if @monitor_thread
38
- rescue
39
- # best effort
38
+ rescue => ex
39
+ logger.warn("Failed to gracefully shutdown watcher for #{@node}")
40
40
  end
41
41
 
42
42
  private
@@ -8,22 +8,20 @@ module RedisFailover
8
8
  # Node Manager is gracefully stopped
9
9
  def self.run(options)
10
10
  options = CLI.parse(options)
11
- @node_manager = NodeManager.new(options)
12
- trap_signals
13
- @node_manager_thread = Thread.new { @node_manager.start }
14
- @node_manager_thread.join
11
+ node_manager = NodeManager.new(options)
12
+ trap_signals(node_manager)
13
+ node_manager.start
15
14
  end
16
15
 
17
16
  # Traps shutdown signals.
18
- def self.trap_signals
17
+ # @param [NodeManager] node_manager the node manager
18
+ def self.trap_signals(node_manager)
19
19
  [:INT, :TERM].each do |signal|
20
20
  trap(signal) do
21
- Util.logger.info('Shutting down ...')
22
- @node_manager.shutdown
23
- @node_manager_thread.join
24
- exit(0)
21
+ node_manager.shutdown
25
22
  end
26
23
  end
27
24
  end
25
+ private_class_method :trap_signals
28
26
  end
29
27
  end
@@ -1,3 +1,3 @@
1
1
  module RedisFailover
2
- VERSION = '0.9.4'
2
+ VERSION = '0.9.5'
3
3
  end
@@ -108,5 +108,29 @@ module RedisFailover
108
108
  end
109
109
  end
110
110
  end
111
+
112
+ describe '#guess_master' do
113
+ let(:node1) { Node.new(:host => 'node1').extend(RedisStubSupport) }
114
+ let(:node2) { Node.new(:host => 'node2').extend(RedisStubSupport) }
115
+ let(:node3) { Node.new(:host => 'node3').extend(RedisStubSupport) }
116
+
117
+ it 'raises error when no master is found' do
118
+ node1.make_slave!(node3)
119
+ node2.make_slave!(node3)
120
+ expect { manager.guess_master([node1, node2]) }.to raise_error(NoMasterError)
121
+ end
122
+
123
+ it 'raises error when multiple masters found' do
124
+ node1.make_master!
125
+ node2.make_master!
126
+ expect { manager.guess_master([node1, node2]) }.to raise_error(MultipleMastersError)
127
+ end
128
+
129
+ it 'raises error when a node can not be reached' do
130
+ node1.make_master!
131
+ node2.redis.make_unavailable!
132
+ expect { manager.guess_master([node1, node2]) }.to raise_error(NodeUnavailableError)
133
+ end
134
+ end
111
135
  end
112
136
  end
@@ -1,11 +1,12 @@
1
1
  module RedisFailover
2
2
  class NodeManagerStub < NodeManager
3
3
  attr_accessor :master
4
- public :current_nodes
4
+ # HACK - this will go away once we refactor the tests to use a real ZK/Redis server.
5
+ public :current_nodes, :guess_master
5
6
 
6
7
  def discover_nodes
7
8
  # only discover nodes once in testing
8
- return if @nodes_discovered
9
+ return true if @nodes_discovered
9
10
 
10
11
  master = Node.new(:host => 'master')
11
12
  slave = Node.new(:host => 'slave')
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: redis_failover
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.4
4
+ version: 0.9.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-08-31 00:00:00.000000000 Z
12
+ date: 2012-09-01 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: redis
@@ -189,7 +189,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
189
189
  version: '0'
190
190
  segments:
191
191
  - 0
192
- hash: -3042115734438994013
192
+ hash: -2193925210006995870
193
193
  required_rubygems_version: !ruby/object:Gem::Requirement
194
194
  none: false
195
195
  requirements:
@@ -198,7 +198,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
198
198
  version: '0'
199
199
  segments:
200
200
  - 0
201
- hash: -3042115734438994013
201
+ hash: -2193925210006995870
202
202
  requirements: []
203
203
  rubyforge_project:
204
204
  rubygems_version: 1.8.23