nogara-redis_failover 0.8.9

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,47 @@
1
+ module RedisFailover
2
+ # Base class for all RedisFailover errors.
3
+ class Error < StandardError
4
+ end
5
+
6
+ # Raised when a node is specified incorrectly.
7
+ class InvalidNodeError < Error
8
+ end
9
+
10
+ # Raised when a node changes to an invalid/unknown state.
11
+ class InvalidNodeStateError < Error
12
+ def initialize(node, state)
13
+ super("Invalid state change `#{state}` for node #{node}")
14
+ end
15
+ end
16
+
17
+ # Raised when a node is unavailable (i.e., unreachable via network).
18
+ class NodeUnavailableError < Error
19
+ def initialize(node)
20
+ super("Node: #{node}")
21
+ end
22
+ end
23
+
24
+ # Raised when no master is currently available.
25
+ class NoMasterError < Error
26
+ end
27
+
28
+ # Raised when no slave is currently available.
29
+ class NoSlaveError < Error
30
+ end
31
+
32
+ # Raised when a redis server is no longer using the same role
33
+ # as previously assumed.
34
+ class InvalidNodeRoleError < Error
35
+ def initialize(node, assumed, actual)
36
+ super("Invalid role detected for node #{node}, client thought " +
37
+ "it was a #{assumed}, but it's now a #{actual}")
38
+ end
39
+ end
40
+
41
+ # Raised when an unsupported redis operation is performed.
42
+ class UnsupportedOperationError < Error
43
+ def initialize(operation)
44
+ super("Operation `#{operation}` is currently unsupported")
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,40 @@
1
+ module RedisFailover
2
+ # Provides manual failover support to a new master.
3
+ class ManualFailover
4
+ # Path for manual failover communication.
5
+ ZNODE_PATH = '/redis_failover_manual'.freeze
6
+
7
+ # Denotes that any slave can be used as a candidate for promotion.
8
+ ANY_SLAVE = "ANY_SLAVE".freeze
9
+
10
+ # Creates a new instance.
11
+ #
12
+ # @param [ZK] zk the ZooKeeper client
13
+ # @param [Hash] options the options used for manual failover
14
+ # @option options [String] :host the host of the failover candidate
15
+ # @option options [String] :port the port of the failover candidate
16
+ # @note
17
+ # If options is empty, a random slave will be used
18
+ # as a failover candidate.
19
+ def initialize(zk, options = {})
20
+ @zk = zk
21
+ @options = options
22
+ end
23
+
24
+ # Performs a manual failover.
25
+ def perform
26
+ create_path
27
+ node = @options.empty? ? ANY_SLAVE : "#{@options[:host]}:#{@options[:port]}"
28
+ @zk.set(ZNODE_PATH, node)
29
+ end
30
+
31
+ private
32
+
33
+ # Creates the znode path used for coordinating manual failovers.
34
+ def create_path
35
+ @zk.create(ZNODE_PATH)
36
+ rescue ZK::Exceptions::NodeExists
37
+ # best effort
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,190 @@
1
+ module RedisFailover
2
+ # Represents a redis node (master or slave). Instances of this class
3
+ # are used by the NodeManager and NodeWatcher to manipulate real redis
4
+ # servers.
5
+ class Node
6
+ include Util
7
+
8
+ # Maximum amount of time given for any redis operation to complete.
9
+ # If a redis operation doesn't complete in the alotted time, a
10
+ # NodeUnavailableError will be raised.
11
+ MAX_OP_WAIT_TIME = 5
12
+
13
+ # @return [String] the redis server host
14
+ attr_reader :host
15
+
16
+ # @return [Integer] the redis server port
17
+ attr_reader :port
18
+
19
+ # Creates a new instance.
20
+ #
21
+ # @param [Hash] options the options used to create the node
22
+ # @option options [String] :host the host of the redis server
23
+ # @option options [String] :port the port of the redis server
24
+ def initialize(options = {})
25
+ @host = options.fetch(:host) { raise InvalidNodeError, 'missing host'}
26
+ @port = Integer(options[:port] || 6379)
27
+ @password = options[:password]
28
+ end
29
+
30
+ # @return [Boolean] true if this node is a master, false otherwise
31
+ def master?
32
+ role == 'master'
33
+ end
34
+
35
+ # @return [Boolean] true if this node is a slave, false otherwise
36
+ def slave?
37
+ !master?
38
+ end
39
+
40
+ # Determines if this node is a slave of the given master.
41
+ #
42
+ # @param [Node] master the master to check
43
+ # @return [Boolean] true if slave of master, false otherwise
44
+ def slave_of?(master)
45
+ current_master == master
46
+ end
47
+
48
+ # Determines current master of this slave.
49
+ #
50
+ # @return [Node] the node representing the master of this slave
51
+ def current_master
52
+ info = fetch_info
53
+ return unless info[:role] == 'slave'
54
+ Node.new(:host => info[:master_host], :port => info[:master_port].to_i)
55
+ end
56
+
57
+ # Waits until something interesting happens. If the connection
58
+ # with this node dies, the blpop call will raise an error. If
59
+ # the blpop call returns without error, then this will be due to
60
+ # a graceful shutdown signaled by #wakeup or a timeout.
61
+ def wait
62
+ perform_operation do |redis|
63
+ redis.blpop(wait_key, MAX_OP_WAIT_TIME - 3)
64
+ redis.del(wait_key)
65
+ end
66
+ end
67
+
68
+ # Wakes up this node by pushing a value to its internal
69
+ # queue used by #wait.
70
+ def wakeup
71
+ perform_operation do |redis|
72
+ redis.lpush(wait_key, '1')
73
+ end
74
+ end
75
+
76
+ # Makes this node a slave of the given node.
77
+ #
78
+ # @param [Node] node the node of which to become a slave
79
+ def make_slave!(node)
80
+ perform_operation do |redis|
81
+ unless slave_of?(node)
82
+ redis.slaveof(node.host, node.port)
83
+ logger.info("#{self} is now a slave of #{node}")
84
+ wakeup
85
+ end
86
+ end
87
+ end
88
+
89
+ # Makes this node a master node.
90
+ def make_master!
91
+ perform_operation do |redis|
92
+ unless master?
93
+ redis.slaveof('no', 'one')
94
+ logger.info("#{self} is now master")
95
+ wakeup
96
+ end
97
+ end
98
+ end
99
+
100
+ # @return [String] an inspect string for this node
101
+ def inspect
102
+ "<RedisFailover::Node #{to_s}>"
103
+ end
104
+
105
+ # @return [String] a friendly string for this node
106
+ def to_s
107
+ "#{@host}:#{@port}"
108
+ end
109
+
110
+ # Determines if this node is equal to another node.
111
+ #
112
+ # @param [Node] other the other node to compare
113
+ # @return [Boolean] true if equal, false otherwise
114
+ def ==(other)
115
+ return false unless Node === other
116
+ return true if self.equal?(other)
117
+ [host, port] == [other.host, other.port]
118
+ end
119
+ alias_method :eql?, :==
120
+
121
+
122
+ # @return [Integer] a hash value for this node
123
+ def hash
124
+ to_s.hash
125
+ end
126
+
127
+ # Fetches information/stats for this node.
128
+ #
129
+ # @return [Hash] the info for this node
130
+ def fetch_info
131
+ perform_operation do |redis|
132
+ symbolize_keys(redis.info)
133
+ end
134
+ end
135
+ alias_method :ping, :fetch_info
136
+
137
+ # @return [Boolean] determines if this node prohibits stale reads
138
+ def prohibits_stale_reads?
139
+ perform_operation do |redis|
140
+ redis.config('get', 'slave-serve-stale-data').last == 'no'
141
+ end
142
+ end
143
+
144
+ # @return [Boolean] determines if this node is syncing with its master
145
+ def syncing_with_master?
146
+ perform_operation do |redis|
147
+ fetch_info[:master_sync_in_progress] == '1'
148
+ end
149
+ end
150
+
151
+ private
152
+
153
+ # @return [String] the current role for this node
154
+ def role
155
+ fetch_info[:role]
156
+ end
157
+
158
+ # @return [String] the name of the wait queue for this node
159
+ def wait_key
160
+ @wait_key ||= "_redis_failover_#{SecureRandom.hex(32)}"
161
+ end
162
+
163
+ # @return [Redis] a new redis client instance for this node
164
+ def new_client
165
+ Redis.new(:host => @host, :password => @password, :port => @port)
166
+ end
167
+
168
+ # Safely performs a redis operation within a given timeout window.
169
+ #
170
+ # @yield [Redis] the redis client to use for the operation
171
+ # @raise [NodeUnavailableError] if node is currently unreachable
172
+ def perform_operation
173
+ redis = nil
174
+ Timeout.timeout(MAX_OP_WAIT_TIME) do
175
+ redis = new_client
176
+ yield redis
177
+ end
178
+ rescue
179
+ raise NodeUnavailableError, self, caller
180
+ ensure
181
+ if redis
182
+ begin
183
+ redis.client.disconnect
184
+ rescue
185
+ raise NodeUnavailableError, self, caller
186
+ end
187
+ end
188
+ end
189
+ end
190
+ end
@@ -0,0 +1,352 @@
1
+ module RedisFailover
2
+ # NodeManager manages a list of redis nodes. Upon startup, the NodeManager
3
+ # will discover the current redis master and slaves. Each redis node is
4
+ # monitored by a NodeWatcher instance. The NodeWatchers periodically
5
+ # report the current state of the redis node it's watching to the
6
+ # NodeManager via an asynchronous queue. The NodeManager processes the
7
+ # state reports and reacts appropriately by handling stale/dead nodes,
8
+ # and promoting a new redis master if it sees fit to do so.
9
+ class NodeManager
10
+ include Util
11
+
12
+ # Name for the znode that handles exclusive locking between multiple
13
+ # Node Manager processes. Whoever holds the lock will be considered
14
+ # the "master" Node Manager, and will be responsible for monitoring
15
+ # the redis nodes. When a Node Manager that holds the lock disappears
16
+ # or fails, another Node Manager process will grab the lock and
17
+ # become the master.
18
+ LOCK_PATH = 'master_node_manager'
19
+
20
+ # Number of seconds to wait before retrying bootstrap process.
21
+ TIMEOUT = 5
22
+
23
+ # Creates a new instance.
24
+ #
25
+ # @param [Hash] options the options used to initialize the manager
26
+ # @option options [String] :zkservers comma-separated ZK host:port pairs
27
+ # @option options [String] :znode_path znode path override for redis nodes
28
+ # @option options [String] :password password for redis nodes
29
+ # @option options [Array<String>] :nodes the nodes to manage
30
+ # @option options [String] :max_failures the max failures for a node
31
+ def initialize(options)
32
+ logger.info("Redis Node Manager v#{VERSION} starting (#{RUBY_DESCRIPTION})")
33
+ @options = options
34
+ @znode = @options[:znode_path] || Util::DEFAULT_ZNODE_PATH
35
+ @manual_znode = ManualFailover::ZNODE_PATH
36
+ @mutex = Mutex.new
37
+ end
38
+
39
+ # Starts the node manager.
40
+ #
41
+ # @note This method does not return until the manager terminates.
42
+ def start
43
+ @queue = Queue.new
44
+ @leader = false
45
+ setup_zk
46
+ logger.info('Waiting to become master Node Manager ...')
47
+ @zk.with_lock(LOCK_PATH) do
48
+ @leader = true
49
+ logger.info('Acquired master Node Manager lock')
50
+ discover_nodes
51
+ initialize_path
52
+ spawn_watchers
53
+ handle_state_reports
54
+ end
55
+ rescue ZK::Exceptions::InterruptedSession => ex
56
+ logger.error("ZK error while attempting to manage nodes: #{ex.inspect}")
57
+ logger.error(ex.backtrace.join("\n"))
58
+ shutdown
59
+ sleep(TIMEOUT)
60
+ retry
61
+ end
62
+
63
+ # Notifies the manager of a state change. Used primarily by
64
+ # {RedisFailover::NodeWatcher} to inform the manager of watched node states.
65
+ #
66
+ # @param [Node] node the node
67
+ # @param [Symbol] state the state
68
+ def notify_state(node, state)
69
+ @queue << [node, state]
70
+ end
71
+
72
+ # Performs a graceful shutdown of the manager.
73
+ def shutdown
74
+ @queue.clear
75
+ @queue << nil
76
+ @watchers.each(&:shutdown) if @watchers
77
+ @zk.close! if @zk
78
+ end
79
+
80
+ private
81
+
82
+ # Configures the ZooKeeper client.
83
+ def setup_zk
84
+ @zk.close! if @zk
85
+ @zk = ZK.new("#{@options[:zkservers]}#{@options[:chroot] || ''}")
86
+
87
+ @zk.register(@manual_znode) do |event|
88
+ @mutex.synchronize do
89
+ if event.node_changed?
90
+ schedule_manual_failover
91
+ end
92
+ end
93
+ end
94
+
95
+ @zk.on_connected { @zk.stat(@manual_znode, :watch => true) }
96
+ @zk.stat(@manual_znode, :watch => true)
97
+ end
98
+
99
+ # Handles periodic state reports from {RedisFailover::NodeWatcher} instances.
100
+ def handle_state_reports
101
+ while state_report = @queue.pop
102
+ begin
103
+ node, state = state_report
104
+ case state
105
+ when :unavailable then handle_unavailable(node)
106
+ when :available then handle_available(node)
107
+ when :syncing then handle_syncing(node)
108
+ when :manual_failover then handle_manual_failover(node)
109
+ else raise InvalidNodeStateError.new(node, state)
110
+ end
111
+
112
+ # flush current state
113
+ write_state
114
+ rescue ZK::Exceptions::InterruptedSession
115
+ # fail hard if this is a ZK connection-related error
116
+ raise
117
+ rescue => ex
118
+ logger.error("Error handling #{state_report.inspect}: #{ex.inspect}")
119
+ logger.error(ex.backtrace.join("\n"))
120
+ end
121
+ end
122
+ end
123
+
124
+ # Handles an unavailable node.
125
+ #
126
+ # @param [Node] node the unavailable node
127
+ def handle_unavailable(node)
128
+ # no-op if we already know about this node
129
+ return if @unavailable.include?(node)
130
+ logger.info("Handling unavailable node: #{node}")
131
+
132
+ @unavailable << node
133
+ # find a new master if this node was a master
134
+ if node == @master
135
+ logger.info("Demoting currently unavailable master #{node}.")
136
+ promote_new_master
137
+ else
138
+ @slaves.delete(node)
139
+ end
140
+ end
141
+
142
+ # Handles an available node.
143
+ #
144
+ # @param [Node] node the available node
145
+ def handle_available(node)
146
+ reconcile(node)
147
+
148
+ # no-op if we already know about this node
149
+ return if @master == node || @slaves.include?(node)
150
+ logger.info("Handling available node: #{node}")
151
+
152
+ if @master
153
+ # master already exists, make a slave
154
+ node.make_slave!(@master)
155
+ @slaves << node
156
+ else
157
+ # no master exists, make this the new master
158
+ promote_new_master(node)
159
+ end
160
+
161
+ @unavailable.delete(node)
162
+ end
163
+
164
+ # Handles a node that is currently syncing.
165
+ #
166
+ # @param [Node] node the syncing node
167
+ def handle_syncing(node)
168
+ reconcile(node)
169
+
170
+ if node.syncing_with_master? && node.prohibits_stale_reads?
171
+ logger.info("Node #{node} not ready yet, still syncing with master.")
172
+ force_unavailable_slave(node)
173
+ return
174
+ end
175
+
176
+ # otherwise, we can use this node
177
+ handle_available(node)
178
+ end
179
+
180
+ # Handles a manual failover request to the given node.
181
+ #
182
+ # @param [Node] node the candidate node for failover
183
+ def handle_manual_failover(node)
184
+ # no-op if node to be failed over is already master
185
+ return if @master == node
186
+ logger.info("Handling manual failover")
187
+
188
+ # make current master a slave, and promote new master
189
+ @slaves << @master
190
+ @slaves.delete(node)
191
+ promote_new_master(node)
192
+ end
193
+
194
+ # Promotes a new master.
195
+ #
196
+ # @param [Node] node the optional node to promote
197
+ # @note if no node is specified, a random slave will be used
198
+ def promote_new_master(node = nil)
199
+ delete_path
200
+ @master = nil
201
+
202
+ # make a specific node or slave the new master
203
+ candidate = node || @slaves.pop
204
+ unless candidate
205
+ logger.error('Failed to promote a new master, no candidate available.')
206
+ return
207
+ end
208
+
209
+ redirect_slaves_to(candidate)
210
+ candidate.make_master!
211
+ @master = candidate
212
+
213
+ create_path
214
+ write_state
215
+ logger.info("Successfully promoted #{candidate} to master.")
216
+ end
217
+
218
+ # Discovers the current master and slave nodes.
219
+ def discover_nodes
220
+ @unavailable = []
221
+ nodes = @options[:nodes].map { |opts| Node.new(opts) }.uniq
222
+ raise NoMasterError unless @master = find_master(nodes)
223
+ @slaves = nodes - [@master]
224
+ logger.info("Managing master (#{@master}) and slaves" +
225
+ " (#{@slaves.map(&:to_s).join(', ')})")
226
+
227
+ # ensure that slaves are correctly pointing to this master
228
+ redirect_slaves_to(@master)
229
+ end
230
+
231
+ # Spawns the {RedisFailover::NodeWatcher} instances for each managed node.
232
+ def spawn_watchers
233
+ @watchers = [@master, @slaves, @unavailable].flatten.map do |node|
234
+ NodeWatcher.new(self, node, @options[:max_failures] || 3)
235
+ end
236
+ @watchers.each(&:watch)
237
+ end
238
+
239
+ # Searches for the master node.
240
+ #
241
+ # @param [Array<Node>] nodes the nodes to search
242
+ # @return [Node] the found master node, nil if not found
243
+ def find_master(nodes)
244
+ nodes.find do |node|
245
+ begin
246
+ node.master?
247
+ rescue NodeUnavailableError
248
+ false
249
+ end
250
+ end
251
+ end
252
+
253
+ # Redirects all slaves to the specified node.
254
+ #
255
+ # @param [Node] node the node to which slaves are redirected
256
+ def redirect_slaves_to(node)
257
+ @slaves.dup.each do |slave|
258
+ begin
259
+ slave.make_slave!(node)
260
+ rescue NodeUnavailableError
261
+ logger.info("Failed to redirect unreachable slave #{slave} to #{node}")
262
+ force_unavailable_slave(slave)
263
+ end
264
+ end
265
+ end
266
+
267
+ # Forces a slave to be marked as unavailable.
268
+ #
269
+ # @param [Node] node the node to force as unavailable
270
+ def force_unavailable_slave(node)
271
+ @slaves.delete(node)
272
+ @unavailable << node unless @unavailable.include?(node)
273
+ end
274
+
275
+ # It's possible that a newly available node may have been restarted
276
+ # and completely lost its dynamically set run-time role by the node
277
+ # manager. This method ensures that the node resumes its role as
278
+ # determined by the manager.
279
+ #
280
+ # @param [Node] node the node to reconcile
281
+ def reconcile(node)
282
+ return if @master == node && node.master?
283
+ return if @master && node.slave_of?(@master)
284
+
285
+ logger.info("Reconciling node #{node}")
286
+ if @master == node && !node.master?
287
+ # we think the node is a master, but the node doesn't
288
+ node.make_master!
289
+ return
290
+ end
291
+
292
+ # verify that node is a slave for the current master
293
+ if @master && !node.slave_of?(@master)
294
+ node.make_slave!(@master)
295
+ end
296
+ end
297
+
298
+ # @return [Hash] the set of current nodes grouped by category
299
+ def current_nodes
300
+ {
301
+ :master => @master ? @master.to_s : nil,
302
+ :slaves => @slaves.map(&:to_s),
303
+ :unavailable => @unavailable.map(&:to_s)
304
+ }
305
+ end
306
+
307
+ # Deletes the znode path containing the redis nodes.
308
+ def delete_path
309
+ @zk.delete(@znode)
310
+ logger.info("Deleted ZooKeeper node #{@znode}")
311
+ rescue ZK::Exceptions::NoNode => ex
312
+ logger.info("Tried to delete missing znode: #{ex.inspect}")
313
+ end
314
+
315
+ # Creates the znode path containing the redis nodes.
316
+ def create_path
317
+ unless @zk.exists?(@znode)
318
+ @zk.create(@znode, encode(current_nodes), :ephemeral => true)
319
+ logger.info("Created ZooKeeper node #{@znode}")
320
+ end
321
+ rescue ZK::Exceptions::NodeExists
322
+ # best effort
323
+ end
324
+
325
+ # Initializes the znode path containing the redis nodes.
326
+ def initialize_path
327
+ create_path
328
+ write_state
329
+ end
330
+
331
+ # Writes the current redis nodes state to the znode path.
332
+ def write_state
333
+ create_path
334
+ @zk.set(@znode, encode(current_nodes))
335
+ end
336
+
337
+ # Schedules a manual failover to a redis node.
338
+ def schedule_manual_failover
339
+ return unless @leader
340
+ new_master = @zk.get(@manual_znode, :watch => true).first
341
+ logger.info("Received manual failover request for: #{new_master}")
342
+
343
+ node = if new_master == ManualFailover::ANY_SLAVE
344
+ @slaves.sample
345
+ else
346
+ host, port = new_master.split(':', 2)
347
+ Node.new(:host => host, :port => port, :password => @options[:password])
348
+ end
349
+ notify_state(node, :manual_failover) if node
350
+ end
351
+ end
352
+ end