nogara-redis_failover 0.8.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,47 @@
1
+ module RedisFailover
2
+ # Base class for all RedisFailover errors.
3
+ class Error < StandardError
4
+ end
5
+
6
+ # Raised when a node is specified incorrectly.
7
+ class InvalidNodeError < Error
8
+ end
9
+
10
+ # Raised when a node changes to an invalid/unknown state.
11
+ class InvalidNodeStateError < Error
12
+ def initialize(node, state)
13
+ super("Invalid state change `#{state}` for node #{node}")
14
+ end
15
+ end
16
+
17
+ # Raised when a node is unavailable (i.e., unreachable via network).
18
+ class NodeUnavailableError < Error
19
+ def initialize(node)
20
+ super("Node: #{node}")
21
+ end
22
+ end
23
+
24
+ # Raised when no master is currently available.
25
+ class NoMasterError < Error
26
+ end
27
+
28
+ # Raised when no slave is currently available.
29
+ class NoSlaveError < Error
30
+ end
31
+
32
+ # Raised when a redis server is no longer using the same role
33
+ # as previously assumed.
34
+ class InvalidNodeRoleError < Error
35
+ def initialize(node, assumed, actual)
36
+ super("Invalid role detected for node #{node}, client thought " +
37
+ "it was a #{assumed}, but it's now a #{actual}")
38
+ end
39
+ end
40
+
41
+ # Raised when an unsupported redis operation is performed.
42
+ class UnsupportedOperationError < Error
43
+ def initialize(operation)
44
+ super("Operation `#{operation}` is currently unsupported")
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,40 @@
1
+ module RedisFailover
2
+ # Provides manual failover support to a new master.
3
+ class ManualFailover
4
+ # Path for manual failover communication.
5
+ ZNODE_PATH = '/redis_failover_manual'.freeze
6
+
7
+ # Denotes that any slave can be used as a candidate for promotion.
8
+ ANY_SLAVE = "ANY_SLAVE".freeze
9
+
10
+ # Creates a new instance.
11
+ #
12
+ # @param [ZK] zk the ZooKeeper client
13
+ # @param [Hash] options the options used for manual failover
14
+ # @option options [String] :host the host of the failover candidate
15
+ # @option options [String] :port the port of the failover candidate
16
+ # @note
17
+ # If options is empty, a random slave will be used
18
+ # as a failover candidate.
19
+ def initialize(zk, options = {})
20
+ @zk = zk
21
+ @options = options
22
+ end
23
+
24
+ # Performs a manual failover.
25
+ def perform
26
+ create_path
27
+ node = @options.empty? ? ANY_SLAVE : "#{@options[:host]}:#{@options[:port]}"
28
+ @zk.set(ZNODE_PATH, node)
29
+ end
30
+
31
+ private
32
+
33
+ # Creates the znode path used for coordinating manual failovers.
34
+ def create_path
35
+ @zk.create(ZNODE_PATH)
36
+ rescue ZK::Exceptions::NodeExists
37
+ # best effort
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,190 @@
1
+ module RedisFailover
2
+ # Represents a redis node (master or slave). Instances of this class
3
+ # are used by the NodeManager and NodeWatcher to manipulate real redis
4
+ # servers.
5
+ class Node
6
+ include Util
7
+
8
+ # Maximum amount of time given for any redis operation to complete.
9
+ # If a redis operation doesn't complete in the alotted time, a
10
+ # NodeUnavailableError will be raised.
11
+ MAX_OP_WAIT_TIME = 5
12
+
13
+ # @return [String] the redis server host
14
+ attr_reader :host
15
+
16
+ # @return [Integer] the redis server port
17
+ attr_reader :port
18
+
19
+ # Creates a new instance.
20
+ #
21
+ # @param [Hash] options the options used to create the node
22
+ # @option options [String] :host the host of the redis server
23
+ # @option options [String] :port the port of the redis server
24
+ def initialize(options = {})
25
+ @host = options.fetch(:host) { raise InvalidNodeError, 'missing host'}
26
+ @port = Integer(options[:port] || 6379)
27
+ @password = options[:password]
28
+ end
29
+
30
+ # @return [Boolean] true if this node is a master, false otherwise
31
+ def master?
32
+ role == 'master'
33
+ end
34
+
35
+ # @return [Boolean] true if this node is a slave, false otherwise
36
+ def slave?
37
+ !master?
38
+ end
39
+
40
+ # Determines if this node is a slave of the given master.
41
+ #
42
+ # @param [Node] master the master to check
43
+ # @return [Boolean] true if slave of master, false otherwise
44
+ def slave_of?(master)
45
+ current_master == master
46
+ end
47
+
48
+ # Determines current master of this slave.
49
+ #
50
+ # @return [Node] the node representing the master of this slave
51
+ def current_master
52
+ info = fetch_info
53
+ return unless info[:role] == 'slave'
54
+ Node.new(:host => info[:master_host], :port => info[:master_port].to_i)
55
+ end
56
+
57
+ # Waits until something interesting happens. If the connection
58
+ # with this node dies, the blpop call will raise an error. If
59
+ # the blpop call returns without error, then this will be due to
60
+ # a graceful shutdown signaled by #wakeup or a timeout.
61
+ def wait
62
+ perform_operation do |redis|
63
+ redis.blpop(wait_key, MAX_OP_WAIT_TIME - 3)
64
+ redis.del(wait_key)
65
+ end
66
+ end
67
+
68
+ # Wakes up this node by pushing a value to its internal
69
+ # queue used by #wait.
70
+ def wakeup
71
+ perform_operation do |redis|
72
+ redis.lpush(wait_key, '1')
73
+ end
74
+ end
75
+
76
+ # Makes this node a slave of the given node.
77
+ #
78
+ # @param [Node] node the node of which to become a slave
79
+ def make_slave!(node)
80
+ perform_operation do |redis|
81
+ unless slave_of?(node)
82
+ redis.slaveof(node.host, node.port)
83
+ logger.info("#{self} is now a slave of #{node}")
84
+ wakeup
85
+ end
86
+ end
87
+ end
88
+
89
+ # Makes this node a master node.
90
+ def make_master!
91
+ perform_operation do |redis|
92
+ unless master?
93
+ redis.slaveof('no', 'one')
94
+ logger.info("#{self} is now master")
95
+ wakeup
96
+ end
97
+ end
98
+ end
99
+
100
+ # @return [String] an inspect string for this node
101
+ def inspect
102
+ "<RedisFailover::Node #{to_s}>"
103
+ end
104
+
105
+ # @return [String] a friendly string for this node
106
+ def to_s
107
+ "#{@host}:#{@port}"
108
+ end
109
+
110
+ # Determines if this node is equal to another node.
111
+ #
112
+ # @param [Node] other the other node to compare
113
+ # @return [Boolean] true if equal, false otherwise
114
+ def ==(other)
115
+ return false unless Node === other
116
+ return true if self.equal?(other)
117
+ [host, port] == [other.host, other.port]
118
+ end
119
+ alias_method :eql?, :==
120
+
121
+
122
+ # @return [Integer] a hash value for this node
123
+ def hash
124
+ to_s.hash
125
+ end
126
+
127
+ # Fetches information/stats for this node.
128
+ #
129
+ # @return [Hash] the info for this node
130
+ def fetch_info
131
+ perform_operation do |redis|
132
+ symbolize_keys(redis.info)
133
+ end
134
+ end
135
+ alias_method :ping, :fetch_info
136
+
137
+ # @return [Boolean] determines if this node prohibits stale reads
138
+ def prohibits_stale_reads?
139
+ perform_operation do |redis|
140
+ redis.config('get', 'slave-serve-stale-data').last == 'no'
141
+ end
142
+ end
143
+
144
+ # @return [Boolean] determines if this node is syncing with its master
145
+ def syncing_with_master?
146
+ perform_operation do |redis|
147
+ fetch_info[:master_sync_in_progress] == '1'
148
+ end
149
+ end
150
+
151
+ private
152
+
153
+ # @return [String] the current role for this node
154
+ def role
155
+ fetch_info[:role]
156
+ end
157
+
158
+ # @return [String] the name of the wait queue for this node
159
+ def wait_key
160
+ @wait_key ||= "_redis_failover_#{SecureRandom.hex(32)}"
161
+ end
162
+
163
+ # @return [Redis] a new redis client instance for this node
164
+ def new_client
165
+ Redis.new(:host => @host, :password => @password, :port => @port)
166
+ end
167
+
168
+ # Safely performs a redis operation within a given timeout window.
169
+ #
170
+ # @yield [Redis] the redis client to use for the operation
171
+ # @raise [NodeUnavailableError] if node is currently unreachable
172
+ def perform_operation
173
+ redis = nil
174
+ Timeout.timeout(MAX_OP_WAIT_TIME) do
175
+ redis = new_client
176
+ yield redis
177
+ end
178
+ rescue
179
+ raise NodeUnavailableError, self, caller
180
+ ensure
181
+ if redis
182
+ begin
183
+ redis.client.disconnect
184
+ rescue
185
+ raise NodeUnavailableError, self, caller
186
+ end
187
+ end
188
+ end
189
+ end
190
+ end
@@ -0,0 +1,352 @@
1
+ module RedisFailover
2
+ # NodeManager manages a list of redis nodes. Upon startup, the NodeManager
3
+ # will discover the current redis master and slaves. Each redis node is
4
+ # monitored by a NodeWatcher instance. The NodeWatchers periodically
5
+ # report the current state of the redis node it's watching to the
6
+ # NodeManager via an asynchronous queue. The NodeManager processes the
7
+ # state reports and reacts appropriately by handling stale/dead nodes,
8
+ # and promoting a new redis master if it sees fit to do so.
9
+ class NodeManager
10
+ include Util
11
+
12
+ # Name for the znode that handles exclusive locking between multiple
13
+ # Node Manager processes. Whoever holds the lock will be considered
14
+ # the "master" Node Manager, and will be responsible for monitoring
15
+ # the redis nodes. When a Node Manager that holds the lock disappears
16
+ # or fails, another Node Manager process will grab the lock and
17
+ # become the master.
18
+ LOCK_PATH = 'master_node_manager'
19
+
20
+ # Number of seconds to wait before retrying bootstrap process.
21
+ TIMEOUT = 5
22
+
23
+ # Creates a new instance.
24
+ #
25
+ # @param [Hash] options the options used to initialize the manager
26
+ # @option options [String] :zkservers comma-separated ZK host:port pairs
27
+ # @option options [String] :znode_path znode path override for redis nodes
28
+ # @option options [String] :password password for redis nodes
29
+ # @option options [Array<String>] :nodes the nodes to manage
30
+ # @option options [String] :max_failures the max failures for a node
31
+ def initialize(options)
32
+ logger.info("Redis Node Manager v#{VERSION} starting (#{RUBY_DESCRIPTION})")
33
+ @options = options
34
+ @znode = @options[:znode_path] || Util::DEFAULT_ZNODE_PATH
35
+ @manual_znode = ManualFailover::ZNODE_PATH
36
+ @mutex = Mutex.new
37
+ end
38
+
39
+ # Starts the node manager.
40
+ #
41
+ # @note This method does not return until the manager terminates.
42
+ def start
43
+ @queue = Queue.new
44
+ @leader = false
45
+ setup_zk
46
+ logger.info('Waiting to become master Node Manager ...')
47
+ @zk.with_lock(LOCK_PATH) do
48
+ @leader = true
49
+ logger.info('Acquired master Node Manager lock')
50
+ discover_nodes
51
+ initialize_path
52
+ spawn_watchers
53
+ handle_state_reports
54
+ end
55
+ rescue ZK::Exceptions::InterruptedSession => ex
56
+ logger.error("ZK error while attempting to manage nodes: #{ex.inspect}")
57
+ logger.error(ex.backtrace.join("\n"))
58
+ shutdown
59
+ sleep(TIMEOUT)
60
+ retry
61
+ end
62
+
63
+ # Notifies the manager of a state change. Used primarily by
64
+ # {RedisFailover::NodeWatcher} to inform the manager of watched node states.
65
+ #
66
+ # @param [Node] node the node
67
+ # @param [Symbol] state the state
68
+ def notify_state(node, state)
69
+ @queue << [node, state]
70
+ end
71
+
72
+ # Performs a graceful shutdown of the manager.
73
+ def shutdown
74
+ @queue.clear
75
+ @queue << nil
76
+ @watchers.each(&:shutdown) if @watchers
77
+ @zk.close! if @zk
78
+ end
79
+
80
+ private
81
+
82
+ # Configures the ZooKeeper client.
83
+ def setup_zk
84
+ @zk.close! if @zk
85
+ @zk = ZK.new("#{@options[:zkservers]}#{@options[:chroot] || ''}")
86
+
87
+ @zk.register(@manual_znode) do |event|
88
+ @mutex.synchronize do
89
+ if event.node_changed?
90
+ schedule_manual_failover
91
+ end
92
+ end
93
+ end
94
+
95
+ @zk.on_connected { @zk.stat(@manual_znode, :watch => true) }
96
+ @zk.stat(@manual_znode, :watch => true)
97
+ end
98
+
99
+ # Handles periodic state reports from {RedisFailover::NodeWatcher} instances.
100
+ def handle_state_reports
101
+ while state_report = @queue.pop
102
+ begin
103
+ node, state = state_report
104
+ case state
105
+ when :unavailable then handle_unavailable(node)
106
+ when :available then handle_available(node)
107
+ when :syncing then handle_syncing(node)
108
+ when :manual_failover then handle_manual_failover(node)
109
+ else raise InvalidNodeStateError.new(node, state)
110
+ end
111
+
112
+ # flush current state
113
+ write_state
114
+ rescue ZK::Exceptions::InterruptedSession
115
+ # fail hard if this is a ZK connection-related error
116
+ raise
117
+ rescue => ex
118
+ logger.error("Error handling #{state_report.inspect}: #{ex.inspect}")
119
+ logger.error(ex.backtrace.join("\n"))
120
+ end
121
+ end
122
+ end
123
+
124
+ # Handles an unavailable node.
125
+ #
126
+ # @param [Node] node the unavailable node
127
+ def handle_unavailable(node)
128
+ # no-op if we already know about this node
129
+ return if @unavailable.include?(node)
130
+ logger.info("Handling unavailable node: #{node}")
131
+
132
+ @unavailable << node
133
+ # find a new master if this node was a master
134
+ if node == @master
135
+ logger.info("Demoting currently unavailable master #{node}.")
136
+ promote_new_master
137
+ else
138
+ @slaves.delete(node)
139
+ end
140
+ end
141
+
142
+ # Handles an available node.
143
+ #
144
+ # @param [Node] node the available node
145
+ def handle_available(node)
146
+ reconcile(node)
147
+
148
+ # no-op if we already know about this node
149
+ return if @master == node || @slaves.include?(node)
150
+ logger.info("Handling available node: #{node}")
151
+
152
+ if @master
153
+ # master already exists, make a slave
154
+ node.make_slave!(@master)
155
+ @slaves << node
156
+ else
157
+ # no master exists, make this the new master
158
+ promote_new_master(node)
159
+ end
160
+
161
+ @unavailable.delete(node)
162
+ end
163
+
164
+ # Handles a node that is currently syncing.
165
+ #
166
+ # @param [Node] node the syncing node
167
+ def handle_syncing(node)
168
+ reconcile(node)
169
+
170
+ if node.syncing_with_master? && node.prohibits_stale_reads?
171
+ logger.info("Node #{node} not ready yet, still syncing with master.")
172
+ force_unavailable_slave(node)
173
+ return
174
+ end
175
+
176
+ # otherwise, we can use this node
177
+ handle_available(node)
178
+ end
179
+
180
+ # Handles a manual failover request to the given node.
181
+ #
182
+ # @param [Node] node the candidate node for failover
183
+ def handle_manual_failover(node)
184
+ # no-op if node to be failed over is already master
185
+ return if @master == node
186
+ logger.info("Handling manual failover")
187
+
188
+ # make current master a slave, and promote new master
189
+ @slaves << @master
190
+ @slaves.delete(node)
191
+ promote_new_master(node)
192
+ end
193
+
194
+ # Promotes a new master.
195
+ #
196
+ # @param [Node] node the optional node to promote
197
+ # @note if no node is specified, a random slave will be used
198
+ def promote_new_master(node = nil)
199
+ delete_path
200
+ @master = nil
201
+
202
+ # make a specific node or slave the new master
203
+ candidate = node || @slaves.pop
204
+ unless candidate
205
+ logger.error('Failed to promote a new master, no candidate available.')
206
+ return
207
+ end
208
+
209
+ redirect_slaves_to(candidate)
210
+ candidate.make_master!
211
+ @master = candidate
212
+
213
+ create_path
214
+ write_state
215
+ logger.info("Successfully promoted #{candidate} to master.")
216
+ end
217
+
218
+ # Discovers the current master and slave nodes.
219
+ def discover_nodes
220
+ @unavailable = []
221
+ nodes = @options[:nodes].map { |opts| Node.new(opts) }.uniq
222
+ raise NoMasterError unless @master = find_master(nodes)
223
+ @slaves = nodes - [@master]
224
+ logger.info("Managing master (#{@master}) and slaves" +
225
+ " (#{@slaves.map(&:to_s).join(', ')})")
226
+
227
+ # ensure that slaves are correctly pointing to this master
228
+ redirect_slaves_to(@master)
229
+ end
230
+
231
+ # Spawns the {RedisFailover::NodeWatcher} instances for each managed node.
232
+ def spawn_watchers
233
+ @watchers = [@master, @slaves, @unavailable].flatten.map do |node|
234
+ NodeWatcher.new(self, node, @options[:max_failures] || 3)
235
+ end
236
+ @watchers.each(&:watch)
237
+ end
238
+
239
+ # Searches for the master node.
240
+ #
241
+ # @param [Array<Node>] nodes the nodes to search
242
+ # @return [Node] the found master node, nil if not found
243
+ def find_master(nodes)
244
+ nodes.find do |node|
245
+ begin
246
+ node.master?
247
+ rescue NodeUnavailableError
248
+ false
249
+ end
250
+ end
251
+ end
252
+
253
+ # Redirects all slaves to the specified node.
254
+ #
255
+ # @param [Node] node the node to which slaves are redirected
256
+ def redirect_slaves_to(node)
257
+ @slaves.dup.each do |slave|
258
+ begin
259
+ slave.make_slave!(node)
260
+ rescue NodeUnavailableError
261
+ logger.info("Failed to redirect unreachable slave #{slave} to #{node}")
262
+ force_unavailable_slave(slave)
263
+ end
264
+ end
265
+ end
266
+
267
+ # Forces a slave to be marked as unavailable.
268
+ #
269
+ # @param [Node] node the node to force as unavailable
270
+ def force_unavailable_slave(node)
271
+ @slaves.delete(node)
272
+ @unavailable << node unless @unavailable.include?(node)
273
+ end
274
+
275
+ # It's possible that a newly available node may have been restarted
276
+ # and completely lost its dynamically set run-time role by the node
277
+ # manager. This method ensures that the node resumes its role as
278
+ # determined by the manager.
279
+ #
280
+ # @param [Node] node the node to reconcile
281
+ def reconcile(node)
282
+ return if @master == node && node.master?
283
+ return if @master && node.slave_of?(@master)
284
+
285
+ logger.info("Reconciling node #{node}")
286
+ if @master == node && !node.master?
287
+ # we think the node is a master, but the node doesn't
288
+ node.make_master!
289
+ return
290
+ end
291
+
292
+ # verify that node is a slave for the current master
293
+ if @master && !node.slave_of?(@master)
294
+ node.make_slave!(@master)
295
+ end
296
+ end
297
+
298
+ # @return [Hash] the set of current nodes grouped by category
299
+ def current_nodes
300
+ {
301
+ :master => @master ? @master.to_s : nil,
302
+ :slaves => @slaves.map(&:to_s),
303
+ :unavailable => @unavailable.map(&:to_s)
304
+ }
305
+ end
306
+
307
+ # Deletes the znode path containing the redis nodes.
308
+ def delete_path
309
+ @zk.delete(@znode)
310
+ logger.info("Deleted ZooKeeper node #{@znode}")
311
+ rescue ZK::Exceptions::NoNode => ex
312
+ logger.info("Tried to delete missing znode: #{ex.inspect}")
313
+ end
314
+
315
+ # Creates the znode path containing the redis nodes.
316
+ def create_path
317
+ unless @zk.exists?(@znode)
318
+ @zk.create(@znode, encode(current_nodes), :ephemeral => true)
319
+ logger.info("Created ZooKeeper node #{@znode}")
320
+ end
321
+ rescue ZK::Exceptions::NodeExists
322
+ # best effort
323
+ end
324
+
325
+ # Initializes the znode path containing the redis nodes.
326
+ def initialize_path
327
+ create_path
328
+ write_state
329
+ end
330
+
331
+ # Writes the current redis nodes state to the znode path.
332
+ def write_state
333
+ create_path
334
+ @zk.set(@znode, encode(current_nodes))
335
+ end
336
+
337
+ # Schedules a manual failover to a redis node.
338
+ def schedule_manual_failover
339
+ return unless @leader
340
+ new_master = @zk.get(@manual_znode, :watch => true).first
341
+ logger.info("Received manual failover request for: #{new_master}")
342
+
343
+ node = if new_master == ManualFailover::ANY_SLAVE
344
+ @slaves.sample
345
+ else
346
+ host, port = new_master.split(':', 2)
347
+ Node.new(:host => host, :port => port, :password => @options[:password])
348
+ end
349
+ notify_state(node, :manual_failover) if node
350
+ end
351
+ end
352
+ end