spbtv_redis_failover 1.0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +19 -0
- data/.travis.yml +7 -0
- data/.yardopts +6 -0
- data/Changes.md +191 -0
- data/Gemfile +2 -0
- data/LICENSE +22 -0
- data/README.md +240 -0
- data/Rakefile +9 -0
- data/bin/redis_node_manager +7 -0
- data/examples/config.yml +17 -0
- data/examples/multiple_environments_config.yml +15 -0
- data/lib/redis_failover.rb +25 -0
- data/lib/redis_failover/cli.rb +142 -0
- data/lib/redis_failover/client.rb +517 -0
- data/lib/redis_failover/errors.rb +54 -0
- data/lib/redis_failover/failover_strategy.rb +25 -0
- data/lib/redis_failover/failover_strategy/latency.rb +21 -0
- data/lib/redis_failover/manual_failover.rb +52 -0
- data/lib/redis_failover/node.rb +190 -0
- data/lib/redis_failover/node_manager.rb +741 -0
- data/lib/redis_failover/node_snapshot.rb +81 -0
- data/lib/redis_failover/node_strategy.rb +34 -0
- data/lib/redis_failover/node_strategy/consensus.rb +18 -0
- data/lib/redis_failover/node_strategy/majority.rb +18 -0
- data/lib/redis_failover/node_strategy/single.rb +17 -0
- data/lib/redis_failover/node_watcher.rb +83 -0
- data/lib/redis_failover/runner.rb +27 -0
- data/lib/redis_failover/util.rb +137 -0
- data/lib/redis_failover/version.rb +3 -0
- data/misc/redis_failover.png +0 -0
- data/spbtv_redis_failover.gemspec +26 -0
- data/spec/cli_spec.rb +75 -0
- data/spec/client_spec.rb +153 -0
- data/spec/failover_strategy/latency_spec.rb +41 -0
- data/spec/failover_strategy_spec.rb +17 -0
- data/spec/node_manager_spec.rb +136 -0
- data/spec/node_snapshot_spec.rb +30 -0
- data/spec/node_spec.rb +84 -0
- data/spec/node_strategy/consensus_spec.rb +30 -0
- data/spec/node_strategy/majority_spec.rb +22 -0
- data/spec/node_strategy/single_spec.rb +22 -0
- data/spec/node_strategy_spec.rb +22 -0
- data/spec/node_watcher_spec.rb +58 -0
- data/spec/spec_helper.rb +21 -0
- data/spec/support/config/multiple_environments.yml +15 -0
- data/spec/support/config/multiple_environments_with_chroot.yml +17 -0
- data/spec/support/config/single_environment.yml +7 -0
- data/spec/support/config/single_environment_with_chroot.yml +8 -0
- data/spec/support/node_manager_stub.rb +87 -0
- data/spec/support/redis_stub.rb +105 -0
- data/spec/util_spec.rb +21 -0
- metadata +207 -0
@@ -0,0 +1,54 @@
|
|
1
|
+
module RedisFailover
|
2
|
+
# Base class for all RedisFailover errors.
|
3
|
+
class Error < StandardError
|
4
|
+
end
|
5
|
+
|
6
|
+
# Raised when a node is specified incorrectly.
|
7
|
+
class InvalidNodeError < Error
|
8
|
+
end
|
9
|
+
|
10
|
+
# Raised when a node changes to an invalid/unknown state.
|
11
|
+
class InvalidNodeStateError < Error
|
12
|
+
def initialize(node, state)
|
13
|
+
super("Invalid state change `#{state}` for node #{node}")
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
# Raised when a node is unavailable (i.e., unreachable via network).
|
18
|
+
class NodeUnavailableError < Error
|
19
|
+
def initialize(node)
|
20
|
+
super("Node: #{node}")
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
# Raised when no master is currently available.
|
25
|
+
class NoMasterError < Error
|
26
|
+
end
|
27
|
+
|
28
|
+
# Raised when more than one master is found on startup.
|
29
|
+
class MultipleMastersError < Error
|
30
|
+
def initialize(nodes)
|
31
|
+
super("Multiple nodes with master role: #{nodes.map(&:to_s)}")
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# Raised when no slave is currently available.
|
36
|
+
class NoSlaveError < Error
|
37
|
+
end
|
38
|
+
|
39
|
+
# Raised when a redis server is no longer using the same role
|
40
|
+
# as previously assumed.
|
41
|
+
class InvalidNodeRoleError < Error
|
42
|
+
def initialize(node, assumed, actual)
|
43
|
+
super("Invalid role detected for node #{node}, client thought " +
|
44
|
+
"it was a #{assumed}, but it's now a #{actual}")
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# Raised when an unsupported redis operation is performed.
|
49
|
+
class UnsupportedOperationError < Error
|
50
|
+
def initialize(operation)
|
51
|
+
super("Operation `#{operation}` is currently unsupported")
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module RedisFailover
|
2
|
+
# Base class for strategies that determine which node is used during failover.
|
3
|
+
class FailoverStrategy
|
4
|
+
include Util
|
5
|
+
|
6
|
+
# Loads a strategy based on the given name.
|
7
|
+
#
|
8
|
+
# @param [String, Symbol] name the strategy name
|
9
|
+
# @return [Object] a new strategy instance
|
10
|
+
def self.for(name)
|
11
|
+
require "redis_failover/failover_strategy/#{name.downcase}"
|
12
|
+
const_get(name.capitalize).new
|
13
|
+
rescue LoadError, NameError
|
14
|
+
raise "Failed to find failover strategy: #{name}"
|
15
|
+
end
|
16
|
+
|
17
|
+
# Returns a candidate node as determined by this strategy.
|
18
|
+
#
|
19
|
+
# @param [Hash<Node, NodeSnapshot>] snapshots the node snapshots
|
20
|
+
# @return [Node] the candidate node or nil if one couldn't be found
|
21
|
+
def find_candidate(snapshots)
|
22
|
+
raise NotImplementedError
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module RedisFailover
|
2
|
+
class FailoverStrategy
|
3
|
+
# Failover strategy that selects an available node that is both seen by all
|
4
|
+
# node managers and has the lowest reported health check latency.
|
5
|
+
class Latency < FailoverStrategy
|
6
|
+
# @see RedisFailover::FailoverStrategy#find_candidate
|
7
|
+
def find_candidate(snapshots)
|
8
|
+
candidates = {}
|
9
|
+
snapshots.each do |node, snapshot|
|
10
|
+
if snapshot.all_available?
|
11
|
+
candidates[node] = snapshot.avg_latency
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
if candidate = candidates.min_by(&:last)
|
16
|
+
candidate.first
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
module RedisFailover
|
2
|
+
# Provides manual failover support to a new master.
|
3
|
+
class ManualFailover
|
4
|
+
# Path for manual failover communication.
|
5
|
+
ZNODE_PATH = 'manual_failover'.freeze
|
6
|
+
|
7
|
+
# Denotes that any slave can be used as a candidate for promotion.
|
8
|
+
ANY_SLAVE = "ANY_SLAVE".freeze
|
9
|
+
|
10
|
+
def self.path(root_znode)
|
11
|
+
"#{root_znode}/#{ZNODE_PATH}"
|
12
|
+
end
|
13
|
+
|
14
|
+
# Creates a new instance.
|
15
|
+
#
|
16
|
+
# @param [ZK] zk the ZooKeeper client
|
17
|
+
# @param [ZNode] root_znode the root ZK node
|
18
|
+
# @param [Hash] options the options used for manual failover
|
19
|
+
# @option options [String] :host the host of the failover candidate
|
20
|
+
# @option options [String] :port the port of the failover candidate
|
21
|
+
# @note
|
22
|
+
# If options is empty, a random slave will be used
|
23
|
+
# as a failover candidate.
|
24
|
+
def initialize(zk, root_znode, options = {})
|
25
|
+
@zk = zk
|
26
|
+
@root_znode = root_znode
|
27
|
+
@options = options
|
28
|
+
|
29
|
+
unless @options.empty?
|
30
|
+
port = Integer(@options[:port]) rescue nil
|
31
|
+
raise ArgumentError, ':host not properly specified' if @options[:host].to_s.empty?
|
32
|
+
raise ArgumentError, ':port not properly specified' if port.nil?
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# Performs a manual failover.
|
37
|
+
def perform
|
38
|
+
create_path
|
39
|
+
node = @options.empty? ? ANY_SLAVE : "#{@options[:host]}:#{@options[:port]}"
|
40
|
+
@zk.set(self.class.path(@root_znode), node)
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
# Creates the znode path used for coordinating manual failovers.
|
46
|
+
def create_path
|
47
|
+
@zk.create(self.class.path(@root_znode))
|
48
|
+
rescue ZK::Exceptions::NodeExists
|
49
|
+
# best effort
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,190 @@
|
|
1
|
+
module RedisFailover
|
2
|
+
# Represents a redis node (master or slave). Instances of this class
|
3
|
+
# are used by the NodeManager and NodeWatcher to manipulate real redis
|
4
|
+
# servers.
|
5
|
+
class Node
|
6
|
+
include Util
|
7
|
+
|
8
|
+
# Maximum amount of time given for any redis operation to complete.
|
9
|
+
# If a redis operation doesn't complete in the alotted time, a
|
10
|
+
# NodeUnavailableError will be raised.
|
11
|
+
MAX_OP_WAIT_TIME = 5
|
12
|
+
|
13
|
+
# @return [String] the redis server host
|
14
|
+
attr_reader :host
|
15
|
+
|
16
|
+
# @return [Integer] the redis server port
|
17
|
+
attr_reader :port
|
18
|
+
|
19
|
+
# Creates a new instance.
|
20
|
+
#
|
21
|
+
# @param [Hash] options the options used to create the node
|
22
|
+
# @option options [String] :host the host of the redis server
|
23
|
+
# @option options [String] :port the port of the redis server
|
24
|
+
def initialize(options = {})
|
25
|
+
@host = options[:host]
|
26
|
+
raise InvalidNodeError, 'missing host' if @host.to_s.empty?
|
27
|
+
@port = Integer(options[:port] || 6379)
|
28
|
+
@password = options[:password]
|
29
|
+
end
|
30
|
+
|
31
|
+
# @return [Boolean] true if this node is a master, false otherwise
|
32
|
+
def master?
|
33
|
+
role == 'master'
|
34
|
+
end
|
35
|
+
|
36
|
+
# @return [Boolean] true if this node is a slave, false otherwise
|
37
|
+
def slave?
|
38
|
+
!master?
|
39
|
+
end
|
40
|
+
|
41
|
+
# Determines if this node is a slave of the given master.
|
42
|
+
#
|
43
|
+
# @param [Node] master the master to check
|
44
|
+
# @return [Boolean] true if slave of master, false otherwise
|
45
|
+
def slave_of?(master)
|
46
|
+
current_master == master
|
47
|
+
end
|
48
|
+
|
49
|
+
# Determines current master of this slave.
|
50
|
+
#
|
51
|
+
# @return [Node] the node representing the master of this slave
|
52
|
+
def current_master
|
53
|
+
info = fetch_info
|
54
|
+
return unless info[:role] == 'slave'
|
55
|
+
Node.new(:host => info[:master_host], :port => info[:master_port].to_i)
|
56
|
+
end
|
57
|
+
|
58
|
+
# Waits until something interesting happens. If the connection
|
59
|
+
# with this node dies, the blpop call will raise an error. If
|
60
|
+
# the blpop call returns without error, then this will be due to
|
61
|
+
# a graceful shutdown signaled by #wakeup or a timeout.
|
62
|
+
def wait
|
63
|
+
perform_operation do |redis|
|
64
|
+
redis.blpop(wait_key, MAX_OP_WAIT_TIME - 3)
|
65
|
+
redis.del(wait_key)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
# Wakes up this node by pushing a value to its internal
|
70
|
+
# queue used by #wait.
|
71
|
+
def wakeup
|
72
|
+
perform_operation do |redis|
|
73
|
+
redis.lpush(wait_key, '1')
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
# Makes this node a slave of the given node.
|
78
|
+
#
|
79
|
+
# @param [Node] node the node of which to become a slave
|
80
|
+
def make_slave!(node)
|
81
|
+
perform_operation do |redis|
|
82
|
+
unless slave_of?(node)
|
83
|
+
redis.slaveof(node.host, node.port)
|
84
|
+
logger.info("#{self} is now a slave of #{node}")
|
85
|
+
wakeup
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
# Makes this node a master node.
|
91
|
+
def make_master!
|
92
|
+
perform_operation do |redis|
|
93
|
+
unless master?
|
94
|
+
redis.slaveof('no', 'one')
|
95
|
+
logger.info("#{self} is now master")
|
96
|
+
wakeup
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
# @return [String] an inspect string for this node
|
102
|
+
def inspect
|
103
|
+
"<RedisFailover::Node #{to_s}>"
|
104
|
+
end
|
105
|
+
|
106
|
+
# @return [String] a friendly string for this node
|
107
|
+
def to_s
|
108
|
+
"#{@host}:#{@port}"
|
109
|
+
end
|
110
|
+
|
111
|
+
# Determines if this node is equal to another node.
|
112
|
+
#
|
113
|
+
# @param [Node] other the other node to compare
|
114
|
+
# @return [Boolean] true if equal, false otherwise
|
115
|
+
def ==(other)
|
116
|
+
return false unless Node === other
|
117
|
+
return true if self.equal?(other)
|
118
|
+
[host, port] == [other.host, other.port]
|
119
|
+
end
|
120
|
+
alias_method :eql?, :==
|
121
|
+
|
122
|
+
# @return [Integer] a hash value for this node
|
123
|
+
def hash
|
124
|
+
to_s.hash
|
125
|
+
end
|
126
|
+
|
127
|
+
# Fetches information/stats for this node.
|
128
|
+
#
|
129
|
+
# @return [Hash] the info for this node
|
130
|
+
def fetch_info
|
131
|
+
perform_operation do |redis|
|
132
|
+
symbolize_keys(redis.info)
|
133
|
+
end
|
134
|
+
end
|
135
|
+
alias_method :ping, :fetch_info
|
136
|
+
|
137
|
+
# @return [Boolean] determines if this node prohibits stale reads
|
138
|
+
def prohibits_stale_reads?
|
139
|
+
perform_operation do |redis|
|
140
|
+
redis.config('get', 'slave-serve-stale-data').last == 'no'
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
# @return [Boolean] determines if this node is syncing with its master
|
145
|
+
def syncing_with_master?
|
146
|
+
perform_operation do |redis|
|
147
|
+
fetch_info[:master_sync_in_progress] == '1'
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
private
|
152
|
+
|
153
|
+
# @return [String] the current role for this node
|
154
|
+
def role
|
155
|
+
fetch_info[:role]
|
156
|
+
end
|
157
|
+
|
158
|
+
# @return [String] the name of the wait queue for this node
|
159
|
+
def wait_key
|
160
|
+
@wait_key ||= "_redis_failover_#{SecureRandom.hex(32)}"
|
161
|
+
end
|
162
|
+
|
163
|
+
# @return [Redis] a new redis client instance for this node
|
164
|
+
def new_client
|
165
|
+
Redis.new(:host => @host, :password => @password, :port => @port)
|
166
|
+
end
|
167
|
+
|
168
|
+
# Safely performs a redis operation within a given timeout window.
|
169
|
+
#
|
170
|
+
# @yield [Redis] the redis client to use for the operation
|
171
|
+
# @raise [NodeUnavailableError] if node is currently unreachable
|
172
|
+
def perform_operation
|
173
|
+
redis = nil
|
174
|
+
Timeout.timeout(MAX_OP_WAIT_TIME) do
|
175
|
+
redis = new_client
|
176
|
+
yield redis
|
177
|
+
end
|
178
|
+
rescue Exception => ex
|
179
|
+
raise NodeUnavailableError, "#{ex.class}: #{ex.message}", ex.backtrace
|
180
|
+
ensure
|
181
|
+
if redis
|
182
|
+
begin
|
183
|
+
redis.client.disconnect
|
184
|
+
rescue Exception => ex
|
185
|
+
raise NodeUnavailableError, "#{ex.class}: #{ex.message}", ex.backtrace
|
186
|
+
end
|
187
|
+
end
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end
|
@@ -0,0 +1,741 @@
|
|
1
|
+
module RedisFailover
|
2
|
+
# NodeManager manages a list of redis nodes. Upon startup, the NodeManager
|
3
|
+
# will discover the current redis master and slaves. Each redis node is
|
4
|
+
# monitored by a NodeWatcher instance. The NodeWatchers periodically
|
5
|
+
# report the current state of the redis node it's watching to the
|
6
|
+
# NodeManager. The NodeManager processes the state reports and reacts
|
7
|
+
# appropriately by handling stale/dead nodes, and promoting a new redis master
|
8
|
+
# if it sees fit to do so.
|
9
|
+
class NodeManager
|
10
|
+
include Util
|
11
|
+
|
12
|
+
# Number of seconds to wait before retrying bootstrap process.
|
13
|
+
TIMEOUT = 5
|
14
|
+
# Number of seconds for checking node snapshots.
|
15
|
+
CHECK_INTERVAL = 5
|
16
|
+
# Number of max attempts to promote a master before releasing master lock.
|
17
|
+
MAX_PROMOTION_ATTEMPTS = 3
|
18
|
+
# Latency threshold for recording node state.
|
19
|
+
LATENCY_THRESHOLD = 0.5
|
20
|
+
|
21
|
+
# Errors that can happen during the node discovery process.
|
22
|
+
NODE_DISCOVERY_ERRORS = [
|
23
|
+
InvalidNodeRoleError,
|
24
|
+
NodeUnavailableError,
|
25
|
+
NoMasterError,
|
26
|
+
MultipleMastersError
|
27
|
+
].freeze
|
28
|
+
|
29
|
+
# Creates a new instance.
|
30
|
+
#
|
31
|
+
# @param [Hash] options the options used to initialize the manager
|
32
|
+
# @option options [String] :zkservers comma-separated ZK host:port pairs
|
33
|
+
# @option options [String] :znode_path znode path override for redis nodes
|
34
|
+
# @option options [String] :password password for redis nodes
|
35
|
+
# @option options [Array<String>] :nodes the nodes to manage
|
36
|
+
# @option options [String] :max_failures the max failures for a node
|
37
|
+
def initialize(options)
|
38
|
+
logger.info("Redis Node Manager v#{VERSION} starting (#{RUBY_DESCRIPTION})")
|
39
|
+
@options = options
|
40
|
+
@required_node_managers = options.fetch(:required_node_managers, 1)
|
41
|
+
@root_znode = options.fetch(:znode_path, Util::DEFAULT_ROOT_ZNODE_PATH)
|
42
|
+
@node_strategy = NodeStrategy.for(options.fetch(:node_strategy, :majority))
|
43
|
+
@failover_strategy = FailoverStrategy.for(options.fetch(:failover_strategy, :latency))
|
44
|
+
@nodes = Array(@options[:nodes]).map { |opts| Node.new(opts) }.uniq
|
45
|
+
@master_manager = false
|
46
|
+
@master_promotion_attempts = 0
|
47
|
+
@sufficient_node_managers = false
|
48
|
+
@lock = Monitor.new
|
49
|
+
@shutdown = false
|
50
|
+
end
|
51
|
+
|
52
|
+
# Starts the node manager.
|
53
|
+
#
|
54
|
+
# @note This method does not return until the manager terminates.
|
55
|
+
def start
|
56
|
+
return unless running?
|
57
|
+
setup_zk
|
58
|
+
spawn_watchers
|
59
|
+
wait_until_master
|
60
|
+
rescue *ZK_ERRORS => ex
|
61
|
+
logger.error("ZK error while attempting to manage nodes: #{ex.inspect}")
|
62
|
+
reset
|
63
|
+
sleep(TIMEOUT)
|
64
|
+
retry
|
65
|
+
rescue NoMasterError
|
66
|
+
logger.error("Failed to promote a new master after #{MAX_PROMOTION_ATTEMPTS} attempts.")
|
67
|
+
reset
|
68
|
+
sleep(TIMEOUT)
|
69
|
+
retry
|
70
|
+
end
|
71
|
+
|
72
|
+
# Notifies the manager of a state change. Used primarily by
|
73
|
+
# {RedisFailover::NodeWatcher} to inform the manager of watched node states.
|
74
|
+
#
|
75
|
+
# @param [Node] node the node
|
76
|
+
# @param [Symbol] state the state
|
77
|
+
# @param [Integer] latency an optional latency
|
78
|
+
def notify_state(node, state, latency = nil)
|
79
|
+
@lock.synchronize do
|
80
|
+
if running?
|
81
|
+
update_current_state(node, state, latency)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
rescue => ex
|
85
|
+
logger.error("Error handling state report #{[node, state].inspect}: #{ex.inspect}")
|
86
|
+
logger.error(ex.backtrace.join("\n"))
|
87
|
+
end
|
88
|
+
|
89
|
+
# Performs a reset of the manager.
|
90
|
+
def reset
|
91
|
+
@master_manager = false
|
92
|
+
@master_promotion_attempts = 0
|
93
|
+
@watchers.each(&:shutdown) if @watchers
|
94
|
+
end
|
95
|
+
|
96
|
+
# Initiates a graceful shutdown.
|
97
|
+
def shutdown
|
98
|
+
logger.info('Shutting down ...')
|
99
|
+
@lock.synchronize do
|
100
|
+
@shutdown = true
|
101
|
+
end
|
102
|
+
|
103
|
+
reset
|
104
|
+
exit
|
105
|
+
end
|
106
|
+
|
107
|
+
private
|
108
|
+
|
109
|
+
# Configures the ZooKeeper client.
|
110
|
+
def setup_zk
|
111
|
+
unless @zk
|
112
|
+
@zk = ZK.new("#{@options[:zkservers]}#{@options[:chroot] || ''}")
|
113
|
+
@zk.register(manual_failover_path) do |event|
|
114
|
+
handle_manual_failover_update(event)
|
115
|
+
end
|
116
|
+
@zk.on_connected { @zk.stat(manual_failover_path, :watch => true) }
|
117
|
+
end
|
118
|
+
|
119
|
+
create_path(@root_znode)
|
120
|
+
create_path(current_state_root)
|
121
|
+
@zk.stat(manual_failover_path, :watch => true)
|
122
|
+
end
|
123
|
+
|
124
|
+
# Handles an unavailable node.
|
125
|
+
#
|
126
|
+
# @param [Node] node the unavailable node
|
127
|
+
# @param [Hash<Node, NodeSnapshot>] snapshots the current set of snapshots
|
128
|
+
def handle_unavailable(node, snapshots)
|
129
|
+
# no-op if we already know about this node
|
130
|
+
return if @unavailable.include?(node)
|
131
|
+
logger.info("Handling unavailable node: #{node}")
|
132
|
+
|
133
|
+
@unavailable << node
|
134
|
+
# find a new master if this node was a master
|
135
|
+
if node == @master
|
136
|
+
logger.info("Demoting currently unavailable master #{node}.")
|
137
|
+
promote_new_master(snapshots)
|
138
|
+
else
|
139
|
+
@slaves.delete(node)
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
# Handles an available node.
|
144
|
+
#
|
145
|
+
# @param [Node] node the available node
|
146
|
+
# @param [Hash<Node, NodeSnapshot>] snapshots the current set of snapshots
|
147
|
+
def handle_available(node, snapshots)
|
148
|
+
reconcile(node)
|
149
|
+
|
150
|
+
# no-op if we already know about this node
|
151
|
+
return if @master == node || (@master && @slaves.include?(node))
|
152
|
+
logger.info("Handling available node: #{node}")
|
153
|
+
|
154
|
+
if @master
|
155
|
+
# master already exists, make a slave
|
156
|
+
node.make_slave!(@master)
|
157
|
+
@slaves << node
|
158
|
+
else
|
159
|
+
# no master exists, make this the new master
|
160
|
+
promote_new_master(snapshots, node)
|
161
|
+
end
|
162
|
+
|
163
|
+
@unavailable.delete(node)
|
164
|
+
end
|
165
|
+
|
166
|
+
# Handles a node that is currently syncing.
|
167
|
+
#
|
168
|
+
# @param [Node] node the syncing node
|
169
|
+
# @param [Hash<Node, NodeSnapshot>] snapshots the current set of snapshots
|
170
|
+
def handle_syncing(node, snapshots)
|
171
|
+
reconcile(node)
|
172
|
+
|
173
|
+
if node.syncing_with_master? && node.prohibits_stale_reads?
|
174
|
+
logger.info("Node #{node} not ready yet, still syncing with master.")
|
175
|
+
force_unavailable_slave(node)
|
176
|
+
else
|
177
|
+
# otherwise, we can use this node
|
178
|
+
handle_available(node, snapshots)
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
# Handles a manual failover request to the given node.
|
183
|
+
#
|
184
|
+
# @param [Node] node the candidate node for failover
|
185
|
+
# @param [Hash<Node, NodeSnapshot>] snapshots the current set of snapshots
|
186
|
+
def handle_manual_failover(node, snapshots)
|
187
|
+
# no-op if node to be failed over is already master
|
188
|
+
return if @master == node
|
189
|
+
logger.info("Handling manual failover")
|
190
|
+
|
191
|
+
# ensure we can talk to the node
|
192
|
+
node.ping
|
193
|
+
|
194
|
+
# make current master a slave, and promote new master
|
195
|
+
@slaves << @master if @master
|
196
|
+
@slaves.delete(node)
|
197
|
+
promote_new_master(snapshots, node)
|
198
|
+
end
|
199
|
+
|
200
|
+
# Promotes a new master.
|
201
|
+
#
|
202
|
+
# @param [Hash<Node, NodeSnapshot>] snapshots the current set of snapshots
|
203
|
+
# @param [Node] node the optional node to promote
|
204
|
+
def promote_new_master(snapshots, node = nil)
|
205
|
+
delete_path(redis_nodes_path)
|
206
|
+
@master = nil
|
207
|
+
|
208
|
+
# make a specific node or selected candidate the new master
|
209
|
+
candidate = node || failover_strategy_candidate(snapshots)
|
210
|
+
|
211
|
+
if candidate.nil?
|
212
|
+
logger.error('Failed to promote a new master, no candidate available.')
|
213
|
+
else
|
214
|
+
@slaves.delete(candidate)
|
215
|
+
@unavailable.delete(candidate)
|
216
|
+
redirect_slaves_to(candidate)
|
217
|
+
candidate.make_master!
|
218
|
+
@master = candidate
|
219
|
+
write_current_redis_nodes
|
220
|
+
@master_promotion_attempts = 0
|
221
|
+
logger.info("Successfully promoted #{candidate} to master.")
|
222
|
+
end
|
223
|
+
end
|
224
|
+
|
225
|
+
# Discovers the current master and slave nodes.
|
226
|
+
# @return [Boolean] true if nodes successfully discovered, false otherwise
|
227
|
+
def discover_nodes
|
228
|
+
@lock.synchronize do
|
229
|
+
return unless running?
|
230
|
+
@slaves, @unavailable = [], []
|
231
|
+
if @master = find_existing_master
|
232
|
+
logger.info("Using master #{@master} from existing znode config.")
|
233
|
+
elsif @master = guess_master(@nodes)
|
234
|
+
logger.info("Guessed master #{@master} from known redis nodes.")
|
235
|
+
end
|
236
|
+
@slaves = @nodes - [@master]
|
237
|
+
logger.info("Managing master (#{@master}) and slaves #{stringify_nodes(@slaves)}")
|
238
|
+
end
|
239
|
+
rescue *NODE_DISCOVERY_ERRORS => ex
|
240
|
+
msg = <<-MSG.gsub(/\s+/, ' ')
|
241
|
+
Failed to discover master node: #{ex.inspect}
|
242
|
+
In order to ensure a safe startup, redis_failover requires that all redis
|
243
|
+
nodes be accessible, and only a single node indicating that it's the master.
|
244
|
+
In order to fix this, you can perform a manual failover via redis_failover,
|
245
|
+
or manually fix the individual redis servers. This discovery process will
|
246
|
+
retry in #{TIMEOUT}s.
|
247
|
+
MSG
|
248
|
+
logger.warn(msg)
|
249
|
+
sleep(TIMEOUT)
|
250
|
+
retry
|
251
|
+
end
|
252
|
+
|
253
|
+
# Seeds the initial node master from an existing znode config.
|
254
|
+
def find_existing_master
|
255
|
+
if data = @zk.get(redis_nodes_path).first
|
256
|
+
nodes = symbolize_keys(decode(data))
|
257
|
+
master = node_from(nodes[:master])
|
258
|
+
logger.info("Master from existing znode config: #{master || 'none'}")
|
259
|
+
# Check for case where a node previously thought to be the master was
|
260
|
+
# somehow manually reconfigured to be a slave outside of the node manager's
|
261
|
+
# control.
|
262
|
+
begin
|
263
|
+
if master && master.slave?
|
264
|
+
raise InvalidNodeRoleError.new(master, :master, :slave)
|
265
|
+
end
|
266
|
+
rescue RedisFailover::NodeUnavailableError => ex
|
267
|
+
logger.warn("Failed to check whether existing master has invalid role: #{ex.inspect}")
|
268
|
+
end
|
269
|
+
|
270
|
+
master
|
271
|
+
end
|
272
|
+
rescue ZK::Exceptions::NoNode
|
273
|
+
# blank slate, no last known master
|
274
|
+
nil
|
275
|
+
end
|
276
|
+
|
277
|
+
# Creates a Node instance from a string.
|
278
|
+
#
|
279
|
+
# @param [String] node_string a string representation of a node (e.g., host:port)
|
280
|
+
# @return [Node] the Node representation
|
281
|
+
def node_from(node_string)
|
282
|
+
return if node_string.nil?
|
283
|
+
host, port = node_string.split(':', 2)
|
284
|
+
Node.new(:host => host, :port => port, :password => @options[:password])
|
285
|
+
end
|
286
|
+
|
287
|
+
# Spawns the {RedisFailover::NodeWatcher} instances for each managed node.
|
288
|
+
def spawn_watchers
|
289
|
+
@zk.delete(current_state_path, :ignore => :no_node)
|
290
|
+
@monitored_available, @monitored_unavailable = {}, []
|
291
|
+
@watchers = @nodes.map do |node|
|
292
|
+
NodeWatcher.new(self, node, @options.fetch(:max_failures, 3))
|
293
|
+
end
|
294
|
+
@watchers.each(&:watch)
|
295
|
+
logger.info("Monitoring redis nodes at #{stringify_nodes(@nodes)}")
|
296
|
+
end
|
297
|
+
|
298
|
+
# Searches for the master node.
|
299
|
+
#
|
300
|
+
# @param [Array<Node>] nodes the nodes to search
|
301
|
+
# @return [Node] the found master node, nil if not found
|
302
|
+
def guess_master(nodes)
|
303
|
+
master_nodes = nodes.select { |node| node.master? }
|
304
|
+
raise NoMasterError if master_nodes.empty?
|
305
|
+
raise MultipleMastersError.new(master_nodes) if master_nodes.size > 1
|
306
|
+
master_nodes.first
|
307
|
+
end
|
308
|
+
|
309
|
+
# Redirects all slaves to the specified node.
|
310
|
+
#
|
311
|
+
# @param [Node] node the node to which slaves are redirected
|
312
|
+
def redirect_slaves_to(node)
|
313
|
+
@slaves.dup.each do |slave|
|
314
|
+
begin
|
315
|
+
slave.make_slave!(node)
|
316
|
+
rescue NodeUnavailableError
|
317
|
+
logger.info("Failed to redirect unreachable slave #{slave} to #{node}")
|
318
|
+
force_unavailable_slave(slave)
|
319
|
+
end
|
320
|
+
end
|
321
|
+
end
|
322
|
+
|
323
|
+
# Forces a slave to be marked as unavailable.
|
324
|
+
#
|
325
|
+
# @param [Node] node the node to force as unavailable
|
326
|
+
def force_unavailable_slave(node)
|
327
|
+
@slaves.delete(node)
|
328
|
+
@unavailable << node unless @unavailable.include?(node)
|
329
|
+
end
|
330
|
+
|
331
|
+
# It's possible that a newly available node may have been restarted
|
332
|
+
# and completely lost its dynamically set run-time role by the node
|
333
|
+
# manager. This method ensures that the node resumes its role as
|
334
|
+
# determined by the manager.
|
335
|
+
#
|
336
|
+
# @param [Node] node the node to reconcile
|
337
|
+
def reconcile(node)
|
338
|
+
return if @master == node && node.master?
|
339
|
+
return if @master && node.slave_of?(@master)
|
340
|
+
|
341
|
+
logger.info("Reconciling node #{node}")
|
342
|
+
if @master == node && !node.master?
|
343
|
+
# we think the node is a master, but the node doesn't
|
344
|
+
node.make_master!
|
345
|
+
return
|
346
|
+
end
|
347
|
+
|
348
|
+
# verify that node is a slave for the current master
|
349
|
+
if @master && !node.slave_of?(@master)
|
350
|
+
node.make_slave!(@master)
|
351
|
+
end
|
352
|
+
end
|
353
|
+
|
354
|
+
# @return [Hash] the set of current nodes grouped by category
|
355
|
+
def current_nodes
|
356
|
+
{
|
357
|
+
:master => @master ? @master.to_s : nil,
|
358
|
+
:slaves => @slaves.map(&:to_s),
|
359
|
+
:unavailable => @unavailable.map(&:to_s)
|
360
|
+
}
|
361
|
+
end
|
362
|
+
|
363
|
+
# @return [Hash] the set of currently available/unavailable nodes as
|
364
|
+
# seen by this node manager instance
|
365
|
+
def node_availability_state
|
366
|
+
{
|
367
|
+
:available => Hash[@monitored_available.map { |k, v| [k.to_s, v] }],
|
368
|
+
:unavailable => @monitored_unavailable.map(&:to_s)
|
369
|
+
}
|
370
|
+
end
|
371
|
+
|
372
|
+
# Deletes the znode path containing the redis nodes.
|
373
|
+
#
|
374
|
+
# @param [String] path the znode path to delete
|
375
|
+
def delete_path(path)
|
376
|
+
@zk.delete(path)
|
377
|
+
logger.info("Deleted ZK node #{path}")
|
378
|
+
rescue ZK::Exceptions::NoNode => ex
|
379
|
+
logger.info("Tried to delete missing znode: #{ex.inspect}")
|
380
|
+
end
|
381
|
+
|
382
|
+
# Creates a znode path.
|
383
|
+
#
|
384
|
+
# @param [String] path the znode path to create
|
385
|
+
# @param [Hash] options the options used to create the path
|
386
|
+
# @option options [String] :initial_value an initial value for the znode
|
387
|
+
# @option options [Boolean] :ephemeral true if node is ephemeral, false otherwise
|
388
|
+
def create_path(path, options = {})
|
389
|
+
unless @zk.exists?(path)
|
390
|
+
@zk.create(path,
|
391
|
+
options[:initial_value],
|
392
|
+
:ephemeral => options.fetch(:ephemeral, false))
|
393
|
+
logger.info("Created ZK node #{path}")
|
394
|
+
end
|
395
|
+
rescue ZK::Exceptions::NodeExists
|
396
|
+
# best effort
|
397
|
+
end
|
398
|
+
|
399
|
+
# Writes state to a particular znode path.
|
400
|
+
#
|
401
|
+
# @param [String] path the znode path that should be written to
|
402
|
+
# @param [String] value the value to write to the znode
|
403
|
+
# @param [Hash] options the default options to be used when creating the node
|
404
|
+
# @note the path will be created if it doesn't exist
|
405
|
+
def write_state(path, value, options = {})
|
406
|
+
create_path(path, options.merge(:initial_value => value))
|
407
|
+
@zk.set(path, value)
|
408
|
+
end
|
409
|
+
|
410
|
+
# Handles a manual failover znode update.
|
411
|
+
#
|
412
|
+
# @param [ZK::Event] event the ZK event to handle
|
413
|
+
def handle_manual_failover_update(event)
|
414
|
+
if event.node_created? || event.node_changed?
|
415
|
+
perform_manual_failover
|
416
|
+
end
|
417
|
+
rescue => ex
|
418
|
+
logger.error("Error scheduling a manual failover: #{ex.inspect}")
|
419
|
+
logger.error(ex.backtrace.join("\n"))
|
420
|
+
ensure
|
421
|
+
@zk.stat(manual_failover_path, :watch => true)
|
422
|
+
end
|
423
|
+
|
424
|
+
# Produces a FQDN id for this Node Manager.
|
425
|
+
#
|
426
|
+
# @return [String] the FQDN for this Node Manager
|
427
|
+
def manager_id
|
428
|
+
@manager_id ||= [
|
429
|
+
Socket.gethostbyname(Socket.gethostname)[0],
|
430
|
+
Process.pid
|
431
|
+
].join('-')
|
432
|
+
end
|
433
|
+
|
434
|
+
# Writes the current master list of redis nodes. This method is only invoked
|
435
|
+
# if this node manager instance is the master/primary manager.
|
436
|
+
def write_current_redis_nodes
|
437
|
+
write_state(redis_nodes_path, encode(current_nodes))
|
438
|
+
end
|
439
|
+
|
440
|
+
# Writes the current monitored list of redis nodes. This method is always
|
441
|
+
# invoked by all running node managers.
|
442
|
+
def write_current_monitored_state
|
443
|
+
write_state(current_state_path, encode(node_availability_state), :ephemeral => true)
|
444
|
+
end
|
445
|
+
|
446
|
+
# @return [String] root path for current node manager state
|
447
|
+
def current_state_root
|
448
|
+
"#{@root_znode}/manager_node_state"
|
449
|
+
end
|
450
|
+
|
451
|
+
# @return [String] the znode path for this node manager's view
|
452
|
+
# of available nodes
|
453
|
+
def current_state_path
|
454
|
+
"#{current_state_root}/#{manager_id}"
|
455
|
+
end
|
456
|
+
|
457
|
+
# @return [String] the znode path for the master redis nodes config
|
458
|
+
def redis_nodes_path
|
459
|
+
"#{@root_znode}/nodes"
|
460
|
+
end
|
461
|
+
|
462
|
+
# @return [String] root path for current node manager lock
|
463
|
+
def current_lock_path
|
464
|
+
"#{@root_znode}/master_redis_node_manager_lock"
|
465
|
+
end
|
466
|
+
|
467
|
+
# @return [String] the znode path used for performing manual failovers
|
468
|
+
def manual_failover_path
|
469
|
+
ManualFailover.path(@root_znode)
|
470
|
+
end
|
471
|
+
|
472
|
+
# @return [Boolean] true if this node manager is the master, false otherwise
|
473
|
+
def master_manager?
|
474
|
+
@master_manager
|
475
|
+
end
|
476
|
+
|
477
|
+
# Used to update the master node manager state. These states are only handled if
|
478
|
+
# this node manager instance is serving as the master manager.
|
479
|
+
#
|
480
|
+
# @param [Node] node the node to handle
|
481
|
+
# @param [Hash<Node, NodeSnapshot>] snapshots the current set of snapshots
|
482
|
+
def update_master_state(node, snapshots)
|
483
|
+
state = @node_strategy.determine_state(node, snapshots)
|
484
|
+
case state
|
485
|
+
when :unavailable
|
486
|
+
handle_unavailable(node, snapshots)
|
487
|
+
when :available
|
488
|
+
if node.syncing_with_master?
|
489
|
+
handle_syncing(node, snapshots)
|
490
|
+
else
|
491
|
+
handle_available(node, snapshots)
|
492
|
+
end
|
493
|
+
else
|
494
|
+
raise InvalidNodeStateError.new(node, state)
|
495
|
+
end
|
496
|
+
rescue *ZK_ERRORS
|
497
|
+
# fail hard if this is a ZK connection-related error
|
498
|
+
raise
|
499
|
+
rescue => ex
|
500
|
+
logger.error("Error handling state report for #{[node, state].inspect}: #{ex.inspect}")
|
501
|
+
end
|
502
|
+
|
503
|
+
# Updates the current view of the world for this particular node
|
504
|
+
# manager instance. All node managers write this state regardless
|
505
|
+
# of whether they are the master manager or not.
|
506
|
+
#
|
507
|
+
# @param [Node] node the node to handle
|
508
|
+
# @param [Symbol] state the node state
|
509
|
+
# @param [Integer] latency an optional latency
|
510
|
+
def update_current_state(node, state, latency = nil)
|
511
|
+
old_unavailable = @monitored_unavailable.dup
|
512
|
+
old_available = @monitored_available.dup
|
513
|
+
|
514
|
+
case state
|
515
|
+
when :unavailable
|
516
|
+
unless @monitored_unavailable.include?(node)
|
517
|
+
@monitored_unavailable << node
|
518
|
+
@monitored_available.delete(node)
|
519
|
+
write_current_monitored_state
|
520
|
+
end
|
521
|
+
when :available
|
522
|
+
last_latency = @monitored_available[node]
|
523
|
+
if last_latency.nil? || (latency - last_latency) > LATENCY_THRESHOLD
|
524
|
+
@monitored_available[node] = latency
|
525
|
+
@monitored_unavailable.delete(node)
|
526
|
+
write_current_monitored_state
|
527
|
+
end
|
528
|
+
else
|
529
|
+
raise InvalidNodeStateError.new(node, state)
|
530
|
+
end
|
531
|
+
rescue => ex
|
532
|
+
# if an error occurs, make sure that we rollback to the old state
|
533
|
+
@monitored_unavailable = old_unavailable
|
534
|
+
@monitored_available = old_available
|
535
|
+
raise
|
536
|
+
end
|
537
|
+
|
538
|
+
# Fetches each currently running node manager's view of the
|
539
|
+
# world in terms of which nodes they think are available/unavailable.
|
540
|
+
#
|
541
|
+
# @return [Hash<String, Array>] a hash of node manager to host states
|
542
|
+
def fetch_node_manager_states
|
543
|
+
states = {}
|
544
|
+
@zk.children(current_state_root).each do |child|
|
545
|
+
full_path = "#{current_state_root}/#{child}"
|
546
|
+
begin
|
547
|
+
states[child] = symbolize_keys(decode(@zk.get(full_path).first))
|
548
|
+
rescue ZK::Exceptions::NoNode
|
549
|
+
# ignore, this is an edge case that can happen when a node manager
|
550
|
+
# process dies while fetching its state
|
551
|
+
rescue => ex
|
552
|
+
logger.error("Failed to fetch states for #{full_path}: #{ex.inspect}")
|
553
|
+
end
|
554
|
+
end
|
555
|
+
states
|
556
|
+
end
|
557
|
+
|
558
|
+
# Builds current snapshots of nodes across all running node managers.
|
559
|
+
#
|
560
|
+
# @return [Hash<Node, NodeSnapshot>] the snapshots for all nodes
|
561
|
+
def current_node_snapshots
|
562
|
+
nodes = {}
|
563
|
+
snapshots = Hash.new { |h, k| h[k] = NodeSnapshot.new(k) }
|
564
|
+
fetch_node_manager_states.each do |node_manager, states|
|
565
|
+
available, unavailable = states.values_at(:available, :unavailable)
|
566
|
+
available.each do |node_string, latency|
|
567
|
+
node = nodes[node_string] ||= node_from(node_string)
|
568
|
+
snapshots[node].viewable_by(node_manager, latency)
|
569
|
+
end
|
570
|
+
unavailable.each do |node_string|
|
571
|
+
node = nodes[node_string] ||= node_from(node_string)
|
572
|
+
snapshots[node].unviewable_by(node_manager)
|
573
|
+
end
|
574
|
+
end
|
575
|
+
|
576
|
+
snapshots
|
577
|
+
end
|
578
|
+
|
579
|
+
# Waits until this node manager becomes the master.
|
580
|
+
def wait_until_master
|
581
|
+
logger.info('Waiting to become master Node Manager ...')
|
582
|
+
|
583
|
+
with_lock do
|
584
|
+
@master_manager = true
|
585
|
+
logger.info('Acquired master Node Manager lock.')
|
586
|
+
logger.info("Configured node strategy #{@node_strategy.class}")
|
587
|
+
logger.info("Configured failover strategy #{@failover_strategy.class}")
|
588
|
+
logger.info("Required Node Managers to make a decision: #{@required_node_managers}")
|
589
|
+
manage_nodes
|
590
|
+
end
|
591
|
+
end
|
592
|
+
|
593
|
+
# Manages the redis nodes by periodically processing snapshots.
|
594
|
+
def manage_nodes
|
595
|
+
# Re-discover nodes, since the state of the world may have been changed
|
596
|
+
# by the time we've become the primary node manager.
|
597
|
+
discover_nodes
|
598
|
+
|
599
|
+
# ensure that slaves are correctly pointing to this master
|
600
|
+
redirect_slaves_to(@master)
|
601
|
+
|
602
|
+
# Periodically update master config state.
|
603
|
+
while running? && master_manager?
|
604
|
+
@zk_lock.assert!
|
605
|
+
sleep(CHECK_INTERVAL)
|
606
|
+
|
607
|
+
@lock.synchronize do
|
608
|
+
snapshots = current_node_snapshots
|
609
|
+
if ensure_sufficient_node_managers(snapshots)
|
610
|
+
snapshots.each_key do |node|
|
611
|
+
update_master_state(node, snapshots)
|
612
|
+
end
|
613
|
+
|
614
|
+
# flush current master state
|
615
|
+
write_current_redis_nodes
|
616
|
+
|
617
|
+
# check if we've exhausted our attempts to promote a master
|
618
|
+
unless @master
|
619
|
+
@master_promotion_attempts += 1
|
620
|
+
raise NoMasterError if @master_promotion_attempts > MAX_PROMOTION_ATTEMPTS
|
621
|
+
end
|
622
|
+
end
|
623
|
+
end
|
624
|
+
end
|
625
|
+
end
|
626
|
+
|
627
|
+
# Creates a Node instance from a string.
|
628
|
+
#
|
629
|
+
# @param [String] node_string a string representation of a node (e.g., host:port)
|
630
|
+
# @return [Node] the Node representation
|
631
|
+
def node_from(node_string)
|
632
|
+
return if node_string.nil?
|
633
|
+
host, port = node_string.split(':', 2)
|
634
|
+
Node.new(:host => host, :port => port, :password => @options[:password])
|
635
|
+
end
|
636
|
+
|
637
|
+
# Executes a block wrapped in a ZK exclusive lock.
|
638
|
+
def with_lock
|
639
|
+
@zk_lock ||= @zk.locker(current_lock_path)
|
640
|
+
|
641
|
+
begin
|
642
|
+
@zk_lock.lock!(true)
|
643
|
+
rescue Exception
|
644
|
+
# handle shutdown case
|
645
|
+
running? ? raise : return
|
646
|
+
end
|
647
|
+
|
648
|
+
if running?
|
649
|
+
@zk_lock.assert!
|
650
|
+
yield
|
651
|
+
end
|
652
|
+
ensure
|
653
|
+
if @zk_lock
|
654
|
+
begin
|
655
|
+
@zk_lock.unlock!
|
656
|
+
rescue => ex
|
657
|
+
logger.warn("Failed to release lock: #{ex.inspect}")
|
658
|
+
end
|
659
|
+
end
|
660
|
+
end
|
661
|
+
|
662
|
+
# Perform a manual failover to a redis node.
|
663
|
+
def perform_manual_failover
|
664
|
+
@lock.synchronize do
|
665
|
+
return unless running? && @master_manager && @zk_lock
|
666
|
+
@zk_lock.assert!
|
667
|
+
new_master = @zk.get(manual_failover_path, :watch => true).first
|
668
|
+
return unless new_master && new_master.size > 0
|
669
|
+
logger.info("Received manual failover request for: #{new_master}")
|
670
|
+
logger.info("Current nodes: #{current_nodes.inspect}")
|
671
|
+
snapshots = current_node_snapshots
|
672
|
+
|
673
|
+
node = if new_master == ManualFailover::ANY_SLAVE
|
674
|
+
failover_strategy_candidate(snapshots)
|
675
|
+
else
|
676
|
+
node_from(new_master)
|
677
|
+
end
|
678
|
+
|
679
|
+
if node
|
680
|
+
handle_manual_failover(node, snapshots)
|
681
|
+
else
|
682
|
+
logger.error('Failed to perform manual failover, no candidate found.')
|
683
|
+
end
|
684
|
+
end
|
685
|
+
rescue => ex
|
686
|
+
logger.error("Error handling manual failover: #{ex.inspect}")
|
687
|
+
logger.error(ex.backtrace.join("\n"))
|
688
|
+
ensure
|
689
|
+
@zk.stat(manual_failover_path, :watch => true)
|
690
|
+
end
|
691
|
+
|
692
|
+
# @return [Boolean] true if running, false otherwise
|
693
|
+
def running?
|
694
|
+
@lock.synchronize { !@shutdown }
|
695
|
+
end
|
696
|
+
|
697
|
+
# @return [String] a stringified version of redis nodes
|
698
|
+
def stringify_nodes(nodes)
|
699
|
+
"(#{nodes.map(&:to_s).join(', ')})"
|
700
|
+
end
|
701
|
+
|
702
|
+
# Determines if each snapshot has a sufficient number of node managers.
|
703
|
+
#
|
704
|
+
# @param [Hash<Node, Snapshot>] snapshots the current snapshots
|
705
|
+
# @return [Boolean] true if sufficient, false otherwise
|
706
|
+
def ensure_sufficient_node_managers(snapshots)
|
707
|
+
currently_sufficient = true
|
708
|
+
snapshots.each do |node, snapshot|
|
709
|
+
node_managers = snapshot.node_managers
|
710
|
+
if node_managers.size < @required_node_managers
|
711
|
+
logger.error("Not enough Node Managers in snapshot for node #{node}. " +
|
712
|
+
"Required: #{@required_node_managers}, " +
|
713
|
+
"Available: #{node_managers.size} #{node_managers}")
|
714
|
+
currently_sufficient = false
|
715
|
+
end
|
716
|
+
end
|
717
|
+
|
718
|
+
if currently_sufficient && !@sufficient_node_managers
|
719
|
+
logger.info("Required Node Managers are visible: #{@required_node_managers}")
|
720
|
+
end
|
721
|
+
|
722
|
+
@sufficient_node_managers = currently_sufficient
|
723
|
+
@sufficient_node_managers
|
724
|
+
end
|
725
|
+
|
726
|
+
# Invokes the configured failover strategy.
|
727
|
+
#
|
728
|
+
# @param [Hash<Node, NodeSnapshot>] snapshots the node snapshots
|
729
|
+
# @return [Node] a failover candidate
|
730
|
+
def failover_strategy_candidate(snapshots)
|
731
|
+
# only include nodes that this master Node Manager can see
|
732
|
+
filtered_snapshots = snapshots.select do |node, snapshot|
|
733
|
+
snapshot.viewable_by?(manager_id)
|
734
|
+
end
|
735
|
+
|
736
|
+
logger.info('Attempting to find candidate from snapshots:')
|
737
|
+
logger.info("\n" + filtered_snapshots.values.join("\n"))
|
738
|
+
@failover_strategy.find_candidate(filtered_snapshots)
|
739
|
+
end
|
740
|
+
end
|
741
|
+
end
|