evinrude 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.editorconfig +23 -0
- data/.gitignore +6 -0
- data/.yardopts +1 -0
- data/CODE_OF_CONDUCT.md +49 -0
- data/CONTRIBUTING.md +10 -0
- data/LICENCE +674 -0
- data/README.md +410 -0
- data/evinrude.gemspec +42 -0
- data/lib/evinrude.rb +1233 -0
- data/lib/evinrude/backoff.rb +19 -0
- data/lib/evinrude/cluster_configuration.rb +162 -0
- data/lib/evinrude/config_change_queue_entry.rb +19 -0
- data/lib/evinrude/config_change_queue_entry/add_node.rb +13 -0
- data/lib/evinrude/config_change_queue_entry/remove_node.rb +14 -0
- data/lib/evinrude/freedom_patches/range.rb +5 -0
- data/lib/evinrude/log.rb +102 -0
- data/lib/evinrude/log_entries.rb +3 -0
- data/lib/evinrude/log_entry.rb +13 -0
- data/lib/evinrude/log_entry/cluster_configuration.rb +15 -0
- data/lib/evinrude/log_entry/null.rb +6 -0
- data/lib/evinrude/log_entry/state_machine_command.rb +13 -0
- data/lib/evinrude/logging_helpers.rb +40 -0
- data/lib/evinrude/message.rb +19 -0
- data/lib/evinrude/message/append_entries_reply.rb +13 -0
- data/lib/evinrude/message/append_entries_request.rb +18 -0
- data/lib/evinrude/message/command_reply.rb +13 -0
- data/lib/evinrude/message/command_request.rb +18 -0
- data/lib/evinrude/message/install_snapshot_reply.rb +13 -0
- data/lib/evinrude/message/install_snapshot_request.rb +18 -0
- data/lib/evinrude/message/join_reply.rb +13 -0
- data/lib/evinrude/message/join_request.rb +18 -0
- data/lib/evinrude/message/node_removal_reply.rb +13 -0
- data/lib/evinrude/message/node_removal_request.rb +18 -0
- data/lib/evinrude/message/read_reply.rb +13 -0
- data/lib/evinrude/message/read_request.rb +18 -0
- data/lib/evinrude/message/vote_reply.rb +13 -0
- data/lib/evinrude/message/vote_request.rb +18 -0
- data/lib/evinrude/messages.rb +14 -0
- data/lib/evinrude/metrics.rb +50 -0
- data/lib/evinrude/network.rb +69 -0
- data/lib/evinrude/network/connection.rb +144 -0
- data/lib/evinrude/network/protocol.rb +69 -0
- data/lib/evinrude/node_info.rb +35 -0
- data/lib/evinrude/peer.rb +50 -0
- data/lib/evinrude/resolver.rb +96 -0
- data/lib/evinrude/snapshot.rb +9 -0
- data/lib/evinrude/state_machine.rb +15 -0
- data/lib/evinrude/state_machine/register.rb +25 -0
- data/smoke_tests/001_single_node_cluster.rb +20 -0
- data/smoke_tests/002_three_node_cluster.rb +43 -0
- data/smoke_tests/003_spill.rb +25 -0
- data/smoke_tests/004_stale_read.rb +67 -0
- data/smoke_tests/005_sleepy_master.rb +28 -0
- data/smoke_tests/006_join_via_follower.rb +26 -0
- data/smoke_tests/007_snapshot_madness.rb +97 -0
- data/smoke_tests/008_downsizing.rb +43 -0
- data/smoke_tests/009_disaster_recovery.rb +46 -0
- data/smoke_tests/999_final_smoke_test.rb +279 -0
- data/smoke_tests/run +22 -0
- data/smoke_tests/smoke_test_helper.rb +199 -0
- metadata +318 -0
data/lib/evinrude.rb
ADDED
@@ -0,0 +1,1233 @@
|
|
1
|
+
require "async"
|
2
|
+
require "async/dns"
|
3
|
+
require "fiber"
|
4
|
+
require "logger"
|
5
|
+
require "pathname"
|
6
|
+
require "securerandom"
|
7
|
+
require "tempfile"
|
8
|
+
|
9
|
+
require_relative "./evinrude/logging_helpers"
|
10
|
+
|
11
|
+
class Evinrude
|
12
|
+
include Evinrude::LoggingHelpers
|
13
|
+
|
14
|
+
class Error < StandardError; end
|
15
|
+
|
16
|
+
class NoLeaderError < Error; end
|
17
|
+
|
18
|
+
class NodeExpiredError < Error; end
|
19
|
+
|
20
|
+
attr_reader :node_name
|
21
|
+
|
22
|
+
def initialize(join_hints: [], shared_keys:, state_machine: Evinrude::StateMachine::Register,
|
23
|
+
logger: Logger.new("/dev/null"), node_name: nil, storage_dir: nil,
|
24
|
+
heartbeat_interval: 0.25, heartbeat_timeout: 1..2,
|
25
|
+
listen: {}, advertise: {}, metrics_registry: Prometheus::Client::Registry.new)
|
26
|
+
@join_hints, @keys, @logger, @heartbeat_interval, @heartbeat_timeout = join_hints, shared_keys, logger, heartbeat_interval, heartbeat_timeout
|
27
|
+
|
28
|
+
@metrics = Evinrude::Metrics.new(metrics_registry)
|
29
|
+
|
30
|
+
@listen, @advertise = listen, advertise
|
31
|
+
@listen[:address] ||= "::"
|
32
|
+
@listen[:port] ||= 0
|
33
|
+
|
34
|
+
if storage_dir
|
35
|
+
@storage_dir = Pathname.new(storage_dir)
|
36
|
+
end
|
37
|
+
|
38
|
+
snapshot = if @storage_dir
|
39
|
+
if !@storage_dir.exist?
|
40
|
+
@storage_dir.mkdir
|
41
|
+
end
|
42
|
+
|
43
|
+
if !@storage_dir.directory?
|
44
|
+
raise ArgumentError, "Storage directory #{@storage_dir} isn't *actually* a directory"
|
45
|
+
end
|
46
|
+
|
47
|
+
snapshot_file = @storage_dir.join("snapshot.yaml")
|
48
|
+
|
49
|
+
if snapshot_file.exist?
|
50
|
+
@metrics.snapshot_file_size.set(snapshot_file.stat.size)
|
51
|
+
YAML.load_file(snapshot_file)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
@state_machine_class = state_machine
|
56
|
+
|
57
|
+
if snapshot
|
58
|
+
@node_name = snapshot.node_name
|
59
|
+
@state_machine = @state_machine_class.new(snapshot: snapshot.state)
|
60
|
+
@last_command_ids = snapshot.last_command_ids
|
61
|
+
else
|
62
|
+
@node_name = node_name || SecureRandom.uuid
|
63
|
+
@state_machine = @state_machine_class.new
|
64
|
+
@last_command_ids = {}
|
65
|
+
end
|
66
|
+
|
67
|
+
@sm_mutex = Mutex.new
|
68
|
+
|
69
|
+
if snapshot
|
70
|
+
@config = snapshot.cluster_config
|
71
|
+
@config_index = snapshot.cluster_config_index
|
72
|
+
@config.metrics = @metrics
|
73
|
+
@config.logger = logger
|
74
|
+
else
|
75
|
+
@config = Evinrude::ClusterConfiguration.new(logger: logger, metrics: @metrics)
|
76
|
+
@config_index = 0
|
77
|
+
end
|
78
|
+
|
79
|
+
@last_append = Time.at(0)
|
80
|
+
@current_term = 0
|
81
|
+
@voted_for = nil
|
82
|
+
@mode = :init
|
83
|
+
|
84
|
+
@metrics.term.set(0)
|
85
|
+
|
86
|
+
if snapshot
|
87
|
+
logger.debug(logloc) { "Configuring log from snapshot; snapshot_last_term=#{snapshot.last_term} snapshot_last_index=#{snapshot.last_index}" }
|
88
|
+
@log = Evinrude::Log.new(snapshot_last_term: snapshot.last_term, snapshot_last_index: snapshot.last_index, logger: logger)
|
89
|
+
else
|
90
|
+
@log = Evinrude::Log.new(logger: logger)
|
91
|
+
end
|
92
|
+
|
93
|
+
if snapshot
|
94
|
+
logger.debug(logloc) { "Setting commit_index to #{snapshot.last_index} from snapshot" }
|
95
|
+
@commit_index = snapshot.last_index
|
96
|
+
else
|
97
|
+
@commit_index = 0
|
98
|
+
end
|
99
|
+
|
100
|
+
@metrics.commit_index.set(@commit_index)
|
101
|
+
|
102
|
+
@peers = Hash.new do |h, k|
|
103
|
+
backoff = Evinrude::Backoff.new
|
104
|
+
|
105
|
+
peer_conn = @network.connect(address: k.address, port: k.port)
|
106
|
+
|
107
|
+
h[k] = Peer.new(metrics: @metrics, conn: peer_conn, node_info: k, next_index: @log.last_index + 1)
|
108
|
+
end
|
109
|
+
|
110
|
+
@config_change_queue = []
|
111
|
+
@config_change_request_in_progress = nil
|
112
|
+
@cc_sem = Async::Semaphore.new
|
113
|
+
end
|
114
|
+
|
115
|
+
def command(s)
|
116
|
+
@metrics.command_execution.measure do
|
117
|
+
Async(logger: logger) do |task|
|
118
|
+
command_id = SecureRandom.uuid
|
119
|
+
|
120
|
+
loop do
|
121
|
+
reply = rpc_to_leader(Message::CommandRequest.new(command: s, id: command_id, node_name: @node_name), task)
|
122
|
+
|
123
|
+
if reply.success
|
124
|
+
break true
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end.result
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
def state
|
132
|
+
@metrics.read_state.measure do
|
133
|
+
Async(logger: logger) do |task|
|
134
|
+
loop do
|
135
|
+
state_object = nil
|
136
|
+
commit_index = nil
|
137
|
+
|
138
|
+
@sm_mutex.synchronize do
|
139
|
+
# Disturbingly, this appears to be one of the best available ways
|
140
|
+
# to make a guaranteed deep copy of an arbitrary object
|
141
|
+
state_object = YAML.load(@state_machine.current_state.to_yaml)
|
142
|
+
commit_index = @commit_index
|
143
|
+
end
|
144
|
+
|
145
|
+
logger.debug(logloc) { "(in #{@node_name}) Checking if #{state_object.inspect} at commit_index=#{commit_index} is the most up-to-date state" }
|
146
|
+
|
147
|
+
reply = rpc_to_leader(Evinrude::Message::ReadRequest.new(commit_index: commit_index), task)
|
148
|
+
|
149
|
+
if reply.success
|
150
|
+
break state_object
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end.result
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
def run
|
158
|
+
logger.info(logloc) { "Evinrude node #{@node_name} starting up" }
|
159
|
+
|
160
|
+
@metrics.start_time.set(Time.now.to_f)
|
161
|
+
|
162
|
+
if @storage_dir
|
163
|
+
@metrics.log_loaded_from_disk.set(1)
|
164
|
+
load_log_from_disk
|
165
|
+
else
|
166
|
+
@metrics.log_loaded_from_disk.set(0)
|
167
|
+
end
|
168
|
+
|
169
|
+
Async do |task| #(logger: logger) do |task|
|
170
|
+
@async_task = task
|
171
|
+
@network = Network.new(keys: @keys, logger: logger, metrics: @metrics, listen: @listen, advertise: @advertise).start
|
172
|
+
|
173
|
+
logger.info(logloc) { "Node #{@node_name} listening on #{address}:#{port}" }
|
174
|
+
|
175
|
+
@metrics.info.set(1, labels: { node_name: @node_name, listen_address: @network.listen_address, listen_port: @network.listen_port, advertise_address: address, advertise_port: port })
|
176
|
+
|
177
|
+
task.async { process_rpc_requests }
|
178
|
+
|
179
|
+
join_or_create_cluster
|
180
|
+
end.return
|
181
|
+
rescue => ex
|
182
|
+
log_exception(ex) { "Fatal error" }
|
183
|
+
raise
|
184
|
+
end
|
185
|
+
|
186
|
+
def remove_node(node_info, unsafe: false)
|
187
|
+
if unsafe
|
188
|
+
logger.warn(logloc) { "Unsafely removing node #{node_info.inspect} from the local configuration" }
|
189
|
+
|
190
|
+
@config.remove_node(node_info, force: true)
|
191
|
+
else
|
192
|
+
@metrics.remove_node.measure do
|
193
|
+
Async(logger: logger) do |task|
|
194
|
+
loop do
|
195
|
+
logger.debug(logloc) { "(in #{@node_name}) Requesting removal of #{node_info.inspect}" }
|
196
|
+
|
197
|
+
reply = rpc_to_leader(Evinrude::Message::NodeRemovalRequest.new(node_info: node_info, unsafe: unsafe), task)
|
198
|
+
|
199
|
+
if reply.success
|
200
|
+
break true
|
201
|
+
end
|
202
|
+
end
|
203
|
+
end.result
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
def address
|
209
|
+
@network&.advertised_address
|
210
|
+
end
|
211
|
+
|
212
|
+
def port
|
213
|
+
@network&.advertised_port
|
214
|
+
end
|
215
|
+
|
216
|
+
def nodes
|
217
|
+
@config.nodes
|
218
|
+
end
|
219
|
+
|
220
|
+
def leader?
|
221
|
+
@mode == :leader
|
222
|
+
end
|
223
|
+
|
224
|
+
def follower?
|
225
|
+
@mode == :follower
|
226
|
+
end
|
227
|
+
|
228
|
+
def candidate?
|
229
|
+
@mode == :candidate
|
230
|
+
end
|
231
|
+
|
232
|
+
def init?
|
233
|
+
@mode == :init
|
234
|
+
end
|
235
|
+
|
236
|
+
def expired?
|
237
|
+
!!(!leader? && @heartbeat_timeout_time && @heartbeat_timeout_time < Time.now)
|
238
|
+
end
|
239
|
+
|
240
|
+
def node_info
|
241
|
+
if @network.nil?
|
242
|
+
raise RuntimeError, "Cannot determine node info until the network is up"
|
243
|
+
end
|
244
|
+
|
245
|
+
@node_info ||= Evinrude::NodeInfo.new(address: address, port: port, name: @node_name)
|
246
|
+
end
|
247
|
+
|
248
|
+
private
|
249
|
+
|
250
|
+
def load_log_from_disk
|
251
|
+
log_file = @storage_dir.join("log.yaml")
|
252
|
+
|
253
|
+
if log_file.exist?
|
254
|
+
logger.debug(logloc) { "Loading log entries from #{log_file}" }
|
255
|
+
@metrics.log_file_size.set(log_file.stat.size)
|
256
|
+
|
257
|
+
# Temporarily unsetting @storage_dir prevents the calls we make from
|
258
|
+
# writing all the log entries straight back to disk again
|
259
|
+
tmp_storage_dir, @storage_dir = @storage_dir, nil
|
260
|
+
|
261
|
+
begin
|
262
|
+
log_file.open do |fd|
|
263
|
+
YAML.load_stream(fd) do |entry|
|
264
|
+
unless entry.is_a?(Hash)
|
265
|
+
logger.fatal(logloc) { "SHENANIGAN ALERT: persisted log entry #{entry} is not a hash!" }
|
266
|
+
exit 42
|
267
|
+
end
|
268
|
+
|
269
|
+
m, args = entry.to_a.first
|
270
|
+
|
271
|
+
unless %i{process_log_entry commit_entries_to}.include?(m)
|
272
|
+
logger.fatal(logloc) { "SHENANIGAN ALERT: log includes unexpected operation #{m.inspect}(*#{args.inspect})!!!" }
|
273
|
+
exit 42
|
274
|
+
end
|
275
|
+
|
276
|
+
logger.debug(logloc) { "Running #{m}(#{args.inspect}) from disk log" }
|
277
|
+
|
278
|
+
self.__send__(m, *args)
|
279
|
+
end
|
280
|
+
end
|
281
|
+
ensure
|
282
|
+
@storage_dir = tmp_storage_dir
|
283
|
+
end
|
284
|
+
|
285
|
+
logger.debug(logloc) { "Completed log read" }
|
286
|
+
end
|
287
|
+
end
|
288
|
+
|
289
|
+
def rpc_to_leader(msg, task)
|
290
|
+
backoff = Evinrude::Backoff.new
|
291
|
+
reply = nil
|
292
|
+
command_id = SecureRandom.uuid
|
293
|
+
|
294
|
+
logger.debug(logloc) { "(in #{@node_name}) Sending message #{msg.inspect} to cluster leader" }
|
295
|
+
|
296
|
+
loop do
|
297
|
+
until leader? || follower? || expired?
|
298
|
+
logger.debug(logloc) { "(in #{@node_name}) Waiting until we're in the cluster before sending RPC to leader" }
|
299
|
+
task.sleep 0.5
|
300
|
+
end
|
301
|
+
|
302
|
+
begin
|
303
|
+
remote = reply&.leader_info || @leader_info
|
304
|
+
|
305
|
+
if remote.nil?
|
306
|
+
raise NoLeaderError, "No leader could be discerned for the cluster at present"
|
307
|
+
end
|
308
|
+
|
309
|
+
conn = @network.connect(address: remote.address, port: remote.port)
|
310
|
+
|
311
|
+
reply = task.with_timeout(5) do |t|
|
312
|
+
conn.rpc(msg)
|
313
|
+
end
|
314
|
+
|
315
|
+
if reply.nil?
|
316
|
+
logger.debug(logloc) { "(in #{@node_name}) RPC to leader #{remote.inspect} timed out" }
|
317
|
+
elsif reply.leader_info
|
318
|
+
logger.debug(logloc) { "(in #{@node_name}) Redirected to #{reply.leader_info.inspect}" }
|
319
|
+
# No need to wait for the backoff time here
|
320
|
+
next
|
321
|
+
else
|
322
|
+
logger.debug(logloc) { "(in #{@node_name}) RPC to leader returned #{reply.inspect}" }
|
323
|
+
return reply
|
324
|
+
end
|
325
|
+
|
326
|
+
task.sleep backoff.wait_time
|
327
|
+
rescue Evinrude::Error, Async::TimeoutError, Async::Wrapper::Cancelled, SystemCallError, IOError => ex
|
328
|
+
@metrics.rpc_exception.increment(labels: { target: "#{remote.address}:#{remote.port}", node_name: remote.name, class: ex.class.to_s })
|
329
|
+
log_exception(ex) { "(in #{@node_name}) RPC to leader raised exception" }
|
330
|
+
conn&.close
|
331
|
+
reply = nil
|
332
|
+
|
333
|
+
if expired?
|
334
|
+
raise NodeExpiredError, "This node instance is not active in the cluster (mode=#{@mode})"
|
335
|
+
end
|
336
|
+
|
337
|
+
task.sleep backoff.wait_time
|
338
|
+
end
|
339
|
+
end
|
340
|
+
end
|
341
|
+
|
342
|
+
def become_leader
|
343
|
+
reset_peers
|
344
|
+
|
345
|
+
logger.info(logloc) { "Assuming leadership of the cluster" }
|
346
|
+
|
347
|
+
@mode = :leader
|
348
|
+
|
349
|
+
@leader_info = node_info
|
350
|
+
@commands_in_progress = {}
|
351
|
+
|
352
|
+
@cc_sem.acquire do
|
353
|
+
@config_change_queue = []
|
354
|
+
@config_change_request_in_progress = nil
|
355
|
+
end
|
356
|
+
|
357
|
+
@async_task.async do |subtask|
|
358
|
+
while leader?
|
359
|
+
subtask.sleep @heartbeat_interval
|
360
|
+
|
361
|
+
if leader?
|
362
|
+
logger.debug(logloc) { "Triggering periodic AE heartbeat" }
|
363
|
+
issue_append_entries_to_cluster
|
364
|
+
end
|
365
|
+
end
|
366
|
+
end
|
367
|
+
|
368
|
+
propose_log_entry(
|
369
|
+
LogEntry::Null.new(term: @current_term) do
|
370
|
+
logger.debug(logloc) { "Null log entry to mark start-of-term replicated" }
|
371
|
+
end
|
372
|
+
)
|
373
|
+
|
374
|
+
@metrics.state.set(3)
|
375
|
+
end
|
376
|
+
|
377
|
+
def become_follower
|
378
|
+
reset_peers
|
379
|
+
|
380
|
+
logger.info(logloc) { "Becoming follower" }
|
381
|
+
|
382
|
+
@mode = :follower
|
383
|
+
|
384
|
+
@heartbeat_timeout_time = Time.now + @heartbeat_timeout.rand
|
385
|
+
|
386
|
+
@async_task.async do |subtask|
|
387
|
+
while follower?
|
388
|
+
logger.debug(logloc) { "#{@heartbeat_timeout_time - Time.now}s until heartbeat timer expires" }
|
389
|
+
|
390
|
+
subtask.sleep [0.01, @heartbeat_timeout_time - Time.now].max
|
391
|
+
|
392
|
+
if follower? && @heartbeat_timeout_time < Time.now
|
393
|
+
logger.info(logloc) { "Heartbeat timeout expired; triggering election" }
|
394
|
+
trigger_election
|
395
|
+
end
|
396
|
+
end
|
397
|
+
end
|
398
|
+
|
399
|
+
@metrics.state.set(2)
|
400
|
+
end
|
401
|
+
|
402
|
+
def become_candidate
|
403
|
+
reset_peers
|
404
|
+
|
405
|
+
logger.info(logloc) { "Becoming a candidate" }
|
406
|
+
|
407
|
+
@mode = :candidate
|
408
|
+
|
409
|
+
@async_task.async do |subtask|
|
410
|
+
election_timeout = @heartbeat_timeout.rand
|
411
|
+
logger.debug(logloc) { "Waiting #{election_timeout}s for election to complete" }
|
412
|
+
subtask.sleep election_timeout
|
413
|
+
|
414
|
+
if candidate?
|
415
|
+
logger.info(logloc) { "Election timeout expired without a leader being elected; triggering a new election" }
|
416
|
+
trigger_election
|
417
|
+
end
|
418
|
+
end
|
419
|
+
|
420
|
+
@metrics.state.set(1)
|
421
|
+
end
|
422
|
+
|
423
|
+
def reset_peers
|
424
|
+
@peers.values.each { |f| f.conn.close }
|
425
|
+
@peers.clear
|
426
|
+
@metrics.clear_peer_metrics
|
427
|
+
end
|
428
|
+
|
429
|
+
def new_term(n)
|
430
|
+
logger.debug(logloc) { "Setting up for term #{n}" }
|
431
|
+
@current_term = n
|
432
|
+
@voted_for = nil
|
433
|
+
|
434
|
+
@metrics.term.set(@current_term)
|
435
|
+
end
|
436
|
+
|
437
|
+
def persist_to_disk(e)
|
438
|
+
if @storage_dir
|
439
|
+
file = @storage_dir.join("log.yaml")
|
440
|
+
|
441
|
+
if file.exist? && file.stat.size > 1024 * 1024
|
442
|
+
logger.debug(logloc) { "Log is getting a bit big; time for a new snapshot, methinks" }
|
443
|
+
take_snapshot
|
444
|
+
end
|
445
|
+
|
446
|
+
logger.debug(logloc) { "Persisting #{e.inspect} to #{file}" }
|
447
|
+
file.open("a") do |fd|
|
448
|
+
logger.debug(logloc) { "Doin' the write thing" }
|
449
|
+
fd.puts e.to_yaml
|
450
|
+
fd.fdatasync
|
451
|
+
end
|
452
|
+
|
453
|
+
@metrics.log_entries_persisted.increment
|
454
|
+
@metrics.log_file_size.set(file.stat.size)
|
455
|
+
end
|
456
|
+
end
|
457
|
+
|
458
|
+
def propose_log_entry(entry)
|
459
|
+
unless leader?
|
460
|
+
logger.error(logloc) { with_backtrace("propose_log_entry called while not leader!") }
|
461
|
+
return
|
462
|
+
end
|
463
|
+
|
464
|
+
@log.append(entry)
|
465
|
+
persist_to_disk(process_log_entry: [entry, @log.last_index])
|
466
|
+
|
467
|
+
logger.debug(logloc) { "Proposing #{entry.inspect} as ##{@log.last_index}" }
|
468
|
+
|
469
|
+
if @config.nodes.length == 1
|
470
|
+
# Flyin' solo! Means we can skip all that inconvenient AppendEntries stuff,
|
471
|
+
# but we still need to do what needs to be done after the entry has been
|
472
|
+
# "replicated everywhere" (ie "here")
|
473
|
+
check_for_new_replication_majority
|
474
|
+
else
|
475
|
+
issue_append_entries_to_cluster
|
476
|
+
end
|
477
|
+
end
|
478
|
+
|
479
|
+
def issue_append_entries_to_cluster(&blk)
|
480
|
+
nodes.each do |n|
|
481
|
+
next if n == node_info
|
482
|
+
|
483
|
+
@async_task.async do
|
484
|
+
begin
|
485
|
+
issue_append_entries(@peers[n], &blk)
|
486
|
+
rescue Evinrude::Log::SnapshottedEntryError
|
487
|
+
issue_snapshot(@peers[n])
|
488
|
+
rescue => ex
|
489
|
+
log_exception(ex) { "Failed to issue AppendEntries to #{n.inspect}" }
|
490
|
+
end
|
491
|
+
end
|
492
|
+
end
|
493
|
+
end
|
494
|
+
|
495
|
+
def issue_append_entries(follower)
|
496
|
+
logger.debug(logloc) { "Issuing AppendEntries to #{follower.node_info.inspect}" }
|
497
|
+
entries = @log.entries_from(follower.next_index)
|
498
|
+
prev_index = [follower.next_index - 1, @log.last_index].min
|
499
|
+
prev_entry = @log[prev_index]
|
500
|
+
|
501
|
+
logger.debug(logloc) { "Previous log entry (##{prev_index}) is #{prev_entry.inspect}" }
|
502
|
+
|
503
|
+
reply = follower.rpc(
|
504
|
+
Message::AppendEntriesRequest.new(
|
505
|
+
term: @current_term,
|
506
|
+
leader_info: node_info,
|
507
|
+
leader_commit: @commit_index,
|
508
|
+
prev_log_index: prev_index,
|
509
|
+
prev_log_term: prev_entry.term,
|
510
|
+
entries: entries,
|
511
|
+
)
|
512
|
+
)
|
513
|
+
|
514
|
+
if leader?
|
515
|
+
if reply.nil?
|
516
|
+
logger.debug(logloc) { "AppendEntriesRequest to #{follower.node_info.inspect} was not answered. C'est la vie." }
|
517
|
+
follower.conn.close
|
518
|
+
@peers.delete(follower.node_info)
|
519
|
+
elsif block_given?
|
520
|
+
yield reply, follower.node_info
|
521
|
+
elsif reply.term > @current_term
|
522
|
+
logger.debug(logloc) { "Received term from #{follower.node_info.inspect} greater than our own. Demotion required!" }
|
523
|
+
new_term(reply.term)
|
524
|
+
become_follower
|
525
|
+
elsif reply.success
|
526
|
+
logger.debug(logloc) { "Successful AppendEntriesReply received from #{follower.node_info.inspect}" }
|
527
|
+
follower.successful_append(prev_index + entries.length)
|
528
|
+
check_for_new_replication_majority
|
529
|
+
else
|
530
|
+
logger.debug(logloc) { "AppendEntries to #{follower.node_info.inspect} failed; retrying after next_index decrement" }
|
531
|
+
if reply.last_index && reply.last_index < follower.next_index - 1
|
532
|
+
follower.failed_append(reply.last_index)
|
533
|
+
else
|
534
|
+
follower.failed_append
|
535
|
+
end
|
536
|
+
if follower.next_index <= @log.snapshot_last_index
|
537
|
+
issue_snapshot(follower)
|
538
|
+
else
|
539
|
+
issue_append_entries(follower)
|
540
|
+
end
|
541
|
+
end
|
542
|
+
else
|
543
|
+
logger.debug(logloc) { "Ignoring AppendEntriesReply received when we're not leader" }
|
544
|
+
end
|
545
|
+
end
|
546
|
+
|
547
|
+
def check_for_new_replication_majority
|
548
|
+
new_commits = false
|
549
|
+
|
550
|
+
((@commit_index + 1)..@log.last_index).each do |idx|
|
551
|
+
present_nodes = @peers.values.select { |f| f.match_index >= idx }.map(&:node_info) + [node_info]
|
552
|
+
|
553
|
+
logger.debug(logloc) { "Checking for replication majority on ##{idx} (present: #{present_nodes.inspect})" }
|
554
|
+
if @config.quorum_met?(present_nodes)
|
555
|
+
logger.debug(logloc) { "Log index #{idx} has met majority" }
|
556
|
+
@metrics.replication_majority.set(idx)
|
557
|
+
|
558
|
+
entry = @log[idx]
|
559
|
+
|
560
|
+
case entry
|
561
|
+
when LogEntry::ClusterConfiguration
|
562
|
+
logger.debug(logloc) { "Newly majoritied (majoritised?) log entry is a ClusterConfig; @config_index=#{@config_index}" }
|
563
|
+
|
564
|
+
# Dealing with potentially out-of-date cluster configurations is
|
565
|
+
# absofuckinglutely mind-bending. As near as I can tell, however,
|
566
|
+
# since the leader by definition has all of the log entries, it
|
567
|
+
# also has the latest and greatest config live and in concert,
|
568
|
+
# so we can make some assumptions about future log entries on
|
569
|
+
# that basis.
|
570
|
+
if idx == @config_index
|
571
|
+
logger.debug(logloc) { "Replication of current config #{@config.inspect} complete" }
|
572
|
+
if @config.transitioning?
|
573
|
+
logger.debug(logloc) { "Proposing post-joint config" }
|
574
|
+
@config.joint_configuration_replicated
|
575
|
+
propose_log_entry(LogEntry::ClusterConfiguration.new(term: @current_term, config: @config))
|
576
|
+
@config_index = @log.last_index
|
577
|
+
else
|
578
|
+
# Transition complete; time to let the requestor know they're good
|
579
|
+
# to go
|
580
|
+
logger.debug(logloc) { "Post-joint config replicated; config change saga completed" }
|
581
|
+
@config_index = @log.last_index
|
582
|
+
|
583
|
+
@cc_sem.acquire do
|
584
|
+
if @config_change_request_in_progress
|
585
|
+
logger.debug(logloc) { "Letting #{@config_change_request_in_progress.node_info.inspect} know their config change request was successful" }
|
586
|
+
|
587
|
+
# This is technically only necessary for certain config changes
|
588
|
+
# (like when a node changes address/port but keeps the same
|
589
|
+
# name) but there's no harm in doing it all the time.
|
590
|
+
if @peers.key?(@config_change_request_in_progress.node_info)
|
591
|
+
@peers[@config_change_request_in_progress.node_info].conn.close
|
592
|
+
@peers.delete(@config_change_request_in_progress.node_info)
|
593
|
+
end
|
594
|
+
|
595
|
+
@config_change_request_in_progress.send_successful_reply
|
596
|
+
@config_change_request_in_progress = nil
|
597
|
+
else
|
598
|
+
logger.debug(logloc) { "Nobody to send a successful config change reply to; oh well" }
|
599
|
+
end
|
600
|
+
end
|
601
|
+
|
602
|
+
process_config_change_queue
|
603
|
+
end
|
604
|
+
else
|
605
|
+
logger.debug(logloc) { "Quorum met on out-of-date config #{entry.config.inspect}; ignoring" }
|
606
|
+
end
|
607
|
+
when LogEntry::StateMachineCommand
|
608
|
+
@sm_mutex.synchronize do
|
609
|
+
logger.debug(logloc) { "Applying state machine command #{entry.command} (id #{entry.id})" }
|
610
|
+
@state_machine.process_command(entry.command)
|
611
|
+
if conn = @commands_in_progress.delete(entry.id)
|
612
|
+
logger.debug(logloc) { "Letting the client know their command is cooked" }
|
613
|
+
conn.send_reply(Message::CommandReply.new(success: true))
|
614
|
+
else
|
615
|
+
logger.debug(logloc) { "No client around to notify of command application; they'll figure it out eventually" }
|
616
|
+
end
|
617
|
+
end
|
618
|
+
end
|
619
|
+
|
620
|
+
@commit_index = idx
|
621
|
+
@metrics.commit_index.set(@commit_index)
|
622
|
+
persist_to_disk(commit_entries_to: [idx])
|
623
|
+
new_commits = true
|
624
|
+
else
|
625
|
+
logger.debug(logloc) { "Replication majority not yet met on ##{idx}. Better luck next time." }
|
626
|
+
end
|
627
|
+
end
|
628
|
+
|
629
|
+
if new_commits
|
630
|
+
# We want to get the good word out to everyone as soon as possible that
|
631
|
+
# there's new log entries that can be committed.
|
632
|
+
issue_append_entries_to_cluster
|
633
|
+
end
|
634
|
+
end
|
635
|
+
|
636
|
+
def take_snapshot
|
637
|
+
return unless @storage_dir
|
638
|
+
|
639
|
+
snapshot = @sm_mutex.synchronize do
|
640
|
+
Evinrude::Snapshot.new(node_name: @node_name, state: @state_machine.snapshot, cluster_config: @config, cluster_config_index: @config_index, last_term: @log.last_entry_term, last_index: @log.last_index, last_command_ids: @last_command_ids)
|
641
|
+
end
|
642
|
+
|
643
|
+
Tempfile.open("snapshot", @storage_dir) do |f|
|
644
|
+
logger.debug(logloc) { "Writing snapshot data to #{f.path}" }
|
645
|
+
f.write(snapshot.to_yaml)
|
646
|
+
f.fdatasync
|
647
|
+
f.close
|
648
|
+
File.rename(f.path, @storage_dir.join("snapshot.yaml"))
|
649
|
+
File.open(@storage_dir) { |d| d.fsync }
|
650
|
+
end
|
651
|
+
|
652
|
+
@metrics.snapshot_file_size.set(@storage_dir.join("snapshot.yaml").stat.size)
|
653
|
+
|
654
|
+
begin
|
655
|
+
logger.debug(logloc) { "Deleting now-stale log.yaml" }
|
656
|
+
File.unlink(File.join(@storage_dir, "log.yaml"))
|
657
|
+
rescue Errno::ENOENT
|
658
|
+
# Yes, this is in fact exactly what we're trying to achieve
|
659
|
+
end
|
660
|
+
|
661
|
+
@metrics.log_file_size.set(0)
|
662
|
+
end
|
663
|
+
|
664
|
+
def issue_snapshot(follower)
|
665
|
+
msg = @sm_mutex.synchronize do
|
666
|
+
Message::InstallSnapshotRequest.new(term: @current_term, leader_info: @leader_info, last_included_index: @commit_index, last_included_term: @log[@commit_index].term, data: @state_machine.snapshot)
|
667
|
+
end
|
668
|
+
|
669
|
+
reply = follower.rpc(msg)
|
670
|
+
|
671
|
+
if reply.term > @current_term
|
672
|
+
new_term(reply.term)
|
673
|
+
else
|
674
|
+
follower.successful_append(@commit_index)
|
675
|
+
end
|
676
|
+
end
|
677
|
+
|
678
|
+
def async_resolver
|
679
|
+
@async_resolver ||= Evinrude::Resolver.new
|
680
|
+
end
|
681
|
+
|
682
|
+
def expand_join_hints
|
683
|
+
return [] if @join_hints.nil?
|
684
|
+
|
685
|
+
# Where's Enumerable.amap when you need it?
|
686
|
+
sem = Async::Semaphore.new
|
687
|
+
|
688
|
+
[].tap do |r|
|
689
|
+
@join_hints.each do |jh|
|
690
|
+
Async(logger: logger) do |t|
|
691
|
+
if jh.is_a?(String)
|
692
|
+
async_resolver.getresources(jh).each do |srv|
|
693
|
+
t.async do
|
694
|
+
async_resolver.getaddresses(srv.target.to_s).each do |addr|
|
695
|
+
sem.acquire { r << { address: addr, port: srv.port } }
|
696
|
+
end
|
697
|
+
end
|
698
|
+
end
|
699
|
+
elsif jh.is_a?(Hash) || jh.is_a?(NodeInfo)
|
700
|
+
begin
|
701
|
+
IPAddr.new(jh[:address])
|
702
|
+
# It's an IP address already; excellent
|
703
|
+
sem.acquire { r << jh }
|
704
|
+
rescue ArgumentError
|
705
|
+
# It's a hostname(ish)
|
706
|
+
async_resolver.getaddresses(jh[:address]).each do |addr|
|
707
|
+
sem.acquire { r << { address: addr, port: srv.port } }
|
708
|
+
end
|
709
|
+
end
|
710
|
+
else
|
711
|
+
raise ArgumentError, "Invalid join hint entry: #{jh.inspect}"
|
712
|
+
end
|
713
|
+
end.result
|
714
|
+
end
|
715
|
+
end
|
716
|
+
end
|
717
|
+
|
718
|
+
def join_targets
|
719
|
+
expand_join_hints + @config.nodes.reject { |n| n.name == node_info.name }
|
720
|
+
end
|
721
|
+
|
722
|
+
def join_or_create_cluster
|
723
|
+
if @join_hints.nil? && join_targets.empty?
|
724
|
+
logger.info(logloc) { "No hints of an existing cluster found; configuring for standalone mode" }
|
725
|
+
new_term(1)
|
726
|
+
|
727
|
+
@config.add_node(node_info)
|
728
|
+
@config.joint_configuration_replicated
|
729
|
+
|
730
|
+
become_leader
|
731
|
+
|
732
|
+
propose_log_entry(LogEntry::ClusterConfiguration.new(term: @current_term, config: @config))
|
733
|
+
|
734
|
+
take_snapshot
|
735
|
+
else
|
736
|
+
logger.info(logloc) { "Joining existing cluster" }
|
737
|
+
join_cluster_via(join_targets)
|
738
|
+
|
739
|
+
# Taking a snapshot immediately after joining allows us to capture an
|
740
|
+
# up-to-date config, as well as our node name, in case of accidents.
|
741
|
+
take_snapshot
|
742
|
+
end
|
743
|
+
end
|
744
|
+
|
745
|
+
def join_cluster_via(targets)
|
746
|
+
connected = false
|
747
|
+
|
748
|
+
logger.debug(logloc) { "Attempting to join cluster via targets #{targets.inspect}" }
|
749
|
+
|
750
|
+
# I call this algorithm "happy joinballs".
|
751
|
+
#
|
752
|
+
# I will not be taking questions at this time.
|
753
|
+
conn_tasks = targets.map do |t|
|
754
|
+
@async_task.async do |subtask|
|
755
|
+
logger.debug(logloc) { "Initiating happy joinballs connection to #{t[:address]}:#{t[:port]}" }
|
756
|
+
|
757
|
+
begin
|
758
|
+
conn = @network.connect(address: t[:address], port: t[:port])
|
759
|
+
rescue StandardError => ex
|
760
|
+
logger.warn(logloc) { "Failed to connect to #{t[:address]}:#{t[:port]}: #{ex.class} (#{ex.message})" }
|
761
|
+
if targets.length == 1
|
762
|
+
logger.warn(logloc) { "Cluster leader not responsive; restarting join attempt" }
|
763
|
+
join_or_create_cluster
|
764
|
+
end
|
765
|
+
|
766
|
+
next
|
767
|
+
end
|
768
|
+
|
769
|
+
# If we get here, we have won the happy joinballs race
|
770
|
+
conn_tasks.each do |ct|
|
771
|
+
next if ct == Async::Task.current
|
772
|
+
|
773
|
+
ct.stop
|
774
|
+
end
|
775
|
+
|
776
|
+
logger.debug(logloc) { "Sending a join request to #{conn.peer_info}" }
|
777
|
+
reply = subtask.with_timeout(5) do |t|
|
778
|
+
conn.rpc(Message::JoinRequest.new(node_info: node_info))
|
779
|
+
rescue Async::TimeoutError
|
780
|
+
nil
|
781
|
+
end
|
782
|
+
|
783
|
+
if reply&.success
|
784
|
+
logger.info(logloc) { "Joined cluster; #{reply.inspect}" }
|
785
|
+
become_follower
|
786
|
+
elsif reply&.leader_info
|
787
|
+
logger.debug(logloc) { "Redirected to leader #{reply.leader_info.inspect}" }
|
788
|
+
join_cluster_via([reply.leader_info])
|
789
|
+
else
|
790
|
+
logger.error(logloc) { "Cluster join via #{t.inspect} failed: #{reply.nil? ? "RPC timeout" : reply.inspect}" }
|
791
|
+
# Obviously that target is busticated, so we'll retry without it.
|
792
|
+
# The problem is that the busticated target might have been a
|
793
|
+
# leader we were erroneously redirected to; in that case, the
|
794
|
+
# targets list will have only one node, and we'll need to go
|
795
|
+
# back to joinballing everyone. Hopefully by now the cluster
|
796
|
+
# will have agreed on a *live* leader for us to join via.
|
797
|
+
if targets.length == 1
|
798
|
+
join_cluster_via(join_targets - [t])
|
799
|
+
else
|
800
|
+
join_cluster_via(targets - [t])
|
801
|
+
end
|
802
|
+
end
|
803
|
+
end
|
804
|
+
end
|
805
|
+
|
806
|
+
conn_tasks.each(&:wait)
|
807
|
+
end
|
808
|
+
|
809
|
+
def process_rpc_requests
|
810
|
+
logger.debug(logloc) { "Commencing to process RPC requests" }
|
811
|
+
@network.each_message do |msg, conn|
|
812
|
+
@metrics.messages_received.increment(labels: { type: msg.class.to_s.split("::").last })
|
813
|
+
|
814
|
+
logger.debug(logloc) { "Received #{msg} from #{conn.peer_info}" }
|
815
|
+
reply = case msg
|
816
|
+
when Message::AppendEntriesRequest
|
817
|
+
process_append_entries_request(msg, conn)
|
818
|
+
when Message::CommandRequest
|
819
|
+
process_command_request(msg, conn)
|
820
|
+
when Message::JoinRequest
|
821
|
+
process_join_request(msg, conn)
|
822
|
+
when Message::NodeRemovalRequest
|
823
|
+
process_node_removal_request(msg, conn)
|
824
|
+
when Message::ReadRequest
|
825
|
+
process_read_request(msg, conn)
|
826
|
+
when Message::VoteRequest
|
827
|
+
process_vote_request(msg, conn)
|
828
|
+
when Message::InstallSnapshotRequest
|
829
|
+
process_install_snapshot_request(msg, conn)
|
830
|
+
else
|
831
|
+
logger.warn(logloc) { "Unexpected #{msg.class.to_s.split("::").last} received from #{conn.peer_info}" }
|
832
|
+
nil
|
833
|
+
end
|
834
|
+
|
835
|
+
if reply
|
836
|
+
logger.debug(logloc) { "Sending reply #{reply.inspect} to #{conn.peer_info}" }
|
837
|
+
conn.send_reply(reply)
|
838
|
+
else
|
839
|
+
logger.warn(logloc) { "No immediate reply to #{msg.inspect} from #{conn.peer_info}" }
|
840
|
+
end
|
841
|
+
end
|
842
|
+
end
|
843
|
+
|
844
|
+
def process_join_request(msg, conn)
|
845
|
+
logger.debug(logloc) { "Join request #{msg.inspect} received from #{conn.peer_info}" }
|
846
|
+
|
847
|
+
if follower?
|
848
|
+
logger.debug(logloc) { "Not leader; redirecting" }
|
849
|
+
Message::JoinReply.new(success: false, leader_info: @leader_info)
|
850
|
+
elsif leader?
|
851
|
+
logger.debug(logloc) { "Queueing join request" }
|
852
|
+
@config_change_queue << ConfigChangeQueueEntry::AddNode.new(msg, conn)
|
853
|
+
|
854
|
+
if @config_change_queue.length == 1 && @config_change_request_in_progress.nil?
|
855
|
+
logger.debug(logloc) { "Triggering new config change queue cascade" }
|
856
|
+
process_config_change_queue
|
857
|
+
end
|
858
|
+
|
859
|
+
# No immediate reply; will be sent once the join is completed
|
860
|
+
nil
|
861
|
+
else
|
862
|
+
logger.debug(logloc) { "Ignoring join request from #{msg.node_info} because not leader or follower" }
|
863
|
+
nil
|
864
|
+
end
|
865
|
+
end
|
866
|
+
|
867
|
+
def process_node_removal_request(msg, conn)
|
868
|
+
logger.debug(logloc) { "Node removal request #{msg.inspect} received from #{conn.peer_info}" }
|
869
|
+
|
870
|
+
if follower?
|
871
|
+
logger.debug(logloc) { "Not leader; redirecting" }
|
872
|
+
Message::NodeRemovalReply.new(success: false, leader_info: @leader_info)
|
873
|
+
elsif leader?
|
874
|
+
logger.debug(logloc) { "Queueing node removal request" }
|
875
|
+
@config_change_queue << ConfigChangeQueueEntry::RemoveNode.new(msg, conn)
|
876
|
+
|
877
|
+
if @config_change_queue.length == 1
|
878
|
+
logger.debug(logloc) { "Triggering new config change queue cascade" }
|
879
|
+
process_config_change_queue
|
880
|
+
end
|
881
|
+
|
882
|
+
# No immediate reply; will be sent once the join is completed
|
883
|
+
nil
|
884
|
+
else
|
885
|
+
logger.debug(logloc) { "Ignoring node removal request from #{msg.node_info} because not leader or follower" }
|
886
|
+
nil
|
887
|
+
end
|
888
|
+
end
|
889
|
+
|
890
|
+
def process_config_change_queue
|
891
|
+
if @config_change_queue.empty?
|
892
|
+
logger.debug(logloc) { "No more entries in the config change queue" }
|
893
|
+
return
|
894
|
+
end
|
895
|
+
|
896
|
+
if @config_change_request_in_progress
|
897
|
+
logger.error(logloc) { "Change queue processing requested while change request in progress!" }
|
898
|
+
return
|
899
|
+
end
|
900
|
+
|
901
|
+
@config_change_request_in_progress = @config_change_queue.shift
|
902
|
+
logger.debug(logloc) { "Processing config change queue entry #{@config_change_request_in_progress.inspect}" }
|
903
|
+
|
904
|
+
unless leader?
|
905
|
+
@cc_sem.acquire do
|
906
|
+
@config_change_request_in_progress.send_redirect_reply(@leader_info)
|
907
|
+
@config_change_request_in_progress = nil
|
908
|
+
end
|
909
|
+
process_config_change_queue
|
910
|
+
return
|
911
|
+
end
|
912
|
+
|
913
|
+
case @config_change_request_in_progress
|
914
|
+
when ConfigChangeQueueEntry::AddNode
|
915
|
+
if @config.nodes.include?(@config_change_request_in_progress.node_info)
|
916
|
+
# "Dude, you're *already* part of the cluster! Duuuuuuuuuuuuuuude!"
|
917
|
+
@cc_sem.acquire do
|
918
|
+
@config_change_request_in_progress.send_successful_reply
|
919
|
+
@config_change_request_in_progress = nil
|
920
|
+
end
|
921
|
+
process_config_change_queue
|
922
|
+
else
|
923
|
+
logger.debug(logloc) { "Transitioning configuration to add #{@config_change_request_in_progress.node_info.inspect}" }
|
924
|
+
|
925
|
+
@config.add_node(@config_change_request_in_progress.node_info)
|
926
|
+
propose_log_entry(LogEntry::ClusterConfiguration.new(term: @current_term, config: @config))
|
927
|
+
@config_index = @log.last_index
|
928
|
+
end
|
929
|
+
when ConfigChangeQueueEntry::RemoveNode
|
930
|
+
if !@config.nodes.include?(@config_change_request_in_progress.node_info)
|
931
|
+
@cc_sem.acquire do
|
932
|
+
@config_change_request_in_progress.send_successful_reply
|
933
|
+
@config_change_request_in_progress = nil
|
934
|
+
end
|
935
|
+
process_config_change_queue
|
936
|
+
else
|
937
|
+
logger.debug(logloc) { "Transitioning configuration to remove #{@config_change_request_in_progress.node_info.inspect}" }
|
938
|
+
|
939
|
+
@config.remove_node(@config_change_request_in_progress.node_info)
|
940
|
+
propose_log_entry(LogEntry::ClusterConfiguration.new(term: @current_term, config: @config))
|
941
|
+
@config_index = @log.last_index
|
942
|
+
end
|
943
|
+
else
|
944
|
+
logger.error(logloc) { "Unsupported change request type #{@config_change_request_in_progress.class}; this really shouldn't ever happen, bug report welcome" }
|
945
|
+
logger.debug(logloc) { "Unsupported change request was #{@config_change_request_in_progress.inspect}" }
|
946
|
+
@config_change_request_in_progress = nil
|
947
|
+
process_config_change_queue
|
948
|
+
end
|
949
|
+
end
|
950
|
+
|
951
|
+
def process_append_entries_request(msg, conn)
|
952
|
+
logger.debug(logloc) { "Processing append_entries request #{msg.inspect} from #{conn.peer_info}" }
|
953
|
+
|
954
|
+
if msg.term < @current_term
|
955
|
+
logger.debug(logloc) { "AppendEntries request term less than our current term #{@current_term}" }
|
956
|
+
Message::AppendEntriesReply.new(success: false, term: @current_term)
|
957
|
+
else
|
958
|
+
@last_append = Time.now
|
959
|
+
|
960
|
+
if !@log.has_entry?(msg.prev_log_index)
|
961
|
+
logger.debug(logloc) { "We don't have log entry prev_log_index=#{msg.prev_log_index}; asking for more entries" }
|
962
|
+
Message::AppendEntriesReply.new(success: false, term: @current_term, last_index: @log.last_index)
|
963
|
+
elsif @log.snapshotted_entry?(msg.prev_log_index + 1)
|
964
|
+
logger.error(logloc) { "Got AppendEntriesRequest with a prev_log_index=#{msg.prev_log_index} that's buried in the snapshot" }
|
965
|
+
# Closing the connection to the leader will cause it to recycle the
|
966
|
+
# follower state, which will reset it to start sending us AppendEntries
|
967
|
+
# from the most recent entry.
|
968
|
+
conn.close
|
969
|
+
elsif msg.prev_log_term != @log.entry_term(msg.prev_log_index)
|
970
|
+
logger.debug(logloc) { "AppendEntries log fork; msg.prev_log_index=#{msg.prev_log_index} msg.prev_log_term=#{msg.prev_log_term} @log.entry_term(msg.prev_log_index=#{@log.entry_term(msg.prev_log_index)} @log.last_index=#{@log.last_index}" }
|
971
|
+
@log.truncate_to(msg.prev_log_index - 1)
|
972
|
+
Message::AppendEntriesReply.new(success: false, term: @current_term)
|
973
|
+
else
|
974
|
+
@leader_info = msg.leader_info
|
975
|
+
|
976
|
+
if msg.term > @current_term || (candidate? && msg.term == @current_term)
|
977
|
+
logger.debug(logloc) { "Received term-updating AppendEntries; msg.term=#{msg.term} @current_term=#{@current_term} node_info.mode=#{node_info.instance_variable_get(:@mode).inspect}" }
|
978
|
+
new_term(msg.term)
|
979
|
+
become_follower
|
980
|
+
end
|
981
|
+
|
982
|
+
@heartbeat_timeout_time = Time.now + @heartbeat_timeout.rand
|
983
|
+
|
984
|
+
msg.entries.each.with_index do |new_entry, i|
|
985
|
+
idx = msg.prev_log_index + i + 1 # Dratted 1-index addressing
|
986
|
+
process_log_entry(new_entry, idx)
|
987
|
+
end
|
988
|
+
|
989
|
+
new_commit_point = [@log.last_index, msg.leader_commit].min
|
990
|
+
|
991
|
+
if new_commit_point > @commit_index
|
992
|
+
commit_entries_to(new_commit_point)
|
993
|
+
end
|
994
|
+
|
995
|
+
Message::AppendEntriesReply.new(success: true, term: @current_term)
|
996
|
+
end
|
997
|
+
end
|
998
|
+
end
|
999
|
+
|
1000
|
+
def process_log_entry(entry, log_index)
|
1001
|
+
logger.debug(logloc) { "Processing #{entry.inspect} at log index #{log_index}" }
|
1002
|
+
|
1003
|
+
existing_entry = @log[log_index]
|
1004
|
+
|
1005
|
+
if existing_entry.nil?
|
1006
|
+
@log.append(entry)
|
1007
|
+
|
1008
|
+
persist_to_disk(process_log_entry: [entry, log_index])
|
1009
|
+
|
1010
|
+
# Configuration changes take place immediately, not after consensus;
|
1011
|
+
# raft.pdf p11, "a server always uses the latest configuration in its
|
1012
|
+
# log, regardless of whether the entry is committed".
|
1013
|
+
if LogEntry::ClusterConfiguration === entry
|
1014
|
+
logger.debug(logloc) { "Using new configuration from log entry ##{log_index}" }
|
1015
|
+
@config = entry.config
|
1016
|
+
@config_index = log_index
|
1017
|
+
end
|
1018
|
+
elsif existing_entry.term != entry.term
|
1019
|
+
logger.debug(logloc) { "Discovered fork at #{log_index} (existing_entry=#{existing_entry.inspect} new_entry=#{entry.inspect}); discarding our remaining log entries" }
|
1020
|
+
@log.truncate_to(log_index - 1)
|
1021
|
+
else
|
1022
|
+
logger.debug(logloc) { "Already got log entry ##{log_index}; skipping" }
|
1023
|
+
end
|
1024
|
+
|
1025
|
+
end
|
1026
|
+
|
1027
|
+
def commit_entries_to(idx)
|
1028
|
+
((@commit_index + 1)..idx).each do |i|
|
1029
|
+
@sm_mutex.synchronize do
|
1030
|
+
logger.debug(logloc) { "Committing log entry ##{i}" }
|
1031
|
+
|
1032
|
+
if LogEntry::StateMachineCommand === @log[i]
|
1033
|
+
logger.debug(logloc) { "Applying state machine command #{@log[i].command}" }
|
1034
|
+
@state_machine.process_command(@log[i].command)
|
1035
|
+
@last_command_ids[@log[i].node_name] = @log[i].id
|
1036
|
+
else
|
1037
|
+
logger.debug(logloc) { "Entry ##{i} is a #{@log[i].class}; no commit action necessary" }
|
1038
|
+
end
|
1039
|
+
|
1040
|
+
@commit_index = i
|
1041
|
+
@metrics.commit_index.set(i)
|
1042
|
+
end
|
1043
|
+
end
|
1044
|
+
|
1045
|
+
persist_to_disk(commit_entries_to: [idx])
|
1046
|
+
end
|
1047
|
+
|
1048
|
+
def process_command_request(msg, conn)
|
1049
|
+
logger.debug(logloc) { "Command request #{msg.inspect} received from #{conn.peer_info}" }
|
1050
|
+
|
1051
|
+
if follower?
|
1052
|
+
Message::CommandReply.new(success: false, leader_info: @leader_info)
|
1053
|
+
elsif leader?
|
1054
|
+
if @last_command_ids[msg.node_name] == msg.id
|
1055
|
+
Message::CommandReply.new(success: true)
|
1056
|
+
else
|
1057
|
+
logger.debug(logloc) { "Noting that #{msg.id} is a command in progress" }
|
1058
|
+
@commands_in_progress[msg.id] = conn
|
1059
|
+
propose_log_entry(LogEntry::StateMachineCommand.new(term: @current_term, command: msg.command, id: msg.id, node_name: msg.node_name))
|
1060
|
+
|
1061
|
+
# Deferred reply to log entry commit will occur after replication is complete
|
1062
|
+
nil
|
1063
|
+
end
|
1064
|
+
else
|
1065
|
+
Message::CommandReply.new(success: false)
|
1066
|
+
end
|
1067
|
+
end
|
1068
|
+
|
1069
|
+
def process_vote_request(msg, conn)
|
1070
|
+
if Time.now - @last_append < @heartbeat_timeout.first
|
1071
|
+
# Avoid rogue servers disrupting the cluster by calling votes
|
1072
|
+
# just because they can.
|
1073
|
+
logger.debug(logloc) { "Ignoring vote request from scurvy rogue #{msg.candidate_info}" }
|
1074
|
+
return nil
|
1075
|
+
end
|
1076
|
+
|
1077
|
+
if msg.term > @current_term
|
1078
|
+
new_term(msg.term)
|
1079
|
+
end
|
1080
|
+
|
1081
|
+
if msg.term == @current_term &&
|
1082
|
+
(@voted_for.nil? || @voted_for == msg.candidate_info) &&
|
1083
|
+
((msg.last_log_index >= @log.last_index && msg.last_log_term == @log.last_entry_term) || msg.last_log_term > @log.last_entry_term)
|
1084
|
+
@voted_for = msg.candidate_info
|
1085
|
+
become_follower
|
1086
|
+
logger.debug(logloc) { "Voted for #{msg.candidate_info.inspect} for term #{msg.term} leader" }
|
1087
|
+
Message::VoteReply.new(term: @current_term, vote_granted: true)
|
1088
|
+
else
|
1089
|
+
logger.debug(logloc) { "Rejected #{msg.candidate_info.inspect} for term #{msg.term} leader; @current_term=#{@current_term} @voted_for=#{@voted_for.inspect} msg.last_log_index=#{msg.last_log_index} @log.last_index=#{@log.last_index} msg.last_log_term=#{msg.last_log_term} @log.last_entry_term=#{@log.last_entry_term}" }
|
1090
|
+
Message::VoteReply.new(term: @current_term, vote_granted: false)
|
1091
|
+
end
|
1092
|
+
end
|
1093
|
+
|
1094
|
+
def process_read_request(msg, conn)
|
1095
|
+
if !leader?
|
1096
|
+
Message::ReadReply.new(success: false, leader_info: @leader_info)
|
1097
|
+
elsif @commit_index > msg.commit_index
|
1098
|
+
# We already *know* this is never going to succeed, may as well save ourselves
|
1099
|
+
# the hassle
|
1100
|
+
logger.debug(logloc) { "ReadRequest is for an out-of-date commit_index; nopeing out" }
|
1101
|
+
Message::ReadReply.new(success: false)
|
1102
|
+
elsif @config.nodes.length == 1
|
1103
|
+
# Flyin' solo!
|
1104
|
+
if @commit_index == msg.commit_index
|
1105
|
+
Message::ReadReply.new(success: true)
|
1106
|
+
else
|
1107
|
+
Message::ReadReply.new(success: false)
|
1108
|
+
end
|
1109
|
+
else
|
1110
|
+
responders = [node_info]
|
1111
|
+
|
1112
|
+
issue_append_entries_to_cluster do |reply, node_info|
|
1113
|
+
# responders will be set to nil when quorum has been met, so all remaining
|
1114
|
+
# AE replies can be quietly ignored
|
1115
|
+
next if responders.nil?
|
1116
|
+
|
1117
|
+
if reply.success
|
1118
|
+
responders << node_info
|
1119
|
+
logger.debug(logloc) { "Checking if #{responders.inspect} meets read request quorum" }
|
1120
|
+
if @config.quorum_met?(responders)
|
1121
|
+
logger.debug(logloc) { "Have met read request quorum; reply sent" }
|
1122
|
+
if @commit_index == msg.commit_index
|
1123
|
+
conn.send_reply(Message::ReadReply.new(success: true))
|
1124
|
+
else
|
1125
|
+
conn.send_reply(Message::ReadReply.new(success: false))
|
1126
|
+
end
|
1127
|
+
responders = nil
|
1128
|
+
else
|
1129
|
+
logger.debug(logloc) { "Not yet met read request quorum" }
|
1130
|
+
end
|
1131
|
+
end
|
1132
|
+
end
|
1133
|
+
|
1134
|
+
# Deferred reply
|
1135
|
+
nil
|
1136
|
+
end
|
1137
|
+
end
|
1138
|
+
|
1139
|
+
def process_install_snapshot_request(msg, conn)
|
1140
|
+
if msg.term < @current_term
|
1141
|
+
conn.send_reply(Message::InstallSnapshotReply.new(term: @current_term))
|
1142
|
+
return
|
1143
|
+
end
|
1144
|
+
|
1145
|
+
@sm_mutex.synchronize do
|
1146
|
+
@state_machine = @state_machine_class.new(snapshot: msg.data)
|
1147
|
+
@log.new_snapshot(msg.last_included_term, msg.last_included_index)
|
1148
|
+
@commit_index = msg.last_included_index
|
1149
|
+
end
|
1150
|
+
|
1151
|
+
conn.send_reply(Message::InstallSnapshotReply.new(term: @current_term))
|
1152
|
+
end
|
1153
|
+
|
1154
|
+
def trigger_election
|
1155
|
+
new_term(@current_term + 1)
|
1156
|
+
logger.debug(logloc) { "Initiating election for term #{@current_term}" }
|
1157
|
+
become_candidate
|
1158
|
+
|
1159
|
+
if @config.nodes.length == 1
|
1160
|
+
# Flyin' solo!
|
1161
|
+
logger.debug(logloc) { "No need for an election, as we're in single-node mode" }
|
1162
|
+
become_leader
|
1163
|
+
else
|
1164
|
+
election_term = @current_term
|
1165
|
+
electors = [node_info]
|
1166
|
+
@voted_for = node_info
|
1167
|
+
|
1168
|
+
logger.debug(logloc) { "Canvassing the electorate" }
|
1169
|
+
@config.nodes.each do |n|
|
1170
|
+
next if n == node_info
|
1171
|
+
|
1172
|
+
@async_task.async do
|
1173
|
+
logger.debug(logloc) { "Sending vote request to #{n.inspect}" }
|
1174
|
+
begin
|
1175
|
+
reply = @peers[n].rpc(Message::VoteRequest.new(term: election_term, candidate_info: node_info, last_log_index: @log.last_index, last_log_term: @log.last_entry_term))
|
1176
|
+
rescue => ex
|
1177
|
+
log_exception(ex) { "Failed to send vote to #{n.inspect}" }
|
1178
|
+
if @peers.key?(n)
|
1179
|
+
@peers[n].conn.close
|
1180
|
+
@peers.delete(n)
|
1181
|
+
end
|
1182
|
+
next
|
1183
|
+
end
|
1184
|
+
|
1185
|
+
if electors.nil?
|
1186
|
+
# No need to process a vote if we're not running an election at the moment
|
1187
|
+
next
|
1188
|
+
end
|
1189
|
+
|
1190
|
+
unless candidate?
|
1191
|
+
logger.debug(logloc) { "Received ballot from #{n.inspect}: #{reply.inspect} while in #{@mode} mode" }
|
1192
|
+
next
|
1193
|
+
end
|
1194
|
+
|
1195
|
+
logger.debug(logloc) { "Processing vote #{reply.inspect} from #{n.inspect}" }
|
1196
|
+
if reply.nil?
|
1197
|
+
logger.debug(logloc) { "Received no reply to vote from #{n.inspect}" }
|
1198
|
+
elsif reply.term > @current_term
|
1199
|
+
logger.debug(logloc) { "Received higher term from #{n.inspect}; canceling election" }
|
1200
|
+
new_term(reply.term)
|
1201
|
+
become_follower
|
1202
|
+
electors = nil
|
1203
|
+
elsif reply.vote_granted
|
1204
|
+
logger.debug(logloc) { "Received the vote of #{n.inspect}" }
|
1205
|
+
electors << n
|
1206
|
+
|
1207
|
+
logger.debug(logloc) { "Got #{electors.length} votes so far" }
|
1208
|
+
|
1209
|
+
if @config.quorum_met?(electors)
|
1210
|
+
become_leader
|
1211
|
+
electors = nil
|
1212
|
+
end
|
1213
|
+
end
|
1214
|
+
end
|
1215
|
+
end
|
1216
|
+
end
|
1217
|
+
end
|
1218
|
+
end
|
1219
|
+
|
1220
|
+
require_relative "./evinrude/backoff"
|
1221
|
+
require_relative "./evinrude/config_change_queue_entry/add_node"
|
1222
|
+
require_relative "./evinrude/config_change_queue_entry/remove_node"
|
1223
|
+
require_relative "./evinrude/cluster_configuration"
|
1224
|
+
require_relative "./evinrude/freedom_patches/range"
|
1225
|
+
require_relative "./evinrude/log"
|
1226
|
+
require_relative "./evinrude/log_entries"
|
1227
|
+
require_relative "./evinrude/messages"
|
1228
|
+
require_relative "./evinrude/metrics"
|
1229
|
+
require_relative "./evinrude/network"
|
1230
|
+
require_relative "./evinrude/node_info"
|
1231
|
+
require_relative "./evinrude/peer"
|
1232
|
+
require_relative "./evinrude/snapshot"
|
1233
|
+
require_relative "./evinrude/state_machine/register"
|