evinrude 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.editorconfig +23 -0
- data/.gitignore +6 -0
- data/.yardopts +1 -0
- data/CODE_OF_CONDUCT.md +49 -0
- data/CONTRIBUTING.md +10 -0
- data/LICENCE +674 -0
- data/README.md +410 -0
- data/evinrude.gemspec +42 -0
- data/lib/evinrude.rb +1233 -0
- data/lib/evinrude/backoff.rb +19 -0
- data/lib/evinrude/cluster_configuration.rb +162 -0
- data/lib/evinrude/config_change_queue_entry.rb +19 -0
- data/lib/evinrude/config_change_queue_entry/add_node.rb +13 -0
- data/lib/evinrude/config_change_queue_entry/remove_node.rb +14 -0
- data/lib/evinrude/freedom_patches/range.rb +5 -0
- data/lib/evinrude/log.rb +102 -0
- data/lib/evinrude/log_entries.rb +3 -0
- data/lib/evinrude/log_entry.rb +13 -0
- data/lib/evinrude/log_entry/cluster_configuration.rb +15 -0
- data/lib/evinrude/log_entry/null.rb +6 -0
- data/lib/evinrude/log_entry/state_machine_command.rb +13 -0
- data/lib/evinrude/logging_helpers.rb +40 -0
- data/lib/evinrude/message.rb +19 -0
- data/lib/evinrude/message/append_entries_reply.rb +13 -0
- data/lib/evinrude/message/append_entries_request.rb +18 -0
- data/lib/evinrude/message/command_reply.rb +13 -0
- data/lib/evinrude/message/command_request.rb +18 -0
- data/lib/evinrude/message/install_snapshot_reply.rb +13 -0
- data/lib/evinrude/message/install_snapshot_request.rb +18 -0
- data/lib/evinrude/message/join_reply.rb +13 -0
- data/lib/evinrude/message/join_request.rb +18 -0
- data/lib/evinrude/message/node_removal_reply.rb +13 -0
- data/lib/evinrude/message/node_removal_request.rb +18 -0
- data/lib/evinrude/message/read_reply.rb +13 -0
- data/lib/evinrude/message/read_request.rb +18 -0
- data/lib/evinrude/message/vote_reply.rb +13 -0
- data/lib/evinrude/message/vote_request.rb +18 -0
- data/lib/evinrude/messages.rb +14 -0
- data/lib/evinrude/metrics.rb +50 -0
- data/lib/evinrude/network.rb +69 -0
- data/lib/evinrude/network/connection.rb +144 -0
- data/lib/evinrude/network/protocol.rb +69 -0
- data/lib/evinrude/node_info.rb +35 -0
- data/lib/evinrude/peer.rb +50 -0
- data/lib/evinrude/resolver.rb +96 -0
- data/lib/evinrude/snapshot.rb +9 -0
- data/lib/evinrude/state_machine.rb +15 -0
- data/lib/evinrude/state_machine/register.rb +25 -0
- data/smoke_tests/001_single_node_cluster.rb +20 -0
- data/smoke_tests/002_three_node_cluster.rb +43 -0
- data/smoke_tests/003_spill.rb +25 -0
- data/smoke_tests/004_stale_read.rb +67 -0
- data/smoke_tests/005_sleepy_master.rb +28 -0
- data/smoke_tests/006_join_via_follower.rb +26 -0
- data/smoke_tests/007_snapshot_madness.rb +97 -0
- data/smoke_tests/008_downsizing.rb +43 -0
- data/smoke_tests/009_disaster_recovery.rb +46 -0
- data/smoke_tests/999_final_smoke_test.rb +279 -0
- data/smoke_tests/run +22 -0
- data/smoke_tests/smoke_test_helper.rb +199 -0
- metadata +318 -0
data/lib/evinrude.rb
ADDED
@@ -0,0 +1,1233 @@
|
|
1
|
+
require "async"
|
2
|
+
require "async/dns"
|
3
|
+
require "fiber"
|
4
|
+
require "logger"
|
5
|
+
require "pathname"
|
6
|
+
require "securerandom"
|
7
|
+
require "tempfile"
|
8
|
+
|
9
|
+
require_relative "./evinrude/logging_helpers"
|
10
|
+
|
11
|
+
class Evinrude
|
12
|
+
include Evinrude::LoggingHelpers
|
13
|
+
|
14
|
+
class Error < StandardError; end
|
15
|
+
|
16
|
+
class NoLeaderError < Error; end
|
17
|
+
|
18
|
+
class NodeExpiredError < Error; end
|
19
|
+
|
20
|
+
attr_reader :node_name
|
21
|
+
|
22
|
+
def initialize(join_hints: [], shared_keys:, state_machine: Evinrude::StateMachine::Register,
|
23
|
+
logger: Logger.new("/dev/null"), node_name: nil, storage_dir: nil,
|
24
|
+
heartbeat_interval: 0.25, heartbeat_timeout: 1..2,
|
25
|
+
listen: {}, advertise: {}, metrics_registry: Prometheus::Client::Registry.new)
|
26
|
+
@join_hints, @keys, @logger, @heartbeat_interval, @heartbeat_timeout = join_hints, shared_keys, logger, heartbeat_interval, heartbeat_timeout
|
27
|
+
|
28
|
+
@metrics = Evinrude::Metrics.new(metrics_registry)
|
29
|
+
|
30
|
+
@listen, @advertise = listen, advertise
|
31
|
+
@listen[:address] ||= "::"
|
32
|
+
@listen[:port] ||= 0
|
33
|
+
|
34
|
+
if storage_dir
|
35
|
+
@storage_dir = Pathname.new(storage_dir)
|
36
|
+
end
|
37
|
+
|
38
|
+
snapshot = if @storage_dir
|
39
|
+
if !@storage_dir.exist?
|
40
|
+
@storage_dir.mkdir
|
41
|
+
end
|
42
|
+
|
43
|
+
if !@storage_dir.directory?
|
44
|
+
raise ArgumentError, "Storage directory #{@storage_dir} isn't *actually* a directory"
|
45
|
+
end
|
46
|
+
|
47
|
+
snapshot_file = @storage_dir.join("snapshot.yaml")
|
48
|
+
|
49
|
+
if snapshot_file.exist?
|
50
|
+
@metrics.snapshot_file_size.set(snapshot_file.stat.size)
|
51
|
+
YAML.load_file(snapshot_file)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
@state_machine_class = state_machine
|
56
|
+
|
57
|
+
if snapshot
|
58
|
+
@node_name = snapshot.node_name
|
59
|
+
@state_machine = @state_machine_class.new(snapshot: snapshot.state)
|
60
|
+
@last_command_ids = snapshot.last_command_ids
|
61
|
+
else
|
62
|
+
@node_name = node_name || SecureRandom.uuid
|
63
|
+
@state_machine = @state_machine_class.new
|
64
|
+
@last_command_ids = {}
|
65
|
+
end
|
66
|
+
|
67
|
+
@sm_mutex = Mutex.new
|
68
|
+
|
69
|
+
if snapshot
|
70
|
+
@config = snapshot.cluster_config
|
71
|
+
@config_index = snapshot.cluster_config_index
|
72
|
+
@config.metrics = @metrics
|
73
|
+
@config.logger = logger
|
74
|
+
else
|
75
|
+
@config = Evinrude::ClusterConfiguration.new(logger: logger, metrics: @metrics)
|
76
|
+
@config_index = 0
|
77
|
+
end
|
78
|
+
|
79
|
+
@last_append = Time.at(0)
|
80
|
+
@current_term = 0
|
81
|
+
@voted_for = nil
|
82
|
+
@mode = :init
|
83
|
+
|
84
|
+
@metrics.term.set(0)
|
85
|
+
|
86
|
+
if snapshot
|
87
|
+
logger.debug(logloc) { "Configuring log from snapshot; snapshot_last_term=#{snapshot.last_term} snapshot_last_index=#{snapshot.last_index}" }
|
88
|
+
@log = Evinrude::Log.new(snapshot_last_term: snapshot.last_term, snapshot_last_index: snapshot.last_index, logger: logger)
|
89
|
+
else
|
90
|
+
@log = Evinrude::Log.new(logger: logger)
|
91
|
+
end
|
92
|
+
|
93
|
+
if snapshot
|
94
|
+
logger.debug(logloc) { "Setting commit_index to #{snapshot.last_index} from snapshot" }
|
95
|
+
@commit_index = snapshot.last_index
|
96
|
+
else
|
97
|
+
@commit_index = 0
|
98
|
+
end
|
99
|
+
|
100
|
+
@metrics.commit_index.set(@commit_index)
|
101
|
+
|
102
|
+
@peers = Hash.new do |h, k|
|
103
|
+
backoff = Evinrude::Backoff.new
|
104
|
+
|
105
|
+
peer_conn = @network.connect(address: k.address, port: k.port)
|
106
|
+
|
107
|
+
h[k] = Peer.new(metrics: @metrics, conn: peer_conn, node_info: k, next_index: @log.last_index + 1)
|
108
|
+
end
|
109
|
+
|
110
|
+
@config_change_queue = []
|
111
|
+
@config_change_request_in_progress = nil
|
112
|
+
@cc_sem = Async::Semaphore.new
|
113
|
+
end
|
114
|
+
|
115
|
+
def command(s)
|
116
|
+
@metrics.command_execution.measure do
|
117
|
+
Async(logger: logger) do |task|
|
118
|
+
command_id = SecureRandom.uuid
|
119
|
+
|
120
|
+
loop do
|
121
|
+
reply = rpc_to_leader(Message::CommandRequest.new(command: s, id: command_id, node_name: @node_name), task)
|
122
|
+
|
123
|
+
if reply.success
|
124
|
+
break true
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end.result
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
def state
|
132
|
+
@metrics.read_state.measure do
|
133
|
+
Async(logger: logger) do |task|
|
134
|
+
loop do
|
135
|
+
state_object = nil
|
136
|
+
commit_index = nil
|
137
|
+
|
138
|
+
@sm_mutex.synchronize do
|
139
|
+
# Disturbingly, this appears to be one of the best available ways
|
140
|
+
# to make a guaranteed deep copy of an arbitrary object
|
141
|
+
state_object = YAML.load(@state_machine.current_state.to_yaml)
|
142
|
+
commit_index = @commit_index
|
143
|
+
end
|
144
|
+
|
145
|
+
logger.debug(logloc) { "(in #{@node_name}) Checking if #{state_object.inspect} at commit_index=#{commit_index} is the most up-to-date state" }
|
146
|
+
|
147
|
+
reply = rpc_to_leader(Evinrude::Message::ReadRequest.new(commit_index: commit_index), task)
|
148
|
+
|
149
|
+
if reply.success
|
150
|
+
break state_object
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end.result
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
def run
|
158
|
+
logger.info(logloc) { "Evinrude node #{@node_name} starting up" }
|
159
|
+
|
160
|
+
@metrics.start_time.set(Time.now.to_f)
|
161
|
+
|
162
|
+
if @storage_dir
|
163
|
+
@metrics.log_loaded_from_disk.set(1)
|
164
|
+
load_log_from_disk
|
165
|
+
else
|
166
|
+
@metrics.log_loaded_from_disk.set(0)
|
167
|
+
end
|
168
|
+
|
169
|
+
Async do |task| #(logger: logger) do |task|
|
170
|
+
@async_task = task
|
171
|
+
@network = Network.new(keys: @keys, logger: logger, metrics: @metrics, listen: @listen, advertise: @advertise).start
|
172
|
+
|
173
|
+
logger.info(logloc) { "Node #{@node_name} listening on #{address}:#{port}" }
|
174
|
+
|
175
|
+
@metrics.info.set(1, labels: { node_name: @node_name, listen_address: @network.listen_address, listen_port: @network.listen_port, advertise_address: address, advertise_port: port })
|
176
|
+
|
177
|
+
task.async { process_rpc_requests }
|
178
|
+
|
179
|
+
join_or_create_cluster
|
180
|
+
end.return
|
181
|
+
rescue => ex
|
182
|
+
log_exception(ex) { "Fatal error" }
|
183
|
+
raise
|
184
|
+
end
|
185
|
+
|
186
|
+
def remove_node(node_info, unsafe: false)
|
187
|
+
if unsafe
|
188
|
+
logger.warn(logloc) { "Unsafely removing node #{node_info.inspect} from the local configuration" }
|
189
|
+
|
190
|
+
@config.remove_node(node_info, force: true)
|
191
|
+
else
|
192
|
+
@metrics.remove_node.measure do
|
193
|
+
Async(logger: logger) do |task|
|
194
|
+
loop do
|
195
|
+
logger.debug(logloc) { "(in #{@node_name}) Requesting removal of #{node_info.inspect}" }
|
196
|
+
|
197
|
+
reply = rpc_to_leader(Evinrude::Message::NodeRemovalRequest.new(node_info: node_info, unsafe: unsafe), task)
|
198
|
+
|
199
|
+
if reply.success
|
200
|
+
break true
|
201
|
+
end
|
202
|
+
end
|
203
|
+
end.result
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
def address
|
209
|
+
@network&.advertised_address
|
210
|
+
end
|
211
|
+
|
212
|
+
def port
|
213
|
+
@network&.advertised_port
|
214
|
+
end
|
215
|
+
|
216
|
+
def nodes
|
217
|
+
@config.nodes
|
218
|
+
end
|
219
|
+
|
220
|
+
def leader?
|
221
|
+
@mode == :leader
|
222
|
+
end
|
223
|
+
|
224
|
+
def follower?
|
225
|
+
@mode == :follower
|
226
|
+
end
|
227
|
+
|
228
|
+
def candidate?
|
229
|
+
@mode == :candidate
|
230
|
+
end
|
231
|
+
|
232
|
+
def init?
|
233
|
+
@mode == :init
|
234
|
+
end
|
235
|
+
|
236
|
+
def expired?
|
237
|
+
!!(!leader? && @heartbeat_timeout_time && @heartbeat_timeout_time < Time.now)
|
238
|
+
end
|
239
|
+
|
240
|
+
def node_info
|
241
|
+
if @network.nil?
|
242
|
+
raise RuntimeError, "Cannot determine node info until the network is up"
|
243
|
+
end
|
244
|
+
|
245
|
+
@node_info ||= Evinrude::NodeInfo.new(address: address, port: port, name: @node_name)
|
246
|
+
end
|
247
|
+
|
248
|
+
private
|
249
|
+
|
250
|
+
def load_log_from_disk
|
251
|
+
log_file = @storage_dir.join("log.yaml")
|
252
|
+
|
253
|
+
if log_file.exist?
|
254
|
+
logger.debug(logloc) { "Loading log entries from #{log_file}" }
|
255
|
+
@metrics.log_file_size.set(log_file.stat.size)
|
256
|
+
|
257
|
+
# Temporarily unsetting @storage_dir prevents the calls we make from
|
258
|
+
# writing all the log entries straight back to disk again
|
259
|
+
tmp_storage_dir, @storage_dir = @storage_dir, nil
|
260
|
+
|
261
|
+
begin
|
262
|
+
log_file.open do |fd|
|
263
|
+
YAML.load_stream(fd) do |entry|
|
264
|
+
unless entry.is_a?(Hash)
|
265
|
+
logger.fatal(logloc) { "SHENANIGAN ALERT: persisted log entry #{entry} is not a hash!" }
|
266
|
+
exit 42
|
267
|
+
end
|
268
|
+
|
269
|
+
m, args = entry.to_a.first
|
270
|
+
|
271
|
+
unless %i{process_log_entry commit_entries_to}.include?(m)
|
272
|
+
logger.fatal(logloc) { "SHENANIGAN ALERT: log includes unexpected operation #{m.inspect}(*#{args.inspect})!!!" }
|
273
|
+
exit 42
|
274
|
+
end
|
275
|
+
|
276
|
+
logger.debug(logloc) { "Running #{m}(#{args.inspect}) from disk log" }
|
277
|
+
|
278
|
+
self.__send__(m, *args)
|
279
|
+
end
|
280
|
+
end
|
281
|
+
ensure
|
282
|
+
@storage_dir = tmp_storage_dir
|
283
|
+
end
|
284
|
+
|
285
|
+
logger.debug(logloc) { "Completed log read" }
|
286
|
+
end
|
287
|
+
end
|
288
|
+
|
289
|
+
def rpc_to_leader(msg, task)
|
290
|
+
backoff = Evinrude::Backoff.new
|
291
|
+
reply = nil
|
292
|
+
command_id = SecureRandom.uuid
|
293
|
+
|
294
|
+
logger.debug(logloc) { "(in #{@node_name}) Sending message #{msg.inspect} to cluster leader" }
|
295
|
+
|
296
|
+
loop do
|
297
|
+
until leader? || follower? || expired?
|
298
|
+
logger.debug(logloc) { "(in #{@node_name}) Waiting until we're in the cluster before sending RPC to leader" }
|
299
|
+
task.sleep 0.5
|
300
|
+
end
|
301
|
+
|
302
|
+
begin
|
303
|
+
remote = reply&.leader_info || @leader_info
|
304
|
+
|
305
|
+
if remote.nil?
|
306
|
+
raise NoLeaderError, "No leader could be discerned for the cluster at present"
|
307
|
+
end
|
308
|
+
|
309
|
+
conn = @network.connect(address: remote.address, port: remote.port)
|
310
|
+
|
311
|
+
reply = task.with_timeout(5) do |t|
|
312
|
+
conn.rpc(msg)
|
313
|
+
end
|
314
|
+
|
315
|
+
if reply.nil?
|
316
|
+
logger.debug(logloc) { "(in #{@node_name}) RPC to leader #{remote.inspect} timed out" }
|
317
|
+
elsif reply.leader_info
|
318
|
+
logger.debug(logloc) { "(in #{@node_name}) Redirected to #{reply.leader_info.inspect}" }
|
319
|
+
# No need to wait for the backoff time here
|
320
|
+
next
|
321
|
+
else
|
322
|
+
logger.debug(logloc) { "(in #{@node_name}) RPC to leader returned #{reply.inspect}" }
|
323
|
+
return reply
|
324
|
+
end
|
325
|
+
|
326
|
+
task.sleep backoff.wait_time
|
327
|
+
rescue Evinrude::Error, Async::TimeoutError, Async::Wrapper::Cancelled, SystemCallError, IOError => ex
|
328
|
+
@metrics.rpc_exception.increment(labels: { target: "#{remote.address}:#{remote.port}", node_name: remote.name, class: ex.class.to_s })
|
329
|
+
log_exception(ex) { "(in #{@node_name}) RPC to leader raised exception" }
|
330
|
+
conn&.close
|
331
|
+
reply = nil
|
332
|
+
|
333
|
+
if expired?
|
334
|
+
raise NodeExpiredError, "This node instance is not active in the cluster (mode=#{@mode})"
|
335
|
+
end
|
336
|
+
|
337
|
+
task.sleep backoff.wait_time
|
338
|
+
end
|
339
|
+
end
|
340
|
+
end
|
341
|
+
|
342
|
+
def become_leader
|
343
|
+
reset_peers
|
344
|
+
|
345
|
+
logger.info(logloc) { "Assuming leadership of the cluster" }
|
346
|
+
|
347
|
+
@mode = :leader
|
348
|
+
|
349
|
+
@leader_info = node_info
|
350
|
+
@commands_in_progress = {}
|
351
|
+
|
352
|
+
@cc_sem.acquire do
|
353
|
+
@config_change_queue = []
|
354
|
+
@config_change_request_in_progress = nil
|
355
|
+
end
|
356
|
+
|
357
|
+
@async_task.async do |subtask|
|
358
|
+
while leader?
|
359
|
+
subtask.sleep @heartbeat_interval
|
360
|
+
|
361
|
+
if leader?
|
362
|
+
logger.debug(logloc) { "Triggering periodic AE heartbeat" }
|
363
|
+
issue_append_entries_to_cluster
|
364
|
+
end
|
365
|
+
end
|
366
|
+
end
|
367
|
+
|
368
|
+
propose_log_entry(
|
369
|
+
LogEntry::Null.new(term: @current_term) do
|
370
|
+
logger.debug(logloc) { "Null log entry to mark start-of-term replicated" }
|
371
|
+
end
|
372
|
+
)
|
373
|
+
|
374
|
+
@metrics.state.set(3)
|
375
|
+
end
|
376
|
+
|
377
|
+
def become_follower
|
378
|
+
reset_peers
|
379
|
+
|
380
|
+
logger.info(logloc) { "Becoming follower" }
|
381
|
+
|
382
|
+
@mode = :follower
|
383
|
+
|
384
|
+
@heartbeat_timeout_time = Time.now + @heartbeat_timeout.rand
|
385
|
+
|
386
|
+
@async_task.async do |subtask|
|
387
|
+
while follower?
|
388
|
+
logger.debug(logloc) { "#{@heartbeat_timeout_time - Time.now}s until heartbeat timer expires" }
|
389
|
+
|
390
|
+
subtask.sleep [0.01, @heartbeat_timeout_time - Time.now].max
|
391
|
+
|
392
|
+
if follower? && @heartbeat_timeout_time < Time.now
|
393
|
+
logger.info(logloc) { "Heartbeat timeout expired; triggering election" }
|
394
|
+
trigger_election
|
395
|
+
end
|
396
|
+
end
|
397
|
+
end
|
398
|
+
|
399
|
+
@metrics.state.set(2)
|
400
|
+
end
|
401
|
+
|
402
|
+
def become_candidate
|
403
|
+
reset_peers
|
404
|
+
|
405
|
+
logger.info(logloc) { "Becoming a candidate" }
|
406
|
+
|
407
|
+
@mode = :candidate
|
408
|
+
|
409
|
+
@async_task.async do |subtask|
|
410
|
+
election_timeout = @heartbeat_timeout.rand
|
411
|
+
logger.debug(logloc) { "Waiting #{election_timeout}s for election to complete" }
|
412
|
+
subtask.sleep election_timeout
|
413
|
+
|
414
|
+
if candidate?
|
415
|
+
logger.info(logloc) { "Election timeout expired without a leader being elected; triggering a new election" }
|
416
|
+
trigger_election
|
417
|
+
end
|
418
|
+
end
|
419
|
+
|
420
|
+
@metrics.state.set(1)
|
421
|
+
end
|
422
|
+
|
423
|
+
def reset_peers
|
424
|
+
@peers.values.each { |f| f.conn.close }
|
425
|
+
@peers.clear
|
426
|
+
@metrics.clear_peer_metrics
|
427
|
+
end
|
428
|
+
|
429
|
+
def new_term(n)
|
430
|
+
logger.debug(logloc) { "Setting up for term #{n}" }
|
431
|
+
@current_term = n
|
432
|
+
@voted_for = nil
|
433
|
+
|
434
|
+
@metrics.term.set(@current_term)
|
435
|
+
end
|
436
|
+
|
437
|
+
def persist_to_disk(e)
|
438
|
+
if @storage_dir
|
439
|
+
file = @storage_dir.join("log.yaml")
|
440
|
+
|
441
|
+
if file.exist? && file.stat.size > 1024 * 1024
|
442
|
+
logger.debug(logloc) { "Log is getting a bit big; time for a new snapshot, methinks" }
|
443
|
+
take_snapshot
|
444
|
+
end
|
445
|
+
|
446
|
+
logger.debug(logloc) { "Persisting #{e.inspect} to #{file}" }
|
447
|
+
file.open("a") do |fd|
|
448
|
+
logger.debug(logloc) { "Doin' the write thing" }
|
449
|
+
fd.puts e.to_yaml
|
450
|
+
fd.fdatasync
|
451
|
+
end
|
452
|
+
|
453
|
+
@metrics.log_entries_persisted.increment
|
454
|
+
@metrics.log_file_size.set(file.stat.size)
|
455
|
+
end
|
456
|
+
end
|
457
|
+
|
458
|
+
def propose_log_entry(entry)
|
459
|
+
unless leader?
|
460
|
+
logger.error(logloc) { with_backtrace("propose_log_entry called while not leader!") }
|
461
|
+
return
|
462
|
+
end
|
463
|
+
|
464
|
+
@log.append(entry)
|
465
|
+
persist_to_disk(process_log_entry: [entry, @log.last_index])
|
466
|
+
|
467
|
+
logger.debug(logloc) { "Proposing #{entry.inspect} as ##{@log.last_index}" }
|
468
|
+
|
469
|
+
if @config.nodes.length == 1
|
470
|
+
# Flyin' solo! Means we can skip all that inconvenient AppendEntries stuff,
|
471
|
+
# but we still need to do what needs to be done after the entry has been
|
472
|
+
# "replicated everywhere" (ie "here")
|
473
|
+
check_for_new_replication_majority
|
474
|
+
else
|
475
|
+
issue_append_entries_to_cluster
|
476
|
+
end
|
477
|
+
end
|
478
|
+
|
479
|
+
def issue_append_entries_to_cluster(&blk)
|
480
|
+
nodes.each do |n|
|
481
|
+
next if n == node_info
|
482
|
+
|
483
|
+
@async_task.async do
|
484
|
+
begin
|
485
|
+
issue_append_entries(@peers[n], &blk)
|
486
|
+
rescue Evinrude::Log::SnapshottedEntryError
|
487
|
+
issue_snapshot(@peers[n])
|
488
|
+
rescue => ex
|
489
|
+
log_exception(ex) { "Failed to issue AppendEntries to #{n.inspect}" }
|
490
|
+
end
|
491
|
+
end
|
492
|
+
end
|
493
|
+
end
|
494
|
+
|
495
|
+
def issue_append_entries(follower)
|
496
|
+
logger.debug(logloc) { "Issuing AppendEntries to #{follower.node_info.inspect}" }
|
497
|
+
entries = @log.entries_from(follower.next_index)
|
498
|
+
prev_index = [follower.next_index - 1, @log.last_index].min
|
499
|
+
prev_entry = @log[prev_index]
|
500
|
+
|
501
|
+
logger.debug(logloc) { "Previous log entry (##{prev_index}) is #{prev_entry.inspect}" }
|
502
|
+
|
503
|
+
reply = follower.rpc(
|
504
|
+
Message::AppendEntriesRequest.new(
|
505
|
+
term: @current_term,
|
506
|
+
leader_info: node_info,
|
507
|
+
leader_commit: @commit_index,
|
508
|
+
prev_log_index: prev_index,
|
509
|
+
prev_log_term: prev_entry.term,
|
510
|
+
entries: entries,
|
511
|
+
)
|
512
|
+
)
|
513
|
+
|
514
|
+
if leader?
|
515
|
+
if reply.nil?
|
516
|
+
logger.debug(logloc) { "AppendEntriesRequest to #{follower.node_info.inspect} was not answered. C'est la vie." }
|
517
|
+
follower.conn.close
|
518
|
+
@peers.delete(follower.node_info)
|
519
|
+
elsif block_given?
|
520
|
+
yield reply, follower.node_info
|
521
|
+
elsif reply.term > @current_term
|
522
|
+
logger.debug(logloc) { "Received term from #{follower.node_info.inspect} greater than our own. Demotion required!" }
|
523
|
+
new_term(reply.term)
|
524
|
+
become_follower
|
525
|
+
elsif reply.success
|
526
|
+
logger.debug(logloc) { "Successful AppendEntriesReply received from #{follower.node_info.inspect}" }
|
527
|
+
follower.successful_append(prev_index + entries.length)
|
528
|
+
check_for_new_replication_majority
|
529
|
+
else
|
530
|
+
logger.debug(logloc) { "AppendEntries to #{follower.node_info.inspect} failed; retrying after next_index decrement" }
|
531
|
+
if reply.last_index && reply.last_index < follower.next_index - 1
|
532
|
+
follower.failed_append(reply.last_index)
|
533
|
+
else
|
534
|
+
follower.failed_append
|
535
|
+
end
|
536
|
+
if follower.next_index <= @log.snapshot_last_index
|
537
|
+
issue_snapshot(follower)
|
538
|
+
else
|
539
|
+
issue_append_entries(follower)
|
540
|
+
end
|
541
|
+
end
|
542
|
+
else
|
543
|
+
logger.debug(logloc) { "Ignoring AppendEntriesReply received when we're not leader" }
|
544
|
+
end
|
545
|
+
end
|
546
|
+
|
547
|
+
def check_for_new_replication_majority
|
548
|
+
new_commits = false
|
549
|
+
|
550
|
+
((@commit_index + 1)..@log.last_index).each do |idx|
|
551
|
+
present_nodes = @peers.values.select { |f| f.match_index >= idx }.map(&:node_info) + [node_info]
|
552
|
+
|
553
|
+
logger.debug(logloc) { "Checking for replication majority on ##{idx} (present: #{present_nodes.inspect})" }
|
554
|
+
if @config.quorum_met?(present_nodes)
|
555
|
+
logger.debug(logloc) { "Log index #{idx} has met majority" }
|
556
|
+
@metrics.replication_majority.set(idx)
|
557
|
+
|
558
|
+
entry = @log[idx]
|
559
|
+
|
560
|
+
case entry
|
561
|
+
when LogEntry::ClusterConfiguration
|
562
|
+
logger.debug(logloc) { "Newly majoritied (majoritised?) log entry is a ClusterConfig; @config_index=#{@config_index}" }
|
563
|
+
|
564
|
+
# Dealing with potentially out-of-date cluster configurations is
|
565
|
+
# absofuckinglutely mind-bending. As near as I can tell, however,
|
566
|
+
# since the leader by definition has all of the log entries, it
|
567
|
+
# also has the latest and greatest config live and in concert,
|
568
|
+
# so we can make some assumptions about future log entries on
|
569
|
+
# that basis.
|
570
|
+
if idx == @config_index
|
571
|
+
logger.debug(logloc) { "Replication of current config #{@config.inspect} complete" }
|
572
|
+
if @config.transitioning?
|
573
|
+
logger.debug(logloc) { "Proposing post-joint config" }
|
574
|
+
@config.joint_configuration_replicated
|
575
|
+
propose_log_entry(LogEntry::ClusterConfiguration.new(term: @current_term, config: @config))
|
576
|
+
@config_index = @log.last_index
|
577
|
+
else
|
578
|
+
# Transition complete; time to let the requestor know they're good
|
579
|
+
# to go
|
580
|
+
logger.debug(logloc) { "Post-joint config replicated; config change saga completed" }
|
581
|
+
@config_index = @log.last_index
|
582
|
+
|
583
|
+
@cc_sem.acquire do
|
584
|
+
if @config_change_request_in_progress
|
585
|
+
logger.debug(logloc) { "Letting #{@config_change_request_in_progress.node_info.inspect} know their config change request was successful" }
|
586
|
+
|
587
|
+
# This is technically only necessary for certain config changes
|
588
|
+
# (like when a node changes address/port but keeps the same
|
589
|
+
# name) but there's no harm in doing it all the time.
|
590
|
+
if @peers.key?(@config_change_request_in_progress.node_info)
|
591
|
+
@peers[@config_change_request_in_progress.node_info].conn.close
|
592
|
+
@peers.delete(@config_change_request_in_progress.node_info)
|
593
|
+
end
|
594
|
+
|
595
|
+
@config_change_request_in_progress.send_successful_reply
|
596
|
+
@config_change_request_in_progress = nil
|
597
|
+
else
|
598
|
+
logger.debug(logloc) { "Nobody to send a successful config change reply to; oh well" }
|
599
|
+
end
|
600
|
+
end
|
601
|
+
|
602
|
+
process_config_change_queue
|
603
|
+
end
|
604
|
+
else
|
605
|
+
logger.debug(logloc) { "Quorum met on out-of-date config #{entry.config.inspect}; ignoring" }
|
606
|
+
end
|
607
|
+
when LogEntry::StateMachineCommand
|
608
|
+
@sm_mutex.synchronize do
|
609
|
+
logger.debug(logloc) { "Applying state machine command #{entry.command} (id #{entry.id})" }
|
610
|
+
@state_machine.process_command(entry.command)
|
611
|
+
if conn = @commands_in_progress.delete(entry.id)
|
612
|
+
logger.debug(logloc) { "Letting the client know their command is cooked" }
|
613
|
+
conn.send_reply(Message::CommandReply.new(success: true))
|
614
|
+
else
|
615
|
+
logger.debug(logloc) { "No client around to notify of command application; they'll figure it out eventually" }
|
616
|
+
end
|
617
|
+
end
|
618
|
+
end
|
619
|
+
|
620
|
+
@commit_index = idx
|
621
|
+
@metrics.commit_index.set(@commit_index)
|
622
|
+
persist_to_disk(commit_entries_to: [idx])
|
623
|
+
new_commits = true
|
624
|
+
else
|
625
|
+
logger.debug(logloc) { "Replication majority not yet met on ##{idx}. Better luck next time." }
|
626
|
+
end
|
627
|
+
end
|
628
|
+
|
629
|
+
if new_commits
|
630
|
+
# We want to get the good word out to everyone as soon as possible that
|
631
|
+
# there's new log entries that can be committed.
|
632
|
+
issue_append_entries_to_cluster
|
633
|
+
end
|
634
|
+
end
|
635
|
+
|
636
|
+
def take_snapshot
|
637
|
+
return unless @storage_dir
|
638
|
+
|
639
|
+
snapshot = @sm_mutex.synchronize do
|
640
|
+
Evinrude::Snapshot.new(node_name: @node_name, state: @state_machine.snapshot, cluster_config: @config, cluster_config_index: @config_index, last_term: @log.last_entry_term, last_index: @log.last_index, last_command_ids: @last_command_ids)
|
641
|
+
end
|
642
|
+
|
643
|
+
Tempfile.open("snapshot", @storage_dir) do |f|
|
644
|
+
logger.debug(logloc) { "Writing snapshot data to #{f.path}" }
|
645
|
+
f.write(snapshot.to_yaml)
|
646
|
+
f.fdatasync
|
647
|
+
f.close
|
648
|
+
File.rename(f.path, @storage_dir.join("snapshot.yaml"))
|
649
|
+
File.open(@storage_dir) { |d| d.fsync }
|
650
|
+
end
|
651
|
+
|
652
|
+
@metrics.snapshot_file_size.set(@storage_dir.join("snapshot.yaml").stat.size)
|
653
|
+
|
654
|
+
begin
|
655
|
+
logger.debug(logloc) { "Deleting now-stale log.yaml" }
|
656
|
+
File.unlink(File.join(@storage_dir, "log.yaml"))
|
657
|
+
rescue Errno::ENOENT
|
658
|
+
# Yes, this is in fact exactly what we're trying to achieve
|
659
|
+
end
|
660
|
+
|
661
|
+
@metrics.log_file_size.set(0)
|
662
|
+
end
|
663
|
+
|
664
|
+
def issue_snapshot(follower)
|
665
|
+
msg = @sm_mutex.synchronize do
|
666
|
+
Message::InstallSnapshotRequest.new(term: @current_term, leader_info: @leader_info, last_included_index: @commit_index, last_included_term: @log[@commit_index].term, data: @state_machine.snapshot)
|
667
|
+
end
|
668
|
+
|
669
|
+
reply = follower.rpc(msg)
|
670
|
+
|
671
|
+
if reply.term > @current_term
|
672
|
+
new_term(reply.term)
|
673
|
+
else
|
674
|
+
follower.successful_append(@commit_index)
|
675
|
+
end
|
676
|
+
end
|
677
|
+
|
678
|
+
def async_resolver
|
679
|
+
@async_resolver ||= Evinrude::Resolver.new
|
680
|
+
end
|
681
|
+
|
682
|
+
def expand_join_hints
|
683
|
+
return [] if @join_hints.nil?
|
684
|
+
|
685
|
+
# Where's Enumerable.amap when you need it?
|
686
|
+
sem = Async::Semaphore.new
|
687
|
+
|
688
|
+
[].tap do |r|
|
689
|
+
@join_hints.each do |jh|
|
690
|
+
Async(logger: logger) do |t|
|
691
|
+
if jh.is_a?(String)
|
692
|
+
async_resolver.getresources(jh).each do |srv|
|
693
|
+
t.async do
|
694
|
+
async_resolver.getaddresses(srv.target.to_s).each do |addr|
|
695
|
+
sem.acquire { r << { address: addr, port: srv.port } }
|
696
|
+
end
|
697
|
+
end
|
698
|
+
end
|
699
|
+
elsif jh.is_a?(Hash) || jh.is_a?(NodeInfo)
|
700
|
+
begin
|
701
|
+
IPAddr.new(jh[:address])
|
702
|
+
# It's an IP address already; excellent
|
703
|
+
sem.acquire { r << jh }
|
704
|
+
rescue ArgumentError
|
705
|
+
# It's a hostname(ish)
|
706
|
+
async_resolver.getaddresses(jh[:address]).each do |addr|
|
707
|
+
sem.acquire { r << { address: addr, port: srv.port } }
|
708
|
+
end
|
709
|
+
end
|
710
|
+
else
|
711
|
+
raise ArgumentError, "Invalid join hint entry: #{jh.inspect}"
|
712
|
+
end
|
713
|
+
end.result
|
714
|
+
end
|
715
|
+
end
|
716
|
+
end
|
717
|
+
|
718
|
+
def join_targets
|
719
|
+
expand_join_hints + @config.nodes.reject { |n| n.name == node_info.name }
|
720
|
+
end
|
721
|
+
|
722
|
+
def join_or_create_cluster
|
723
|
+
if @join_hints.nil? && join_targets.empty?
|
724
|
+
logger.info(logloc) { "No hints of an existing cluster found; configuring for standalone mode" }
|
725
|
+
new_term(1)
|
726
|
+
|
727
|
+
@config.add_node(node_info)
|
728
|
+
@config.joint_configuration_replicated
|
729
|
+
|
730
|
+
become_leader
|
731
|
+
|
732
|
+
propose_log_entry(LogEntry::ClusterConfiguration.new(term: @current_term, config: @config))
|
733
|
+
|
734
|
+
take_snapshot
|
735
|
+
else
|
736
|
+
logger.info(logloc) { "Joining existing cluster" }
|
737
|
+
join_cluster_via(join_targets)
|
738
|
+
|
739
|
+
# Taking a snapshot immediately after joining allows us to capture an
|
740
|
+
# up-to-date config, as well as our node name, in case of accidents.
|
741
|
+
take_snapshot
|
742
|
+
end
|
743
|
+
end
|
744
|
+
|
745
|
+
def join_cluster_via(targets)
|
746
|
+
connected = false
|
747
|
+
|
748
|
+
logger.debug(logloc) { "Attempting to join cluster via targets #{targets.inspect}" }
|
749
|
+
|
750
|
+
# I call this algorithm "happy joinballs".
|
751
|
+
#
|
752
|
+
# I will not be taking questions at this time.
|
753
|
+
conn_tasks = targets.map do |t|
|
754
|
+
@async_task.async do |subtask|
|
755
|
+
logger.debug(logloc) { "Initiating happy joinballs connection to #{t[:address]}:#{t[:port]}" }
|
756
|
+
|
757
|
+
begin
|
758
|
+
conn = @network.connect(address: t[:address], port: t[:port])
|
759
|
+
rescue StandardError => ex
|
760
|
+
logger.warn(logloc) { "Failed to connect to #{t[:address]}:#{t[:port]}: #{ex.class} (#{ex.message})" }
|
761
|
+
if targets.length == 1
|
762
|
+
logger.warn(logloc) { "Cluster leader not responsive; restarting join attempt" }
|
763
|
+
join_or_create_cluster
|
764
|
+
end
|
765
|
+
|
766
|
+
next
|
767
|
+
end
|
768
|
+
|
769
|
+
# If we get here, we have won the happy joinballs race
|
770
|
+
conn_tasks.each do |ct|
|
771
|
+
next if ct == Async::Task.current
|
772
|
+
|
773
|
+
ct.stop
|
774
|
+
end
|
775
|
+
|
776
|
+
logger.debug(logloc) { "Sending a join request to #{conn.peer_info}" }
|
777
|
+
reply = subtask.with_timeout(5) do |t|
|
778
|
+
conn.rpc(Message::JoinRequest.new(node_info: node_info))
|
779
|
+
rescue Async::TimeoutError
|
780
|
+
nil
|
781
|
+
end
|
782
|
+
|
783
|
+
if reply&.success
|
784
|
+
logger.info(logloc) { "Joined cluster; #{reply.inspect}" }
|
785
|
+
become_follower
|
786
|
+
elsif reply&.leader_info
|
787
|
+
logger.debug(logloc) { "Redirected to leader #{reply.leader_info.inspect}" }
|
788
|
+
join_cluster_via([reply.leader_info])
|
789
|
+
else
|
790
|
+
logger.error(logloc) { "Cluster join via #{t.inspect} failed: #{reply.nil? ? "RPC timeout" : reply.inspect}" }
|
791
|
+
# Obviously that target is busticated, so we'll retry without it.
|
792
|
+
# The problem is that the busticated target might have been a
|
793
|
+
# leader we were erroneously redirected to; in that case, the
|
794
|
+
# targets list will have only one node, and we'll need to go
|
795
|
+
# back to joinballing everyone. Hopefully by now the cluster
|
796
|
+
# will have agreed on a *live* leader for us to join via.
|
797
|
+
if targets.length == 1
|
798
|
+
join_cluster_via(join_targets - [t])
|
799
|
+
else
|
800
|
+
join_cluster_via(targets - [t])
|
801
|
+
end
|
802
|
+
end
|
803
|
+
end
|
804
|
+
end
|
805
|
+
|
806
|
+
conn_tasks.each(&:wait)
|
807
|
+
end
|
808
|
+
|
809
|
+
def process_rpc_requests
|
810
|
+
logger.debug(logloc) { "Commencing to process RPC requests" }
|
811
|
+
@network.each_message do |msg, conn|
|
812
|
+
@metrics.messages_received.increment(labels: { type: msg.class.to_s.split("::").last })
|
813
|
+
|
814
|
+
logger.debug(logloc) { "Received #{msg} from #{conn.peer_info}" }
|
815
|
+
reply = case msg
|
816
|
+
when Message::AppendEntriesRequest
|
817
|
+
process_append_entries_request(msg, conn)
|
818
|
+
when Message::CommandRequest
|
819
|
+
process_command_request(msg, conn)
|
820
|
+
when Message::JoinRequest
|
821
|
+
process_join_request(msg, conn)
|
822
|
+
when Message::NodeRemovalRequest
|
823
|
+
process_node_removal_request(msg, conn)
|
824
|
+
when Message::ReadRequest
|
825
|
+
process_read_request(msg, conn)
|
826
|
+
when Message::VoteRequest
|
827
|
+
process_vote_request(msg, conn)
|
828
|
+
when Message::InstallSnapshotRequest
|
829
|
+
process_install_snapshot_request(msg, conn)
|
830
|
+
else
|
831
|
+
logger.warn(logloc) { "Unexpected #{msg.class.to_s.split("::").last} received from #{conn.peer_info}" }
|
832
|
+
nil
|
833
|
+
end
|
834
|
+
|
835
|
+
if reply
|
836
|
+
logger.debug(logloc) { "Sending reply #{reply.inspect} to #{conn.peer_info}" }
|
837
|
+
conn.send_reply(reply)
|
838
|
+
else
|
839
|
+
logger.warn(logloc) { "No immediate reply to #{msg.inspect} from #{conn.peer_info}" }
|
840
|
+
end
|
841
|
+
end
|
842
|
+
end
|
843
|
+
|
844
|
+
def process_join_request(msg, conn)
|
845
|
+
logger.debug(logloc) { "Join request #{msg.inspect} received from #{conn.peer_info}" }
|
846
|
+
|
847
|
+
if follower?
|
848
|
+
logger.debug(logloc) { "Not leader; redirecting" }
|
849
|
+
Message::JoinReply.new(success: false, leader_info: @leader_info)
|
850
|
+
elsif leader?
|
851
|
+
logger.debug(logloc) { "Queueing join request" }
|
852
|
+
@config_change_queue << ConfigChangeQueueEntry::AddNode.new(msg, conn)
|
853
|
+
|
854
|
+
if @config_change_queue.length == 1 && @config_change_request_in_progress.nil?
|
855
|
+
logger.debug(logloc) { "Triggering new config change queue cascade" }
|
856
|
+
process_config_change_queue
|
857
|
+
end
|
858
|
+
|
859
|
+
# No immediate reply; will be sent once the join is completed
|
860
|
+
nil
|
861
|
+
else
|
862
|
+
logger.debug(logloc) { "Ignoring join request from #{msg.node_info} because not leader or follower" }
|
863
|
+
nil
|
864
|
+
end
|
865
|
+
end
|
866
|
+
|
867
|
+
def process_node_removal_request(msg, conn)
|
868
|
+
logger.debug(logloc) { "Node removal request #{msg.inspect} received from #{conn.peer_info}" }
|
869
|
+
|
870
|
+
if follower?
|
871
|
+
logger.debug(logloc) { "Not leader; redirecting" }
|
872
|
+
Message::NodeRemovalReply.new(success: false, leader_info: @leader_info)
|
873
|
+
elsif leader?
|
874
|
+
logger.debug(logloc) { "Queueing node removal request" }
|
875
|
+
@config_change_queue << ConfigChangeQueueEntry::RemoveNode.new(msg, conn)
|
876
|
+
|
877
|
+
if @config_change_queue.length == 1
|
878
|
+
logger.debug(logloc) { "Triggering new config change queue cascade" }
|
879
|
+
process_config_change_queue
|
880
|
+
end
|
881
|
+
|
882
|
+
# No immediate reply; will be sent once the join is completed
|
883
|
+
nil
|
884
|
+
else
|
885
|
+
logger.debug(logloc) { "Ignoring node removal request from #{msg.node_info} because not leader or follower" }
|
886
|
+
nil
|
887
|
+
end
|
888
|
+
end
|
889
|
+
|
890
|
+
def process_config_change_queue
|
891
|
+
if @config_change_queue.empty?
|
892
|
+
logger.debug(logloc) { "No more entries in the config change queue" }
|
893
|
+
return
|
894
|
+
end
|
895
|
+
|
896
|
+
if @config_change_request_in_progress
|
897
|
+
logger.error(logloc) { "Change queue processing requested while change request in progress!" }
|
898
|
+
return
|
899
|
+
end
|
900
|
+
|
901
|
+
@config_change_request_in_progress = @config_change_queue.shift
|
902
|
+
logger.debug(logloc) { "Processing config change queue entry #{@config_change_request_in_progress.inspect}" }
|
903
|
+
|
904
|
+
unless leader?
|
905
|
+
@cc_sem.acquire do
|
906
|
+
@config_change_request_in_progress.send_redirect_reply(@leader_info)
|
907
|
+
@config_change_request_in_progress = nil
|
908
|
+
end
|
909
|
+
process_config_change_queue
|
910
|
+
return
|
911
|
+
end
|
912
|
+
|
913
|
+
case @config_change_request_in_progress
|
914
|
+
when ConfigChangeQueueEntry::AddNode
|
915
|
+
if @config.nodes.include?(@config_change_request_in_progress.node_info)
|
916
|
+
# "Dude, you're *already* part of the cluster! Duuuuuuuuuuuuuuude!"
|
917
|
+
@cc_sem.acquire do
|
918
|
+
@config_change_request_in_progress.send_successful_reply
|
919
|
+
@config_change_request_in_progress = nil
|
920
|
+
end
|
921
|
+
process_config_change_queue
|
922
|
+
else
|
923
|
+
logger.debug(logloc) { "Transitioning configuration to add #{@config_change_request_in_progress.node_info.inspect}" }
|
924
|
+
|
925
|
+
@config.add_node(@config_change_request_in_progress.node_info)
|
926
|
+
propose_log_entry(LogEntry::ClusterConfiguration.new(term: @current_term, config: @config))
|
927
|
+
@config_index = @log.last_index
|
928
|
+
end
|
929
|
+
when ConfigChangeQueueEntry::RemoveNode
|
930
|
+
if !@config.nodes.include?(@config_change_request_in_progress.node_info)
|
931
|
+
@cc_sem.acquire do
|
932
|
+
@config_change_request_in_progress.send_successful_reply
|
933
|
+
@config_change_request_in_progress = nil
|
934
|
+
end
|
935
|
+
process_config_change_queue
|
936
|
+
else
|
937
|
+
logger.debug(logloc) { "Transitioning configuration to remove #{@config_change_request_in_progress.node_info.inspect}" }
|
938
|
+
|
939
|
+
@config.remove_node(@config_change_request_in_progress.node_info)
|
940
|
+
propose_log_entry(LogEntry::ClusterConfiguration.new(term: @current_term, config: @config))
|
941
|
+
@config_index = @log.last_index
|
942
|
+
end
|
943
|
+
else
|
944
|
+
logger.error(logloc) { "Unsupported change request type #{@config_change_request_in_progress.class}; this really shouldn't ever happen, bug report welcome" }
|
945
|
+
logger.debug(logloc) { "Unsupported change request was #{@config_change_request_in_progress.inspect}" }
|
946
|
+
@config_change_request_in_progress = nil
|
947
|
+
process_config_change_queue
|
948
|
+
end
|
949
|
+
end
|
950
|
+
|
951
|
+
def process_append_entries_request(msg, conn)
|
952
|
+
logger.debug(logloc) { "Processing append_entries request #{msg.inspect} from #{conn.peer_info}" }
|
953
|
+
|
954
|
+
if msg.term < @current_term
|
955
|
+
logger.debug(logloc) { "AppendEntries request term less than our current term #{@current_term}" }
|
956
|
+
Message::AppendEntriesReply.new(success: false, term: @current_term)
|
957
|
+
else
|
958
|
+
@last_append = Time.now
|
959
|
+
|
960
|
+
if !@log.has_entry?(msg.prev_log_index)
|
961
|
+
logger.debug(logloc) { "We don't have log entry prev_log_index=#{msg.prev_log_index}; asking for more entries" }
|
962
|
+
Message::AppendEntriesReply.new(success: false, term: @current_term, last_index: @log.last_index)
|
963
|
+
elsif @log.snapshotted_entry?(msg.prev_log_index + 1)
|
964
|
+
logger.error(logloc) { "Got AppendEntriesRequest with a prev_log_index=#{msg.prev_log_index} that's buried in the snapshot" }
|
965
|
+
# Closing the connection to the leader will cause it to recycle the
|
966
|
+
# follower state, which will reset it to start sending us AppendEntries
|
967
|
+
# from the most recent entry.
|
968
|
+
conn.close
|
969
|
+
elsif msg.prev_log_term != @log.entry_term(msg.prev_log_index)
|
970
|
+
logger.debug(logloc) { "AppendEntries log fork; msg.prev_log_index=#{msg.prev_log_index} msg.prev_log_term=#{msg.prev_log_term} @log.entry_term(msg.prev_log_index=#{@log.entry_term(msg.prev_log_index)} @log.last_index=#{@log.last_index}" }
|
971
|
+
@log.truncate_to(msg.prev_log_index - 1)
|
972
|
+
Message::AppendEntriesReply.new(success: false, term: @current_term)
|
973
|
+
else
|
974
|
+
@leader_info = msg.leader_info
|
975
|
+
|
976
|
+
if msg.term > @current_term || (candidate? && msg.term == @current_term)
|
977
|
+
logger.debug(logloc) { "Received term-updating AppendEntries; msg.term=#{msg.term} @current_term=#{@current_term} node_info.mode=#{node_info.instance_variable_get(:@mode).inspect}" }
|
978
|
+
new_term(msg.term)
|
979
|
+
become_follower
|
980
|
+
end
|
981
|
+
|
982
|
+
@heartbeat_timeout_time = Time.now + @heartbeat_timeout.rand
|
983
|
+
|
984
|
+
msg.entries.each.with_index do |new_entry, i|
|
985
|
+
idx = msg.prev_log_index + i + 1 # Dratted 1-index addressing
|
986
|
+
process_log_entry(new_entry, idx)
|
987
|
+
end
|
988
|
+
|
989
|
+
new_commit_point = [@log.last_index, msg.leader_commit].min
|
990
|
+
|
991
|
+
if new_commit_point > @commit_index
|
992
|
+
commit_entries_to(new_commit_point)
|
993
|
+
end
|
994
|
+
|
995
|
+
Message::AppendEntriesReply.new(success: true, term: @current_term)
|
996
|
+
end
|
997
|
+
end
|
998
|
+
end
|
999
|
+
|
1000
|
+
def process_log_entry(entry, log_index)
|
1001
|
+
logger.debug(logloc) { "Processing #{entry.inspect} at log index #{log_index}" }
|
1002
|
+
|
1003
|
+
existing_entry = @log[log_index]
|
1004
|
+
|
1005
|
+
if existing_entry.nil?
|
1006
|
+
@log.append(entry)
|
1007
|
+
|
1008
|
+
persist_to_disk(process_log_entry: [entry, log_index])
|
1009
|
+
|
1010
|
+
# Configuration changes take place immediately, not after consensus;
|
1011
|
+
# raft.pdf p11, "a server always uses the latest configuration in its
|
1012
|
+
# log, regardless of whether the entry is committed".
|
1013
|
+
if LogEntry::ClusterConfiguration === entry
|
1014
|
+
logger.debug(logloc) { "Using new configuration from log entry ##{log_index}" }
|
1015
|
+
@config = entry.config
|
1016
|
+
@config_index = log_index
|
1017
|
+
end
|
1018
|
+
elsif existing_entry.term != entry.term
|
1019
|
+
logger.debug(logloc) { "Discovered fork at #{log_index} (existing_entry=#{existing_entry.inspect} new_entry=#{entry.inspect}); discarding our remaining log entries" }
|
1020
|
+
@log.truncate_to(log_index - 1)
|
1021
|
+
else
|
1022
|
+
logger.debug(logloc) { "Already got log entry ##{log_index}; skipping" }
|
1023
|
+
end
|
1024
|
+
|
1025
|
+
end
|
1026
|
+
|
1027
|
+
def commit_entries_to(idx)
|
1028
|
+
((@commit_index + 1)..idx).each do |i|
|
1029
|
+
@sm_mutex.synchronize do
|
1030
|
+
logger.debug(logloc) { "Committing log entry ##{i}" }
|
1031
|
+
|
1032
|
+
if LogEntry::StateMachineCommand === @log[i]
|
1033
|
+
logger.debug(logloc) { "Applying state machine command #{@log[i].command}" }
|
1034
|
+
@state_machine.process_command(@log[i].command)
|
1035
|
+
@last_command_ids[@log[i].node_name] = @log[i].id
|
1036
|
+
else
|
1037
|
+
logger.debug(logloc) { "Entry ##{i} is a #{@log[i].class}; no commit action necessary" }
|
1038
|
+
end
|
1039
|
+
|
1040
|
+
@commit_index = i
|
1041
|
+
@metrics.commit_index.set(i)
|
1042
|
+
end
|
1043
|
+
end
|
1044
|
+
|
1045
|
+
persist_to_disk(commit_entries_to: [idx])
|
1046
|
+
end
|
1047
|
+
|
1048
|
+
def process_command_request(msg, conn)
|
1049
|
+
logger.debug(logloc) { "Command request #{msg.inspect} received from #{conn.peer_info}" }
|
1050
|
+
|
1051
|
+
if follower?
|
1052
|
+
Message::CommandReply.new(success: false, leader_info: @leader_info)
|
1053
|
+
elsif leader?
|
1054
|
+
if @last_command_ids[msg.node_name] == msg.id
|
1055
|
+
Message::CommandReply.new(success: true)
|
1056
|
+
else
|
1057
|
+
logger.debug(logloc) { "Noting that #{msg.id} is a command in progress" }
|
1058
|
+
@commands_in_progress[msg.id] = conn
|
1059
|
+
propose_log_entry(LogEntry::StateMachineCommand.new(term: @current_term, command: msg.command, id: msg.id, node_name: msg.node_name))
|
1060
|
+
|
1061
|
+
# Deferred reply to log entry commit will occur after replication is complete
|
1062
|
+
nil
|
1063
|
+
end
|
1064
|
+
else
|
1065
|
+
Message::CommandReply.new(success: false)
|
1066
|
+
end
|
1067
|
+
end
|
1068
|
+
|
1069
|
+
def process_vote_request(msg, conn)
|
1070
|
+
if Time.now - @last_append < @heartbeat_timeout.first
|
1071
|
+
# Avoid rogue servers disrupting the cluster by calling votes
|
1072
|
+
# just because they can.
|
1073
|
+
logger.debug(logloc) { "Ignoring vote request from scurvy rogue #{msg.candidate_info}" }
|
1074
|
+
return nil
|
1075
|
+
end
|
1076
|
+
|
1077
|
+
if msg.term > @current_term
|
1078
|
+
new_term(msg.term)
|
1079
|
+
end
|
1080
|
+
|
1081
|
+
if msg.term == @current_term &&
|
1082
|
+
(@voted_for.nil? || @voted_for == msg.candidate_info) &&
|
1083
|
+
((msg.last_log_index >= @log.last_index && msg.last_log_term == @log.last_entry_term) || msg.last_log_term > @log.last_entry_term)
|
1084
|
+
@voted_for = msg.candidate_info
|
1085
|
+
become_follower
|
1086
|
+
logger.debug(logloc) { "Voted for #{msg.candidate_info.inspect} for term #{msg.term} leader" }
|
1087
|
+
Message::VoteReply.new(term: @current_term, vote_granted: true)
|
1088
|
+
else
|
1089
|
+
logger.debug(logloc) { "Rejected #{msg.candidate_info.inspect} for term #{msg.term} leader; @current_term=#{@current_term} @voted_for=#{@voted_for.inspect} msg.last_log_index=#{msg.last_log_index} @log.last_index=#{@log.last_index} msg.last_log_term=#{msg.last_log_term} @log.last_entry_term=#{@log.last_entry_term}" }
|
1090
|
+
Message::VoteReply.new(term: @current_term, vote_granted: false)
|
1091
|
+
end
|
1092
|
+
end
|
1093
|
+
|
1094
|
+
def process_read_request(msg, conn)
|
1095
|
+
if !leader?
|
1096
|
+
Message::ReadReply.new(success: false, leader_info: @leader_info)
|
1097
|
+
elsif @commit_index > msg.commit_index
|
1098
|
+
# We already *know* this is never going to succeed, may as well save ourselves
|
1099
|
+
# the hassle
|
1100
|
+
logger.debug(logloc) { "ReadRequest is for an out-of-date commit_index; nopeing out" }
|
1101
|
+
Message::ReadReply.new(success: false)
|
1102
|
+
elsif @config.nodes.length == 1
|
1103
|
+
# Flyin' solo!
|
1104
|
+
if @commit_index == msg.commit_index
|
1105
|
+
Message::ReadReply.new(success: true)
|
1106
|
+
else
|
1107
|
+
Message::ReadReply.new(success: false)
|
1108
|
+
end
|
1109
|
+
else
|
1110
|
+
responders = [node_info]
|
1111
|
+
|
1112
|
+
issue_append_entries_to_cluster do |reply, node_info|
|
1113
|
+
# responders will be set to nil when quorum has been met, so all remaining
|
1114
|
+
# AE replies can be quietly ignored
|
1115
|
+
next if responders.nil?
|
1116
|
+
|
1117
|
+
if reply.success
|
1118
|
+
responders << node_info
|
1119
|
+
logger.debug(logloc) { "Checking if #{responders.inspect} meets read request quorum" }
|
1120
|
+
if @config.quorum_met?(responders)
|
1121
|
+
logger.debug(logloc) { "Have met read request quorum; reply sent" }
|
1122
|
+
if @commit_index == msg.commit_index
|
1123
|
+
conn.send_reply(Message::ReadReply.new(success: true))
|
1124
|
+
else
|
1125
|
+
conn.send_reply(Message::ReadReply.new(success: false))
|
1126
|
+
end
|
1127
|
+
responders = nil
|
1128
|
+
else
|
1129
|
+
logger.debug(logloc) { "Not yet met read request quorum" }
|
1130
|
+
end
|
1131
|
+
end
|
1132
|
+
end
|
1133
|
+
|
1134
|
+
# Deferred reply
|
1135
|
+
nil
|
1136
|
+
end
|
1137
|
+
end
|
1138
|
+
|
1139
|
+
def process_install_snapshot_request(msg, conn)
|
1140
|
+
if msg.term < @current_term
|
1141
|
+
conn.send_reply(Message::InstallSnapshotReply.new(term: @current_term))
|
1142
|
+
return
|
1143
|
+
end
|
1144
|
+
|
1145
|
+
@sm_mutex.synchronize do
|
1146
|
+
@state_machine = @state_machine_class.new(snapshot: msg.data)
|
1147
|
+
@log.new_snapshot(msg.last_included_term, msg.last_included_index)
|
1148
|
+
@commit_index = msg.last_included_index
|
1149
|
+
end
|
1150
|
+
|
1151
|
+
conn.send_reply(Message::InstallSnapshotReply.new(term: @current_term))
|
1152
|
+
end
|
1153
|
+
|
1154
|
+
def trigger_election
|
1155
|
+
new_term(@current_term + 1)
|
1156
|
+
logger.debug(logloc) { "Initiating election for term #{@current_term}" }
|
1157
|
+
become_candidate
|
1158
|
+
|
1159
|
+
if @config.nodes.length == 1
|
1160
|
+
# Flyin' solo!
|
1161
|
+
logger.debug(logloc) { "No need for an election, as we're in single-node mode" }
|
1162
|
+
become_leader
|
1163
|
+
else
|
1164
|
+
election_term = @current_term
|
1165
|
+
electors = [node_info]
|
1166
|
+
@voted_for = node_info
|
1167
|
+
|
1168
|
+
logger.debug(logloc) { "Canvassing the electorate" }
|
1169
|
+
@config.nodes.each do |n|
|
1170
|
+
next if n == node_info
|
1171
|
+
|
1172
|
+
@async_task.async do
|
1173
|
+
logger.debug(logloc) { "Sending vote request to #{n.inspect}" }
|
1174
|
+
begin
|
1175
|
+
reply = @peers[n].rpc(Message::VoteRequest.new(term: election_term, candidate_info: node_info, last_log_index: @log.last_index, last_log_term: @log.last_entry_term))
|
1176
|
+
rescue => ex
|
1177
|
+
log_exception(ex) { "Failed to send vote to #{n.inspect}" }
|
1178
|
+
if @peers.key?(n)
|
1179
|
+
@peers[n].conn.close
|
1180
|
+
@peers.delete(n)
|
1181
|
+
end
|
1182
|
+
next
|
1183
|
+
end
|
1184
|
+
|
1185
|
+
if electors.nil?
|
1186
|
+
# No need to process a vote if we're not running an election at the moment
|
1187
|
+
next
|
1188
|
+
end
|
1189
|
+
|
1190
|
+
unless candidate?
|
1191
|
+
logger.debug(logloc) { "Received ballot from #{n.inspect}: #{reply.inspect} while in #{@mode} mode" }
|
1192
|
+
next
|
1193
|
+
end
|
1194
|
+
|
1195
|
+
logger.debug(logloc) { "Processing vote #{reply.inspect} from #{n.inspect}" }
|
1196
|
+
if reply.nil?
|
1197
|
+
logger.debug(logloc) { "Received no reply to vote from #{n.inspect}" }
|
1198
|
+
elsif reply.term > @current_term
|
1199
|
+
logger.debug(logloc) { "Received higher term from #{n.inspect}; canceling election" }
|
1200
|
+
new_term(reply.term)
|
1201
|
+
become_follower
|
1202
|
+
electors = nil
|
1203
|
+
elsif reply.vote_granted
|
1204
|
+
logger.debug(logloc) { "Received the vote of #{n.inspect}" }
|
1205
|
+
electors << n
|
1206
|
+
|
1207
|
+
logger.debug(logloc) { "Got #{electors.length} votes so far" }
|
1208
|
+
|
1209
|
+
if @config.quorum_met?(electors)
|
1210
|
+
become_leader
|
1211
|
+
electors = nil
|
1212
|
+
end
|
1213
|
+
end
|
1214
|
+
end
|
1215
|
+
end
|
1216
|
+
end
|
1217
|
+
end
|
1218
|
+
end
|
1219
|
+
|
1220
|
+
require_relative "./evinrude/backoff"
|
1221
|
+
require_relative "./evinrude/config_change_queue_entry/add_node"
|
1222
|
+
require_relative "./evinrude/config_change_queue_entry/remove_node"
|
1223
|
+
require_relative "./evinrude/cluster_configuration"
|
1224
|
+
require_relative "./evinrude/freedom_patches/range"
|
1225
|
+
require_relative "./evinrude/log"
|
1226
|
+
require_relative "./evinrude/log_entries"
|
1227
|
+
require_relative "./evinrude/messages"
|
1228
|
+
require_relative "./evinrude/metrics"
|
1229
|
+
require_relative "./evinrude/network"
|
1230
|
+
require_relative "./evinrude/node_info"
|
1231
|
+
require_relative "./evinrude/peer"
|
1232
|
+
require_relative "./evinrude/snapshot"
|
1233
|
+
require_relative "./evinrude/state_machine/register"
|