evinrude 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. checksums.yaml +7 -0
  2. data/.editorconfig +23 -0
  3. data/.gitignore +6 -0
  4. data/.yardopts +1 -0
  5. data/CODE_OF_CONDUCT.md +49 -0
  6. data/CONTRIBUTING.md +10 -0
  7. data/LICENCE +674 -0
  8. data/README.md +410 -0
  9. data/evinrude.gemspec +42 -0
  10. data/lib/evinrude.rb +1233 -0
  11. data/lib/evinrude/backoff.rb +19 -0
  12. data/lib/evinrude/cluster_configuration.rb +162 -0
  13. data/lib/evinrude/config_change_queue_entry.rb +19 -0
  14. data/lib/evinrude/config_change_queue_entry/add_node.rb +13 -0
  15. data/lib/evinrude/config_change_queue_entry/remove_node.rb +14 -0
  16. data/lib/evinrude/freedom_patches/range.rb +5 -0
  17. data/lib/evinrude/log.rb +102 -0
  18. data/lib/evinrude/log_entries.rb +3 -0
  19. data/lib/evinrude/log_entry.rb +13 -0
  20. data/lib/evinrude/log_entry/cluster_configuration.rb +15 -0
  21. data/lib/evinrude/log_entry/null.rb +6 -0
  22. data/lib/evinrude/log_entry/state_machine_command.rb +13 -0
  23. data/lib/evinrude/logging_helpers.rb +40 -0
  24. data/lib/evinrude/message.rb +19 -0
  25. data/lib/evinrude/message/append_entries_reply.rb +13 -0
  26. data/lib/evinrude/message/append_entries_request.rb +18 -0
  27. data/lib/evinrude/message/command_reply.rb +13 -0
  28. data/lib/evinrude/message/command_request.rb +18 -0
  29. data/lib/evinrude/message/install_snapshot_reply.rb +13 -0
  30. data/lib/evinrude/message/install_snapshot_request.rb +18 -0
  31. data/lib/evinrude/message/join_reply.rb +13 -0
  32. data/lib/evinrude/message/join_request.rb +18 -0
  33. data/lib/evinrude/message/node_removal_reply.rb +13 -0
  34. data/lib/evinrude/message/node_removal_request.rb +18 -0
  35. data/lib/evinrude/message/read_reply.rb +13 -0
  36. data/lib/evinrude/message/read_request.rb +18 -0
  37. data/lib/evinrude/message/vote_reply.rb +13 -0
  38. data/lib/evinrude/message/vote_request.rb +18 -0
  39. data/lib/evinrude/messages.rb +14 -0
  40. data/lib/evinrude/metrics.rb +50 -0
  41. data/lib/evinrude/network.rb +69 -0
  42. data/lib/evinrude/network/connection.rb +144 -0
  43. data/lib/evinrude/network/protocol.rb +69 -0
  44. data/lib/evinrude/node_info.rb +35 -0
  45. data/lib/evinrude/peer.rb +50 -0
  46. data/lib/evinrude/resolver.rb +96 -0
  47. data/lib/evinrude/snapshot.rb +9 -0
  48. data/lib/evinrude/state_machine.rb +15 -0
  49. data/lib/evinrude/state_machine/register.rb +25 -0
  50. data/smoke_tests/001_single_node_cluster.rb +20 -0
  51. data/smoke_tests/002_three_node_cluster.rb +43 -0
  52. data/smoke_tests/003_spill.rb +25 -0
  53. data/smoke_tests/004_stale_read.rb +67 -0
  54. data/smoke_tests/005_sleepy_master.rb +28 -0
  55. data/smoke_tests/006_join_via_follower.rb +26 -0
  56. data/smoke_tests/007_snapshot_madness.rb +97 -0
  57. data/smoke_tests/008_downsizing.rb +43 -0
  58. data/smoke_tests/009_disaster_recovery.rb +46 -0
  59. data/smoke_tests/999_final_smoke_test.rb +279 -0
  60. data/smoke_tests/run +22 -0
  61. data/smoke_tests/smoke_test_helper.rb +199 -0
  62. metadata +318 -0
@@ -0,0 +1,28 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative "./smoke_test_helper"
4
+
5
+ require "evinrude"
6
+ require "logger"
7
+
8
+ Thread.current.name = "MT"
9
+
10
+ nodes = spawn_nodes(5)
11
+
12
+ wait_for_stability(nodes)
13
+
14
+ leader = nodes.find { |n| n.c.leader? }
15
+ followers = nodes.select { |n| n.c.follower? }
16
+
17
+ leader.c.singleton_class.prepend(FaultInjector)
18
+ leader.c.pause!(:issue_append_entries)
19
+
20
+ until new_leader = followers.find { |n| n.c.leader? }
21
+ leader.t.join(1)
22
+ end
23
+
24
+ leader.c.unpause!(:issue_append_entries)
25
+
26
+ leader.t.join(1)
27
+
28
+ assert leader.c.follower?
@@ -0,0 +1,26 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative "./smoke_test_helper"
4
+
5
+ require "evinrude"
6
+
7
+ Thread.current.name = "MT"
8
+
9
+ nodes = spawn_nodes(3)
10
+
11
+ wait_for_stability(nodes)
12
+ wait_for_consensus(nodes)
13
+
14
+ follower = nodes.find { |n| n.c.follower? }
15
+
16
+ newbie = ClusterNode.new(join_hints: [{ address: follower.c.address, port: follower.c.port }], shared_keys: ["s3kr1t"], logger: default_logger)
17
+ newbie.t.name = "NW"
18
+
19
+ newbie.t.join(0.1)
20
+
21
+ until newbie.c.follower?
22
+ newbie.t.join(0.1)
23
+ logger.info(logloc) { "Waiting for newbie to become a follower" }
24
+ end
25
+
26
+ assert newbie.c.follower?, "Newbie has joined the cluster"
@@ -0,0 +1,97 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative "./smoke_test_helper"
4
+
5
+ require "evinrude"
6
+
7
+ Thread.current.name = "MT"
8
+
9
+ nodes = spawn_nodes(3, storage_base_dir: $tmpdir.to_s)
10
+
11
+ wait_for_stability(nodes)
12
+ wait_for_consensus(nodes)
13
+
14
+ nodes.each.with_index { |n, i| n.c.command(n.t.name) }
15
+
16
+ wait_for_consensus(nodes)
17
+
18
+ first_state = nodes.first.c.state
19
+
20
+ nodes.each do |n|
21
+ assert_equal YAML.load_stream($tmpdir.join(n.t.name, "log.yaml").read).last[:commit_entries_to], [n.c.instance_variable_get(:@commit_index)], "#{n.t.name} logged commit_entries_to"
22
+ end
23
+
24
+ nodes.each { |n| n.c.__send__(:take_snapshot) }
25
+
26
+ nodes.each do |n|
27
+ snap = YAML.load_file($tmpdir.join(n.t.name, "snapshot.yaml"))
28
+
29
+ assert_equal snap.state, first_state, "#{n.t.name} state"
30
+ end
31
+
32
+ nodes.each.with_index { |n, i| n.c.command(n.t.name) }
33
+
34
+ wait_for_consensus(nodes)
35
+
36
+ mid_state = nodes.first.c.state
37
+
38
+ victim = nodes.find { |n| n.t.name == mid_state }
39
+ nodes.delete(victim)
40
+
41
+ victim.t.kill
42
+
43
+ assert_equal YAML.load_stream($tmpdir.join(victim.t.name, "log.yaml").read).last[:commit_entries_to], [nodes.first.c.instance_variable_get(:@commit_index)], "Victim #{victim.t.name} had up-to-date commit_index"
44
+
45
+ nodes.each.with_index { |n, i| n.c.command(n.t.name) }
46
+
47
+ wait_for_consensus(nodes)
48
+
49
+ new_state = nodes.first.c.state
50
+
51
+ assert new_state != mid_state, "Check that we've got a new state"
52
+
53
+ lazarus = ClusterNode.new(shared_keys: ["s3kr1t"], logger: default_logger, storage_dir: $tmpdir.join(victim.t.name).to_s)
54
+ lazarus.t.name = "LZ"
55
+
56
+ nodes << lazarus
57
+
58
+ wait_for_consensus(nodes)
59
+
60
+ assert_equal lazarus.c.state, new_state, "Ensure lazarus has synced up"
61
+
62
+ # Phatten the log file
63
+ log_file = $tmpdir.join("C1", "log.yaml")
64
+ until log_file.size > 500_000
65
+ lazarus.c.command("logloglog!" * 1000)
66
+ end
67
+
68
+ # Now keep phattening the log file until it gets tiny again -- indicating
69
+ # a snapshot was taken -- or it gets stupidly huge
70
+ until log_file.size < 500_000 || log_file.size > 2_000_000
71
+ lazarus.c.command("xyzzy!" * rand(1000))
72
+ end
73
+
74
+ assert log_file.size < 32_000_000, "Log file didn't get truncated; was a snapshot even taken?"
75
+
76
+ # Can a new node, starting completely from scratch, load from a master's snapshot?
77
+ newbie = ClusterNode.new(shared_keys: ["s3kr1t"], logger: default_logger, storage_dir: $tmpdir.join("newbie").to_s, node_name: "NB", join_hints: [{ address: lazarus.c.address, port: lazarus.c.port }])
78
+ newbie.t.name = "NB"
79
+
80
+ nodes << newbie
81
+
82
+ wait_for_consensus(nodes)
83
+
84
+ assert_equal newbie.c.state, lazarus.c.state, "Ensure newbie is synced"
85
+
86
+ # As a final test, let's make sure the in-memory log trimming is working OK
87
+ nodes.each { |n| n.t.kill }
88
+
89
+ log = nodes.first.c.instance_variable_get(:@log)
90
+
91
+ 2000.times do |i|
92
+ log.append(Evinrude::LogEntry::Null.new(term: i + 1))
93
+ end
94
+
95
+ assert log.instance_variable_get(:@entries).length < 1001, "Log entries length kept under control"
96
+ assert_equal log.last_entry_term, 2000, "All log entries are recorded"
97
+ assert_equal log.instance_variable_get(:@snapshot_last_term), log.instance_variable_get(:@entries).first.term - 1, "The snapshot->live log entries transition is correct"
@@ -0,0 +1,43 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative "./smoke_test_helper"
4
+
5
+ require "evinrude"
6
+
7
+ Thread.current.name = "MT"
8
+
9
+ nodes = spawn_nodes(5)
10
+
11
+ wait_for_stability(nodes)
12
+ wait_for_consensus(nodes)
13
+
14
+ nodes.first.c.command("famous five")
15
+
16
+ nodes.each { |n| assert_equal n.c.state, "famous five", n.t.name }
17
+
18
+ 2.times do
19
+ victim = nodes.last
20
+
21
+ info = Evinrude::NodeInfo.new(address: victim.c.address, port: victim.c.port, name: victim.c.node_name)
22
+
23
+ default_logger.info(logloc) { "Removing node #{victim.t.name}" }
24
+ victim.c.remove_node(info)
25
+ victim.t.kill
26
+
27
+ nodes.delete(victim)
28
+ end
29
+
30
+ # We should still have a working cluster at the moment, regardless
31
+ nodes.first.c.command("now we are three")
32
+
33
+ nodes.each { |n| assert_equal n.c.state, "now we are three", n.t.name }
34
+
35
+ victim = nodes.first
36
+ default_logger.info(logloc) { "Crashing node #{victim.t.name}" }
37
+ victim.t.kill
38
+ nodes.delete(victim)
39
+
40
+ # If the earlier removals went OK, we should *still* have a working cluster
41
+ nodes.first.c.command("two is tops")
42
+
43
+ nodes.each { |n| assert_equal n.c.state, "two is tops", n.t.name }
@@ -0,0 +1,46 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative "./smoke_test_helper"
4
+
5
+ require "evinrude"
6
+
7
+ Thread.current.name = "MT"
8
+
9
+ nodes = spawn_nodes(5)
10
+
11
+ wait_for_stability(nodes)
12
+ wait_for_consensus(nodes)
13
+
14
+ nodes.first.c.command("famous five")
15
+
16
+ nodes.each { |n| assert_equal n.c.state, "famous five", n.t.name }
17
+
18
+ # For maximum confusion, we make sure that we crash the master
19
+ victim = nodes.find { |n| n.c.leader? }
20
+ default_logger.info(logloc) { "Crashing node #{victim.t.name}" }
21
+ victim.t.kill
22
+ nodes.delete(victim)
23
+
24
+ crashed_nodes_info = []
25
+
26
+ # And take out another two nodes, to make sure we've lost consensus, but we'll
27
+ # keep a note of their identity for later consumption
28
+ 2.times do
29
+ victim = nodes.last
30
+
31
+ crashed_nodes_info << victim.c.node_info
32
+ default_logger.info(logloc) { "Crashing node #{victim.t.name}" }
33
+ victim.t.kill
34
+
35
+ nodes.delete(victim)
36
+ end
37
+
38
+ crashed_nodes_info.each do |ni|
39
+ nodes.first.c.remove_node(ni, unsafe: true)
40
+ end
41
+
42
+ # This should work... eventually
43
+
44
+ nodes.last.c.command("survival of the fittest")
45
+
46
+ nodes.each { |n| assert_equal n.c.state, "survival of the fittest", n.t.name }
@@ -0,0 +1,279 @@
1
+ require_relative "./smoke_test_helper"
2
+
3
+ Thread.current.name = "FST"
4
+
5
+ class FinalSmokeTest
6
+ attr_reader :op_log, :m
7
+
8
+ def initialize(seed:, interval:, op_count:, logger:, edn_file: File.open("/dev/null"))
9
+ @seed, @interval, @op_count, @logger, @edn_file = seed, interval, op_count, logger, edn_file
10
+
11
+ @rng = Random.new(@seed)
12
+ @op_log = []
13
+ @olm = Mutex.new
14
+ @m = Mutex.new
15
+ @ednm = Mutex.new
16
+ end
17
+
18
+ def run(nodes)
19
+ threads = []
20
+
21
+ @edn_file.write("[")
22
+
23
+ @op_count.times do |i|
24
+ start = Time.now
25
+
26
+ op = select_op
27
+
28
+ o = op.new(nodes: nodes, rng: @rng, logger: @logger, op_id: i, fst: self)
29
+
30
+ threads << Thread.new do
31
+ Thread.current.name = "O#{i}"
32
+
33
+ o.apply
34
+ end
35
+
36
+ sleep [Time.now - start + @interval.rand, 0].max
37
+ end
38
+
39
+ threads.each(&:join)
40
+
41
+ print "..."
42
+
43
+ =begin
44
+ until threads.empty?
45
+ threads.select!(&:alive?)
46
+
47
+ threads.each do |th|
48
+ th.join(1)
49
+ if th.status
50
+ puts
51
+ puts
52
+ puts "{#{th.name})"
53
+ puts th.backtrace
54
+ th.kill
55
+ end
56
+ end
57
+ end
58
+ =end
59
+ ensure
60
+ @edn_file.puts("]")
61
+ end
62
+
63
+ def <<(v)
64
+ @olm.synchronize { @op_log << v }
65
+ end
66
+
67
+ def edn(e)
68
+ @ednm.synchronize do
69
+ pid = if e[:process] =~ /\AC(\d+)(?:-(\d+))?\z/
70
+ r = $1.to_i
71
+ m = $2.to_i
72
+ m * 5 + r
73
+ else
74
+ raise "FFS"
75
+ end
76
+
77
+ @edn_file.puts "{:process #{pid}, :type #{e[:type].inspect}, :f #{e[:f].inspect}, :value #{e[:value].inspect}}"
78
+ @edn_file.fsync
79
+ end
80
+ end
81
+
82
+
83
+ private
84
+
85
+ def select_op
86
+ n = @rng.rand(op_weight_count)
87
+
88
+ Op.const_get(Op.constants.sort.find { |c| n -= Op.const_get(c)::WEIGHT; n <= 0 })
89
+ end
90
+
91
+ def op_weight_count
92
+ @op_weight_count ||= Op.constants.inject(0) { |a, c| a + Op.const_get(c)::WEIGHT }
93
+ end
94
+
95
+ module EvinrudeHooks
96
+ def initialize(*_)
97
+ super
98
+ end
99
+ end
100
+
101
+ class Op
102
+ include Evinrude::LoggingHelpers
103
+
104
+ def initialize(nodes:, rng:, logger:, op_id:, fst:)
105
+ @nodes, @rng, @logger, @op_id, @fst = nodes, rng, logger, op_id, fst
106
+ end
107
+
108
+ private
109
+
110
+ def random_node
111
+ @fst.m.synchronize { @nodes.sort[@rng.rand(@nodes.length)] }
112
+ end
113
+
114
+ class ExecCommand < Op
115
+ WEIGHT = 20
116
+
117
+ def apply
118
+ node = random_node
119
+
120
+ node.m.synchronize do
121
+ logger.info(logloc) { "Sending command #{@op_id} to node #{node.t.name}" }
122
+ print ">"
123
+
124
+ begin
125
+ return if node.crashed?
126
+ @fst.edn(process: node.t.name, type: :invoke, f: :write, value: @op_id, index: @op_id * 2)
127
+ node.c.command(@op_id.to_s)
128
+ @fst.edn(process: node.t.name, type: :ok, f: :write, value: @op_id, index: @op_id * 2 + 1)
129
+ rescue => ex
130
+ log_exception(ex) { "Sending command #{@op_id} to node #{node.t.name}" }
131
+ @fst.edn(process: node.t.name, type: :info, f: :write, value: @op_id, index: @op_id * 2 + 1)
132
+ node.crashed!
133
+ end
134
+
135
+ logger.info(logloc) { "ExecCommand complete" }
136
+ end
137
+ # { id: @op_id, op: :write, value: @op_id.to_s, node: node.c.node_name, start_time: start.to_f, duration: Time.now - start }
138
+ end
139
+ end
140
+
141
+ class ReadState < Op
142
+ WEIGHT = 20
143
+
144
+ def apply
145
+ node = random_node
146
+
147
+ node.m.synchronize do
148
+ logger.info(logloc) { "Reading state from node #{node.t.name}" }
149
+ print "<"
150
+
151
+ begin
152
+ return if node.crashed?
153
+ @fst.edn(process: node.t.name, type: :invoke, f: :read, value: nil, index: @op_id * 2)
154
+ v = node.c.state
155
+ @fst.edn(process: node.t.name, type: :ok, f: :read, value: v == "" ? nil : v.to_i, index: @op_id * 2 + 1)
156
+ rescue => ex
157
+ log_exception(ex) { "Reading state from node #{node.t.name}" }
158
+ @fst.edn(process: node.t.name, type: :fail, f: :read, value: nil, index: @op_id * 2 + 1)
159
+ node.crashed!
160
+ end
161
+
162
+ logger.info(logloc) { "ReadState complete" }
163
+ end
164
+ # { id: @op_id, op: :read, value: v, node: node.c.node_name, start_time: start.to_f, duration: Time.now - start }
165
+ end
166
+ end
167
+
168
+ class CrashNode < Op
169
+ WEIGHT = 1
170
+
171
+ def apply
172
+ victim = random_node
173
+
174
+ @fst.m.synchronize do
175
+ print "!"
176
+
177
+ @nodes.delete(victim)
178
+
179
+ begin
180
+ victim.m.synchronize do
181
+ node_name, rev = victim.t.name.split("-", 2)
182
+ rev = rev.to_i + 1
183
+
184
+ start = Time.now
185
+
186
+ @logger.info(logloc) { "Crashing node #{node_name}" }
187
+ victim.crashed!
188
+ victim.t.kill
189
+
190
+ @fst.edn(process: victim.t.name, type: :info, f: :crash, index: @op_id * 2)
191
+
192
+ if @rng.rand > 0.9
193
+ @logger.info(logloc) { "Removing existing node state" }
194
+ $tmpdir.join(node_name, "snapshot.yaml").unlink rescue nil
195
+ $tmpdir.join(node_name, "log.yaml").unlink rescue nil
196
+ end
197
+
198
+ @logger.info(logloc) { "Spawning replacement #{node_name}" }
199
+
200
+ until leader = @nodes.find { |n| n.c.leader? }
201
+ @logger.info(logloc) { "Waiting until the cluster has a leader" }
202
+ sleep 0.5
203
+ end
204
+
205
+ lazarus = ClusterNode.new(node_name: node_name, shared_keys: ["s3kr1t"], storage_dir: $tmpdir.join(node_name).to_s, logger: default_logger, join_hints: leader.c.nodes)
206
+ lazarus.t.name = "#{node_name}-#{rev}"
207
+
208
+ @nodes << lazarus
209
+
210
+ until lazarus.c.leader? || lazarus.c.follower?
211
+ @logger.info(logloc) { "Waiting for replacement #{node_name} to join the cluster" }
212
+ sleep 0.5
213
+ end
214
+ rescue => ex
215
+ log_exception(ex) { "Crashing node #{node_name.inspect}" }
216
+ end
217
+ end
218
+ end
219
+
220
+ # { id: @op_id, op: :crash, node: victim.c.node_name, start_time: start.to_f }
221
+ end
222
+ end
223
+ end
224
+
225
+ class Checker
226
+ def initialize(log)
227
+ @log = log
228
+ end
229
+
230
+ def valid?
231
+ valid_values = []
232
+ final_value = [nil, nil]
233
+ final_at = nil
234
+
235
+ @log.sort_by { |e| e[:start_time] }.each do |entry|
236
+ if entry[:op] == :write
237
+ end
238
+ end
239
+ end
240
+ end
241
+
242
+ module ClusterNodeAdditions
243
+ attr_reader :m
244
+
245
+ def initialize(*_)
246
+ @m = Mutex.new
247
+
248
+ super
249
+ end
250
+ end
251
+ end
252
+
253
+ ClusterNode.prepend(FinalSmokeTest::ClusterNodeAdditions)
254
+
255
+ Evinrude.prepend(FinalSmokeTest::EvinrudeHooks)
256
+
257
+ seed = (ENV["FST_SEED"] || rand(65536)).to_i
258
+ op_count = (ENV["FST_OPS"] || 1000).to_i
259
+
260
+ puts "Commencing Evinrude Final Smoke Test, seed=#{seed}, op_count=#{op_count}"
261
+
262
+ st = FinalSmokeTest.new(seed: seed, op_count: op_count, logger: default_logger, interval: 0.05..0.2, edn_file: $tmpdir.join("fst.edn").open("w"))
263
+
264
+ print "Spawning nodes..."
265
+ nodes = spawn_nodes(5, storage_base_dir: $tmpdir.to_s)
266
+ puts " done."
267
+
268
+ print "Waiting for stability..."
269
+ wait_for_stability(nodes)
270
+ puts " done."
271
+
272
+ print "Running ops..."
273
+ st.run(nodes)
274
+
275
+ #$tmpdir.join("fst.log").write(st.op_log.to_yaml)
276
+
277
+ #assert FinalSmokeTest::Checker.new(nodes).valid?, "FST oplog validity"
278
+
279
+ puts " done!"