evinrude 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. checksums.yaml +7 -0
  2. data/.editorconfig +23 -0
  3. data/.gitignore +6 -0
  4. data/.yardopts +1 -0
  5. data/CODE_OF_CONDUCT.md +49 -0
  6. data/CONTRIBUTING.md +10 -0
  7. data/LICENCE +674 -0
  8. data/README.md +410 -0
  9. data/evinrude.gemspec +42 -0
  10. data/lib/evinrude.rb +1233 -0
  11. data/lib/evinrude/backoff.rb +19 -0
  12. data/lib/evinrude/cluster_configuration.rb +162 -0
  13. data/lib/evinrude/config_change_queue_entry.rb +19 -0
  14. data/lib/evinrude/config_change_queue_entry/add_node.rb +13 -0
  15. data/lib/evinrude/config_change_queue_entry/remove_node.rb +14 -0
  16. data/lib/evinrude/freedom_patches/range.rb +5 -0
  17. data/lib/evinrude/log.rb +102 -0
  18. data/lib/evinrude/log_entries.rb +3 -0
  19. data/lib/evinrude/log_entry.rb +13 -0
  20. data/lib/evinrude/log_entry/cluster_configuration.rb +15 -0
  21. data/lib/evinrude/log_entry/null.rb +6 -0
  22. data/lib/evinrude/log_entry/state_machine_command.rb +13 -0
  23. data/lib/evinrude/logging_helpers.rb +40 -0
  24. data/lib/evinrude/message.rb +19 -0
  25. data/lib/evinrude/message/append_entries_reply.rb +13 -0
  26. data/lib/evinrude/message/append_entries_request.rb +18 -0
  27. data/lib/evinrude/message/command_reply.rb +13 -0
  28. data/lib/evinrude/message/command_request.rb +18 -0
  29. data/lib/evinrude/message/install_snapshot_reply.rb +13 -0
  30. data/lib/evinrude/message/install_snapshot_request.rb +18 -0
  31. data/lib/evinrude/message/join_reply.rb +13 -0
  32. data/lib/evinrude/message/join_request.rb +18 -0
  33. data/lib/evinrude/message/node_removal_reply.rb +13 -0
  34. data/lib/evinrude/message/node_removal_request.rb +18 -0
  35. data/lib/evinrude/message/read_reply.rb +13 -0
  36. data/lib/evinrude/message/read_request.rb +18 -0
  37. data/lib/evinrude/message/vote_reply.rb +13 -0
  38. data/lib/evinrude/message/vote_request.rb +18 -0
  39. data/lib/evinrude/messages.rb +14 -0
  40. data/lib/evinrude/metrics.rb +50 -0
  41. data/lib/evinrude/network.rb +69 -0
  42. data/lib/evinrude/network/connection.rb +144 -0
  43. data/lib/evinrude/network/protocol.rb +69 -0
  44. data/lib/evinrude/node_info.rb +35 -0
  45. data/lib/evinrude/peer.rb +50 -0
  46. data/lib/evinrude/resolver.rb +96 -0
  47. data/lib/evinrude/snapshot.rb +9 -0
  48. data/lib/evinrude/state_machine.rb +15 -0
  49. data/lib/evinrude/state_machine/register.rb +25 -0
  50. data/smoke_tests/001_single_node_cluster.rb +20 -0
  51. data/smoke_tests/002_three_node_cluster.rb +43 -0
  52. data/smoke_tests/003_spill.rb +25 -0
  53. data/smoke_tests/004_stale_read.rb +67 -0
  54. data/smoke_tests/005_sleepy_master.rb +28 -0
  55. data/smoke_tests/006_join_via_follower.rb +26 -0
  56. data/smoke_tests/007_snapshot_madness.rb +97 -0
  57. data/smoke_tests/008_downsizing.rb +43 -0
  58. data/smoke_tests/009_disaster_recovery.rb +46 -0
  59. data/smoke_tests/999_final_smoke_test.rb +279 -0
  60. data/smoke_tests/run +22 -0
  61. data/smoke_tests/smoke_test_helper.rb +199 -0
  62. metadata +318 -0
@@ -0,0 +1,28 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative "./smoke_test_helper"
4
+
5
+ require "evinrude"
6
+ require "logger"
7
+
8
+ Thread.current.name = "MT"
9
+
10
+ nodes = spawn_nodes(5)
11
+
12
+ wait_for_stability(nodes)
13
+
14
+ leader = nodes.find { |n| n.c.leader? }
15
+ followers = nodes.select { |n| n.c.follower? }
16
+
17
+ leader.c.singleton_class.prepend(FaultInjector)
18
+ leader.c.pause!(:issue_append_entries)
19
+
20
+ until new_leader = followers.find { |n| n.c.leader? }
21
+ leader.t.join(1)
22
+ end
23
+
24
+ leader.c.unpause!(:issue_append_entries)
25
+
26
+ leader.t.join(1)
27
+
28
+ assert leader.c.follower?
@@ -0,0 +1,26 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative "./smoke_test_helper"
4
+
5
+ require "evinrude"
6
+
7
+ Thread.current.name = "MT"
8
+
9
+ nodes = spawn_nodes(3)
10
+
11
+ wait_for_stability(nodes)
12
+ wait_for_consensus(nodes)
13
+
14
+ follower = nodes.find { |n| n.c.follower? }
15
+
16
+ newbie = ClusterNode.new(join_hints: [{ address: follower.c.address, port: follower.c.port }], shared_keys: ["s3kr1t"], logger: default_logger)
17
+ newbie.t.name = "NW"
18
+
19
+ newbie.t.join(0.1)
20
+
21
+ until newbie.c.follower?
22
+ newbie.t.join(0.1)
23
+ logger.info(logloc) { "Waiting for newbie to become a follower" }
24
+ end
25
+
26
+ assert newbie.c.follower?, "Newbie has joined the cluster"
@@ -0,0 +1,97 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative "./smoke_test_helper"
4
+
5
+ require "evinrude"
6
+
7
+ Thread.current.name = "MT"
8
+
9
+ nodes = spawn_nodes(3, storage_base_dir: $tmpdir.to_s)
10
+
11
+ wait_for_stability(nodes)
12
+ wait_for_consensus(nodes)
13
+
14
+ nodes.each.with_index { |n, i| n.c.command(n.t.name) }
15
+
16
+ wait_for_consensus(nodes)
17
+
18
+ first_state = nodes.first.c.state
19
+
20
+ nodes.each do |n|
21
+ assert_equal YAML.load_stream($tmpdir.join(n.t.name, "log.yaml").read).last[:commit_entries_to], [n.c.instance_variable_get(:@commit_index)], "#{n.t.name} logged commit_entries_to"
22
+ end
23
+
24
+ nodes.each { |n| n.c.__send__(:take_snapshot) }
25
+
26
+ nodes.each do |n|
27
+ snap = YAML.load_file($tmpdir.join(n.t.name, "snapshot.yaml"))
28
+
29
+ assert_equal snap.state, first_state, "#{n.t.name} state"
30
+ end
31
+
32
+ nodes.each.with_index { |n, i| n.c.command(n.t.name) }
33
+
34
+ wait_for_consensus(nodes)
35
+
36
+ mid_state = nodes.first.c.state
37
+
38
+ victim = nodes.find { |n| n.t.name == mid_state }
39
+ nodes.delete(victim)
40
+
41
+ victim.t.kill
42
+
43
+ assert_equal YAML.load_stream($tmpdir.join(victim.t.name, "log.yaml").read).last[:commit_entries_to], [nodes.first.c.instance_variable_get(:@commit_index)], "Victim #{victim.t.name} had up-to-date commit_index"
44
+
45
+ nodes.each.with_index { |n, i| n.c.command(n.t.name) }
46
+
47
+ wait_for_consensus(nodes)
48
+
49
+ new_state = nodes.first.c.state
50
+
51
+ assert new_state != mid_state, "Check that we've got a new state"
52
+
53
+ lazarus = ClusterNode.new(shared_keys: ["s3kr1t"], logger: default_logger, storage_dir: $tmpdir.join(victim.t.name).to_s)
54
+ lazarus.t.name = "LZ"
55
+
56
+ nodes << lazarus
57
+
58
+ wait_for_consensus(nodes)
59
+
60
+ assert_equal lazarus.c.state, new_state, "Ensure lazarus has synced up"
61
+
62
+ # Phatten the log file
63
+ log_file = $tmpdir.join("C1", "log.yaml")
64
+ until log_file.size > 500_000
65
+ lazarus.c.command("logloglog!" * 1000)
66
+ end
67
+
68
+ # Now keep phattening the log file until it gets tiny again -- indicating
69
+ # a snapshot was taken -- or it gets stupidly huge
70
+ until log_file.size < 500_000 || log_file.size > 2_000_000
71
+ lazarus.c.command("xyzzy!" * rand(1000))
72
+ end
73
+
74
+ assert log_file.size < 32_000_000, "Log file didn't get truncated; was a snapshot even taken?"
75
+
76
+ # Can a new node, starting completely from scratch, load from a master's snapshot?
77
+ newbie = ClusterNode.new(shared_keys: ["s3kr1t"], logger: default_logger, storage_dir: $tmpdir.join("newbie").to_s, node_name: "NB", join_hints: [{ address: lazarus.c.address, port: lazarus.c.port }])
78
+ newbie.t.name = "NB"
79
+
80
+ nodes << newbie
81
+
82
+ wait_for_consensus(nodes)
83
+
84
+ assert_equal newbie.c.state, lazarus.c.state, "Ensure newbie is synced"
85
+
86
+ # As a final test, let's make sure the in-memory log trimming is working OK
87
+ nodes.each { |n| n.t.kill }
88
+
89
+ log = nodes.first.c.instance_variable_get(:@log)
90
+
91
+ 2000.times do |i|
92
+ log.append(Evinrude::LogEntry::Null.new(term: i + 1))
93
+ end
94
+
95
+ assert log.instance_variable_get(:@entries).length < 1001, "Log entries length kept under control"
96
+ assert_equal log.last_entry_term, 2000, "All log entries are recorded"
97
+ assert_equal log.instance_variable_get(:@snapshot_last_term), log.instance_variable_get(:@entries).first.term - 1, "The snapshot->live log entries transition is correct"
@@ -0,0 +1,43 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative "./smoke_test_helper"
4
+
5
+ require "evinrude"
6
+
7
+ Thread.current.name = "MT"
8
+
9
+ nodes = spawn_nodes(5)
10
+
11
+ wait_for_stability(nodes)
12
+ wait_for_consensus(nodes)
13
+
14
+ nodes.first.c.command("famous five")
15
+
16
+ nodes.each { |n| assert_equal n.c.state, "famous five", n.t.name }
17
+
18
+ 2.times do
19
+ victim = nodes.last
20
+
21
+ info = Evinrude::NodeInfo.new(address: victim.c.address, port: victim.c.port, name: victim.c.node_name)
22
+
23
+ default_logger.info(logloc) { "Removing node #{victim.t.name}" }
24
+ victim.c.remove_node(info)
25
+ victim.t.kill
26
+
27
+ nodes.delete(victim)
28
+ end
29
+
30
+ # We should still have a working cluster at the moment, regardless
31
+ nodes.first.c.command("now we are three")
32
+
33
+ nodes.each { |n| assert_equal n.c.state, "now we are three", n.t.name }
34
+
35
+ victim = nodes.first
36
+ default_logger.info(logloc) { "Crashing node #{victim.t.name}" }
37
+ victim.t.kill
38
+ nodes.delete(victim)
39
+
40
+ # If the earlier removals went OK, we should *still* have a working cluster
41
+ nodes.first.c.command("two is tops")
42
+
43
+ nodes.each { |n| assert_equal n.c.state, "two is tops", n.t.name }
@@ -0,0 +1,46 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative "./smoke_test_helper"
4
+
5
+ require "evinrude"
6
+
7
+ Thread.current.name = "MT"
8
+
9
+ nodes = spawn_nodes(5)
10
+
11
+ wait_for_stability(nodes)
12
+ wait_for_consensus(nodes)
13
+
14
+ nodes.first.c.command("famous five")
15
+
16
+ nodes.each { |n| assert_equal n.c.state, "famous five", n.t.name }
17
+
18
+ # For maximum confusion, we make sure that we crash the master
19
+ victim = nodes.find { |n| n.c.leader? }
20
+ default_logger.info(logloc) { "Crashing node #{victim.t.name}" }
21
+ victim.t.kill
22
+ nodes.delete(victim)
23
+
24
+ crashed_nodes_info = []
25
+
26
+ # And take out another two nodes, to make sure we've lost consensus, but we'll
27
+ # keep a note of their identity for later consumption
28
+ 2.times do
29
+ victim = nodes.last
30
+
31
+ crashed_nodes_info << victim.c.node_info
32
+ default_logger.info(logloc) { "Crashing node #{victim.t.name}" }
33
+ victim.t.kill
34
+
35
+ nodes.delete(victim)
36
+ end
37
+
38
+ crashed_nodes_info.each do |ni|
39
+ nodes.first.c.remove_node(ni, unsafe: true)
40
+ end
41
+
42
+ # This should work... eventually
43
+
44
+ nodes.last.c.command("survival of the fittest")
45
+
46
+ nodes.each { |n| assert_equal n.c.state, "survival of the fittest", n.t.name }
@@ -0,0 +1,279 @@
1
+ require_relative "./smoke_test_helper"
2
+
3
+ Thread.current.name = "FST"
4
+
5
+ class FinalSmokeTest
6
+ attr_reader :op_log, :m
7
+
8
+ def initialize(seed:, interval:, op_count:, logger:, edn_file: File.open("/dev/null"))
9
+ @seed, @interval, @op_count, @logger, @edn_file = seed, interval, op_count, logger, edn_file
10
+
11
+ @rng = Random.new(@seed)
12
+ @op_log = []
13
+ @olm = Mutex.new
14
+ @m = Mutex.new
15
+ @ednm = Mutex.new
16
+ end
17
+
18
+ def run(nodes)
19
+ threads = []
20
+
21
+ @edn_file.write("[")
22
+
23
+ @op_count.times do |i|
24
+ start = Time.now
25
+
26
+ op = select_op
27
+
28
+ o = op.new(nodes: nodes, rng: @rng, logger: @logger, op_id: i, fst: self)
29
+
30
+ threads << Thread.new do
31
+ Thread.current.name = "O#{i}"
32
+
33
+ o.apply
34
+ end
35
+
36
+ sleep [Time.now - start + @interval.rand, 0].max
37
+ end
38
+
39
+ threads.each(&:join)
40
+
41
+ print "..."
42
+
43
+ =begin
44
+ until threads.empty?
45
+ threads.select!(&:alive?)
46
+
47
+ threads.each do |th|
48
+ th.join(1)
49
+ if th.status
50
+ puts
51
+ puts
52
+ puts "{#{th.name})"
53
+ puts th.backtrace
54
+ th.kill
55
+ end
56
+ end
57
+ end
58
+ =end
59
+ ensure
60
+ @edn_file.puts("]")
61
+ end
62
+
63
+ def <<(v)
64
+ @olm.synchronize { @op_log << v }
65
+ end
66
+
67
+ def edn(e)
68
+ @ednm.synchronize do
69
+ pid = if e[:process] =~ /\AC(\d+)(?:-(\d+))?\z/
70
+ r = $1.to_i
71
+ m = $2.to_i
72
+ m * 5 + r
73
+ else
74
+ raise "FFS"
75
+ end
76
+
77
+ @edn_file.puts "{:process #{pid}, :type #{e[:type].inspect}, :f #{e[:f].inspect}, :value #{e[:value].inspect}}"
78
+ @edn_file.fsync
79
+ end
80
+ end
81
+
82
+
83
+ private
84
+
85
+ def select_op
86
+ n = @rng.rand(op_weight_count)
87
+
88
+ Op.const_get(Op.constants.sort.find { |c| n -= Op.const_get(c)::WEIGHT; n <= 0 })
89
+ end
90
+
91
+ def op_weight_count
92
+ @op_weight_count ||= Op.constants.inject(0) { |a, c| a + Op.const_get(c)::WEIGHT }
93
+ end
94
+
95
+ module EvinrudeHooks
96
+ def initialize(*_)
97
+ super
98
+ end
99
+ end
100
+
101
+ class Op
102
+ include Evinrude::LoggingHelpers
103
+
104
+ def initialize(nodes:, rng:, logger:, op_id:, fst:)
105
+ @nodes, @rng, @logger, @op_id, @fst = nodes, rng, logger, op_id, fst
106
+ end
107
+
108
+ private
109
+
110
+ def random_node
111
+ @fst.m.synchronize { @nodes.sort[@rng.rand(@nodes.length)] }
112
+ end
113
+
114
+ class ExecCommand < Op
115
+ WEIGHT = 20
116
+
117
+ def apply
118
+ node = random_node
119
+
120
+ node.m.synchronize do
121
+ logger.info(logloc) { "Sending command #{@op_id} to node #{node.t.name}" }
122
+ print ">"
123
+
124
+ begin
125
+ return if node.crashed?
126
+ @fst.edn(process: node.t.name, type: :invoke, f: :write, value: @op_id, index: @op_id * 2)
127
+ node.c.command(@op_id.to_s)
128
+ @fst.edn(process: node.t.name, type: :ok, f: :write, value: @op_id, index: @op_id * 2 + 1)
129
+ rescue => ex
130
+ log_exception(ex) { "Sending command #{@op_id} to node #{node.t.name}" }
131
+ @fst.edn(process: node.t.name, type: :info, f: :write, value: @op_id, index: @op_id * 2 + 1)
132
+ node.crashed!
133
+ end
134
+
135
+ logger.info(logloc) { "ExecCommand complete" }
136
+ end
137
+ # { id: @op_id, op: :write, value: @op_id.to_s, node: node.c.node_name, start_time: start.to_f, duration: Time.now - start }
138
+ end
139
+ end
140
+
141
+ class ReadState < Op
142
+ WEIGHT = 20
143
+
144
+ def apply
145
+ node = random_node
146
+
147
+ node.m.synchronize do
148
+ logger.info(logloc) { "Reading state from node #{node.t.name}" }
149
+ print "<"
150
+
151
+ begin
152
+ return if node.crashed?
153
+ @fst.edn(process: node.t.name, type: :invoke, f: :read, value: nil, index: @op_id * 2)
154
+ v = node.c.state
155
+ @fst.edn(process: node.t.name, type: :ok, f: :read, value: v == "" ? nil : v.to_i, index: @op_id * 2 + 1)
156
+ rescue => ex
157
+ log_exception(ex) { "Reading state from node #{node.t.name}" }
158
+ @fst.edn(process: node.t.name, type: :fail, f: :read, value: nil, index: @op_id * 2 + 1)
159
+ node.crashed!
160
+ end
161
+
162
+ logger.info(logloc) { "ReadState complete" }
163
+ end
164
+ # { id: @op_id, op: :read, value: v, node: node.c.node_name, start_time: start.to_f, duration: Time.now - start }
165
+ end
166
+ end
167
+
168
+ class CrashNode < Op
169
+ WEIGHT = 1
170
+
171
+ def apply
172
+ victim = random_node
173
+
174
+ @fst.m.synchronize do
175
+ print "!"
176
+
177
+ @nodes.delete(victim)
178
+
179
+ begin
180
+ victim.m.synchronize do
181
+ node_name, rev = victim.t.name.split("-", 2)
182
+ rev = rev.to_i + 1
183
+
184
+ start = Time.now
185
+
186
+ @logger.info(logloc) { "Crashing node #{node_name}" }
187
+ victim.crashed!
188
+ victim.t.kill
189
+
190
+ @fst.edn(process: victim.t.name, type: :info, f: :crash, index: @op_id * 2)
191
+
192
+ if @rng.rand > 0.9
193
+ @logger.info(logloc) { "Removing existing node state" }
194
+ $tmpdir.join(node_name, "snapshot.yaml").unlink rescue nil
195
+ $tmpdir.join(node_name, "log.yaml").unlink rescue nil
196
+ end
197
+
198
+ @logger.info(logloc) { "Spawning replacement #{node_name}" }
199
+
200
+ until leader = @nodes.find { |n| n.c.leader? }
201
+ @logger.info(logloc) { "Waiting until the cluster has a leader" }
202
+ sleep 0.5
203
+ end
204
+
205
+ lazarus = ClusterNode.new(node_name: node_name, shared_keys: ["s3kr1t"], storage_dir: $tmpdir.join(node_name).to_s, logger: default_logger, join_hints: leader.c.nodes)
206
+ lazarus.t.name = "#{node_name}-#{rev}"
207
+
208
+ @nodes << lazarus
209
+
210
+ until lazarus.c.leader? || lazarus.c.follower?
211
+ @logger.info(logloc) { "Waiting for replacement #{node_name} to join the cluster" }
212
+ sleep 0.5
213
+ end
214
+ rescue => ex
215
+ log_exception(ex) { "Crashing node #{node_name.inspect}" }
216
+ end
217
+ end
218
+ end
219
+
220
+ # { id: @op_id, op: :crash, node: victim.c.node_name, start_time: start.to_f }
221
+ end
222
+ end
223
+ end
224
+
225
+ class Checker
226
+ def initialize(log)
227
+ @log = log
228
+ end
229
+
230
+ def valid?
231
+ valid_values = []
232
+ final_value = [nil, nil]
233
+ final_at = nil
234
+
235
+ @log.sort_by { |e| e[:start_time] }.each do |entry|
236
+ if entry[:op] == :write
237
+ end
238
+ end
239
+ end
240
+ end
241
+
242
+ module ClusterNodeAdditions
243
+ attr_reader :m
244
+
245
+ def initialize(*_)
246
+ @m = Mutex.new
247
+
248
+ super
249
+ end
250
+ end
251
+ end
252
+
253
+ ClusterNode.prepend(FinalSmokeTest::ClusterNodeAdditions)
254
+
255
+ Evinrude.prepend(FinalSmokeTest::EvinrudeHooks)
256
+
257
+ seed = (ENV["FST_SEED"] || rand(65536)).to_i
258
+ op_count = (ENV["FST_OPS"] || 1000).to_i
259
+
260
+ puts "Commencing Evinrude Final Smoke Test, seed=#{seed}, op_count=#{op_count}"
261
+
262
+ st = FinalSmokeTest.new(seed: seed, op_count: op_count, logger: default_logger, interval: 0.05..0.2, edn_file: $tmpdir.join("fst.edn").open("w"))
263
+
264
+ print "Spawning nodes..."
265
+ nodes = spawn_nodes(5, storage_base_dir: $tmpdir.to_s)
266
+ puts " done."
267
+
268
+ print "Waiting for stability..."
269
+ wait_for_stability(nodes)
270
+ puts " done."
271
+
272
+ print "Running ops..."
273
+ st.run(nodes)
274
+
275
+ #$tmpdir.join("fst.log").write(st.op_log.to_yaml)
276
+
277
+ #assert FinalSmokeTest::Checker.new(nodes).valid?, "FST oplog validity"
278
+
279
+ puts " done!"