tigerbeetle-node 0.6.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/README.md +102 -83
  2. package/dist/benchmark.js +102 -100
  3. package/dist/benchmark.js.map +1 -1
  4. package/dist/index.d.ts +82 -82
  5. package/dist/index.js +74 -93
  6. package/dist/index.js.map +1 -1
  7. package/dist/test.js +135 -112
  8. package/dist/test.js.map +1 -1
  9. package/package.json +13 -14
  10. package/scripts/download_node_headers.sh +3 -1
  11. package/src/benchmark.ts +114 -118
  12. package/src/index.ts +102 -111
  13. package/src/node.zig +53 -51
  14. package/src/test.ts +146 -125
  15. package/src/tigerbeetle/scripts/benchmark.bat +46 -46
  16. package/src/tigerbeetle/scripts/benchmark.sh +5 -0
  17. package/src/tigerbeetle/scripts/install_zig.bat +109 -109
  18. package/src/tigerbeetle/scripts/install_zig.sh +7 -3
  19. package/src/tigerbeetle/scripts/vopr.bat +47 -47
  20. package/src/tigerbeetle/src/benchmark.zig +63 -96
  21. package/src/tigerbeetle/src/config.zig +23 -19
  22. package/src/tigerbeetle/src/demo.zig +2 -15
  23. package/src/tigerbeetle/src/demo_01_create_accounts.zig +10 -10
  24. package/src/tigerbeetle/src/demo_03_create_transfers.zig +5 -3
  25. package/src/tigerbeetle/src/{demo_04_create_transfers_two_phase_commit.zig → demo_04_create_pending_transfers.zig} +18 -12
  26. package/src/tigerbeetle/src/demo_05_post_pending_transfers.zig +37 -0
  27. package/src/tigerbeetle/src/demo_06_void_pending_transfers.zig +24 -0
  28. package/src/tigerbeetle/src/demo_07_lookup_transfers.zig +1 -1
  29. package/src/tigerbeetle/src/io/linux.zig +4 -4
  30. package/src/tigerbeetle/src/main.zig +19 -3
  31. package/src/tigerbeetle/src/message_pool.zig +5 -2
  32. package/src/tigerbeetle/src/ring_buffer.zig +48 -3
  33. package/src/tigerbeetle/src/simulator.zig +104 -8
  34. package/src/tigerbeetle/src/state_machine.zig +1813 -816
  35. package/src/tigerbeetle/src/test/cluster.zig +165 -32
  36. package/src/tigerbeetle/src/test/packet_simulator.zig +14 -1
  37. package/src/tigerbeetle/src/test/state_checker.zig +3 -1
  38. package/src/tigerbeetle/src/test/state_machine.zig +8 -7
  39. package/src/tigerbeetle/src/test/storage.zig +99 -40
  40. package/src/tigerbeetle/src/tigerbeetle.zig +103 -98
  41. package/src/tigerbeetle/src/vsr/journal.zig +1387 -459
  42. package/src/tigerbeetle/src/vsr/replica.zig +1204 -417
  43. package/src/tigerbeetle/src/vsr.zig +203 -49
  44. package/src/translate.zig +10 -0
  45. package/.yarn/releases/yarn-berry.cjs +0 -55
  46. package/.yarnrc.yml +0 -1
  47. package/scripts/postinstall.sh +0 -6
  48. package/src/tigerbeetle/src/demo_05_accept_transfers.zig +0 -23
  49. package/src/tigerbeetle/src/demo_06_reject_transfers.zig +0 -17
  50. package/src/tigerbeetle/src/format_test.zig +0 -69
  51. package/yarn.lock +0 -42
@@ -53,29 +53,27 @@ pub const transfers_max = switch (deployment_environment) {
53
53
  else => 1_000_000,
54
54
  };
55
55
 
56
- /// The maximum number of two-phase commits to store in memory:
56
+ /// The maximum number of two-phase transfers to store in memory:
57
57
  /// This impacts the amount of memory allocated at initialization by the server.
58
- pub const commits_max = transfers_max;
59
-
60
- /// The maximum size of the journal file:
61
- /// This is pre-allocated and zeroed for performance when initialized.
62
- /// Writes within this file never extend the filesystem inode size reducing the cost of fdatasync().
63
- /// This enables static allocation of disk space so that appends cannot fail with ENOSPC.
64
- /// This also enables us to detect filesystem inode corruption that would change the journal size.
65
- pub const journal_size_max = switch (deployment_environment) {
66
- .production => 128 * 1024 * 1024 * 1024,
67
- else => 128 * 1024 * 1024,
68
- };
58
+ pub const transfers_pending_max = transfers_max;
69
59
 
70
60
  /// The maximum number of batch entries in the journal file:
71
61
  /// A batch entry may contain many transfers, so this is not a limit on the number of transfers.
72
62
  /// We need this limit to allocate space for copies of batch headers at the start of the journal.
73
63
  /// These header copies enable us to disentangle corruption from crashes and recover accordingly.
74
- pub const journal_headers_max = switch (deployment_environment) {
75
- .production => 1024 * 1024,
76
- else => 16384,
64
+ pub const journal_slot_count = switch (deployment_environment) {
65
+ .production => 1024,
66
+ else => 128,
77
67
  };
78
68
 
69
+ /// The maximum size of the journal file:
70
+ /// This is pre-allocated and zeroed for performance when initialized.
71
+ /// Writes within this file never extend the filesystem inode size reducing the cost of fdatasync().
72
+ /// This enables static allocation of disk space so that appends cannot fail with ENOSPC.
73
+ /// This also enables us to detect filesystem inode corruption that would change the journal size.
74
+ // TODO remove this; just allocate a part of the total storage for the journal
75
+ pub const journal_size_max = journal_slot_count * (128 + message_size_max);
76
+
79
77
  /// The maximum number of connections that can be held open by the server at any time:
80
78
  pub const connections_max = replicas_max + clients_max;
81
79
 
@@ -92,7 +90,7 @@ pub const message_size_max = 1 * 1024 * 1024;
92
90
  /// The maximum number of Viewstamped Replication prepare messages that can be inflight at a time.
93
91
  /// This is immutable once assigned per cluster, as replicas need to know how many operations might
94
92
  /// possibly be uncommitted during a view change, and this must be constant for all replicas.
95
- pub const pipelining_max = clients_max;
93
+ pub const pipeline_max = clients_max;
96
94
 
97
95
  /// The minimum and maximum amount of time in milliseconds to wait before initiating a connection.
98
96
  /// Exponential backoff and jitter are applied within this range.
@@ -224,12 +222,18 @@ pub const clock_synchronization_window_min_ms = 2000;
224
222
  /// If a window expires because of this then it is likely that the clock epoch will also be expired.
225
223
  pub const clock_synchronization_window_max_ms = 20000;
226
224
 
225
+ // TODO Move these to a separate "internal computed constants" file.
226
+ pub const journal_size_headers = journal_slot_count * 128; // 128 == @sizeOf(Header)
227
+ pub const journal_size_prepares = journal_slot_count * message_size_max;
228
+
227
229
  comptime {
228
230
  // vsr.parse_address assumes that config.address/config.port are valid.
229
231
  _ = std.net.Address.parseIp4(address, 0) catch unreachable;
230
232
  _ = @as(u16, port);
231
233
 
232
- // Avoid latency issues from a too-large sndbuf.
233
- assert(tcp_sndbuf_replica <= 4 * 1024 * 1024);
234
- assert(tcp_sndbuf_client <= 4 * 1024 * 1024);
234
+ // Avoid latency issues from setting sndbuf too high:
235
+ assert(tcp_sndbuf_replica <= 16 * 1024 * 1024);
236
+ assert(tcp_sndbuf_client <= 16 * 1024 * 1024);
237
+
238
+ assert(journal_size_max == journal_size_headers + journal_size_prepares);
235
239
  }
@@ -6,11 +6,9 @@ const config = @import("config.zig");
6
6
  const tb = @import("tigerbeetle.zig");
7
7
  const Account = tb.Account;
8
8
  const Transfer = tb.Transfer;
9
- const Commit = tb.Commit;
10
9
 
11
10
  const CreateAccountsResult = tb.CreateAccountsResult;
12
11
  const CreateTransfersResult = tb.CreateTransfersResult;
13
- const CommitTransfersResult = tb.CommitTransfersResult;
14
12
 
15
13
  const IO = @import("io.zig").IO;
16
14
  const MessageBus = @import("message_bus.zig").MessageBusClient;
@@ -33,7 +31,7 @@ pub fn request(
33
31
  ) !void {
34
32
  const allocator = std.heap.page_allocator;
35
33
  const client_id = std.crypto.random.int(u128);
36
- const cluster_id: u32 = 0;
34
+ const cluster_id: u32 = 1;
37
35
  var addresses = [_]std.net.Address{try std.net.Address.parseIp4("127.0.0.1", config.port)};
38
36
 
39
37
  var io = try IO.init(32, 0);
@@ -53,7 +51,7 @@ pub fn request(
53
51
 
54
52
  message_bus.set_on_message(*Client, &client, Client.on_message);
55
53
 
56
- var message = client.get_message();
54
+ const message = client.get_message();
57
55
  defer client.unref(message);
58
56
 
59
57
  const body = std.mem.asBytes(&batch);
@@ -117,17 +115,6 @@ pub fn on_create_transfers(
117
115
  print_results(CreateTransfersResult, results);
118
116
  }
119
117
 
120
- pub fn on_commit_transfers(
121
- user_data: u128,
122
- operation: StateMachine.Operation,
123
- results: Client.Error![]const u8,
124
- ) void {
125
- _ = user_data;
126
- _ = operation;
127
-
128
- print_results(CommitTransfersResult, results);
129
- }
130
-
131
118
  fn print_results(comptime Results: type, results: Client.Error![]const u8) void {
132
119
  const body = results catch unreachable;
133
120
  const slice = std.mem.bytesAsSlice(Results, body);
@@ -9,25 +9,25 @@ pub fn main() !void {
9
9
  .id = 1,
10
10
  .user_data = 0,
11
11
  .reserved = [_]u8{0} ** 48,
12
- .unit = 710, // Let's use the ISO-4217 Code Number for ZAR
12
+ .ledger = 710, // Let's use the ISO-4217 Code Number for ZAR
13
13
  .code = 1000, // A chart of accounts code to describe this as a clearing account.
14
14
  .flags = .{ .debits_must_not_exceed_credits = true },
15
- .debits_reserved = 0,
16
- .debits_accepted = 0,
17
- .credits_reserved = 0,
18
- .credits_accepted = 10000, // Let's start with some liquidity.
15
+ .debits_pending = 0,
16
+ .debits_posted = 0,
17
+ .credits_pending = 0,
18
+ .credits_posted = 10000, // Let's start with some liquidity.
19
19
  },
20
20
  Account{
21
21
  .id = 2,
22
22
  .user_data = 0,
23
23
  .reserved = [_]u8{0} ** 48,
24
- .unit = 710, // Let's use the ISO-4217 Code Number for ZAR
24
+ .ledger = 710, // Let's use the ISO-4217 Code Number for ZAR
25
25
  .code = 2000, // A chart of accounts code to describe this as a payable account.
26
26
  .flags = .{},
27
- .debits_reserved = 0,
28
- .debits_accepted = 0,
29
- .credits_reserved = 0,
30
- .credits_accepted = 0,
27
+ .debits_pending = 0,
28
+ .debits_posted = 0,
29
+ .credits_pending = 0,
30
+ .credits_posted = 0,
31
31
  },
32
32
  };
33
33
 
@@ -6,13 +6,15 @@ const Transfer = tb.Transfer;
6
6
  pub fn main() !void {
7
7
  const transfers = [_]Transfer{
8
8
  Transfer{
9
- .id = 1000,
9
+ .id = 1,
10
10
  .debit_account_id = 1,
11
11
  .credit_account_id = 2,
12
12
  .user_data = 0,
13
- .reserved = [_]u8{0} ** 32,
13
+ .reserved = 0,
14
+ .pending_id = 0,
14
15
  .timeout = 0,
15
- .code = 0,
16
+ .ledger = 710, // Let's use the ISO-4217 Code Number for ZAR
17
+ .code = 1,
16
18
  .flags = .{},
17
19
  .amount = 1000,
18
20
  },
@@ -12,42 +12,48 @@ pub fn main() !void {
12
12
  .debit_account_id = 1,
13
13
  .credit_account_id = 2,
14
14
  .user_data = 0,
15
- .reserved = [_]u8{0} ** 32,
15
+ .reserved = 0,
16
+ .pending_id = 0,
16
17
  .timeout = std.time.ns_per_hour,
17
- .code = 0,
18
+ .ledger = 710,
19
+ .code = 1,
18
20
  .flags = .{
19
- .two_phase_commit = true,
21
+ .pending = true, // Set this transfer to be two-phase.
20
22
  },
21
- .amount = 9000,
23
+ .amount = 8000,
22
24
  },
23
25
  Transfer{
24
26
  .id = 1002,
25
27
  .debit_account_id = 1,
26
28
  .credit_account_id = 2,
27
29
  .user_data = 0,
28
- .reserved = [_]u8{0} ** 32,
30
+ .reserved = 0,
31
+ .pending_id = 0,
29
32
  .timeout = std.time.ns_per_hour,
30
- .code = 0,
33
+ .ledger = 710,
34
+ .code = 1,
31
35
  .flags = .{
32
- .two_phase_commit = true,
36
+ .pending = true, // Set this transfer to be two-phase.
33
37
  .linked = true, // Link this transfer with the next transfer 1003.
34
38
  },
35
- .amount = 1,
39
+ .amount = 500,
36
40
  },
37
41
  Transfer{
38
42
  .id = 1003,
39
43
  .debit_account_id = 1,
40
44
  .credit_account_id = 2,
41
45
  .user_data = 0,
42
- .reserved = [_]u8{0} ** 32,
46
+ .reserved = 0,
47
+ .pending_id = 0,
43
48
  .timeout = std.time.ns_per_hour,
44
- .code = 0,
49
+ .ledger = 710,
50
+ .code = 1,
45
51
  .flags = .{
46
- .two_phase_commit = true,
52
+ .pending = true, // Set this transfer to be two-phase.
47
53
  // The last transfer in a linked chain has .linked set to false to close the chain.
48
54
  // This transfer will succeed or fail together with transfer 1002 above.
49
55
  },
50
- .amount = 1,
56
+ .amount = 500,
51
57
  },
52
58
  };
53
59
 
@@ -0,0 +1,37 @@
1
+ const tb = @import("tigerbeetle.zig");
2
+ const demo = @import("demo.zig");
3
+
4
+ const Transfer = tb.Transfer;
5
+
6
+ pub fn main() !void {
7
+ const commits = [_]Transfer{
8
+ Transfer{
9
+ .id = 2001,
10
+ .debit_account_id = 1,
11
+ .credit_account_id = 2,
12
+ .user_data = 0,
13
+ .reserved = 0,
14
+ .pending_id = 1001,
15
+ .timeout = 0,
16
+ .ledger = 0,// Honor original Transfer ledger.
17
+ .code = 0,// Honor original Transfer code.
18
+ .flags = .{ .post_pending_transfer = true }, // Post the pending two-phase transfer.
19
+ .amount = 0, // Inherit the amount from the pending transfer.
20
+ },
21
+ Transfer{
22
+ .id = 2002,
23
+ .debit_account_id = 1,
24
+ .credit_account_id = 2,
25
+ .user_data = 0,
26
+ .reserved = 0,
27
+ .pending_id = 1002,
28
+ .timeout = 0,
29
+ .ledger = 0,
30
+ .code = 0,
31
+ .flags = .{ .post_pending_transfer = true }, // Post the pending two-phase transfer.
32
+ .amount = 0, // Inherit the amount from the pending transfer.
33
+ },
34
+ };
35
+
36
+ try demo.request(.create_transfers, commits, demo.on_create_transfers);
37
+ }
@@ -0,0 +1,24 @@
1
+ const tb = @import("tigerbeetle.zig");
2
+ const demo = @import("demo.zig");
3
+
4
+ const Transfer = tb.Transfer;
5
+
6
+ pub fn main() !void {
7
+ const commits = [_]Transfer{
8
+ Transfer{
9
+ .id = 2003,
10
+ .debit_account_id = 1,
11
+ .credit_account_id = 2,
12
+ .user_data = 0,
13
+ .reserved = 0,
14
+ .pending_id = 1003,
15
+ .timeout = 0,
16
+ .ledger = 0,
17
+ .code = 0,
18
+ .flags = .{ .void_pending_transfer = true },
19
+ .amount = 0,
20
+ },
21
+ };
22
+
23
+ try demo.request(.create_transfers, commits, demo.on_create_transfers);
24
+ }
@@ -1,7 +1,7 @@
1
1
  const demo = @import("demo.zig");
2
2
 
3
3
  pub fn main() !void {
4
- const ids = [_]u128{ 1000, 1001, 1002 };
4
+ const ids = [_]u128{ 1, 1001, 1002, 1003, 2001, 2002, 2003 };
5
5
 
6
6
  try demo.request(.lookup_transfers, ids, demo.on_lookup_transfers);
7
7
  }
@@ -986,7 +986,7 @@ pub const IO = struct {
986
986
  /// Detects whether the underlying file system for a given directory fd supports Direct I/O.
987
987
  /// Not all Linux file systems support `O_DIRECT`, e.g. a shared macOS volume.
988
988
  fn fs_supports_direct_io(dir_fd: std.os.fd_t) !bool {
989
- if (!@hasDecl(std.os, "O_DIRECT")) return false;
989
+ if (!@hasDecl(std.os.O, "DIRECT")) return false;
990
990
 
991
991
  const path = "fs_supports_direct_io";
992
992
  const dir = std.fs.Dir{ .fd = dir_fd };
@@ -997,12 +997,12 @@ pub const IO = struct {
997
997
  while (true) {
998
998
  const res = os.system.openat(dir_fd, path, os.O.CLOEXEC | os.O.RDONLY | os.O.DIRECT, 0);
999
999
  switch (os.linux.getErrno(res)) {
1000
- 0 => {
1000
+ .SUCCESS => {
1001
1001
  os.close(@intCast(os.fd_t, res));
1002
1002
  return true;
1003
1003
  },
1004
- os.linux.EINTR => continue,
1005
- os.linux.EINVAL => return false,
1004
+ .INTR => continue,
1005
+ .INVAL => return false,
1006
1006
  else => |err| return os.unexpectedErrno(err),
1007
1007
  }
1008
1008
  }
@@ -53,12 +53,28 @@ fn init(io: *IO, cluster: u32, replica: u8, dir_fd: os.fd_t) !void {
53
53
  assert(filename.len == filename_len);
54
54
 
55
55
  // TODO Expose data file size on the CLI.
56
- _ = try io.open_file(
56
+ const fd = try io.open_file(
57
57
  dir_fd,
58
58
  filename,
59
- config.journal_size_max, // TODO Double-check that we have space for redundant headers.
59
+ config.journal_size_max,
60
60
  true,
61
61
  );
62
+ std.os.close(fd);
63
+
64
+ const file = try (std.fs.Dir{ .fd = dir_fd }).openFile(filename, .{ .write = true });
65
+ defer file.close();
66
+
67
+ {
68
+ const write_size_max = 4 * 1024 * 1024;
69
+ var write: [write_size_max]u8 = undefined;
70
+ var offset: u64 = 0;
71
+ while (true) {
72
+ const write_size = vsr.format_journal(cluster, offset, &write);
73
+ if (write_size == 0) break;
74
+ try file.writeAll(write[0..write_size]);
75
+ offset += write_size;
76
+ }
77
+ }
62
78
 
63
79
  log.info("initialized data file", .{});
64
80
  }
@@ -91,7 +107,7 @@ fn start(
91
107
  allocator,
92
108
  config.accounts_max,
93
109
  config.transfers_max,
94
- config.commits_max,
110
+ config.transfers_pending_max,
95
111
  );
96
112
  var storage = try Storage.init(config.journal_size_max, storage_fd, io);
97
113
  var message_bus = try MessageBus.init(
@@ -25,8 +25,11 @@ pub const messages_max_replica = messages_max: {
25
25
  sum += config.io_depth_read + config.io_depth_write; // Journal I/O
26
26
  sum += config.clients_max; // Replica.client_table
27
27
  sum += 1; // Replica.loopback_queue
28
- sum += config.pipelining_max; // Replica.pipeline
29
- sum += config.replicas_max; // Replica.do_view_change_from_all_replicas quorum (all others are bitsets)
28
+ sum += config.pipeline_max; // Replica.pipeline
29
+ // Replica.do_view_change_from_all_replicas quorum:
30
+ // Replica.recovery_response_quorum is only used for recovery and does not increase the limit.
31
+ // All other quorums are bitsets.
32
+ sum += config.replicas_max;
30
33
  sum += config.connections_max; // Connection.recv_message
31
34
  sum += config.connections_max * config.connection_send_queue_max_replica; // Connection.send_queue
32
35
  sum += 1; // Handle bursts (e.g. Connection.parse_message)
@@ -1,12 +1,12 @@
1
1
  const std = @import("std");
2
2
  const assert = std.debug.assert;
3
3
 
4
- /// A First In, First Out ring buffer holding at most `size` elements.
5
- pub fn RingBuffer(comptime T: type, comptime size: usize) type {
4
+ /// A First In, First Out ring buffer holding at most `count_max` elements.
5
+ pub fn RingBuffer(comptime T: type, comptime count_max: usize) type {
6
6
  return struct {
7
7
  const Self = @This();
8
8
 
9
- buffer: [size]T = undefined,
9
+ buffer: [count_max]T = undefined,
10
10
 
11
11
  /// The index of the slot with the first item, if any.
12
12
  index: usize = 0,
@@ -35,6 +35,15 @@ pub fn RingBuffer(comptime T: type, comptime size: usize) type {
35
35
  return &self.buffer[(self.index + self.count - 1) % self.buffer.len];
36
36
  }
37
37
 
38
+ pub inline fn get_ptr(self: *Self, index: usize) ?*T {
39
+ if (index < self.count) {
40
+ return &self.buffer[(self.index + index) % self.buffer.len];
41
+ } else {
42
+ assert(index < count_max);
43
+ return null;
44
+ }
45
+ }
46
+
38
47
  pub inline fn next_tail(self: Self) ?T {
39
48
  if (self.full()) return null;
40
49
  return self.buffer[(self.index + self.count) % self.buffer.len];
@@ -56,6 +65,10 @@ pub fn RingBuffer(comptime T: type, comptime size: usize) type {
56
65
  self.count += 1;
57
66
  }
58
67
 
68
+ pub inline fn retreat_tail(self: *Self) void {
69
+ self.count -= 1;
70
+ }
71
+
59
72
  /// Returns whether the ring buffer is completely full.
60
73
  pub inline fn full(self: Self) bool {
61
74
  return self.count == self.buffer.len;
@@ -90,6 +103,13 @@ pub fn RingBuffer(comptime T: type, comptime size: usize) type {
90
103
  return result;
91
104
  }
92
105
 
106
+ /// Remove and return the last item, if any.
107
+ pub fn pop_tail(self: *Self) ?T {
108
+ const result = self.tail() orelse return null;
109
+ self.retreat_tail();
110
+ return result;
111
+ }
112
+
93
113
  pub const Iterator = struct {
94
114
  ring: *Self,
95
115
  count: usize = 0,
@@ -210,15 +230,21 @@ test "RingBuffer: push/pop high level interface" {
210
230
 
211
231
  try testing.expect(!fifo.full());
212
232
  try testing.expect(fifo.empty());
233
+ try testing.expectEqual(@as(?*u32, null), fifo.get_ptr(0));
234
+ try testing.expectEqual(@as(?*u32, null), fifo.get_ptr(1));
235
+ try testing.expectEqual(@as(?*u32, null), fifo.get_ptr(2));
213
236
 
214
237
  try fifo.push(1);
215
238
  try testing.expectEqual(@as(?u32, 1), fifo.head());
239
+ try testing.expectEqual(@as(u32, 1), fifo.get_ptr(0).?.*);
240
+ try testing.expectEqual(@as(?*u32, null), fifo.get_ptr(1));
216
241
 
217
242
  try testing.expect(!fifo.full());
218
243
  try testing.expect(!fifo.empty());
219
244
 
220
245
  try fifo.push(2);
221
246
  try testing.expectEqual(@as(?u32, 1), fifo.head());
247
+ try testing.expectEqual(@as(u32, 2), fifo.get_ptr(1).?.*);
222
248
 
223
249
  try fifo.push(3);
224
250
  try testing.expectError(error.NoSpaceLeft, fifo.push(4));
@@ -228,6 +254,9 @@ test "RingBuffer: push/pop high level interface" {
228
254
 
229
255
  try testing.expectEqual(@as(?u32, 1), fifo.head());
230
256
  try testing.expectEqual(@as(?u32, 1), fifo.pop());
257
+ try testing.expectEqual(@as(u32, 2), fifo.get_ptr(0).?.*);
258
+ try testing.expectEqual(@as(u32, 3), fifo.get_ptr(1).?.*);
259
+ try testing.expectEqual(@as(?*u32, null), fifo.get_ptr(2));
231
260
 
232
261
  try testing.expect(!fifo.full());
233
262
  try testing.expect(!fifo.empty());
@@ -242,3 +271,19 @@ test "RingBuffer: push/pop high level interface" {
242
271
  try testing.expect(!fifo.full());
243
272
  try testing.expect(fifo.empty());
244
273
  }
274
+
275
+ test "RingBuffer: pop_tail" {
276
+ var lifo = RingBuffer(u32, 3){};
277
+ try lifo.push(1);
278
+ try lifo.push(2);
279
+ try lifo.push(3);
280
+ try testing.expect(lifo.full());
281
+
282
+ try testing.expectEqual(@as(?u32, 3), lifo.pop_tail());
283
+ try testing.expectEqual(@as(?u32, 1), lifo.head());
284
+ try testing.expectEqual(@as(?u32, 2), lifo.pop_tail());
285
+ try testing.expectEqual(@as(?u32, 1), lifo.head());
286
+ try testing.expectEqual(@as(?u32, 1), lifo.pop_tail());
287
+ try testing.expectEqual(@as(?u32, null), lifo.pop_tail());
288
+ try testing.expect(lifo.empty());
289
+ }
@@ -20,6 +20,8 @@ const output = std.log.scoped(.state_checker);
20
20
  /// This will run much slower but will trace all logic across the cluster.
21
21
  const log_state_transitions_only = builtin.mode != .Debug;
22
22
 
23
+ const log_health = std.log.scoped(.health);
24
+
23
25
  /// You can fine tune your log levels even further (debug/info/notice/warn/err/crit/alert/emerg):
24
26
  pub const log_level: std.log.Level = if (log_state_transitions_only) .info else .debug;
25
27
 
@@ -64,7 +66,6 @@ pub fn main() !void {
64
66
  const node_count = replica_count + client_count;
65
67
 
66
68
  const ticks_max = 100_000_000;
67
- const transitions_max = config.journal_size_max / config.message_size_max;
68
69
  const request_probability = 1 + random.uintLessThan(u8, 99);
69
70
  const idle_on_probability = random.uintLessThan(u8, 20);
70
71
  const idle_off_probability = 10 + random.uintLessThan(u8, 10);
@@ -101,10 +102,16 @@ pub fn main() !void {
101
102
  .read_latency_min = random.uintLessThan(u16, 3),
102
103
  .read_latency_mean = 3 + random.uintLessThan(u16, 10),
103
104
  .write_latency_min = random.uintLessThan(u16, 3),
104
- .write_latency_mean = 3 + random.uintLessThan(u16, 10),
105
+ .write_latency_mean = 3 + random.uintLessThan(u16, 100),
105
106
  .read_fault_probability = random.uintLessThan(u8, 10),
106
107
  .write_fault_probability = random.uintLessThan(u8, 10),
107
108
  },
109
+ .health_options = .{
110
+ .crash_probability = 0.0001,
111
+ .crash_stability = random.uintLessThan(u32, 1_000),
112
+ .restart_probability = 0.01,
113
+ .restart_stability = random.uintLessThan(u32, 1_000),
114
+ },
108
115
  });
109
116
  defer cluster.destroy();
110
117
 
@@ -143,6 +150,10 @@ pub fn main() !void {
143
150
  \\ write_latency_mean={}
144
151
  \\ read_fault_probability={}%
145
152
  \\ write_fault_probability={}%
153
+ \\ crash_probability={d}%
154
+ \\ crash_stability={} ticks
155
+ \\ restart_probability={d}%
156
+ \\ restart_stability={} ticks
146
157
  \\
147
158
  , .{
148
159
  seed,
@@ -169,26 +180,105 @@ pub fn main() !void {
169
180
  cluster.options.storage_options.write_latency_mean,
170
181
  cluster.options.storage_options.read_fault_probability,
171
182
  cluster.options.storage_options.write_fault_probability,
183
+ cluster.options.health_options.crash_probability * 100,
184
+ cluster.options.health_options.crash_stability,
185
+ cluster.options.health_options.restart_probability * 100,
186
+ cluster.options.health_options.restart_stability,
172
187
  });
173
188
 
174
189
  var requests_sent: u64 = 0;
175
190
  var idle = false;
176
191
 
192
+ // The minimum number of healthy replicas required for a crashed replica to be able to recover.
193
+ const replica_normal_min = replicas: {
194
+ if (replica_count == 1) {
195
+ // A cluster of 1 can crash safely (as long as there is no disk corruption) since it
196
+ // does not run the recovery protocol.
197
+ break :replicas 0;
198
+ } else {
199
+ break :replicas cluster.replicas[0].quorum_view_change;
200
+ }
201
+ };
202
+
203
+ // Disable most faults at startup, so that the replicas don't get stuck in recovery mode.
204
+ for (cluster.storages) |*storage, i| {
205
+ storage.faulty = replica_normal_min <= i;
206
+ }
207
+
208
+ // TODO When storage is supported, run more transitions than fit in the journal.
209
+ const transitions_max = config.journal_slot_count / 2;
177
210
  var tick: u64 = 0;
178
211
  while (tick < ticks_max) : (tick += 1) {
179
- for (cluster.storages) |*storage| storage.tick();
212
+ const health_options = &cluster.options.health_options;
213
+ // The maximum number of replicas that can crash, with the cluster still able to recover.
214
+ var crashes = cluster.replica_normal_count() -| replica_normal_min;
215
+
216
+ for (cluster.storages) |*storage, replica| {
217
+ if (cluster.replicas[replica].journal.recovered) {
218
+
219
+ // TODO Remove this workaround when VSR recovery protocol is disabled.
220
+ // When only the minimum number of replicas are healthy (no more crashes allowed),
221
+ // disable storage faults on all healthy replicas.
222
+ //
223
+ // This is a workaround to avoid the deadlock that occurs when (for example) in a
224
+ // cluster of 3 replicas, one is down, another has a corrupt prepare, and the last does
225
+ // not have the prepare. The two healthy replicas can never complete a view change,
226
+ // because two replicas are not enough to nack, and the unhealthy replica cannot
227
+ // complete the VSR recovery protocol either.
228
+ if (cluster.health[replica] == .up and crashes == 0) {
229
+ storage.faulty = false;
230
+ } else {
231
+ // When a journal recovers for the first time, enable its storage faults.
232
+ // Future crashes will recover in the presence of faults.
233
+ storage.faulty = true;
234
+ }
235
+ }
236
+ storage.tick();
237
+ }
180
238
 
181
- for (cluster.replicas) |*replica, i| {
182
- replica.tick();
183
- cluster.state_checker.check_state(@intCast(u8, i));
239
+ for (cluster.replicas) |*replica| {
240
+ switch (cluster.health[replica.replica]) {
241
+ .up => |*ticks| {
242
+ ticks.* -|= 1;
243
+ replica.tick();
244
+ cluster.state_checker.check_state(replica.replica);
245
+
246
+ if (ticks.* != 0) continue;
247
+ if (crashes == 0) continue;
248
+ if (cluster.storages[replica.replica].writes.count() == 0) {
249
+ if (!chance_f64(random, health_options.crash_probability)) continue;
250
+ } else {
251
+ if (!chance_f64(random, health_options.crash_probability * 10.0)) continue;
252
+ }
253
+
254
+ if (!try cluster.crash_replica(replica.replica)) continue;
255
+ log_health.debug("crash replica={}", .{replica.replica});
256
+ crashes -= 1;
257
+ },
258
+ .down => |*ticks| {
259
+ ticks.* -|= 1;
260
+ // Keep ticking the time so that it won't have diverged too far to synchronize
261
+ // when the replica restarts.
262
+ replica.clock.time.tick();
263
+ assert(replica.status == .recovering);
264
+ if (ticks.* == 0 and chance_f64(random, health_options.restart_probability)) {
265
+ cluster.health[replica.replica] = .{ .up = health_options.restart_stability };
266
+ log_health.debug("restart replica={}", .{replica.replica});
267
+ }
268
+ },
269
+ }
184
270
  }
185
271
 
186
- cluster.network.packet_simulator.tick();
272
+ cluster.network.packet_simulator.tick(cluster.health);
187
273
 
188
274
  for (cluster.clients) |*client| client.tick();
189
275
 
190
276
  if (cluster.state_checker.transitions == transitions_max) {
191
- if (cluster.state_checker.convergence()) break;
277
+ if (cluster.state_checker.convergence() and
278
+ cluster.replica_up_count() == replica_count)
279
+ {
280
+ break;
281
+ }
192
282
  continue;
193
283
  } else {
194
284
  assert(cluster.state_checker.transitions < transitions_max);
@@ -222,6 +312,12 @@ fn chance(random: std.rand.Random, p: u8) bool {
222
312
  return random.uintLessThan(u8, 100) < p;
223
313
  }
224
314
 
315
+ /// Returns true, `p` percent of the time, else false.
316
+ fn chance_f64(random: std.rand.Random, p: f64) bool {
317
+ assert(p <= 100.0);
318
+ return random.float(f64) < p;
319
+ }
320
+
225
321
  /// Returns the next argument for the simulator or null (if none available)
226
322
  fn args_next(args: *std.process.ArgIterator, allocator: std.mem.Allocator) ?[:0]const u8 {
227
323
  const err_or_bytes = args.next(allocator) orelse return null;