tigerbeetle-node 0.6.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +102 -83
- package/dist/benchmark.js +102 -100
- package/dist/benchmark.js.map +1 -1
- package/dist/index.d.ts +82 -82
- package/dist/index.js +74 -93
- package/dist/index.js.map +1 -1
- package/dist/test.js +135 -112
- package/dist/test.js.map +1 -1
- package/package.json +13 -14
- package/scripts/download_node_headers.sh +3 -1
- package/src/benchmark.ts +114 -118
- package/src/index.ts +102 -111
- package/src/node.zig +53 -51
- package/src/test.ts +146 -125
- package/src/tigerbeetle/scripts/benchmark.bat +46 -46
- package/src/tigerbeetle/scripts/benchmark.sh +5 -0
- package/src/tigerbeetle/scripts/install_zig.bat +109 -109
- package/src/tigerbeetle/scripts/install_zig.sh +7 -3
- package/src/tigerbeetle/scripts/vopr.bat +47 -47
- package/src/tigerbeetle/src/benchmark.zig +63 -96
- package/src/tigerbeetle/src/config.zig +23 -19
- package/src/tigerbeetle/src/demo.zig +2 -15
- package/src/tigerbeetle/src/demo_01_create_accounts.zig +10 -10
- package/src/tigerbeetle/src/demo_03_create_transfers.zig +5 -3
- package/src/tigerbeetle/src/{demo_04_create_transfers_two_phase_commit.zig → demo_04_create_pending_transfers.zig} +18 -12
- package/src/tigerbeetle/src/demo_05_post_pending_transfers.zig +37 -0
- package/src/tigerbeetle/src/demo_06_void_pending_transfers.zig +24 -0
- package/src/tigerbeetle/src/demo_07_lookup_transfers.zig +1 -1
- package/src/tigerbeetle/src/io/linux.zig +4 -4
- package/src/tigerbeetle/src/main.zig +19 -3
- package/src/tigerbeetle/src/message_pool.zig +5 -2
- package/src/tigerbeetle/src/ring_buffer.zig +48 -3
- package/src/tigerbeetle/src/simulator.zig +104 -8
- package/src/tigerbeetle/src/state_machine.zig +1813 -816
- package/src/tigerbeetle/src/test/cluster.zig +165 -32
- package/src/tigerbeetle/src/test/packet_simulator.zig +14 -1
- package/src/tigerbeetle/src/test/state_checker.zig +3 -1
- package/src/tigerbeetle/src/test/state_machine.zig +8 -7
- package/src/tigerbeetle/src/test/storage.zig +99 -40
- package/src/tigerbeetle/src/tigerbeetle.zig +103 -98
- package/src/tigerbeetle/src/vsr/journal.zig +1387 -459
- package/src/tigerbeetle/src/vsr/replica.zig +1204 -417
- package/src/tigerbeetle/src/vsr.zig +203 -49
- package/src/translate.zig +10 -0
- package/.yarn/releases/yarn-berry.cjs +0 -55
- package/.yarnrc.yml +0 -1
- package/scripts/postinstall.sh +0 -6
- package/src/tigerbeetle/src/demo_05_accept_transfers.zig +0 -23
- package/src/tigerbeetle/src/demo_06_reject_transfers.zig +0 -17
- package/src/tigerbeetle/src/format_test.zig +0 -69
- package/yarn.lock +0 -42
|
@@ -53,29 +53,27 @@ pub const transfers_max = switch (deployment_environment) {
|
|
|
53
53
|
else => 1_000_000,
|
|
54
54
|
};
|
|
55
55
|
|
|
56
|
-
/// The maximum number of two-phase
|
|
56
|
+
/// The maximum number of two-phase transfers to store in memory:
|
|
57
57
|
/// This impacts the amount of memory allocated at initialization by the server.
|
|
58
|
-
pub const
|
|
59
|
-
|
|
60
|
-
/// The maximum size of the journal file:
|
|
61
|
-
/// This is pre-allocated and zeroed for performance when initialized.
|
|
62
|
-
/// Writes within this file never extend the filesystem inode size reducing the cost of fdatasync().
|
|
63
|
-
/// This enables static allocation of disk space so that appends cannot fail with ENOSPC.
|
|
64
|
-
/// This also enables us to detect filesystem inode corruption that would change the journal size.
|
|
65
|
-
pub const journal_size_max = switch (deployment_environment) {
|
|
66
|
-
.production => 128 * 1024 * 1024 * 1024,
|
|
67
|
-
else => 128 * 1024 * 1024,
|
|
68
|
-
};
|
|
58
|
+
pub const transfers_pending_max = transfers_max;
|
|
69
59
|
|
|
70
60
|
/// The maximum number of batch entries in the journal file:
|
|
71
61
|
/// A batch entry may contain many transfers, so this is not a limit on the number of transfers.
|
|
72
62
|
/// We need this limit to allocate space for copies of batch headers at the start of the journal.
|
|
73
63
|
/// These header copies enable us to disentangle corruption from crashes and recover accordingly.
|
|
74
|
-
pub const
|
|
75
|
-
.production => 1024
|
|
76
|
-
else =>
|
|
64
|
+
pub const journal_slot_count = switch (deployment_environment) {
|
|
65
|
+
.production => 1024,
|
|
66
|
+
else => 128,
|
|
77
67
|
};
|
|
78
68
|
|
|
69
|
+
/// The maximum size of the journal file:
|
|
70
|
+
/// This is pre-allocated and zeroed for performance when initialized.
|
|
71
|
+
/// Writes within this file never extend the filesystem inode size reducing the cost of fdatasync().
|
|
72
|
+
/// This enables static allocation of disk space so that appends cannot fail with ENOSPC.
|
|
73
|
+
/// This also enables us to detect filesystem inode corruption that would change the journal size.
|
|
74
|
+
// TODO remove this; just allocate a part of the total storage for the journal
|
|
75
|
+
pub const journal_size_max = journal_slot_count * (128 + message_size_max);
|
|
76
|
+
|
|
79
77
|
/// The maximum number of connections that can be held open by the server at any time:
|
|
80
78
|
pub const connections_max = replicas_max + clients_max;
|
|
81
79
|
|
|
@@ -92,7 +90,7 @@ pub const message_size_max = 1 * 1024 * 1024;
|
|
|
92
90
|
/// The maximum number of Viewstamped Replication prepare messages that can be inflight at a time.
|
|
93
91
|
/// This is immutable once assigned per cluster, as replicas need to know how many operations might
|
|
94
92
|
/// possibly be uncommitted during a view change, and this must be constant for all replicas.
|
|
95
|
-
pub const
|
|
93
|
+
pub const pipeline_max = clients_max;
|
|
96
94
|
|
|
97
95
|
/// The minimum and maximum amount of time in milliseconds to wait before initiating a connection.
|
|
98
96
|
/// Exponential backoff and jitter are applied within this range.
|
|
@@ -224,12 +222,18 @@ pub const clock_synchronization_window_min_ms = 2000;
|
|
|
224
222
|
/// If a window expires because of this then it is likely that the clock epoch will also be expired.
|
|
225
223
|
pub const clock_synchronization_window_max_ms = 20000;
|
|
226
224
|
|
|
225
|
+
// TODO Move these to a separate "internal computed constants" file.
|
|
226
|
+
pub const journal_size_headers = journal_slot_count * 128; // 128 == @sizeOf(Header)
|
|
227
|
+
pub const journal_size_prepares = journal_slot_count * message_size_max;
|
|
228
|
+
|
|
227
229
|
comptime {
|
|
228
230
|
// vsr.parse_address assumes that config.address/config.port are valid.
|
|
229
231
|
_ = std.net.Address.parseIp4(address, 0) catch unreachable;
|
|
230
232
|
_ = @as(u16, port);
|
|
231
233
|
|
|
232
|
-
// Avoid latency issues from
|
|
233
|
-
assert(tcp_sndbuf_replica <=
|
|
234
|
-
assert(tcp_sndbuf_client <=
|
|
234
|
+
// Avoid latency issues from setting sndbuf too high:
|
|
235
|
+
assert(tcp_sndbuf_replica <= 16 * 1024 * 1024);
|
|
236
|
+
assert(tcp_sndbuf_client <= 16 * 1024 * 1024);
|
|
237
|
+
|
|
238
|
+
assert(journal_size_max == journal_size_headers + journal_size_prepares);
|
|
235
239
|
}
|
|
@@ -6,11 +6,9 @@ const config = @import("config.zig");
|
|
|
6
6
|
const tb = @import("tigerbeetle.zig");
|
|
7
7
|
const Account = tb.Account;
|
|
8
8
|
const Transfer = tb.Transfer;
|
|
9
|
-
const Commit = tb.Commit;
|
|
10
9
|
|
|
11
10
|
const CreateAccountsResult = tb.CreateAccountsResult;
|
|
12
11
|
const CreateTransfersResult = tb.CreateTransfersResult;
|
|
13
|
-
const CommitTransfersResult = tb.CommitTransfersResult;
|
|
14
12
|
|
|
15
13
|
const IO = @import("io.zig").IO;
|
|
16
14
|
const MessageBus = @import("message_bus.zig").MessageBusClient;
|
|
@@ -33,7 +31,7 @@ pub fn request(
|
|
|
33
31
|
) !void {
|
|
34
32
|
const allocator = std.heap.page_allocator;
|
|
35
33
|
const client_id = std.crypto.random.int(u128);
|
|
36
|
-
const cluster_id: u32 =
|
|
34
|
+
const cluster_id: u32 = 1;
|
|
37
35
|
var addresses = [_]std.net.Address{try std.net.Address.parseIp4("127.0.0.1", config.port)};
|
|
38
36
|
|
|
39
37
|
var io = try IO.init(32, 0);
|
|
@@ -53,7 +51,7 @@ pub fn request(
|
|
|
53
51
|
|
|
54
52
|
message_bus.set_on_message(*Client, &client, Client.on_message);
|
|
55
53
|
|
|
56
|
-
|
|
54
|
+
const message = client.get_message();
|
|
57
55
|
defer client.unref(message);
|
|
58
56
|
|
|
59
57
|
const body = std.mem.asBytes(&batch);
|
|
@@ -117,17 +115,6 @@ pub fn on_create_transfers(
|
|
|
117
115
|
print_results(CreateTransfersResult, results);
|
|
118
116
|
}
|
|
119
117
|
|
|
120
|
-
pub fn on_commit_transfers(
|
|
121
|
-
user_data: u128,
|
|
122
|
-
operation: StateMachine.Operation,
|
|
123
|
-
results: Client.Error![]const u8,
|
|
124
|
-
) void {
|
|
125
|
-
_ = user_data;
|
|
126
|
-
_ = operation;
|
|
127
|
-
|
|
128
|
-
print_results(CommitTransfersResult, results);
|
|
129
|
-
}
|
|
130
|
-
|
|
131
118
|
fn print_results(comptime Results: type, results: Client.Error![]const u8) void {
|
|
132
119
|
const body = results catch unreachable;
|
|
133
120
|
const slice = std.mem.bytesAsSlice(Results, body);
|
|
@@ -9,25 +9,25 @@ pub fn main() !void {
|
|
|
9
9
|
.id = 1,
|
|
10
10
|
.user_data = 0,
|
|
11
11
|
.reserved = [_]u8{0} ** 48,
|
|
12
|
-
.
|
|
12
|
+
.ledger = 710, // Let's use the ISO-4217 Code Number for ZAR
|
|
13
13
|
.code = 1000, // A chart of accounts code to describe this as a clearing account.
|
|
14
14
|
.flags = .{ .debits_must_not_exceed_credits = true },
|
|
15
|
-
.
|
|
16
|
-
.
|
|
17
|
-
.
|
|
18
|
-
.
|
|
15
|
+
.debits_pending = 0,
|
|
16
|
+
.debits_posted = 0,
|
|
17
|
+
.credits_pending = 0,
|
|
18
|
+
.credits_posted = 10000, // Let's start with some liquidity.
|
|
19
19
|
},
|
|
20
20
|
Account{
|
|
21
21
|
.id = 2,
|
|
22
22
|
.user_data = 0,
|
|
23
23
|
.reserved = [_]u8{0} ** 48,
|
|
24
|
-
.
|
|
24
|
+
.ledger = 710, // Let's use the ISO-4217 Code Number for ZAR
|
|
25
25
|
.code = 2000, // A chart of accounts code to describe this as a payable account.
|
|
26
26
|
.flags = .{},
|
|
27
|
-
.
|
|
28
|
-
.
|
|
29
|
-
.
|
|
30
|
-
.
|
|
27
|
+
.debits_pending = 0,
|
|
28
|
+
.debits_posted = 0,
|
|
29
|
+
.credits_pending = 0,
|
|
30
|
+
.credits_posted = 0,
|
|
31
31
|
},
|
|
32
32
|
};
|
|
33
33
|
|
|
@@ -6,13 +6,15 @@ const Transfer = tb.Transfer;
|
|
|
6
6
|
pub fn main() !void {
|
|
7
7
|
const transfers = [_]Transfer{
|
|
8
8
|
Transfer{
|
|
9
|
-
.id =
|
|
9
|
+
.id = 1,
|
|
10
10
|
.debit_account_id = 1,
|
|
11
11
|
.credit_account_id = 2,
|
|
12
12
|
.user_data = 0,
|
|
13
|
-
.reserved =
|
|
13
|
+
.reserved = 0,
|
|
14
|
+
.pending_id = 0,
|
|
14
15
|
.timeout = 0,
|
|
15
|
-
.
|
|
16
|
+
.ledger = 710, // Let's use the ISO-4217 Code Number for ZAR
|
|
17
|
+
.code = 1,
|
|
16
18
|
.flags = .{},
|
|
17
19
|
.amount = 1000,
|
|
18
20
|
},
|
|
@@ -12,42 +12,48 @@ pub fn main() !void {
|
|
|
12
12
|
.debit_account_id = 1,
|
|
13
13
|
.credit_account_id = 2,
|
|
14
14
|
.user_data = 0,
|
|
15
|
-
.reserved =
|
|
15
|
+
.reserved = 0,
|
|
16
|
+
.pending_id = 0,
|
|
16
17
|
.timeout = std.time.ns_per_hour,
|
|
17
|
-
.
|
|
18
|
+
.ledger = 710,
|
|
19
|
+
.code = 1,
|
|
18
20
|
.flags = .{
|
|
19
|
-
.
|
|
21
|
+
.pending = true, // Set this transfer to be two-phase.
|
|
20
22
|
},
|
|
21
|
-
.amount =
|
|
23
|
+
.amount = 8000,
|
|
22
24
|
},
|
|
23
25
|
Transfer{
|
|
24
26
|
.id = 1002,
|
|
25
27
|
.debit_account_id = 1,
|
|
26
28
|
.credit_account_id = 2,
|
|
27
29
|
.user_data = 0,
|
|
28
|
-
.reserved =
|
|
30
|
+
.reserved = 0,
|
|
31
|
+
.pending_id = 0,
|
|
29
32
|
.timeout = std.time.ns_per_hour,
|
|
30
|
-
.
|
|
33
|
+
.ledger = 710,
|
|
34
|
+
.code = 1,
|
|
31
35
|
.flags = .{
|
|
32
|
-
.
|
|
36
|
+
.pending = true, // Set this transfer to be two-phase.
|
|
33
37
|
.linked = true, // Link this transfer with the next transfer 1003.
|
|
34
38
|
},
|
|
35
|
-
.amount =
|
|
39
|
+
.amount = 500,
|
|
36
40
|
},
|
|
37
41
|
Transfer{
|
|
38
42
|
.id = 1003,
|
|
39
43
|
.debit_account_id = 1,
|
|
40
44
|
.credit_account_id = 2,
|
|
41
45
|
.user_data = 0,
|
|
42
|
-
.reserved =
|
|
46
|
+
.reserved = 0,
|
|
47
|
+
.pending_id = 0,
|
|
43
48
|
.timeout = std.time.ns_per_hour,
|
|
44
|
-
.
|
|
49
|
+
.ledger = 710,
|
|
50
|
+
.code = 1,
|
|
45
51
|
.flags = .{
|
|
46
|
-
.
|
|
52
|
+
.pending = true, // Set this transfer to be two-phase.
|
|
47
53
|
// The last transfer in a linked chain has .linked set to false to close the chain.
|
|
48
54
|
// This transfer will succeed or fail together with transfer 1002 above.
|
|
49
55
|
},
|
|
50
|
-
.amount =
|
|
56
|
+
.amount = 500,
|
|
51
57
|
},
|
|
52
58
|
};
|
|
53
59
|
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
const tb = @import("tigerbeetle.zig");
|
|
2
|
+
const demo = @import("demo.zig");
|
|
3
|
+
|
|
4
|
+
const Transfer = tb.Transfer;
|
|
5
|
+
|
|
6
|
+
pub fn main() !void {
|
|
7
|
+
const commits = [_]Transfer{
|
|
8
|
+
Transfer{
|
|
9
|
+
.id = 2001,
|
|
10
|
+
.debit_account_id = 1,
|
|
11
|
+
.credit_account_id = 2,
|
|
12
|
+
.user_data = 0,
|
|
13
|
+
.reserved = 0,
|
|
14
|
+
.pending_id = 1001,
|
|
15
|
+
.timeout = 0,
|
|
16
|
+
.ledger = 0,// Honor original Transfer ledger.
|
|
17
|
+
.code = 0,// Honor original Transfer code.
|
|
18
|
+
.flags = .{ .post_pending_transfer = true }, // Post the pending two-phase transfer.
|
|
19
|
+
.amount = 0, // Inherit the amount from the pending transfer.
|
|
20
|
+
},
|
|
21
|
+
Transfer{
|
|
22
|
+
.id = 2002,
|
|
23
|
+
.debit_account_id = 1,
|
|
24
|
+
.credit_account_id = 2,
|
|
25
|
+
.user_data = 0,
|
|
26
|
+
.reserved = 0,
|
|
27
|
+
.pending_id = 1002,
|
|
28
|
+
.timeout = 0,
|
|
29
|
+
.ledger = 0,
|
|
30
|
+
.code = 0,
|
|
31
|
+
.flags = .{ .post_pending_transfer = true }, // Post the pending two-phase transfer.
|
|
32
|
+
.amount = 0, // Inherit the amount from the pending transfer.
|
|
33
|
+
},
|
|
34
|
+
};
|
|
35
|
+
|
|
36
|
+
try demo.request(.create_transfers, commits, demo.on_create_transfers);
|
|
37
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
const tb = @import("tigerbeetle.zig");
|
|
2
|
+
const demo = @import("demo.zig");
|
|
3
|
+
|
|
4
|
+
const Transfer = tb.Transfer;
|
|
5
|
+
|
|
6
|
+
pub fn main() !void {
|
|
7
|
+
const commits = [_]Transfer{
|
|
8
|
+
Transfer{
|
|
9
|
+
.id = 2003,
|
|
10
|
+
.debit_account_id = 1,
|
|
11
|
+
.credit_account_id = 2,
|
|
12
|
+
.user_data = 0,
|
|
13
|
+
.reserved = 0,
|
|
14
|
+
.pending_id = 1003,
|
|
15
|
+
.timeout = 0,
|
|
16
|
+
.ledger = 0,
|
|
17
|
+
.code = 0,
|
|
18
|
+
.flags = .{ .void_pending_transfer = true },
|
|
19
|
+
.amount = 0,
|
|
20
|
+
},
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
try demo.request(.create_transfers, commits, demo.on_create_transfers);
|
|
24
|
+
}
|
|
@@ -986,7 +986,7 @@ pub const IO = struct {
|
|
|
986
986
|
/// Detects whether the underlying file system for a given directory fd supports Direct I/O.
|
|
987
987
|
/// Not all Linux file systems support `O_DIRECT`, e.g. a shared macOS volume.
|
|
988
988
|
fn fs_supports_direct_io(dir_fd: std.os.fd_t) !bool {
|
|
989
|
-
if (!@hasDecl(std.os, "
|
|
989
|
+
if (!@hasDecl(std.os.O, "DIRECT")) return false;
|
|
990
990
|
|
|
991
991
|
const path = "fs_supports_direct_io";
|
|
992
992
|
const dir = std.fs.Dir{ .fd = dir_fd };
|
|
@@ -997,12 +997,12 @@ pub const IO = struct {
|
|
|
997
997
|
while (true) {
|
|
998
998
|
const res = os.system.openat(dir_fd, path, os.O.CLOEXEC | os.O.RDONLY | os.O.DIRECT, 0);
|
|
999
999
|
switch (os.linux.getErrno(res)) {
|
|
1000
|
-
|
|
1000
|
+
.SUCCESS => {
|
|
1001
1001
|
os.close(@intCast(os.fd_t, res));
|
|
1002
1002
|
return true;
|
|
1003
1003
|
},
|
|
1004
|
-
|
|
1005
|
-
|
|
1004
|
+
.INTR => continue,
|
|
1005
|
+
.INVAL => return false,
|
|
1006
1006
|
else => |err| return os.unexpectedErrno(err),
|
|
1007
1007
|
}
|
|
1008
1008
|
}
|
|
@@ -53,12 +53,28 @@ fn init(io: *IO, cluster: u32, replica: u8, dir_fd: os.fd_t) !void {
|
|
|
53
53
|
assert(filename.len == filename_len);
|
|
54
54
|
|
|
55
55
|
// TODO Expose data file size on the CLI.
|
|
56
|
-
|
|
56
|
+
const fd = try io.open_file(
|
|
57
57
|
dir_fd,
|
|
58
58
|
filename,
|
|
59
|
-
config.journal_size_max,
|
|
59
|
+
config.journal_size_max,
|
|
60
60
|
true,
|
|
61
61
|
);
|
|
62
|
+
std.os.close(fd);
|
|
63
|
+
|
|
64
|
+
const file = try (std.fs.Dir{ .fd = dir_fd }).openFile(filename, .{ .write = true });
|
|
65
|
+
defer file.close();
|
|
66
|
+
|
|
67
|
+
{
|
|
68
|
+
const write_size_max = 4 * 1024 * 1024;
|
|
69
|
+
var write: [write_size_max]u8 = undefined;
|
|
70
|
+
var offset: u64 = 0;
|
|
71
|
+
while (true) {
|
|
72
|
+
const write_size = vsr.format_journal(cluster, offset, &write);
|
|
73
|
+
if (write_size == 0) break;
|
|
74
|
+
try file.writeAll(write[0..write_size]);
|
|
75
|
+
offset += write_size;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
62
78
|
|
|
63
79
|
log.info("initialized data file", .{});
|
|
64
80
|
}
|
|
@@ -91,7 +107,7 @@ fn start(
|
|
|
91
107
|
allocator,
|
|
92
108
|
config.accounts_max,
|
|
93
109
|
config.transfers_max,
|
|
94
|
-
config.
|
|
110
|
+
config.transfers_pending_max,
|
|
95
111
|
);
|
|
96
112
|
var storage = try Storage.init(config.journal_size_max, storage_fd, io);
|
|
97
113
|
var message_bus = try MessageBus.init(
|
|
@@ -25,8 +25,11 @@ pub const messages_max_replica = messages_max: {
|
|
|
25
25
|
sum += config.io_depth_read + config.io_depth_write; // Journal I/O
|
|
26
26
|
sum += config.clients_max; // Replica.client_table
|
|
27
27
|
sum += 1; // Replica.loopback_queue
|
|
28
|
-
sum += config.
|
|
29
|
-
|
|
28
|
+
sum += config.pipeline_max; // Replica.pipeline
|
|
29
|
+
// Replica.do_view_change_from_all_replicas quorum:
|
|
30
|
+
// Replica.recovery_response_quorum is only used for recovery and does not increase the limit.
|
|
31
|
+
// All other quorums are bitsets.
|
|
32
|
+
sum += config.replicas_max;
|
|
30
33
|
sum += config.connections_max; // Connection.recv_message
|
|
31
34
|
sum += config.connections_max * config.connection_send_queue_max_replica; // Connection.send_queue
|
|
32
35
|
sum += 1; // Handle bursts (e.g. Connection.parse_message)
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
const std = @import("std");
|
|
2
2
|
const assert = std.debug.assert;
|
|
3
3
|
|
|
4
|
-
/// A First In, First Out ring buffer holding at most `
|
|
5
|
-
pub fn RingBuffer(comptime T: type, comptime
|
|
4
|
+
/// A First In, First Out ring buffer holding at most `count_max` elements.
|
|
5
|
+
pub fn RingBuffer(comptime T: type, comptime count_max: usize) type {
|
|
6
6
|
return struct {
|
|
7
7
|
const Self = @This();
|
|
8
8
|
|
|
9
|
-
buffer: [
|
|
9
|
+
buffer: [count_max]T = undefined,
|
|
10
10
|
|
|
11
11
|
/// The index of the slot with the first item, if any.
|
|
12
12
|
index: usize = 0,
|
|
@@ -35,6 +35,15 @@ pub fn RingBuffer(comptime T: type, comptime size: usize) type {
|
|
|
35
35
|
return &self.buffer[(self.index + self.count - 1) % self.buffer.len];
|
|
36
36
|
}
|
|
37
37
|
|
|
38
|
+
pub inline fn get_ptr(self: *Self, index: usize) ?*T {
|
|
39
|
+
if (index < self.count) {
|
|
40
|
+
return &self.buffer[(self.index + index) % self.buffer.len];
|
|
41
|
+
} else {
|
|
42
|
+
assert(index < count_max);
|
|
43
|
+
return null;
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|
|
38
47
|
pub inline fn next_tail(self: Self) ?T {
|
|
39
48
|
if (self.full()) return null;
|
|
40
49
|
return self.buffer[(self.index + self.count) % self.buffer.len];
|
|
@@ -56,6 +65,10 @@ pub fn RingBuffer(comptime T: type, comptime size: usize) type {
|
|
|
56
65
|
self.count += 1;
|
|
57
66
|
}
|
|
58
67
|
|
|
68
|
+
pub inline fn retreat_tail(self: *Self) void {
|
|
69
|
+
self.count -= 1;
|
|
70
|
+
}
|
|
71
|
+
|
|
59
72
|
/// Returns whether the ring buffer is completely full.
|
|
60
73
|
pub inline fn full(self: Self) bool {
|
|
61
74
|
return self.count == self.buffer.len;
|
|
@@ -90,6 +103,13 @@ pub fn RingBuffer(comptime T: type, comptime size: usize) type {
|
|
|
90
103
|
return result;
|
|
91
104
|
}
|
|
92
105
|
|
|
106
|
+
/// Remove and return the last item, if any.
|
|
107
|
+
pub fn pop_tail(self: *Self) ?T {
|
|
108
|
+
const result = self.tail() orelse return null;
|
|
109
|
+
self.retreat_tail();
|
|
110
|
+
return result;
|
|
111
|
+
}
|
|
112
|
+
|
|
93
113
|
pub const Iterator = struct {
|
|
94
114
|
ring: *Self,
|
|
95
115
|
count: usize = 0,
|
|
@@ -210,15 +230,21 @@ test "RingBuffer: push/pop high level interface" {
|
|
|
210
230
|
|
|
211
231
|
try testing.expect(!fifo.full());
|
|
212
232
|
try testing.expect(fifo.empty());
|
|
233
|
+
try testing.expectEqual(@as(?*u32, null), fifo.get_ptr(0));
|
|
234
|
+
try testing.expectEqual(@as(?*u32, null), fifo.get_ptr(1));
|
|
235
|
+
try testing.expectEqual(@as(?*u32, null), fifo.get_ptr(2));
|
|
213
236
|
|
|
214
237
|
try fifo.push(1);
|
|
215
238
|
try testing.expectEqual(@as(?u32, 1), fifo.head());
|
|
239
|
+
try testing.expectEqual(@as(u32, 1), fifo.get_ptr(0).?.*);
|
|
240
|
+
try testing.expectEqual(@as(?*u32, null), fifo.get_ptr(1));
|
|
216
241
|
|
|
217
242
|
try testing.expect(!fifo.full());
|
|
218
243
|
try testing.expect(!fifo.empty());
|
|
219
244
|
|
|
220
245
|
try fifo.push(2);
|
|
221
246
|
try testing.expectEqual(@as(?u32, 1), fifo.head());
|
|
247
|
+
try testing.expectEqual(@as(u32, 2), fifo.get_ptr(1).?.*);
|
|
222
248
|
|
|
223
249
|
try fifo.push(3);
|
|
224
250
|
try testing.expectError(error.NoSpaceLeft, fifo.push(4));
|
|
@@ -228,6 +254,9 @@ test "RingBuffer: push/pop high level interface" {
|
|
|
228
254
|
|
|
229
255
|
try testing.expectEqual(@as(?u32, 1), fifo.head());
|
|
230
256
|
try testing.expectEqual(@as(?u32, 1), fifo.pop());
|
|
257
|
+
try testing.expectEqual(@as(u32, 2), fifo.get_ptr(0).?.*);
|
|
258
|
+
try testing.expectEqual(@as(u32, 3), fifo.get_ptr(1).?.*);
|
|
259
|
+
try testing.expectEqual(@as(?*u32, null), fifo.get_ptr(2));
|
|
231
260
|
|
|
232
261
|
try testing.expect(!fifo.full());
|
|
233
262
|
try testing.expect(!fifo.empty());
|
|
@@ -242,3 +271,19 @@ test "RingBuffer: push/pop high level interface" {
|
|
|
242
271
|
try testing.expect(!fifo.full());
|
|
243
272
|
try testing.expect(fifo.empty());
|
|
244
273
|
}
|
|
274
|
+
|
|
275
|
+
test "RingBuffer: pop_tail" {
|
|
276
|
+
var lifo = RingBuffer(u32, 3){};
|
|
277
|
+
try lifo.push(1);
|
|
278
|
+
try lifo.push(2);
|
|
279
|
+
try lifo.push(3);
|
|
280
|
+
try testing.expect(lifo.full());
|
|
281
|
+
|
|
282
|
+
try testing.expectEqual(@as(?u32, 3), lifo.pop_tail());
|
|
283
|
+
try testing.expectEqual(@as(?u32, 1), lifo.head());
|
|
284
|
+
try testing.expectEqual(@as(?u32, 2), lifo.pop_tail());
|
|
285
|
+
try testing.expectEqual(@as(?u32, 1), lifo.head());
|
|
286
|
+
try testing.expectEqual(@as(?u32, 1), lifo.pop_tail());
|
|
287
|
+
try testing.expectEqual(@as(?u32, null), lifo.pop_tail());
|
|
288
|
+
try testing.expect(lifo.empty());
|
|
289
|
+
}
|
|
@@ -20,6 +20,8 @@ const output = std.log.scoped(.state_checker);
|
|
|
20
20
|
/// This will run much slower but will trace all logic across the cluster.
|
|
21
21
|
const log_state_transitions_only = builtin.mode != .Debug;
|
|
22
22
|
|
|
23
|
+
const log_health = std.log.scoped(.health);
|
|
24
|
+
|
|
23
25
|
/// You can fine tune your log levels even further (debug/info/notice/warn/err/crit/alert/emerg):
|
|
24
26
|
pub const log_level: std.log.Level = if (log_state_transitions_only) .info else .debug;
|
|
25
27
|
|
|
@@ -64,7 +66,6 @@ pub fn main() !void {
|
|
|
64
66
|
const node_count = replica_count + client_count;
|
|
65
67
|
|
|
66
68
|
const ticks_max = 100_000_000;
|
|
67
|
-
const transitions_max = config.journal_size_max / config.message_size_max;
|
|
68
69
|
const request_probability = 1 + random.uintLessThan(u8, 99);
|
|
69
70
|
const idle_on_probability = random.uintLessThan(u8, 20);
|
|
70
71
|
const idle_off_probability = 10 + random.uintLessThan(u8, 10);
|
|
@@ -101,10 +102,16 @@ pub fn main() !void {
|
|
|
101
102
|
.read_latency_min = random.uintLessThan(u16, 3),
|
|
102
103
|
.read_latency_mean = 3 + random.uintLessThan(u16, 10),
|
|
103
104
|
.write_latency_min = random.uintLessThan(u16, 3),
|
|
104
|
-
.write_latency_mean = 3 + random.uintLessThan(u16,
|
|
105
|
+
.write_latency_mean = 3 + random.uintLessThan(u16, 100),
|
|
105
106
|
.read_fault_probability = random.uintLessThan(u8, 10),
|
|
106
107
|
.write_fault_probability = random.uintLessThan(u8, 10),
|
|
107
108
|
},
|
|
109
|
+
.health_options = .{
|
|
110
|
+
.crash_probability = 0.0001,
|
|
111
|
+
.crash_stability = random.uintLessThan(u32, 1_000),
|
|
112
|
+
.restart_probability = 0.01,
|
|
113
|
+
.restart_stability = random.uintLessThan(u32, 1_000),
|
|
114
|
+
},
|
|
108
115
|
});
|
|
109
116
|
defer cluster.destroy();
|
|
110
117
|
|
|
@@ -143,6 +150,10 @@ pub fn main() !void {
|
|
|
143
150
|
\\ write_latency_mean={}
|
|
144
151
|
\\ read_fault_probability={}%
|
|
145
152
|
\\ write_fault_probability={}%
|
|
153
|
+
\\ crash_probability={d}%
|
|
154
|
+
\\ crash_stability={} ticks
|
|
155
|
+
\\ restart_probability={d}%
|
|
156
|
+
\\ restart_stability={} ticks
|
|
146
157
|
\\
|
|
147
158
|
, .{
|
|
148
159
|
seed,
|
|
@@ -169,26 +180,105 @@ pub fn main() !void {
|
|
|
169
180
|
cluster.options.storage_options.write_latency_mean,
|
|
170
181
|
cluster.options.storage_options.read_fault_probability,
|
|
171
182
|
cluster.options.storage_options.write_fault_probability,
|
|
183
|
+
cluster.options.health_options.crash_probability * 100,
|
|
184
|
+
cluster.options.health_options.crash_stability,
|
|
185
|
+
cluster.options.health_options.restart_probability * 100,
|
|
186
|
+
cluster.options.health_options.restart_stability,
|
|
172
187
|
});
|
|
173
188
|
|
|
174
189
|
var requests_sent: u64 = 0;
|
|
175
190
|
var idle = false;
|
|
176
191
|
|
|
192
|
+
// The minimum number of healthy replicas required for a crashed replica to be able to recover.
|
|
193
|
+
const replica_normal_min = replicas: {
|
|
194
|
+
if (replica_count == 1) {
|
|
195
|
+
// A cluster of 1 can crash safely (as long as there is no disk corruption) since it
|
|
196
|
+
// does not run the recovery protocol.
|
|
197
|
+
break :replicas 0;
|
|
198
|
+
} else {
|
|
199
|
+
break :replicas cluster.replicas[0].quorum_view_change;
|
|
200
|
+
}
|
|
201
|
+
};
|
|
202
|
+
|
|
203
|
+
// Disable most faults at startup, so that the replicas don't get stuck in recovery mode.
|
|
204
|
+
for (cluster.storages) |*storage, i| {
|
|
205
|
+
storage.faulty = replica_normal_min <= i;
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
// TODO When storage is supported, run more transitions than fit in the journal.
|
|
209
|
+
const transitions_max = config.journal_slot_count / 2;
|
|
177
210
|
var tick: u64 = 0;
|
|
178
211
|
while (tick < ticks_max) : (tick += 1) {
|
|
179
|
-
|
|
212
|
+
const health_options = &cluster.options.health_options;
|
|
213
|
+
// The maximum number of replicas that can crash, with the cluster still able to recover.
|
|
214
|
+
var crashes = cluster.replica_normal_count() -| replica_normal_min;
|
|
215
|
+
|
|
216
|
+
for (cluster.storages) |*storage, replica| {
|
|
217
|
+
if (cluster.replicas[replica].journal.recovered) {
|
|
218
|
+
|
|
219
|
+
// TODO Remove this workaround when VSR recovery protocol is disabled.
|
|
220
|
+
// When only the minimum number of replicas are healthy (no more crashes allowed),
|
|
221
|
+
// disable storage faults on all healthy replicas.
|
|
222
|
+
//
|
|
223
|
+
// This is a workaround to avoid the deadlock that occurs when (for example) in a
|
|
224
|
+
// cluster of 3 replicas, one is down, another has a corrupt prepare, and the last does
|
|
225
|
+
// not have the prepare. The two healthy replicas can never complete a view change,
|
|
226
|
+
// because two replicas are not enough to nack, and the unhealthy replica cannot
|
|
227
|
+
// complete the VSR recovery protocol either.
|
|
228
|
+
if (cluster.health[replica] == .up and crashes == 0) {
|
|
229
|
+
storage.faulty = false;
|
|
230
|
+
} else {
|
|
231
|
+
// When a journal recovers for the first time, enable its storage faults.
|
|
232
|
+
// Future crashes will recover in the presence of faults.
|
|
233
|
+
storage.faulty = true;
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
storage.tick();
|
|
237
|
+
}
|
|
180
238
|
|
|
181
|
-
for (cluster.replicas) |*replica
|
|
182
|
-
replica.
|
|
183
|
-
|
|
239
|
+
for (cluster.replicas) |*replica| {
|
|
240
|
+
switch (cluster.health[replica.replica]) {
|
|
241
|
+
.up => |*ticks| {
|
|
242
|
+
ticks.* -|= 1;
|
|
243
|
+
replica.tick();
|
|
244
|
+
cluster.state_checker.check_state(replica.replica);
|
|
245
|
+
|
|
246
|
+
if (ticks.* != 0) continue;
|
|
247
|
+
if (crashes == 0) continue;
|
|
248
|
+
if (cluster.storages[replica.replica].writes.count() == 0) {
|
|
249
|
+
if (!chance_f64(random, health_options.crash_probability)) continue;
|
|
250
|
+
} else {
|
|
251
|
+
if (!chance_f64(random, health_options.crash_probability * 10.0)) continue;
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
if (!try cluster.crash_replica(replica.replica)) continue;
|
|
255
|
+
log_health.debug("crash replica={}", .{replica.replica});
|
|
256
|
+
crashes -= 1;
|
|
257
|
+
},
|
|
258
|
+
.down => |*ticks| {
|
|
259
|
+
ticks.* -|= 1;
|
|
260
|
+
// Keep ticking the time so that it won't have diverged too far to synchronize
|
|
261
|
+
// when the replica restarts.
|
|
262
|
+
replica.clock.time.tick();
|
|
263
|
+
assert(replica.status == .recovering);
|
|
264
|
+
if (ticks.* == 0 and chance_f64(random, health_options.restart_probability)) {
|
|
265
|
+
cluster.health[replica.replica] = .{ .up = health_options.restart_stability };
|
|
266
|
+
log_health.debug("restart replica={}", .{replica.replica});
|
|
267
|
+
}
|
|
268
|
+
},
|
|
269
|
+
}
|
|
184
270
|
}
|
|
185
271
|
|
|
186
|
-
cluster.network.packet_simulator.tick();
|
|
272
|
+
cluster.network.packet_simulator.tick(cluster.health);
|
|
187
273
|
|
|
188
274
|
for (cluster.clients) |*client| client.tick();
|
|
189
275
|
|
|
190
276
|
if (cluster.state_checker.transitions == transitions_max) {
|
|
191
|
-
if (cluster.state_checker.convergence()
|
|
277
|
+
if (cluster.state_checker.convergence() and
|
|
278
|
+
cluster.replica_up_count() == replica_count)
|
|
279
|
+
{
|
|
280
|
+
break;
|
|
281
|
+
}
|
|
192
282
|
continue;
|
|
193
283
|
} else {
|
|
194
284
|
assert(cluster.state_checker.transitions < transitions_max);
|
|
@@ -222,6 +312,12 @@ fn chance(random: std.rand.Random, p: u8) bool {
|
|
|
222
312
|
return random.uintLessThan(u8, 100) < p;
|
|
223
313
|
}
|
|
224
314
|
|
|
315
|
+
/// Returns true, `p` percent of the time, else false.
|
|
316
|
+
fn chance_f64(random: std.rand.Random, p: f64) bool {
|
|
317
|
+
assert(p <= 100.0);
|
|
318
|
+
return random.float(f64) < p;
|
|
319
|
+
}
|
|
320
|
+
|
|
225
321
|
/// Returns the next argument for the simulator or null (if none available)
|
|
226
322
|
fn args_next(args: *std.process.ArgIterator, allocator: std.mem.Allocator) ?[:0]const u8 {
|
|
227
323
|
const err_or_bytes = args.next(allocator) orelse return null;
|