tigerbeetle-node 0.3.3 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/README.md +21 -7
  2. package/dist/benchmark.js +1 -1
  3. package/dist/benchmark.js.map +1 -1
  4. package/dist/index.d.ts +22 -20
  5. package/dist/index.js +40 -18
  6. package/dist/index.js.map +1 -1
  7. package/dist/test.js +13 -1
  8. package/dist/test.js.map +1 -1
  9. package/package.json +12 -12
  10. package/scripts/postinstall.sh +2 -2
  11. package/src/benchmark.ts +4 -4
  12. package/src/index.ts +35 -9
  13. package/src/node.zig +139 -28
  14. package/src/test.ts +19 -5
  15. package/src/tigerbeetle/scripts/benchmark.sh +10 -3
  16. package/src/tigerbeetle/scripts/install.sh +2 -2
  17. package/src/tigerbeetle/scripts/install_zig.bat +109 -0
  18. package/src/tigerbeetle/scripts/install_zig.sh +21 -4
  19. package/src/tigerbeetle/scripts/vopr.bat +48 -0
  20. package/src/tigerbeetle/scripts/vopr.sh +33 -0
  21. package/src/tigerbeetle/src/benchmark.zig +74 -42
  22. package/src/tigerbeetle/src/cli.zig +136 -83
  23. package/src/tigerbeetle/src/config.zig +80 -26
  24. package/src/tigerbeetle/src/demo.zig +101 -78
  25. package/src/tigerbeetle/src/demo_01_create_accounts.zig +2 -7
  26. package/src/tigerbeetle/src/demo_02_lookup_accounts.zig +2 -7
  27. package/src/tigerbeetle/src/demo_03_create_transfers.zig +2 -7
  28. package/src/tigerbeetle/src/demo_04_create_transfers_two_phase_commit.zig +2 -5
  29. package/src/tigerbeetle/src/demo_05_accept_transfers.zig +2 -7
  30. package/src/tigerbeetle/src/demo_06_reject_transfers.zig +2 -7
  31. package/src/tigerbeetle/src/demo_07_lookup_transfers.zig +8 -0
  32. package/src/tigerbeetle/src/fifo.zig +20 -11
  33. package/src/tigerbeetle/src/io.zig +35 -22
  34. package/src/tigerbeetle/src/io_darwin.zig +701 -0
  35. package/src/tigerbeetle/src/main.zig +72 -25
  36. package/src/tigerbeetle/src/message_bus.zig +379 -456
  37. package/src/tigerbeetle/src/message_pool.zig +3 -3
  38. package/src/tigerbeetle/src/ring_buffer.zig +192 -37
  39. package/src/tigerbeetle/src/simulator.zig +317 -0
  40. package/src/tigerbeetle/src/state_machine.zig +846 -38
  41. package/src/tigerbeetle/src/storage.zig +488 -90
  42. package/src/tigerbeetle/src/test/cluster.zig +221 -0
  43. package/src/tigerbeetle/src/test/message_bus.zig +92 -0
  44. package/src/tigerbeetle/src/test/network.zig +182 -0
  45. package/src/tigerbeetle/src/test/packet_simulator.zig +371 -0
  46. package/src/tigerbeetle/src/test/state_checker.zig +142 -0
  47. package/src/tigerbeetle/src/test/state_machine.zig +71 -0
  48. package/src/tigerbeetle/src/test/storage.zig +375 -0
  49. package/src/tigerbeetle/src/test/time.zig +84 -0
  50. package/src/tigerbeetle/src/tigerbeetle.zig +6 -3
  51. package/src/tigerbeetle/src/time.zig +65 -0
  52. package/src/tigerbeetle/src/unit_tests.zig +14 -0
  53. package/src/tigerbeetle/src/vsr/client.zig +519 -0
  54. package/src/tigerbeetle/src/vsr/clock.zig +829 -0
  55. package/src/tigerbeetle/src/vsr/journal.zig +1368 -0
  56. package/src/tigerbeetle/src/vsr/marzullo.zig +306 -0
  57. package/src/tigerbeetle/src/vsr/replica.zig +4248 -0
  58. package/src/tigerbeetle/src/vsr.zig +601 -0
  59. package/src/tigerbeetle/LICENSE +0 -177
  60. package/src/tigerbeetle/README.md +0 -116
  61. package/src/tigerbeetle/src/client.zig +0 -319
  62. package/src/tigerbeetle/src/concurrent_ranges.zig +0 -162
  63. package/src/tigerbeetle/src/fixed_array_list.zig +0 -53
  64. package/src/tigerbeetle/src/io_async.zig +0 -600
  65. package/src/tigerbeetle/src/journal.zig +0 -567
  66. package/src/tigerbeetle/src/test_client.zig +0 -41
  67. package/src/tigerbeetle/src/test_main.zig +0 -118
  68. package/src/tigerbeetle/src/test_message_bus.zig +0 -132
  69. package/src/tigerbeetle/src/vr/journal.zig +0 -672
  70. package/src/tigerbeetle/src/vr/replica.zig +0 -3061
  71. package/src/tigerbeetle/src/vr.zig +0 -374
@@ -2,167 +2,220 @@ const std = @import("std");
2
2
  const assert = std.debug.assert;
3
3
  const fmt = std.fmt;
4
4
  const mem = std.mem;
5
+ const meta = std.meta;
5
6
  const net = std.net;
6
7
  const os = std.os;
7
8
 
8
9
  const config = @import("config.zig");
9
- const vr = @import("vr.zig");
10
+ const vsr = @import("vsr.zig");
10
11
 
11
12
  const usage = fmt.comptimePrint(
12
- \\Usage: tigerbeetle [options]
13
+ \\Usage:
13
14
  \\
14
- \\ -h, --help
15
+ \\ tigerbeetle [-h | --help]
16
+ \\
17
+ \\ tigerbeetle init [--directory=<path>] --cluster=<integer> --replica=<index>
18
+ \\
19
+ \\ tigerbeetle start [--directory=<path>] --cluster=<integer> --replica=<index> --addresses=<addresses>
20
+ \\
21
+ \\Commands:
22
+ \\
23
+ \\ init Create a new .tigerbeetle data file. Requires the --cluster and
24
+ \\ --replica options. The file will be created in the path set by
25
+ \\ the --directory option if provided. Otherwise, it will be created in
26
+ \\ the default {[default_directory]s}.
27
+ \\
28
+ \\ start Run a TigerBeetle replica as part of the cluster specified by the
29
+ \\ --cluster, --replica, and --addresses options. This requires an
30
+ \\ existing .tigerbeetle data file, either in the default
31
+ \\ {[default_directory]s} or the path set with --directory.
32
+ \\
33
+ \\Options:
34
+ \\
35
+ \\ -h, --help
15
36
  \\ Print this help message and exit.
16
37
  \\
17
- \\Required Configuration Options:
38
+ \\ --directory=<path>
39
+ \\ Set the directory used to store .tigerbeetle data files. If this option is
40
+ \\ omitted, the default {[default_directory]s} will be used.
41
+ \\
42
+ \\ --cluster=<integer>
43
+ \\ Set the cluster ID to the provided 32-bit unsigned integer.
18
44
  \\
19
- \\ --cluster-id=<hex id>
20
- \\ Set the cluster ID to the provided non-zero 128-bit hexadecimal number.
45
+ \\ --replica=<index>
46
+ \\ Set the zero-based index that will be used for this replica process.
47
+ \\ The value of this option will be interpreted as an index into the --addresses array.
21
48
  \\
22
- \\ --replica-addresses=<addresses>
49
+ \\ --addresses=<addresses>
23
50
  \\ Set the addresses of all replicas in the cluster. Accepts a
24
51
  \\ comma-separated list of IPv4 addresses with port numbers.
25
52
  \\ Either the IPv4 address or port number, but not both, may be
26
53
  \\ ommited in which case a default of {[default_address]s} or {[default_port]d}
27
54
  \\ will be used.
28
55
  \\
29
- \\ --replica-index=<index>
30
- \\ Set the address in the array passed to the --replica-addresses option that
31
- \\ will be used for this replica process. The value of this option is
32
- \\ interpreted as a zero-based index into the array.
33
- \\
34
56
  \\Examples:
35
57
  \\
36
- \\ tigerbeetle --cluster-id=1a2b3c --replica-addresses=127.0.0.1:3003,127.0.0.1:3001,127.0.0.1:3002 --replica-index=0
58
+ \\ tigerbeetle init --cluster=0 --replica=0 --directory=/var/lib/tigerbeetle
59
+ \\ tigerbeetle init --cluster=0 --replica=1 --directory=/var/lib/tigerbeetle
60
+ \\ tigerbeetle init --cluster=0 --replica=2 --directory=/var/lib/tigerbeetle
37
61
  \\
38
- \\ tigerbeetle --cluster-id=1a2b3c --replica-addresses=3003,3001,3002 --replica-index=1
62
+ \\ tigerbeetle start --cluster=0 --replica=0 --addresses=127.0.0.1:3003,127.0.0.1:3001,127.0.0.1:3002
63
+ \\ tigerbeetle start --cluster=0 --replica=1 --addresses=3003,3001,3002
64
+ \\ tigerbeetle start --cluster=0 --replica=2 --addresses=3003,3001,3002
39
65
  \\
40
- \\ tigerbeetle --cluster-id=1a2b3c --replica-addresses=192.168.0.1,192.168.0.2,192.168.0.3 --replica-index=2
66
+ \\ tigerbeetle start --cluster=1 --replica=0 --addresses=192.168.0.1,192.168.0.2,192.168.0.3
41
67
  \\
42
68
  , .{
69
+ .default_directory = config.directory,
43
70
  .default_address = config.address,
44
71
  .default_port = config.port,
45
72
  });
46
73
 
47
- pub const Args = struct {
48
- cluster: u128,
49
- configuration: []net.Address,
50
- replica: u16,
74
+ pub const Command = union(enum) {
75
+ init: struct {
76
+ cluster: u32,
77
+ replica: u8,
78
+ dir_fd: os.fd_t,
79
+ },
80
+ start: struct {
81
+ cluster: u32,
82
+ replica: u8,
83
+ addresses: []net.Address,
84
+ dir_fd: os.fd_t,
85
+ },
51
86
  };
52
87
 
53
88
  /// Parse the command line arguments passed to the tigerbeetle binary.
54
89
  /// Exits the program with a non-zero exit code if an error is found.
55
- pub fn parse_args(allocator: *std.mem.Allocator) Args {
90
+ pub fn parse_args(allocator: *std.mem.Allocator) Command {
56
91
  var maybe_cluster: ?[]const u8 = null;
57
- var maybe_configuration: ?[]const u8 = null;
58
92
  var maybe_replica: ?[]const u8 = null;
93
+ var maybe_addresses: ?[]const u8 = null;
94
+ var maybe_directory: ?[:0]const u8 = null;
59
95
 
60
96
  var args = std.process.args();
61
97
  // Skip argv[0] which is the name of this executable
62
98
  _ = args.nextPosix();
99
+
100
+ const raw_command = args.nextPosix() orelse
101
+ fatal("no command provided, expected 'start' or 'init'", .{});
102
+ if (mem.eql(u8, raw_command, "-h") or mem.eql(u8, raw_command, "--help")) {
103
+ std.io.getStdOut().writeAll(usage) catch os.exit(1);
104
+ os.exit(0);
105
+ }
106
+ const command = meta.stringToEnum(meta.Tag(Command), raw_command) orelse
107
+ fatal("unknown command '{s}', expected 'start' or 'init'", .{raw_command});
108
+
63
109
  while (args.nextPosix()) |arg| {
64
- if (mem.startsWith(u8, arg, "--cluster-id")) {
65
- maybe_cluster = parse_flag("--cluster-id", arg);
66
- } else if (mem.startsWith(u8, arg, "--replica-addresses")) {
67
- maybe_configuration = parse_flag("--replica-addresses", arg);
68
- } else if (mem.startsWith(u8, arg, "--replica-index")) {
69
- maybe_replica = parse_flag("--replica-index", arg);
110
+ if (mem.startsWith(u8, arg, "--cluster")) {
111
+ maybe_cluster = parse_flag("--cluster", arg);
112
+ } else if (mem.startsWith(u8, arg, "--replica")) {
113
+ maybe_replica = parse_flag("--replica", arg);
114
+ } else if (mem.startsWith(u8, arg, "--addresses")) {
115
+ maybe_addresses = parse_flag("--addresses", arg);
116
+ } else if (mem.startsWith(u8, arg, "--directory")) {
117
+ maybe_directory = parse_flag("--directory", arg);
70
118
  } else if (mem.eql(u8, arg, "-h") or mem.eql(u8, arg, "--help")) {
71
119
  std.io.getStdOut().writeAll(usage) catch os.exit(1);
72
120
  os.exit(0);
73
121
  } else {
74
- print_error_exit("unexpected argument: '{s}'", .{arg});
122
+ fatal("unexpected argument: '{s}'", .{arg});
75
123
  }
76
124
  }
77
125
 
78
- const raw_cluster = maybe_cluster orelse
79
- print_error_exit("required argument: --cluster-id", .{});
80
- const raw_configuration = maybe_configuration orelse
81
- print_error_exit("required argument: --replica-addresses", .{});
82
- const raw_replica = maybe_replica orelse
83
- print_error_exit("required argument: --replica-index", .{});
126
+ const raw_cluster = maybe_cluster orelse fatal("required argument: --cluster", .{});
127
+ const raw_replica = maybe_replica orelse fatal("required argument: --replica", .{});
84
128
 
85
129
  const cluster = parse_cluster(raw_cluster);
86
- const configuration = parse_configuration(allocator, raw_configuration);
87
- const replica = parse_replica(raw_replica, @intCast(u16, configuration.len));
130
+ const replica = parse_replica(raw_replica);
88
131
 
89
- return .{
90
- .cluster = cluster,
91
- .configuration = configuration,
92
- .replica = replica,
93
- };
132
+ const dir_path = maybe_directory orelse config.directory;
133
+ const dir_fd = os.openZ(dir_path, os.O_CLOEXEC | os.O_RDONLY, 0) catch |err|
134
+ fatal("failed to open directory '{s}': {}", .{ dir_path, err });
135
+
136
+ switch (command) {
137
+ .init => {
138
+ if (maybe_addresses != null) {
139
+ fatal("--addresses: supported only by 'start' command", .{});
140
+ }
141
+
142
+ return .{ .init = .{
143
+ .cluster = cluster,
144
+ .replica = replica,
145
+ .dir_fd = dir_fd,
146
+ } };
147
+ },
148
+ .start => {
149
+ const raw_addresses = maybe_addresses orelse
150
+ fatal("required argument: --addresses", .{});
151
+ const addresses = parse_addresses(allocator, raw_addresses);
152
+
153
+ if (replica >= addresses.len) {
154
+ fatal("--replica: value greater than length of --addresses array", .{});
155
+ }
156
+
157
+ return .{ .start = .{
158
+ .cluster = cluster,
159
+ .replica = replica,
160
+ .addresses = addresses,
161
+ .dir_fd = dir_fd,
162
+ } };
163
+ },
164
+ }
94
165
  }
95
166
 
96
167
  /// Format and print an error message followed by the usage string to stderr,
97
168
  /// then exit with an exit code of 1.
98
- fn print_error_exit(comptime fmt_string: []const u8, args: anytype) noreturn {
169
+ fn fatal(comptime fmt_string: []const u8, args: anytype) noreturn {
99
170
  const stderr = std.io.getStdErr().writer();
100
- stderr.print("error: " ++ fmt_string ++ "\n\n" ++ usage, args) catch {};
171
+ stderr.print("error: " ++ fmt_string ++ "\n", args) catch {};
101
172
  os.exit(1);
102
173
  }
103
174
 
104
175
  /// Parse e.g. `--cluster=1a2b3c` into `1a2b3c` with error handling.
105
- fn parse_flag(comptime flag: []const u8, arg: []const u8) []const u8 {
176
+ fn parse_flag(comptime flag: []const u8, arg: [:0]const u8) [:0]const u8 {
106
177
  const value = arg[flag.len..];
107
178
  if (value.len < 2) {
108
- print_error_exit("{s} argument requires a value", .{flag});
179
+ fatal("{s} argument requires a value", .{flag});
109
180
  }
110
181
  if (value[0] != '=') {
111
- print_error_exit("expected '=' after {s} but found '{c}'", .{ flag, value[0] });
182
+ fatal("expected '=' after {s} but found '{c}'", .{ flag, value[0] });
112
183
  }
113
184
  return value[1..];
114
185
  }
115
186
 
116
- fn parse_cluster(raw_cluster: []const u8) u128 {
117
- const cluster = fmt.parseUnsigned(u128, raw_cluster, 16) catch |err| switch (err) {
118
- error.Overflow => print_error_exit(
119
- \\--cluster-id: value does not fit into a 128-bit unsigned integer
120
- , .{}),
121
- error.InvalidCharacter => print_error_exit(
122
- \\--cluster-id: value contains an invalid character
123
- , .{}),
187
+ fn parse_cluster(raw_cluster: []const u8) u32 {
188
+ const cluster = fmt.parseUnsigned(u32, raw_cluster, 10) catch |err| switch (err) {
189
+ error.Overflow => fatal("--cluster: value exceeds a 32-bit unsigned integer", .{}),
190
+ error.InvalidCharacter => fatal("--cluster: value contains an invalid character", .{}),
124
191
  };
125
- if (cluster == 0) {
126
- print_error_exit("--cluster-id: a value of 0 is not permitted", .{});
127
- }
128
192
  return cluster;
129
193
  }
130
194
 
131
- /// Parse and allocate the configuration returning a slice into that array.
132
- fn parse_configuration(allocator: *std.mem.Allocator, raw_configuration: []const u8) []net.Address {
133
- return vr.parse_configuration(allocator, raw_configuration) catch |err| switch (err) {
134
- error.AddressHasTrailingComma => {
135
- print_error_exit("--replica-addresses: invalid trailing comma", .{});
136
- },
195
+ /// Parse and allocate the addresses returning a slice into that array.
196
+ fn parse_addresses(allocator: *std.mem.Allocator, raw_addresses: []const u8) []net.Address {
197
+ return vsr.parse_addresses(allocator, raw_addresses) catch |err| switch (err) {
198
+ error.AddressHasTrailingComma => fatal("--addresses: invalid trailing comma", .{}),
137
199
  error.AddressLimitExceeded => {
138
- print_error_exit("--replica-addresses: too many addresses, at most {d} are allowed", .{
200
+ fatal("--addresses: too many addresses, at most {d} are allowed", .{
139
201
  config.replicas_max,
140
202
  });
141
203
  },
142
204
  error.AddressHasMoreThanOneColon => {
143
- print_error_exit("--replica-addresses: invalid address with more than one colon", .{});
205
+ fatal("--addresses: invalid address with more than one colon", .{});
144
206
  },
145
- error.PortOverflow => print_error_exit("--replica-addresses: port exceeds 65535", .{}),
146
- error.PortInvalid => print_error_exit("--replica-addresses: invalid port", .{}),
147
- error.AddressInvalid => print_error_exit("--replica-addresses: invalid IPv4 address", .{}),
148
- error.OutOfMemory => print_error_exit("--replica-addresses: out of memory", .{}),
207
+ error.PortOverflow => fatal("--addresses: port exceeds 65535", .{}),
208
+ error.PortInvalid => fatal("--addresses: invalid port", .{}),
209
+ error.AddressInvalid => fatal("--addresses: invalid IPv4 address", .{}),
210
+ error.OutOfMemory => fatal("--addresses: out of memory", .{}),
149
211
  };
150
212
  }
151
213
 
152
- fn parse_replica(raw_replica: []const u8, configuration_len: u16) u16 {
153
- comptime assert(config.replicas_max <= std.math.maxInt(u16));
154
- const replica = fmt.parseUnsigned(u16, raw_replica, 10) catch |err| switch (err) {
155
- error.Overflow => print_error_exit(
156
- \\--replica-index: value greater than length of address array
157
- , .{}),
158
- error.InvalidCharacter => print_error_exit(
159
- \\--replica-index: value contains an invalid character
160
- , .{}),
214
+ fn parse_replica(raw_replica: []const u8) u8 {
215
+ comptime assert(config.replicas_max <= std.math.maxInt(u8));
216
+ const replica = fmt.parseUnsigned(u8, raw_replica, 10) catch |err| switch (err) {
217
+ error.Overflow => fatal("--replica: value exceeds an 8-bit unsigned integer", .{}),
218
+ error.InvalidCharacter => fatal("--replica: value contains an invalid character", .{}),
161
219
  };
162
- if (replica >= configuration_len) {
163
- print_error_exit(
164
- \\--replica-index: value greater than length of address array
165
- , .{});
166
- }
167
220
  return replica;
168
221
  }
@@ -5,28 +5,35 @@ pub const deployment_environment = .development;
5
5
  pub const log_level = 6;
6
6
 
7
7
  /// The maximum number of replicas allowed in a cluster.
8
- pub const replicas_max = 15;
8
+ pub const replicas_max = 6;
9
9
 
10
- /// The minimum number of nodes required to form quorums for leader election or replication:
11
- /// Majority quorums are only required across leader election and replication phases (not within).
12
- /// As per Flexible Paxos, provided quorum_leader_election + quorum_replication > cluster_nodes:
13
- /// 1. you may increase quorum_leader_election above a majority, so that
14
- /// 2. you can decrease quorum_replication below a majority, to optimize the common case.
10
+ /// The maximum number of clients allowed per cluster, where each client has a unique 128-bit ID.
11
+ /// This impacts the amount of memory allocated at initialization by the server.
12
+ /// This determines the size of the VR client table used to cache replies to clients by client ID.
13
+ /// Each client has one entry in the VR client table to store the latest `message_size_max` reply.
14
+ pub const clients_max = 32;
15
+
16
+ /// The minimum number of nodes required to form a quorum for replication:
17
+ /// Majority quorums are only required across view change and replication phases (not within).
18
+ /// As per Flexible Paxos, provided `quorum_replication + quorum_view_change > replicas`:
19
+ /// 1. you may increase `quorum_view_change` above a majority, so that
20
+ /// 2. you can decrease `quorum_replication` below a majority, to optimize the common case.
15
21
  /// This improves latency by reducing the number of nodes required for synchronous replication.
16
22
  /// This reduces redundancy only in the short term, asynchronous replication will still continue.
17
- pub const quorum_leader_election = -1;
18
- pub const quorum_replication = 2;
23
+ /// The size of the replication quorum is limited to the minimum of this value and actual majority.
24
+ /// The size of the view change quorum will then be automatically inferred from quorum_replication.
25
+ pub const quorum_replication_max = 3;
19
26
 
20
- /// The default server port to listen on if not specified in `--replica-addresses`:
27
+ /// The default server port to listen on if not specified in `--addresses`:
21
28
  pub const port = 3001;
22
29
 
23
- /// The default network interface address to listen on if not specified in `--replica-addresses`:
30
+ /// The default network interface address to listen on if not specified in `--addresses`:
24
31
  /// WARNING: Binding to all interfaces with "0.0.0.0" is dangerous and opens the server to anyone.
25
32
  /// Bind to the "127.0.0.1" loopback address to accept local connections as a safe default only.
26
33
  pub const address = "127.0.0.1";
27
34
 
28
- /// Where journal files should be persisted:
29
- pub const data_directory = "/var/lib/tigerbeetle";
35
+ /// Where data files should be persisted by default:
36
+ pub const directory = "/var/lib/tigerbeetle";
30
37
 
31
38
  /// The maximum number of accounts to store in memory:
32
39
  /// This impacts the amount of memory allocated at initialization by the server.
@@ -54,7 +61,7 @@ pub const commits_max = transfers_max;
54
61
  /// This also enables us to detect filesystem inode corruption that would change the journal size.
55
62
  pub const journal_size_max = switch (deployment_environment) {
56
63
  .production => 128 * 1024 * 1024 * 1024,
57
- else => 256 * 1024 * 1024,
64
+ else => 128 * 1024 * 1024,
58
65
  };
59
66
 
60
67
  /// The maximum number of batch entries in the journal file:
@@ -66,33 +73,37 @@ pub const journal_headers_max = switch (deployment_environment) {
66
73
  else => 16384,
67
74
  };
68
75
 
69
- /// The maximum number of connections that can be accepted and held open by the server at any time:
70
- pub const connections_max = 32;
76
+ /// The maximum number of connections that can be held open by the server at any time:
77
+ pub const connections_max = replicas_max + clients_max;
71
78
 
72
79
  /// The maximum size of a message in bytes:
73
80
  /// This is also the limit of all inflight data across multiple pipelined requests per connection.
74
- /// We may have one request of up to 4 MiB inflight or 4 pipelined requests of up to 1 MiB inflight.
81
+ /// We may have one request of up to 2 MiB inflight or 2 pipelined requests of up to 1 MiB inflight.
75
82
  /// This impacts sequential disk write throughput, the larger the buffer the better.
76
- /// 4 MiB is 32,768 transfers, and a reasonable choice for sequential disk write throughput.
83
+ /// 2 MiB is 16,384 transfers, and a reasonable choice for sequential disk write throughput.
77
84
  /// However, this impacts bufferbloat and head-of-line blocking latency for pipelined requests.
78
- /// For a 1 Gbps NIC = 125 MiB/s throughput: 4 MiB / 125 * 1000ms = 32ms for the next request.
79
- /// This also impacts the amount of memory allocated at initialization by the server.
80
- pub const message_size_max = 4 * 1024 * 1024;
85
+ /// For a 1 Gbps NIC = 125 MiB/s throughput: 2 MiB / 125 * 1000ms = 16ms for the next request.
86
+ /// This impacts the amount of memory allocated at initialization by the server.
87
+ pub const message_size_max = 1 * 1024 * 1024;
81
88
 
82
89
  /// The number of full-sized messages allocated at initialization by the message bus.
83
90
  pub const message_bus_messages_max = connections_max * 4;
84
91
  /// The number of header-sized messages allocated at initialization by the message bus.
85
92
  /// These are much smaller/cheaper and we can therefore have many of them.
86
- pub const message_bus_headers_max = connections_max * connection_send_queue_max;
93
+ pub const message_bus_headers_max = connections_max * connection_send_queue_max * 2;
94
+
95
+ /// The maximum number of Viewstamped Replication prepare messages that can be inflight at a time.
96
+ /// This is immutable once assigned per cluster, as replicas need to know how many operations might
97
+ /// possibly be uncommitted during a view change, and this must be constant for all replicas.
98
+ pub const pipelining_max = clients_max;
87
99
 
88
100
  /// The minimum and maximum amount of time in milliseconds to wait before initiating a connection.
89
- /// Exponential backoff and full jitter are applied within this range.
90
- /// For more, see: https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/
91
- pub const connection_delay_min = 50;
92
- pub const connection_delay_max = 1000;
101
+ /// Exponential backoff and jitter are applied within this range.
102
+ pub const connection_delay_min_ms = 50;
103
+ pub const connection_delay_max_ms = 1000;
93
104
 
94
105
  /// The maximum number of outgoing messages that may be queued on a connection.
95
- pub const connection_send_queue_max = 3;
106
+ pub const connection_send_queue_max = pipelining_max;
96
107
 
97
108
  /// The maximum number of connections in the kernel's complete connection queue pending an accept():
98
109
  /// If the backlog argument is greater than the value in `/proc/sys/net/core/somaxconn`, then it is
@@ -161,6 +172,49 @@ pub const sector_size = 4096;
161
172
  /// when they were never written to disk.
162
173
  pub const direct_io = true;
163
174
 
175
+ /// The maximum number of concurrent read I/O operations to allow at once.
176
+ pub const io_depth_read = 8;
177
+ /// The maximum number of concurrent write I/O operations to allow at once.
178
+ pub const io_depth_write = 8;
179
+
164
180
  /// The number of milliseconds between each replica tick, the basic unit of time in TigerBeetle.
165
181
  /// Used to regulate heartbeats, retries and timeouts, all specified as multiples of a tick.
166
182
  pub const tick_ms = 10;
183
+
184
+ /// The conservative round-trip time at startup when there is no network knowledge.
185
+ /// Adjusted dynamically thereafter for RTT-sensitive timeouts according to network congestion.
186
+ /// This should be set higher rather than lower to avoid flooding the network at startup.
187
+ pub const rtt_ticks = 300 / tick_ms;
188
+
189
+ /// The multiple of round-trip time for RTT-sensitive timeouts.
190
+ pub const rtt_multiple = 2;
191
+
192
+ /// The min/max bounds of exponential backoff (and jitter) to add to RTT-sensitive timeouts.
193
+ pub const backoff_min_ticks = 100 / tick_ms;
194
+ pub const backoff_max_ticks = 10000 / tick_ms;
195
+
196
+ /// The maximum skew between two clocks to allow when considering them to be in agreement.
197
+ /// The principle is that no two clocks tick exactly alike but some clocks more or less agree.
198
+ /// The maximum skew across the cluster as a whole is this value times the total number of clocks.
199
+ /// The cluster will be unavailable if the majority of clocks are all further than this value apart.
200
+ /// Decreasing this reduces the probability of reaching agreement on synchronized time.
201
+ /// Increasing this reduces the accuracy of synchronized time.
202
+ pub const clock_offset_tolerance_max_ms = 10000;
203
+
204
+ /// The amount of time before the clock's synchronized epoch is expired.
205
+ /// If the epoch is expired before it can be replaced with a new synchronized epoch, then this most
206
+ /// likely indicates either a network partition or else too many clock faults across the cluster.
207
+ /// A new synchronized epoch will be installed as soon as these conditions resolve.
208
+ pub const clock_epoch_max_ms = 60000;
209
+
210
+ /// The amount of time to wait for enough accurate samples before synchronizing the clock.
211
+ /// The more samples we can take per remote clock source, the more accurate our estimation becomes.
212
+ /// This impacts cluster startup time as the leader must first wait for synchronization to complete.
213
+ pub const clock_synchronization_window_min_ms = 2000;
214
+
215
+ /// The amount of time without agreement before the clock window is expired and a new window opened.
216
+ /// This happens where some samples have been collected but not enough to reach agreement.
217
+ /// The quality of samples degrades as they age so at some point we throw them away and start over.
218
+ /// This eliminates the impact of gradual clock drift on our clock offset (clock skew) measurements.
219
+ /// If a window expires because of this then it is likely that the clock epoch will also be expired.
220
+ pub const clock_synchronization_window_max_ms = 20000;