tigerbeetle-node 0.11.12 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. package/README.md +212 -196
  2. package/dist/bin/aarch64-linux-gnu/client.node +0 -0
  3. package/dist/bin/aarch64-linux-musl/client.node +0 -0
  4. package/dist/bin/aarch64-macos/client.node +0 -0
  5. package/dist/bin/x86_64-linux-gnu/client.node +0 -0
  6. package/dist/bin/x86_64-linux-musl/client.node +0 -0
  7. package/dist/bin/x86_64-macos/client.node +0 -0
  8. package/dist/index.js +33 -1
  9. package/dist/index.js.map +1 -1
  10. package/package-lock.json +66 -0
  11. package/package.json +8 -17
  12. package/src/index.ts +56 -1
  13. package/src/node.zig +10 -9
  14. package/dist/.client.node.sha256 +0 -1
  15. package/scripts/build_lib.sh +0 -61
  16. package/scripts/download_node_headers.sh +0 -32
  17. package/src/tigerbeetle/scripts/benchmark.bat +0 -48
  18. package/src/tigerbeetle/scripts/benchmark.sh +0 -66
  19. package/src/tigerbeetle/scripts/confirm_image.sh +0 -44
  20. package/src/tigerbeetle/scripts/fuzz_loop.sh +0 -15
  21. package/src/tigerbeetle/scripts/fuzz_unique_errors.sh +0 -7
  22. package/src/tigerbeetle/scripts/install.bat +0 -7
  23. package/src/tigerbeetle/scripts/install.sh +0 -21
  24. package/src/tigerbeetle/scripts/install_zig.bat +0 -113
  25. package/src/tigerbeetle/scripts/install_zig.sh +0 -90
  26. package/src/tigerbeetle/scripts/lint.zig +0 -199
  27. package/src/tigerbeetle/scripts/pre-commit.sh +0 -9
  28. package/src/tigerbeetle/scripts/scripts/benchmark.bat +0 -48
  29. package/src/tigerbeetle/scripts/scripts/benchmark.sh +0 -66
  30. package/src/tigerbeetle/scripts/scripts/confirm_image.sh +0 -44
  31. package/src/tigerbeetle/scripts/scripts/fuzz_loop.sh +0 -15
  32. package/src/tigerbeetle/scripts/scripts/fuzz_unique_errors.sh +0 -7
  33. package/src/tigerbeetle/scripts/scripts/install.bat +0 -7
  34. package/src/tigerbeetle/scripts/scripts/install.sh +0 -21
  35. package/src/tigerbeetle/scripts/scripts/install_zig.bat +0 -113
  36. package/src/tigerbeetle/scripts/scripts/install_zig.sh +0 -90
  37. package/src/tigerbeetle/scripts/scripts/lint.zig +0 -199
  38. package/src/tigerbeetle/scripts/scripts/pre-commit.sh +0 -9
  39. package/src/tigerbeetle/scripts/scripts/shellcheck.sh +0 -5
  40. package/src/tigerbeetle/scripts/scripts/tests_on_alpine.sh +0 -10
  41. package/src/tigerbeetle/scripts/scripts/tests_on_ubuntu.sh +0 -14
  42. package/src/tigerbeetle/scripts/scripts/upgrade_ubuntu_kernel.sh +0 -48
  43. package/src/tigerbeetle/scripts/scripts/validate_docs.sh +0 -23
  44. package/src/tigerbeetle/scripts/scripts/vr_state_enumerate +0 -46
  45. package/src/tigerbeetle/scripts/shellcheck.sh +0 -5
  46. package/src/tigerbeetle/scripts/tests_on_alpine.sh +0 -10
  47. package/src/tigerbeetle/scripts/tests_on_ubuntu.sh +0 -14
  48. package/src/tigerbeetle/scripts/upgrade_ubuntu_kernel.sh +0 -48
  49. package/src/tigerbeetle/scripts/validate_docs.sh +0 -23
  50. package/src/tigerbeetle/scripts/vr_state_enumerate +0 -46
  51. package/src/tigerbeetle/src/benchmark.zig +0 -314
  52. package/src/tigerbeetle/src/config.zig +0 -234
  53. package/src/tigerbeetle/src/constants.zig +0 -436
  54. package/src/tigerbeetle/src/ewah.zig +0 -286
  55. package/src/tigerbeetle/src/ewah_benchmark.zig +0 -120
  56. package/src/tigerbeetle/src/ewah_fuzz.zig +0 -130
  57. package/src/tigerbeetle/src/fifo.zig +0 -120
  58. package/src/tigerbeetle/src/io/benchmark.zig +0 -213
  59. package/src/tigerbeetle/src/io/darwin.zig +0 -814
  60. package/src/tigerbeetle/src/io/linux.zig +0 -1062
  61. package/src/tigerbeetle/src/io/test.zig +0 -643
  62. package/src/tigerbeetle/src/io/windows.zig +0 -1183
  63. package/src/tigerbeetle/src/io.zig +0 -34
  64. package/src/tigerbeetle/src/iops.zig +0 -107
  65. package/src/tigerbeetle/src/lsm/README.md +0 -308
  66. package/src/tigerbeetle/src/lsm/binary_search.zig +0 -341
  67. package/src/tigerbeetle/src/lsm/bloom_filter.zig +0 -125
  68. package/src/tigerbeetle/src/lsm/compaction.zig +0 -603
  69. package/src/tigerbeetle/src/lsm/composite_key.zig +0 -77
  70. package/src/tigerbeetle/src/lsm/direction.zig +0 -11
  71. package/src/tigerbeetle/src/lsm/eytzinger.zig +0 -587
  72. package/src/tigerbeetle/src/lsm/eytzinger_benchmark.zig +0 -330
  73. package/src/tigerbeetle/src/lsm/forest.zig +0 -204
  74. package/src/tigerbeetle/src/lsm/forest_fuzz.zig +0 -401
  75. package/src/tigerbeetle/src/lsm/grid.zig +0 -573
  76. package/src/tigerbeetle/src/lsm/groove.zig +0 -972
  77. package/src/tigerbeetle/src/lsm/k_way_merge.zig +0 -474
  78. package/src/tigerbeetle/src/lsm/level_iterator.zig +0 -332
  79. package/src/tigerbeetle/src/lsm/manifest.zig +0 -617
  80. package/src/tigerbeetle/src/lsm/manifest_level.zig +0 -877
  81. package/src/tigerbeetle/src/lsm/manifest_log.zig +0 -789
  82. package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +0 -691
  83. package/src/tigerbeetle/src/lsm/merge_iterator.zig +0 -106
  84. package/src/tigerbeetle/src/lsm/node_pool.zig +0 -235
  85. package/src/tigerbeetle/src/lsm/posted_groove.zig +0 -378
  86. package/src/tigerbeetle/src/lsm/segmented_array.zig +0 -1328
  87. package/src/tigerbeetle/src/lsm/segmented_array_benchmark.zig +0 -148
  88. package/src/tigerbeetle/src/lsm/segmented_array_fuzz.zig +0 -9
  89. package/src/tigerbeetle/src/lsm/set_associative_cache.zig +0 -850
  90. package/src/tigerbeetle/src/lsm/table.zig +0 -1031
  91. package/src/tigerbeetle/src/lsm/table_immutable.zig +0 -203
  92. package/src/tigerbeetle/src/lsm/table_iterator.zig +0 -340
  93. package/src/tigerbeetle/src/lsm/table_mutable.zig +0 -220
  94. package/src/tigerbeetle/src/lsm/test.zig +0 -438
  95. package/src/tigerbeetle/src/lsm/tree.zig +0 -1193
  96. package/src/tigerbeetle/src/lsm/tree_fuzz.zig +0 -474
  97. package/src/tigerbeetle/src/message_bus.zig +0 -1012
  98. package/src/tigerbeetle/src/message_pool.zig +0 -156
  99. package/src/tigerbeetle/src/ring_buffer.zig +0 -399
  100. package/src/tigerbeetle/src/simulator.zig +0 -569
  101. package/src/tigerbeetle/src/state_machine/auditor.zig +0 -577
  102. package/src/tigerbeetle/src/state_machine/workload.zig +0 -883
  103. package/src/tigerbeetle/src/state_machine.zig +0 -1881
  104. package/src/tigerbeetle/src/static_allocator.zig +0 -65
  105. package/src/tigerbeetle/src/stdx.zig +0 -162
  106. package/src/tigerbeetle/src/storage.zig +0 -393
  107. package/src/tigerbeetle/src/testing/cluster/message_bus.zig +0 -82
  108. package/src/tigerbeetle/src/testing/cluster/network.zig +0 -237
  109. package/src/tigerbeetle/src/testing/cluster/state_checker.zig +0 -169
  110. package/src/tigerbeetle/src/testing/cluster/storage_checker.zig +0 -202
  111. package/src/tigerbeetle/src/testing/cluster.zig +0 -443
  112. package/src/tigerbeetle/src/testing/fuzz.zig +0 -140
  113. package/src/tigerbeetle/src/testing/hash_log.zig +0 -66
  114. package/src/tigerbeetle/src/testing/id.zig +0 -99
  115. package/src/tigerbeetle/src/testing/packet_simulator.zig +0 -364
  116. package/src/tigerbeetle/src/testing/priority_queue.zig +0 -645
  117. package/src/tigerbeetle/src/testing/reply_sequence.zig +0 -139
  118. package/src/tigerbeetle/src/testing/state_machine.zig +0 -249
  119. package/src/tigerbeetle/src/testing/storage.zig +0 -757
  120. package/src/tigerbeetle/src/testing/table.zig +0 -247
  121. package/src/tigerbeetle/src/testing/time.zig +0 -84
  122. package/src/tigerbeetle/src/tigerbeetle.zig +0 -227
  123. package/src/tigerbeetle/src/time.zig +0 -112
  124. package/src/tigerbeetle/src/tracer.zig +0 -529
  125. package/src/tigerbeetle/src/unit_tests.zig +0 -42
  126. package/src/tigerbeetle/src/vopr.zig +0 -495
  127. package/src/tigerbeetle/src/vsr/README.md +0 -209
  128. package/src/tigerbeetle/src/vsr/client.zig +0 -544
  129. package/src/tigerbeetle/src/vsr/clock.zig +0 -853
  130. package/src/tigerbeetle/src/vsr/journal.zig +0 -2413
  131. package/src/tigerbeetle/src/vsr/journal_format_fuzz.zig +0 -111
  132. package/src/tigerbeetle/src/vsr/marzullo.zig +0 -309
  133. package/src/tigerbeetle/src/vsr/replica.zig +0 -6381
  134. package/src/tigerbeetle/src/vsr/replica_format.zig +0 -219
  135. package/src/tigerbeetle/src/vsr/superblock.zig +0 -1631
  136. package/src/tigerbeetle/src/vsr/superblock_client_table.zig +0 -256
  137. package/src/tigerbeetle/src/vsr/superblock_free_set.zig +0 -929
  138. package/src/tigerbeetle/src/vsr/superblock_free_set_fuzz.zig +0 -334
  139. package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +0 -390
  140. package/src/tigerbeetle/src/vsr/superblock_manifest.zig +0 -615
  141. package/src/tigerbeetle/src/vsr/superblock_quorums.zig +0 -394
  142. package/src/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +0 -314
  143. package/src/tigerbeetle/src/vsr.zig +0 -1352
@@ -1,1352 +0,0 @@
1
- const std = @import("std");
2
- const math = std.math;
3
- const Allocator = std.mem.Allocator;
4
- const assert = std.debug.assert;
5
- const log = std.log.scoped(.vsr);
6
-
7
- // vsr.zig is the root of a zig package, reexport all public APIs.
8
- //
9
- // Note that we don't promise any stability of these interfaces yet.
10
- pub const constants = @import("constants.zig");
11
- pub const io = @import("io.zig");
12
- pub const message_bus = @import("message_bus.zig");
13
- pub const message_pool = @import("message_pool.zig");
14
- pub const state_machine = @import("state_machine.zig");
15
- pub const storage = @import("storage.zig");
16
- pub const tigerbeetle = @import("tigerbeetle.zig");
17
- pub const time = @import("time.zig");
18
- pub const tracer = @import("tracer.zig");
19
- pub const config = @import("config.zig");
20
- pub const stdx = @import("stdx.zig");
21
- pub const superblock = @import("vsr/superblock.zig");
22
- pub const lsm = .{
23
- .tree = @import("lsm/tree.zig"),
24
- .grid = @import("lsm/grid.zig"),
25
- .groove = @import("lsm/groove.zig"),
26
- .forest = @import("lsm/forest.zig"),
27
- .posted_groove = @import("lsm/posted_groove.zig"),
28
- };
29
- pub const testing = .{
30
- .cluster = @import("testing/cluster.zig"),
31
- };
32
-
33
- pub const ReplicaType = @import("vsr/replica.zig").ReplicaType;
34
- pub const format = @import("vsr/replica_format.zig").format;
35
- pub const Status = @import("vsr/replica.zig").Status;
36
- pub const Client = @import("vsr/client.zig").Client;
37
- pub const Clock = @import("vsr/clock.zig").Clock;
38
- pub const JournalType = @import("vsr/journal.zig").JournalType;
39
- pub const SlotRange = @import("vsr/journal.zig").SlotRange;
40
- pub const SuperBlockType = superblock.SuperBlockType;
41
- pub const VSRState = superblock.SuperBlockHeader.VSRState;
42
-
43
- /// The version of our Viewstamped Replication protocol in use, including customizations.
44
- /// For backwards compatibility through breaking changes (e.g. upgrading checksums/ciphers).
45
- pub const Version: u8 = 0;
46
-
47
- pub const ProcessType = enum { replica, client };
48
-
49
- pub const Zone = enum {
50
- superblock,
51
- wal_headers,
52
- wal_prepares,
53
- grid,
54
-
55
- const size_superblock = superblock.superblock_zone_size;
56
- const size_wal_headers = constants.journal_size_headers;
57
- const size_wal_prepares = constants.journal_size_prepares;
58
-
59
- comptime {
60
- for (.{
61
- size_superblock,
62
- size_wal_headers,
63
- size_wal_prepares,
64
- }) |zone_size| {
65
- assert(zone_size % constants.sector_size == 0);
66
- }
67
- }
68
-
69
- pub fn offset(zone: Zone, offset_logical: u64) u64 {
70
- if (zone.size()) |zone_size| {
71
- assert(offset_logical < zone_size);
72
- }
73
-
74
- return offset_logical + switch (zone) {
75
- .superblock => 0,
76
- .wal_headers => size_superblock,
77
- .wal_prepares => size_superblock + size_wal_headers,
78
- .grid => size_superblock + size_wal_headers + size_wal_prepares,
79
- };
80
- }
81
-
82
- pub fn size(zone: Zone) ?u64 {
83
- return switch (zone) {
84
- .superblock => size_superblock,
85
- .wal_headers => size_wal_headers,
86
- .wal_prepares => size_wal_prepares,
87
- .grid => null,
88
- };
89
- }
90
- };
91
-
92
- /// Viewstamped Replication protocol commands:
93
- pub const Command = enum(u8) {
94
- reserved,
95
-
96
- ping,
97
- pong,
98
-
99
- request,
100
- prepare,
101
- prepare_ok,
102
- reply,
103
- commit,
104
-
105
- start_view_change,
106
- do_view_change,
107
- start_view,
108
-
109
- request_start_view,
110
- request_headers,
111
- request_prepare,
112
- headers,
113
- nack_prepare,
114
-
115
- eviction,
116
-
117
- request_block,
118
- block,
119
- };
120
-
121
- /// This type exists to avoid making the Header type dependant on the state
122
- /// machine used, which would cause awkward circular type dependencies.
123
- pub const Operation = enum(u8) {
124
- /// Operations reserved by VR protocol (for all state machines):
125
- /// The value 0 is reserved to prevent a spurious zero from being interpreted as an operation.
126
- reserved = 0,
127
- /// The value 1 is reserved to initialize the cluster.
128
- root = 1,
129
- /// The value 2 is reserved to register a client session with the cluster.
130
- register = 2,
131
-
132
- /// Operations exported by the state machine (all other values are free):
133
- _,
134
-
135
- pub fn from(comptime StateMachine: type, op: StateMachine.Operation) Operation {
136
- check_state_machine_operations(StateMachine.Operation);
137
- return @intToEnum(Operation, @enumToInt(op));
138
- }
139
-
140
- pub fn cast(self: Operation, comptime StateMachine: type) StateMachine.Operation {
141
- check_state_machine_operations(StateMachine.Operation);
142
- return @intToEnum(StateMachine.Operation, @enumToInt(self));
143
- }
144
-
145
- fn check_state_machine_operations(comptime Op: type) void {
146
- if (!@hasField(Op, "reserved") or std.meta.fieldInfo(Op, .reserved).value != 0) {
147
- @compileError("StateMachine.Operation must have a 'reserved' field with value 0");
148
- }
149
- if (!@hasField(Op, "root") or std.meta.fieldInfo(Op, .root).value != 1) {
150
- @compileError("StateMachine.Operation must have a 'root' field with value 1");
151
- }
152
- if (!@hasField(Op, "register") or std.meta.fieldInfo(Op, .register).value != 2) {
153
- @compileError("StateMachine.Operation must have a 'register' field with value 2");
154
- }
155
- }
156
- };
157
-
158
- /// Network message and journal entry header:
159
- /// We reuse the same header for both so that prepare messages from the primary can simply be
160
- /// journalled as is by the backups without requiring any further modification.
161
- pub const Header = extern struct {
162
- comptime {
163
- assert(@sizeOf(Header) == 128);
164
- // Assert that there is no implicit padding in the struct.
165
- assert(@bitSizeOf(Header) == @sizeOf(Header) * 8);
166
- }
167
- /// A checksum covering only the remainder of this header.
168
- /// This allows the header to be trusted without having to recv() or read() the associated body.
169
- /// This checksum is enough to uniquely identify a network message or journal entry.
170
- checksum: u128 = 0,
171
-
172
- /// A checksum covering only the associated body after this header.
173
- checksum_body: u128 = 0,
174
-
175
- /// A backpointer to the previous request or prepare checksum for hash chain verification.
176
- /// This provides a cryptographic guarantee for linearizability:
177
- /// 1. across our distributed log of prepares, and
178
- /// 2. across a client's requests and our replies.
179
- /// This may also be used as the initialization vector for AEAD encryption at rest, provided
180
- /// that the primary ratchets the encryption key every view change to ensure that prepares
181
- /// reordered through a view change never repeat the same IV for the same encryption key.
182
- parent: u128 = 0,
183
-
184
- /// Each client process generates a unique, random and ephemeral client ID at initialization.
185
- /// The client ID identifies connections made by the client to the cluster for the sake of
186
- /// routing messages back to the client.
187
- ///
188
- /// With the client ID in hand, the client then registers a monotonically increasing session
189
- /// number (committed through the cluster) to allow the client's session to be evicted safely
190
- /// from the client table if too many concurrent clients cause the client table to overflow.
191
- /// The monotonically increasing session number prevents duplicate client requests from being
192
- /// replayed.
193
- ///
194
- /// The problem of routing is therefore solved by the 128-bit client ID, and the problem of
195
- /// detecting whether a session has been evicted is solved by the session number.
196
- client: u128 = 0,
197
-
198
- /// The checksum of the message to which this message refers, or a unique recovery nonce.
199
- ///
200
- /// We use this cryptographic context in various ways, for example:
201
- ///
202
- /// * A `request` sets this to the client's session number.
203
- /// * A `prepare` sets this to the checksum of the client's request.
204
- /// * A `prepare_ok` sets this to the checksum of the prepare being acked.
205
- /// * A `commit` sets this to the checksum of the latest committed prepare.
206
- /// * A `request_prepare` sets this to the checksum of the prepare being requested.
207
- /// * A `nack_prepare` sets this to the checksum of the prepare being nacked.
208
- /// * A `recovery` and `recovery_response` sets this to the nonce.
209
- ///
210
- /// This allows for cryptographic guarantees beyond request, op, and commit numbers, which have
211
- /// low entropy and may otherwise collide in the event of any correctness bugs.
212
- context: u128 = 0,
213
-
214
- /// Each request is given a number by the client and later requests must have larger numbers
215
- /// than earlier ones. The request number is used by the replicas to avoid running requests more
216
- /// than once; it is also used by the client to discard duplicate responses to its requests.
217
- /// A client is allowed to have at most one request inflight at a time.
218
- request: u32 = 0,
219
-
220
- /// The cluster number binds intention into the header, so that a client or replica can indicate
221
- /// the cluster it believes it is speaking to, instead of accidentally talking to the wrong
222
- /// cluster (for example, staging vs production).
223
- cluster: u32,
224
-
225
- /// The cluster reconfiguration epoch number (for future use).
226
- epoch: u32 = 0,
227
-
228
- /// Every message sent from one replica to another contains the sending replica's current view.
229
- /// A `u32` allows for a minimum lifetime of 136 years at a rate of one view change per second.
230
- view: u32 = 0,
231
-
232
- /// The op number of the latest prepare that may or may not yet be committed. Uncommitted ops
233
- /// may be replaced by different ops if they do not survive through a view change.
234
- op: u64 = 0,
235
-
236
- /// The commit number of the latest committed prepare. Committed ops are immutable.
237
- ///
238
- /// * A `do_view_change` sets this to `commit_min`, to indicate the sending replica's progress.
239
- /// The sending replica may continue to commit after sending the DVC.
240
- /// * A `start_view` sets this to `commit_max`.
241
- commit: u64 = 0,
242
-
243
- /// This field is used in various ways:
244
- ///
245
- /// * A `prepare` sets this to the primary's state machine `prepare_timestamp`.
246
- /// For `create_accounts` and `create_transfers` this is the batch's highest timestamp.
247
- /// * A `reply` sets this to the corresponding `prepare`'s timestamp.
248
- /// This allows the test workload to verify transfer timeouts.
249
- /// * A `do_view_change` sets this to the latest normal view number.
250
- /// * A `pong` sets this to the sender's wall clock value.
251
- /// * A `request_prepare` sets this to `1` when `context` is set to a checksum, and `0`
252
- /// otherwise.
253
- timestamp: u64 = 0,
254
-
255
- /// The size of the Header structure (always), plus any associated body.
256
- size: u32 = @sizeOf(Header),
257
-
258
- /// The index of the replica in the cluster configuration array that authored this message.
259
- /// This identifies only the ultimate author because messages may be forwarded amongst replicas.
260
- replica: u8 = 0,
261
-
262
- /// The Viewstamped Replication protocol command for this message.
263
- command: Command,
264
-
265
- /// The state machine operation to apply.
266
- operation: Operation = .reserved,
267
-
268
- /// The version of the protocol implementation that originated this message.
269
- version: u8 = Version,
270
-
271
- pub fn calculate_checksum(self: *const Header) u128 {
272
- const checksum_size = @sizeOf(@TypeOf(self.checksum));
273
- assert(checksum_size == 16);
274
- const checksum_value = checksum(std.mem.asBytes(self)[checksum_size..]);
275
- assert(@TypeOf(checksum_value) == @TypeOf(self.checksum));
276
- return checksum_value;
277
- }
278
-
279
- pub fn calculate_checksum_body(self: *const Header, body: []const u8) u128 {
280
- assert(self.size == @sizeOf(Header) + body.len);
281
- const checksum_size = @sizeOf(@TypeOf(self.checksum_body));
282
- assert(checksum_size == 16);
283
- const checksum_value = checksum(body);
284
- assert(@TypeOf(checksum_value) == @TypeOf(self.checksum_body));
285
- return checksum_value;
286
- }
287
-
288
- /// This must be called only after set_checksum_body() so that checksum_body is also covered:
289
- pub fn set_checksum(self: *Header) void {
290
- self.checksum = self.calculate_checksum();
291
- }
292
-
293
- pub fn set_checksum_body(self: *Header, body: []const u8) void {
294
- self.checksum_body = self.calculate_checksum_body(body);
295
- }
296
-
297
- pub fn valid_checksum(self: *const Header) bool {
298
- return self.checksum == self.calculate_checksum();
299
- }
300
-
301
- pub fn valid_checksum_body(self: *const Header, body: []const u8) bool {
302
- return self.checksum_body == self.calculate_checksum_body(body);
303
- }
304
-
305
- /// Returns null if all fields are set correctly according to the command, or else a warning.
306
- /// This does not verify that checksum is valid, and expects that this has already been done.
307
- pub fn invalid(self: *const Header) ?[]const u8 {
308
- if (self.version != Version) return "version != Version";
309
- if (self.size < @sizeOf(Header)) return "size < @sizeOf(Header)";
310
- if (self.epoch != 0) return "epoch != 0";
311
- return switch (self.command) {
312
- .reserved => self.invalid_reserved(),
313
- .ping => self.invalid_ping(),
314
- .pong => self.invalid_pong(),
315
- .request => self.invalid_request(),
316
- .prepare => self.invalid_prepare(),
317
- .prepare_ok => self.invalid_prepare_ok(),
318
- .reply => self.invalid_reply(),
319
- .commit => self.invalid_commit(),
320
- .start_view_change => self.invalid_start_view_change(),
321
- .do_view_change => self.invalid_do_view_change(),
322
- .start_view => self.invalid_start_view(),
323
- .request_start_view => self.invalid_request_start_view(),
324
- .request_headers => self.invalid_request_headers(),
325
- .request_prepare => self.invalid_request_prepare(),
326
- .request_block => null, // TODO
327
- .headers => self.invalid_headers(),
328
- .nack_prepare => self.invalid_nack_prepare(),
329
- .eviction => self.invalid_eviction(),
330
- .block => null, // TODO
331
- };
332
- }
333
-
334
- fn invalid_reserved(self: *const Header) ?[]const u8 {
335
- assert(self.command == .reserved);
336
- if (self.parent != 0) return "parent != 0";
337
- if (self.client != 0) return "client != 0";
338
- if (self.context != 0) return "context != 0";
339
- if (self.request != 0) return "request != 0";
340
- if (self.view != 0) return "view != 0";
341
- if (self.commit != 0) return "commit != 0";
342
- if (self.timestamp != 0) return "timestamp != 0";
343
- if (self.replica != 0) return "replica != 0";
344
- if (self.operation != .reserved) return "operation != .reserved";
345
- return null;
346
- }
347
-
348
- fn invalid_ping(self: *const Header) ?[]const u8 {
349
- assert(self.command == .ping);
350
- if (self.parent != 0) return "parent != 0";
351
- if (self.context != 0) return "context != 0";
352
- if (self.request != 0) return "request != 0";
353
- if (self.commit != 0) return "commit != 0";
354
- if (self.timestamp != 0) return "timestamp != 0";
355
- if (self.view != 0) return "view != 0";
356
- if (self.client != 0) {
357
- if (self.replica != 0) return "replica != 0";
358
- if (self.op != 0) return "op != 0";
359
- }
360
- if (self.operation != .reserved) return "operation != .reserved";
361
- return null;
362
- }
363
-
364
- fn invalid_pong(self: *const Header) ?[]const u8 {
365
- assert(self.command == .pong);
366
- if (self.parent != 0) return "parent != 0";
367
- if (self.client != 0) return "client != 0";
368
- if (self.context != 0) return "context != 0";
369
- if (self.request != 0) return "request != 0";
370
- if (self.commit != 0) return "commit != 0";
371
- if (self.timestamp > 0) {
372
- if (self.view != 0) return "view != 0";
373
- }
374
- if (self.operation != .reserved) return "operation != .reserved";
375
- return null;
376
- }
377
-
378
- fn invalid_request(self: *const Header) ?[]const u8 {
379
- assert(self.command == .request);
380
- if (self.client == 0) return "client == 0";
381
- if (self.op != 0) return "op != 0";
382
- if (self.commit != 0) return "commit != 0";
383
- if (self.timestamp != 0) return "timestamp != 0";
384
- if (self.replica != 0) return "replica != 0";
385
- switch (self.operation) {
386
- .reserved => return "operation == .reserved",
387
- .root => return "operation == .root",
388
- .register => {
389
- // The first request a client makes must be to register with the cluster:
390
- if (self.parent != 0) return "parent != 0";
391
- if (self.context != 0) return "context != 0";
392
- if (self.request != 0) return "request != 0";
393
- // The .register operation carries no payload:
394
- if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
395
- },
396
- else => {
397
- // Thereafter, the client must provide the session number in the context:
398
- // These requests should set `parent` to the `checksum` of the previous reply.
399
- if (self.context == 0) return "context == 0";
400
- if (self.request == 0) return "request == 0";
401
- },
402
- }
403
- return null;
404
- }
405
-
406
- fn invalid_prepare(self: *const Header) ?[]const u8 {
407
- assert(self.command == .prepare);
408
- switch (self.operation) {
409
- .reserved => return "operation == .reserved",
410
- .root => {
411
- if (self.parent != 0) return "root: parent != 0";
412
- if (self.client != 0) return "root: client != 0";
413
- if (self.context != 0) return "root: context != 0";
414
- if (self.request != 0) return "root: request != 0";
415
- if (self.view != 0) return "root: view != 0";
416
- if (self.op != 0) return "root: op != 0";
417
- if (self.commit != 0) return "root: commit != 0";
418
- if (self.timestamp != 0) return "root: timestamp != 0";
419
- if (self.size != @sizeOf(Header)) return "root: size != @sizeOf(Header)";
420
- if (self.replica != 0) return "root: replica != 0";
421
- },
422
- else => {
423
- if (self.client == 0) return "client == 0";
424
- if (self.op == 0) return "op == 0";
425
- if (self.op <= self.commit) return "op <= commit";
426
- if (self.timestamp == 0) return "timestamp == 0";
427
- if (self.operation == .register) {
428
- // Client session numbers are replaced by the reference to the previous prepare.
429
- if (self.request != 0) return "request != 0";
430
- } else {
431
- // Client session numbers are replaced by the reference to the previous prepare.
432
- if (self.request == 0) return "request == 0";
433
- }
434
- },
435
- }
436
- return null;
437
- }
438
-
439
- fn invalid_prepare_ok(self: *const Header) ?[]const u8 {
440
- assert(self.command == .prepare_ok);
441
- if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
442
- switch (self.operation) {
443
- .reserved => return "operation == .reserved",
444
- .root => {
445
- if (self.parent != 0) return "root: parent != 0";
446
- if (self.client != 0) return "root: client != 0";
447
- if (self.context != 0) return "root: context != 0";
448
- if (self.request != 0) return "root: request != 0";
449
- if (self.view != 0) return "root: view != 0";
450
- if (self.op != 0) return "root: op != 0";
451
- if (self.commit != 0) return "root: commit != 0";
452
- if (self.timestamp != 0) return "root: timestamp != 0";
453
- if (self.replica != 0) return "root: replica != 0";
454
- },
455
- else => {
456
- if (self.client == 0) return "client == 0";
457
- if (self.op == 0) return "op == 0";
458
- if (self.op <= self.commit) return "op <= commit";
459
- if (self.operation == .register) {
460
- if (self.request != 0) return "request != 0";
461
- } else {
462
- if (self.request == 0) return "request == 0";
463
- }
464
- },
465
- }
466
- return null;
467
- }
468
-
469
- fn invalid_reply(self: *const Header) ?[]const u8 {
470
- assert(self.command == .reply);
471
- // Initialization within `client.zig` asserts that client `id` is greater than zero:
472
- if (self.client == 0) return "client == 0";
473
- if (self.context != 0) return "context != 0";
474
- if (self.op != self.commit) return "op != commit";
475
- if (self.timestamp == 0) return "timestamp == 0";
476
- if (self.operation == .register) {
477
- // In this context, the commit number is the newly registered session number.
478
- // The `0` commit number is reserved for cluster initialization.
479
- if (self.commit == 0) return "commit == 0";
480
- if (self.request != 0) return "request != 0";
481
- } else {
482
- if (self.commit == 0) return "commit == 0";
483
- if (self.request == 0) return "request == 0";
484
- }
485
- return null;
486
- }
487
-
488
- fn invalid_commit(self: *const Header) ?[]const u8 {
489
- assert(self.command == .commit);
490
- if (self.parent != 0) return "parent != 0";
491
- if (self.client != 0) return "client != 0";
492
- if (self.request != 0) return "request != 0";
493
- if (self.op != 0) return "op != 0";
494
- if (self.timestamp != 0) return "timestamp != 0";
495
- if (self.operation != .reserved) return "operation != .reserved";
496
- return null;
497
- }
498
-
499
- fn invalid_start_view_change(self: *const Header) ?[]const u8 {
500
- assert(self.command == .start_view_change);
501
- if (self.parent != 0) return "parent != 0";
502
- if (self.client != 0) return "client != 0";
503
- if (self.context != 0) return "context != 0";
504
- if (self.request != 0) return "request != 0";
505
- if (self.op != 0) return "op != 0";
506
- if (self.commit != 0) return "commit != 0";
507
- if (self.timestamp != 0) return "timestamp != 0";
508
- if (self.operation != .reserved) return "operation != .reserved";
509
- return null;
510
- }
511
-
512
- fn invalid_do_view_change(self: *const Header) ?[]const u8 {
513
- assert(self.command == .do_view_change);
514
- if (self.parent != 0) return "parent != 0";
515
- if (self.client != 0) return "client != 0";
516
- if (self.context != 0) return "context != 0";
517
- if (self.request != 0) return "request != 0";
518
- if (self.operation != .reserved) return "operation != .reserved";
519
- return null;
520
- }
521
-
522
- fn invalid_start_view(self: *const Header) ?[]const u8 {
523
- assert(self.command == .start_view);
524
- if (self.parent != 0) return "parent != 0";
525
- if (self.client != 0) return "client != 0";
526
- if (self.context != 0) return "context != 0";
527
- if (self.request != 0) return "request != 0";
528
- if (self.timestamp != 0) return "timestamp != 0";
529
- if (self.operation != .reserved) return "operation != .reserved";
530
- return null;
531
- }
532
-
533
- fn invalid_request_start_view(self: *const Header) ?[]const u8 {
534
- assert(self.command == .request_start_view);
535
- if (self.parent != 0) return "parent != 0";
536
- if (self.client != 0) return "client != 0";
537
- if (self.context != 0) return "context != 0";
538
- if (self.request != 0) return "request != 0";
539
- if (self.op != 0) return "op != 0";
540
- if (self.commit != 0) return "commit != 0";
541
- if (self.timestamp != 0) return "timestamp != 0";
542
- if (self.operation != .reserved) return "operation != .reserved";
543
- return null;
544
- }
545
-
546
- fn invalid_request_headers(self: *const Header) ?[]const u8 {
547
- assert(self.command == .request_headers);
548
- if (self.parent != 0) return "parent != 0";
549
- if (self.client != 0) return "client != 0";
550
- if (self.context != 0) return "context != 0";
551
- if (self.request != 0) return "request != 0";
552
- if (self.timestamp != 0) return "timestamp != 0";
553
- if (self.commit > self.op) return "op_min > op_max";
554
- if (self.operation != .reserved) return "operation != .reserved";
555
- return null;
556
- }
557
-
558
- fn invalid_request_prepare(self: *const Header) ?[]const u8 {
559
- assert(self.command == .request_prepare);
560
- if (self.parent != 0) return "parent != 0";
561
- if (self.client != 0) return "client != 0";
562
- if (self.request != 0) return "request != 0";
563
- if (self.commit != 0) return "commit != 0";
564
- switch (self.timestamp) {
565
- 0 => if (self.context != 0) return "context != 0",
566
- 1 => {}, // context is a checksum, which may be 0.
567
- else => return "timestamp > 1",
568
- }
569
- if (self.operation != .reserved) return "operation != .reserved";
570
- return null;
571
- }
572
-
573
- fn invalid_headers(self: *const Header) ?[]const u8 {
574
- assert(self.command == .headers);
575
- if (self.parent != 0) return "parent != 0";
576
- if (self.client != 0) return "client != 0";
577
- if (self.request != 0) return "request != 0";
578
- if (self.op != 0) return "op != 0";
579
- if (self.commit != 0) return "commit != 0";
580
- if (self.timestamp != 0) return "timestamp != 0";
581
- if (self.operation != .reserved) return "operation != .reserved";
582
- return null;
583
- }
584
-
585
- fn invalid_nack_prepare(self: *const Header) ?[]const u8 {
586
- assert(self.command == .nack_prepare);
587
- if (self.parent != 0) return "parent != 0";
588
- if (self.client != 0) return "client != 0";
589
- if (self.request != 0) return "request != 0";
590
- if (self.commit != 0) return "commit != 0";
591
- if (self.timestamp != 0) return "timestamp != 0";
592
- if (self.operation != .reserved) return "operation != .reserved";
593
- return null;
594
- }
595
-
596
- fn invalid_eviction(self: *const Header) ?[]const u8 {
597
- assert(self.command == .eviction);
598
- if (self.parent != 0) return "parent != 0";
599
- if (self.context != 0) return "context != 0";
600
- if (self.request != 0) return "request != 0";
601
- if (self.op != 0) return "op != 0";
602
- if (self.commit != 0) return "commit != 0";
603
- if (self.timestamp != 0) return "timestamp != 0";
604
- if (self.operation != .reserved) return "operation != .reserved";
605
- return null;
606
- }
607
-
608
- /// Returns whether the immediate sender is a replica or client (if this can be determined).
609
- /// Some commands such as .request or .prepare may be forwarded on to other replicas so that
610
- /// Header.replica or Header.client only identifies the ultimate origin, not the latest peer.
611
- pub fn peer_type(self: *const Header) enum { unknown, replica, client } {
612
- switch (self.command) {
613
- .reserved => unreachable,
614
- // These messages cannot always identify the peer as they may be forwarded:
615
- .request => switch (self.operation) {
616
- // However, we do not forward the first .register request sent by a client:
617
- .register => return .client,
618
- else => return .unknown,
619
- },
620
- .prepare => return .unknown,
621
- // These messages identify the peer as either a replica or a client:
622
- // TODO Assert that pong responses from a replica do not echo the pinging client's ID.
623
- .ping, .pong => {
624
- if (self.client > 0) {
625
- assert(self.replica == 0);
626
- return .client;
627
- } else {
628
- return .replica;
629
- }
630
- },
631
- // All other messages identify the peer as a replica:
632
- else => return .replica,
633
- }
634
- }
635
-
636
- pub fn reserved(cluster: u32, slot: u64) Header {
637
- assert(slot < constants.journal_slot_count);
638
-
639
- var header = Header{
640
- .command = .reserved,
641
- .cluster = cluster,
642
- .op = slot,
643
- };
644
- header.set_checksum_body(&[0]u8{});
645
- header.set_checksum();
646
- assert(header.invalid() == null);
647
- return header;
648
- }
649
-
650
- pub fn root_prepare(cluster: u32) Header {
651
- var header = Header{
652
- .cluster = cluster,
653
- .size = @sizeOf(Header),
654
- .command = .prepare,
655
- .operation = .root,
656
- };
657
- header.set_checksum_body(&[0]u8{});
658
- header.set_checksum();
659
- assert(header.invalid() == null);
660
- return header;
661
- }
662
- };
663
-
664
- pub const Timeout = struct {
665
- name: []const u8,
666
- id: u128,
667
- after: u64,
668
- attempts: u8 = 0,
669
- rtt: u64 = constants.rtt_ticks,
670
- rtt_multiple: u8 = constants.rtt_multiple,
671
- ticks: u64 = 0,
672
- ticking: bool = false,
673
-
674
- /// Increments the attempts counter and resets the timeout with exponential backoff and jitter.
675
- /// Allows the attempts counter to wrap from time to time.
676
- /// The overflow period is kept short to surface any related bugs sooner rather than later.
677
- /// We do not saturate the counter as this would cause round-robin retries to get stuck.
678
- pub fn backoff(self: *Timeout, random: std.rand.Random) void {
679
- assert(self.ticking);
680
-
681
- self.ticks = 0;
682
- self.attempts +%= 1;
683
-
684
- log.debug("{}: {s} backing off", .{ self.id, self.name });
685
- self.set_after_for_rtt_and_attempts(random);
686
- }
687
-
688
- /// It's important to check that when fired() is acted on that the timeout is stopped/started,
689
- /// otherwise further ticks around the event loop may trigger a thundering herd of messages.
690
- pub fn fired(self: *Timeout) bool {
691
- if (self.ticking and self.ticks >= self.after) {
692
- log.debug("{}: {s} fired", .{ self.id, self.name });
693
- if (self.ticks > self.after) {
694
- log.err("{}: {s} is firing every tick", .{ self.id, self.name });
695
- @panic("timeout was not reset correctly");
696
- }
697
- return true;
698
- } else {
699
- return false;
700
- }
701
- }
702
-
703
- pub fn reset(self: *Timeout) void {
704
- self.attempts = 0;
705
- self.ticks = 0;
706
- assert(self.ticking);
707
- // TODO Use self.prng to adjust for rtt and attempts.
708
- log.debug("{}: {s} reset", .{ self.id, self.name });
709
- }
710
-
711
- /// Sets the value of `after` as a function of `rtt` and `attempts`.
712
- /// Adds exponential backoff and jitter.
713
- /// May be called only after a timeout has been stopped or reset, to prevent backward jumps.
714
- pub fn set_after_for_rtt_and_attempts(self: *Timeout, random: std.rand.Random) void {
715
- // If `after` is reduced by this function to less than `ticks`, then `fired()` will panic:
716
- assert(self.ticks == 0);
717
- assert(self.rtt > 0);
718
-
719
- const after = (self.rtt * self.rtt_multiple) + exponential_backoff_with_jitter(
720
- random,
721
- constants.backoff_min_ticks,
722
- constants.backoff_max_ticks,
723
- self.attempts,
724
- );
725
-
726
- // TODO Clamp `after` to min/max tick bounds for timeout.
727
-
728
- log.debug("{}: {s} after={}..{} (rtt={} min={} max={} attempts={})", .{
729
- self.id,
730
- self.name,
731
- self.after,
732
- after,
733
- self.rtt,
734
- constants.backoff_min_ticks,
735
- constants.backoff_max_ticks,
736
- self.attempts,
737
- });
738
-
739
- self.after = after;
740
- assert(self.after > 0);
741
- }
742
-
743
- pub fn set_rtt(self: *Timeout, rtt_ticks: u64) void {
744
- assert(self.rtt > 0);
745
- assert(rtt_ticks > 0);
746
-
747
- log.debug("{}: {s} rtt={}..{}", .{
748
- self.id,
749
- self.name,
750
- self.rtt,
751
- rtt_ticks,
752
- });
753
-
754
- self.rtt = rtt_ticks;
755
- }
756
-
757
- pub fn start(self: *Timeout) void {
758
- self.attempts = 0;
759
- self.ticks = 0;
760
- self.ticking = true;
761
- // TODO Use self.prng to adjust for rtt and attempts.
762
- log.debug("{}: {s} started", .{ self.id, self.name });
763
- }
764
-
765
- pub fn stop(self: *Timeout) void {
766
- self.attempts = 0;
767
- self.ticks = 0;
768
- self.ticking = false;
769
- log.debug("{}: {s} stopped", .{ self.id, self.name });
770
- }
771
-
772
- pub fn tick(self: *Timeout) void {
773
- if (self.ticking) self.ticks += 1;
774
- }
775
- };
776
-
777
- /// Calculates exponential backoff with jitter to prevent cascading failure due to thundering herds.
778
- pub fn exponential_backoff_with_jitter(
779
- random: std.rand.Random,
780
- min: u64,
781
- max: u64,
782
- attempt: u64,
783
- ) u64 {
784
- const range = max - min;
785
- assert(range > 0);
786
-
787
- // Do not use `@truncate(u6, attempt)` since that only discards the high bits:
788
- // We want a saturating exponent here instead.
789
- const exponent = @intCast(u6, std.math.min(std.math.maxInt(u6), attempt));
790
-
791
- // A "1" shifted left gives any power of two:
792
- // 1<<0 = 1, 1<<1 = 2, 1<<2 = 4, 1<<3 = 8
793
- const power = std.math.shlExact(u128, 1, exponent) catch unreachable; // Do not truncate.
794
-
795
- // Ensure that `backoff` is calculated correctly when min is 0, taking `std.math.max(1, min)`.
796
- // Otherwise, the final result will always be 0. This was an actual bug we encountered.
797
- const min_non_zero = std.math.max(1, min);
798
- assert(min_non_zero > 0);
799
- assert(power > 0);
800
-
801
- // Calculate the capped exponential backoff component, `min(range, min * 2 ^ attempt)`:
802
- const backoff = std.math.min(range, min_non_zero * power);
803
- const jitter = random.uintAtMostBiased(u64, backoff);
804
-
805
- const result = @intCast(u64, min + jitter);
806
- assert(result >= min);
807
- assert(result <= max);
808
-
809
- return result;
810
- }
811
-
812
- test "exponential_backoff_with_jitter" {
813
- var prng = std.rand.DefaultPrng.init(0);
814
- const random = prng.random();
815
-
816
- const attempts = 1000;
817
- const max: u64 = std.math.maxInt(u64);
818
- const min = max - attempts;
819
-
820
- var attempt = max - attempts;
821
- while (attempt < max) : (attempt += 1) {
822
- const ebwj = exponential_backoff_with_jitter(random, min, max, attempt);
823
- try std.testing.expect(ebwj >= min);
824
- try std.testing.expect(ebwj <= max);
825
- }
826
- }
827
-
828
- /// Returns An array containing the remote or local addresses of each of the 2f + 1 replicas:
829
- /// Unlike the VRR paper, we do not sort the array but leave the order explicitly to the user.
830
- /// There are several advantages to this:
831
- /// * The operator may deploy a cluster with proximity in mind since replication follows order.
832
- /// * A replica's IP address may be changed without reconfiguration.
833
- /// This does require that the user specify the same order to all replicas.
834
- /// The caller owns the memory of the returned slice of addresses.
835
- pub fn parse_addresses(allocator: std.mem.Allocator, raw: []const u8, address_limit: usize) ![]std.net.Address {
836
- const address_count = std.mem.count(u8, raw, ",") + 1;
837
- if (address_count > address_limit) return error.AddressLimitExceeded;
838
-
839
- const addresses = try allocator.alloc(std.net.Address, address_count);
840
- errdefer allocator.free(addresses);
841
-
842
- var index: usize = 0;
843
- var comma_iterator = std.mem.split(u8, raw, ",");
844
- while (comma_iterator.next()) |raw_address| : (index += 1) {
845
- assert(index < address_limit);
846
- if (raw_address.len == 0) return error.AddressHasTrailingComma;
847
- addresses[index] = try parse_address(raw_address);
848
- }
849
- assert(index == address_count);
850
-
851
- return addresses;
852
- }
853
-
854
- pub fn parse_address(raw: []const u8) !std.net.Address {
855
- var colon_iterator = std.mem.split(u8, raw, ":");
856
- // The split iterator will always return non-null once, even if the delimiter is not found:
857
- const raw_ipv4 = colon_iterator.next().?;
858
-
859
- if (colon_iterator.next()) |raw_port| {
860
- if (colon_iterator.next() != null) return error.AddressHasMoreThanOneColon;
861
-
862
- const port = std.fmt.parseUnsigned(u16, raw_port, 10) catch |err| switch (err) {
863
- error.Overflow => return error.PortOverflow,
864
- error.InvalidCharacter => return error.PortInvalid,
865
- };
866
- return std.net.Address.parseIp4(raw_ipv4, port) catch {
867
- return error.AddressInvalid;
868
- };
869
- } else {
870
- // There was no colon in the address so there are now two cases:
871
- // 1. an IPv4 address with the default port, or
872
- // 2. a port with the default IPv4 address.
873
-
874
- // Let's try parsing as a port first:
875
- if (std.fmt.parseUnsigned(u16, raw, 10)) |port| {
876
- return std.net.Address.parseIp4(constants.address, port) catch unreachable;
877
- } else |err| switch (err) {
878
- error.Overflow => return error.PortOverflow,
879
- error.InvalidCharacter => {
880
- // Something was not a digit, let's try parsing as an IPv4 instead:
881
- return std.net.Address.parseIp4(raw, constants.port) catch {
882
- return error.AddressInvalid;
883
- };
884
- },
885
- }
886
- }
887
- }
888
-
889
- test "parse_addresses" {
890
- const vectors_positive = &[_]struct {
891
- raw: []const u8,
892
- addresses: []const std.net.Address,
893
- }{
894
- .{
895
- // Test the minimum/maximum address/port.
896
- .raw = "1.2.3.4:567,0.0.0.0:0,255.255.255.255:65535",
897
- .addresses = &[3]std.net.Address{
898
- std.net.Address.initIp4([_]u8{ 1, 2, 3, 4 }, 567),
899
- std.net.Address.initIp4([_]u8{ 0, 0, 0, 0 }, 0),
900
- std.net.Address.initIp4([_]u8{ 255, 255, 255, 255 }, 65535),
901
- },
902
- },
903
- .{
904
- // Addresses are not reordered.
905
- .raw = "3.4.5.6:7777,200.3.4.5:6666,1.2.3.4:5555",
906
- .addresses = &[3]std.net.Address{
907
- std.net.Address.initIp4([_]u8{ 3, 4, 5, 6 }, 7777),
908
- std.net.Address.initIp4([_]u8{ 200, 3, 4, 5 }, 6666),
909
- std.net.Address.initIp4([_]u8{ 1, 2, 3, 4 }, 5555),
910
- },
911
- },
912
- .{
913
- // Test default address and port.
914
- .raw = "1.2.3.4:5,4321,2.3.4.5",
915
- .addresses = &[3]std.net.Address{
916
- std.net.Address.initIp4([_]u8{ 1, 2, 3, 4 }, 5),
917
- try std.net.Address.parseIp4(constants.address, 4321),
918
- std.net.Address.initIp4([_]u8{ 2, 3, 4, 5 }, constants.port),
919
- },
920
- },
921
- .{
922
- // Test addresses less than address_limit.
923
- .raw = "1.2.3.4:5,4321",
924
- .addresses = &[2]std.net.Address{
925
- std.net.Address.initIp4([_]u8{ 1, 2, 3, 4 }, 5),
926
- try std.net.Address.parseIp4(constants.address, 4321),
927
- },
928
- },
929
- };
930
-
931
- const vectors_negative = &[_]struct {
932
- raw: []const u8,
933
- err: anyerror![]std.net.Address,
934
- }{
935
- .{ .raw = "", .err = error.AddressHasTrailingComma },
936
- .{ .raw = "1.2.3.4:5,2.3.4.5:6,4.5.6.7:8", .err = error.AddressLimitExceeded },
937
- .{ .raw = "1.2.3.4:7777,", .err = error.AddressHasTrailingComma },
938
- .{ .raw = "1.2.3.4:7777,2.3.4.5::8888", .err = error.AddressHasMoreThanOneColon },
939
- .{ .raw = "1.2.3.4:5,A", .err = error.AddressInvalid }, // default port
940
- .{ .raw = "1.2.3.4:5,2.a.4.5", .err = error.AddressInvalid }, // default port
941
- .{ .raw = "1.2.3.4:5,2.a.4.5:6", .err = error.AddressInvalid }, // specified port
942
- .{ .raw = "1.2.3.4:5,2.3.4.5:", .err = error.PortInvalid },
943
- .{ .raw = "1.2.3.4:5,2.3.4.5:A", .err = error.PortInvalid },
944
- .{ .raw = "1.2.3.4:5,65536", .err = error.PortOverflow }, // default address
945
- .{ .raw = "1.2.3.4:5,2.3.4.5:65536", .err = error.PortOverflow },
946
- };
947
-
948
- for (vectors_positive) |vector| {
949
- const addresses_actual = try parse_addresses(std.testing.allocator, vector.raw, 3);
950
- defer std.testing.allocator.free(addresses_actual);
951
-
952
- try std.testing.expectEqual(addresses_actual.len, vector.addresses.len);
953
- for (vector.addresses) |address_expect, i| {
954
- const address_actual = addresses_actual[i];
955
- try std.testing.expectEqual(address_expect.in.sa.family, address_actual.in.sa.family);
956
- try std.testing.expectEqual(address_expect.in.sa.port, address_actual.in.sa.port);
957
- try std.testing.expectEqual(address_expect.in.sa.addr, address_actual.in.sa.addr);
958
- try std.testing.expectEqual(address_expect.in.sa.zero, address_actual.in.sa.zero);
959
- }
960
- }
961
-
962
- for (vectors_negative) |vector| {
963
- try std.testing.expectEqual(vector.err, parse_addresses(std.testing.allocator, vector.raw, 2));
964
- }
965
- }
966
-
967
- pub fn sector_floor(offset: u64) u64 {
968
- const sectors = math.divFloor(u64, offset, constants.sector_size) catch unreachable;
969
- return sectors * constants.sector_size;
970
- }
971
-
972
- pub fn sector_ceil(offset: u64) u64 {
973
- const sectors = math.divCeil(u64, offset, constants.sector_size) catch unreachable;
974
- return sectors * constants.sector_size;
975
- }
976
-
977
- pub fn checksum(source: []const u8) u128 {
978
- var target: [32]u8 = undefined;
979
- std.crypto.hash.Blake3.hash(source, target[0..], .{});
980
- return @bitCast(u128, target[0..@sizeOf(u128)].*);
981
- }
982
-
983
- pub fn quorums(replica_count: u8) struct {
984
- replication: u8,
985
- view_change: u8,
986
- } {
987
- const majority = @divFloor(replica_count, 2) + 1;
988
- assert(majority <= replica_count);
989
-
990
- assert(constants.quorum_replication_max >= 2);
991
- const quorum_replication = std.math.min(constants.quorum_replication_max, majority);
992
- assert(quorum_replication >= 2 or quorum_replication == replica_count);
993
-
994
- const quorum_view_change = std.math.max(
995
- replica_count - quorum_replication + 1,
996
- majority,
997
- );
998
- // The view change quorum may be more expensive to make the replication quorum cheaper.
999
- // The insight is that the replication phase is by far more common than the view change.
1000
- // This trade-off allows us to optimize for the common case.
1001
- // See the comments in `constants.zig` for further explanation.
1002
- assert(quorum_view_change >= majority);
1003
-
1004
- return .{
1005
- .replication = quorum_replication,
1006
- .view_change = quorum_view_change,
1007
- };
1008
- }
1009
-
1010
- pub const Headers = struct {
1011
- pub const Array = std.BoundedArray(Header, constants.view_change_headers_max);
1012
- /// The SuperBlock's persisted VSR headers.
1013
- /// One of the following:
1014
- ///
1015
- /// - SV headers (consecutive chain)
1016
- /// - DVC headers (disjoint chain)
1017
- pub const ViewChangeSlice = ViewChangeHeadersSlice;
1018
- pub const ViewChangeArray = ViewChangeHeadersArray;
1019
- };
1020
-
1021
- const ViewChangeHeadersSlice = struct {
1022
- /// Headers are ordered from high-to-low op.
1023
- slice: []const Header,
1024
-
1025
- pub fn init(slice: []const Header) ViewChangeHeadersSlice {
1026
- ViewChangeHeadersSlice.verify(slice);
1027
-
1028
- return .{ .slice = slice };
1029
- }
1030
-
1031
- pub fn verify(slice: []const Header) void {
1032
- assert(slice.len > 0);
1033
- assert(slice.len <= constants.view_change_headers_max);
1034
-
1035
- var child: ?*const Header = null;
1036
- for (slice) |*header| {
1037
- assert(header.valid_checksum());
1038
- assert(header.command == .prepare);
1039
-
1040
- if (child) |child_header| {
1041
- assert(header.op < child_header.op);
1042
- assert(header.view <= child_header.view);
1043
- assert((header.op + 1 == child_header.op) ==
1044
- (header.checksum == child_header.parent));
1045
- assert(header.timestamp < child_header.timestamp);
1046
- }
1047
- child = header;
1048
- }
1049
- }
1050
-
1051
- const ViewRange = struct {
1052
- min: u32, // inclusive
1053
- max: u32, // inclusive
1054
-
1055
- pub fn contains(range: ViewRange, view: u32) bool {
1056
- return range.min <= view and view <= range.max;
1057
- }
1058
- };
1059
-
1060
- /// Returns the range of possible views (of prepare, not commit) for a message that is part of
1061
- /// the same log_view as these headers.
1062
- ///
1063
- /// - When these are DVC headers for a log_view=V, we must be in view_change status working to
1064
- /// transition to a view beyond V. So we will never prepare anything else as part of view V.
1065
- /// - When these are SV headers for a log_view=V, we can continue to add to them (by preparing
1066
- /// more ops), but those ops will laways be part of the log_view. If they were prepared during
1067
- /// a view prior to the log_view, they would already be part of the headers.
1068
- pub fn view_for_op(headers: ViewChangeHeadersSlice, op: u64, log_view: u32) ViewRange {
1069
- const header_newest = &headers.slice[0];
1070
- const header_oldest = &headers.slice[headers.slice.len - 1];
1071
-
1072
- if (op < header_oldest.op) return .{ .min = 0, .max = header_oldest.view };
1073
- if (op > header_newest.op) return .{ .min = log_view, .max = log_view };
1074
-
1075
- for (headers.slice) |*header| {
1076
- if (header.op == op) return .{ .min = header.view, .max = header.view };
1077
- }
1078
-
1079
- for (headers.slice[0 .. headers.slice.len - 1]) |*header_next, header_next_index| {
1080
- const header_prev = headers.slice[header_next_index + 1];
1081
- if (header_prev.op < op and op < header_next.op) {
1082
- return .{ .min = header_prev.view, .max = header_next.view };
1083
- }
1084
- }
1085
- unreachable;
1086
- }
1087
- };
1088
-
1089
- test "Headers.ViewChangeSlice.view_for_op" {
1090
- var headers_array = [_]Header{
1091
- std.mem.zeroInit(Header, .{ .op = 9, .view = 10 }),
1092
- std.mem.zeroInit(Header, .{ .op = 6, .view = 7 }),
1093
- };
1094
-
1095
- const headers = Headers.ViewChangeSlice{ .slice = &headers_array };
1096
- try std.testing.expect(std.meta.eql(headers.view_for_op(11, 12), .{ .min = 12, .max = 12 }));
1097
- try std.testing.expect(std.meta.eql(headers.view_for_op(10, 12), .{ .min = 12, .max = 12 }));
1098
- try std.testing.expect(std.meta.eql(headers.view_for_op(9, 12), .{ .min = 10, .max = 10 }));
1099
- try std.testing.expect(std.meta.eql(headers.view_for_op(8, 12), .{ .min = 7, .max = 10 }));
1100
- try std.testing.expect(std.meta.eql(headers.view_for_op(7, 12), .{ .min = 7, .max = 10 }));
1101
- try std.testing.expect(std.meta.eql(headers.view_for_op(6, 12), .{ .min = 7, .max = 7 }));
1102
- try std.testing.expect(std.meta.eql(headers.view_for_op(5, 12), .{ .min = 0, .max = 7 }));
1103
- try std.testing.expect(std.meta.eql(headers.view_for_op(0, 12), .{ .min = 0, .max = 7 }));
1104
- }
1105
-
1106
- /// The headers of a SV or DVC message.
1107
- const ViewChangeHeadersArray = struct {
1108
- array: Headers.Array,
1109
-
1110
- pub fn root(cluster: u32) ViewChangeHeadersArray {
1111
- var array = Headers.Array{ .buffer = undefined };
1112
- array.appendAssumeCapacity(Header.root_prepare(cluster));
1113
- return ViewChangeHeadersArray.init(array);
1114
- }
1115
-
1116
- fn init(array: Headers.Array) ViewChangeHeadersArray {
1117
- Headers.ViewChangeSlice.verify(array.constSlice());
1118
- return .{ .array = array };
1119
- }
1120
-
1121
- /// This function generates either DVC headers or SV headers:
1122
- /// - When `current.log_view < current.view`, generate headers for a SV message.
1123
- /// - When `current.log_view = current.view`, generate headers for a DVC message.
1124
- ///
1125
- /// Additionally, the current log_view/view/primary state informs the sort of "faults"
1126
- /// (gaps/breaks/etc) that we expect to find in the journal headers (`current.headers`).
1127
- /// For example, backups generating a DVC can safely skip over gaps (if the gap is after the DVC
1128
- /// anchor).
1129
- ///
1130
- /// Primaries and backups both generate DVCs and SVs.
1131
- /// - However, SVs are broadcast only by the primary.
1132
- /// - Backups generate a SV for persisting to the superblock.
1133
- /// (For convenience/symmetry, not correctness).
1134
- ///
1135
- /// DVCs and SVs have different invariants they must abide.
1136
- /// - Read DVCQuorum's comments to understand DVC invariants.
1137
- /// - SV headers are much simpler: no gaps or breaks, and all uncommitted ops must be included.
1138
- pub fn build(
1139
- results: *ViewChangeHeadersArray,
1140
- options: struct {
1141
- op_checkpoint: u64,
1142
- /// The last view_change_headers_max headers of the journal, starting with the head op
1143
- /// then descending, skipping over all gaps.
1144
- current: struct {
1145
- headers: *const Headers.Array,
1146
- view: u32,
1147
- log_view: u32,
1148
- log_view_primary: bool,
1149
- },
1150
- // The vsr_headers from the working superblock.
1151
- // The durable headers are useful (complimenting `current.headers`) because:
1152
- // - They simplify generation of DVCs in the case where we are recovering from a crash,
1153
- // when we were generating the same DVC prior to the crash.
1154
- // - They enable additional verification of header gaps/breaks based on the
1155
- // gap's/break's position relative to the durable headers.
1156
- durable: struct {
1157
- headers: Headers.ViewChangeSlice,
1158
- view: u32,
1159
- log_view: u32,
1160
- log_view_primary: bool,
1161
- },
1162
- },
1163
- ) void {
1164
- defer Headers.ViewChangeSlice.verify(results.array.constSlice());
1165
-
1166
- const headers = &results.array;
1167
- const current = options.current;
1168
- const durable = options.durable;
1169
-
1170
- assert(headers.len == 0);
1171
- assert(durable.headers.slice.len > 0);
1172
- assert(current.headers.len > 0);
1173
- for (current.headers.constSlice()[1..]) |*header, i| {
1174
- assert(current.headers.get(i).op > header.op);
1175
- }
1176
-
1177
- assert(current.view >= durable.view);
1178
- assert(current.log_view >= durable.log_view);
1179
- assert(current.view >= current.log_view);
1180
- assert(durable.view >= durable.log_view);
1181
-
1182
- const op_head_current = current.headers.get(0).op;
1183
- const op_head_durable = durable.headers.slice[0].op;
1184
-
1185
- // The rules for generating DVCs and SVs differ. We use the current view numbers to
1186
- // determine which is being generated:
1187
- // - When `log_view < view`, generate a DVC.
1188
- // - When `log_view = view`, generate a SV.
1189
- const command_current: enum { start_view, do_view_change } =
1190
- if (current.log_view == current.view) .start_view else .do_view_change;
1191
- // Likewise, the durable view numbers identify whether the durable headers were from a past
1192
- // DVC or SV. The durable headers are only useful if they are from the same view as our
1193
- // current headers, though.
1194
- const command_durable: enum { start_view, do_view_change, outdated } = command: {
1195
- if (durable.log_view == current.log_view) {
1196
- if (durable.log_view == durable.view) {
1197
- break :command .start_view;
1198
- } else {
1199
- break :command .do_view_change;
1200
- }
1201
- } else {
1202
- break :command .outdated;
1203
- }
1204
- };
1205
-
1206
- if (command_durable == .do_view_change and command_current == .do_view_change) {
1207
- assert(op_head_durable == op_head_current);
1208
- // Ensure that if we started a DVC before a crash, that we will resume sending the exact
1209
- // same DVC after recovery. (An alternative implementation would be to load the
1210
- // superblock's DVC headers (including gaps) into the journal during Replica.open(), but
1211
- // that is more complicated to implement correctly).
1212
- for (durable.headers.slice) |*header| headers.appendAssumeCapacity(header.*);
1213
- return;
1214
- }
1215
-
1216
- // What is the relationship between two prepares?
1217
- const Chain = enum {
1218
- // The ops are sequential, and the hash-chain is valid.
1219
- chain_sequence,
1220
- // The ops are sequential, and the hash-chain is invalid.
1221
- chain_break,
1222
- // The ops are non-sequential, and belong to the same view.
1223
- // This gap never hides a break.
1224
- chain_view,
1225
- // The ops are non-sequential, and belong to the different views.
1226
- // Depending on the replica state, this gap may hide a break.
1227
- chain_gap,
1228
- };
1229
-
1230
- // The DVC anchor: Within the log suffix following the anchor, we have additional
1231
- // guarantees about the state of the log headers which allow us to tolerate certain
1232
- // gaps (by locally guaranteeing that the gap does not hide a break).
1233
- const op_dvc_anchor = std.math.max(
1234
- options.op_checkpoint,
1235
- // +1: We may have a full pipeline, but not yet have performed any repair.
1236
- // In such a case, we want to send those pipeline_prepare_queue_max headers in
1237
- // the DVC, but not the preceding op (which may belong to a different chain).
1238
- // This satisfies the DVC invariant because the first op in the pipeline is
1239
- // "connected" to the canonical chain (via its "parent" checksum).
1240
- 1 + op_head_current -| constants.pipeline_prepare_queue_max,
1241
- );
1242
-
1243
- // Within the "suffix" we can make additional assumptions about gaps/etc.
1244
- // After the suffix, we just add as many extra (valid) headers as we can fit.
1245
- var suffix_done = false;
1246
-
1247
- for (current.headers.constSlice()) |*header, i| {
1248
- const op = header.op;
1249
- const chain = chain: {
1250
- // Always include the head message.
1251
- if (i == 0) break :chain Chain.chain_sequence;
1252
-
1253
- const child = headers.get(i - 1);
1254
- if (child.op == header.op + 1) {
1255
- break :chain if (child.parent == header.checksum) Chain.chain_sequence else Chain.chain_break;
1256
- } else {
1257
- break :chain if (child.view == header.view) Chain.chain_view else Chain.chain_gap;
1258
- }
1259
- };
1260
-
1261
- if (command_current == .start_view) {
1262
- // Primary: Collect headers for a start_view message.
1263
- // Backup: these headers are stored in the superblock's vsr_headers.
1264
- switch (chain) {
1265
- .chain_sequence => {},
1266
- // Gaps are due to either:
1267
- // - entries before checkpoint, which are not repaired, or
1268
- // - backup missed prepares and has not repaired headers. (Immediately after
1269
- // receiving a start_view this is not a concern, but the view_durable_update()
1270
- // may be delayed if another is in progress).
1271
- .chain_view, .chain_gap => {
1272
- assert(op <= options.op_checkpoint or !current.log_view_primary);
1273
- break;
1274
- },
1275
- // Breaks are due to:
1276
- // - entries before checkpoint, which are not repaired
1277
- .chain_break => {
1278
- assert(op <= options.op_checkpoint);
1279
- break;
1280
- },
1281
- }
1282
- } else if (suffix_done) {
1283
- // Add extra headers to the DVC. These are not required for correctness or
1284
- // availability, but including extra (correct) headers minimizes header repair at
1285
- // the new primary.
1286
- switch (chain) {
1287
- .chain_sequence => {},
1288
- .chain_view => {},
1289
- // Outside of the log suffix, repair may not have been finished, so gaps and
1290
- // breaks are possible. Non-same-view gaps may hide breaks.
1291
- .chain_gap => break,
1292
- .chain_break => break,
1293
- }
1294
- } else if (current.log_view_primary and command_durable == .start_view) {
1295
- switch (chain) {
1296
- .chain_sequence => {},
1297
- // Gaps to the right of the (durable) SV originate from:
1298
- // 1. The primary (durable SV: 1,2,3) prepares several ops (4,5,6).
1299
- // 2. However, the WAL writes are reordered such that some later ops (5,6)
1300
- // finish before an earlier op (4).
1301
- // 3. Crash, recover. Start sending a DVC for the next view. Either:
1302
- // - There is a gap in the WAL at op=4, but this is to the right of the
1303
- // durable SV, so it may be safely skipped.
1304
- // - Same as above, except op=4 was a torn write (or bit rot).
1305
- .chain_view, .chain_gap => assert(op + 1 > op_head_durable),
1306
- // Breaks are impossible to the right of the durable SV — journal recovery uses
1307
- // the durable SV to prune bad headers by their view numbers.
1308
- .chain_break => unreachable,
1309
- }
1310
- suffix_done = op <= op_head_durable;
1311
- } else if (current.log_view_primary and command_durable != .start_view) {
1312
- switch (chain) {
1313
- .chain_sequence => {},
1314
- .chain_view => {},
1315
- // The retiring primary may have gap-breaks or breaks in its suffix iff:
1316
- // - it didn't finish repairs before the second view-change, and
1317
- // - some uncommitted ops were truncated during the first view-change.
1318
- // (Truncation "moves" the suffix backwards).
1319
- .chain_gap => break,
1320
- .chain_break => break,
1321
- }
1322
- suffix_done = op <= op_dvc_anchor;
1323
- } else if (!current.log_view_primary and command_durable == .start_view) {
1324
- switch (chain) {
1325
- .chain_sequence => {},
1326
- // Backups load a full suffix of headers from the view's SV message. If there
1327
- // is now a gap in it the bcakup's suffix, this must be due to missed prepares.
1328
- .chain_view, .chain_gap => assert(op + 1 > op_head_durable),
1329
- // Breaks are impossible to the right of the durable SV — journal recovery uses
1330
- // the durable SV to prune bad headers by their view numbers.
1331
- .chain_break => unreachable,
1332
- }
1333
- suffix_done = op <= op_head_durable;
1334
- } else if (!current.log_view_primary and command_durable != .start_view) {
1335
- switch (chain) {
1336
- .chain_sequence => {},
1337
- .chain_view => {},
1338
- // Backups load a full suffix of headers from the view's SV message.
1339
- // That SV isn't durable, but it is part of the journal, so any gaps to its
1340
- // right must be due to missed prepares.
1341
- .chain_gap => {},
1342
- // Breaks are impossible to the right of the ephemeral SV, since the log was
1343
- // truncated when the SV was installed.
1344
- .chain_break => unreachable,
1345
- }
1346
- suffix_done = op <= op_dvc_anchor;
1347
- } else unreachable;
1348
-
1349
- headers.appendAssumeCapacity(header.*);
1350
- }
1351
- }
1352
- };