tigerbeetle-node 0.11.13 → 0.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. package/README.md +5 -10
  2. package/dist/bin/aarch64-linux-gnu/client.node +0 -0
  3. package/dist/bin/aarch64-linux-musl/client.node +0 -0
  4. package/dist/bin/aarch64-macos/client.node +0 -0
  5. package/dist/bin/x86_64-linux-gnu/client.node +0 -0
  6. package/dist/bin/x86_64-linux-musl/client.node +0 -0
  7. package/dist/bin/x86_64-macos/client.node +0 -0
  8. package/dist/index.js +33 -1
  9. package/dist/index.js.map +1 -1
  10. package/package-lock.json +66 -0
  11. package/package.json +6 -16
  12. package/src/index.ts +56 -1
  13. package/src/node.zig +9 -9
  14. package/dist/.client.node.sha256 +0 -1
  15. package/scripts/build_lib.sh +0 -61
  16. package/scripts/download_node_headers.sh +0 -32
  17. package/src/tigerbeetle/scripts/benchmark.bat +0 -55
  18. package/src/tigerbeetle/scripts/benchmark.sh +0 -66
  19. package/src/tigerbeetle/scripts/confirm_image.sh +0 -44
  20. package/src/tigerbeetle/scripts/fail_on_diff.sh +0 -9
  21. package/src/tigerbeetle/scripts/fuzz_loop.sh +0 -15
  22. package/src/tigerbeetle/scripts/fuzz_loop_hash_log.sh +0 -12
  23. package/src/tigerbeetle/scripts/fuzz_unique_errors.sh +0 -7
  24. package/src/tigerbeetle/scripts/install.bat +0 -7
  25. package/src/tigerbeetle/scripts/install.sh +0 -21
  26. package/src/tigerbeetle/scripts/install_zig.bat +0 -113
  27. package/src/tigerbeetle/scripts/install_zig.sh +0 -90
  28. package/src/tigerbeetle/scripts/lint.zig +0 -199
  29. package/src/tigerbeetle/scripts/pre-commit.sh +0 -9
  30. package/src/tigerbeetle/scripts/scripts/benchmark.bat +0 -55
  31. package/src/tigerbeetle/scripts/scripts/benchmark.sh +0 -66
  32. package/src/tigerbeetle/scripts/scripts/confirm_image.sh +0 -44
  33. package/src/tigerbeetle/scripts/scripts/fail_on_diff.sh +0 -9
  34. package/src/tigerbeetle/scripts/scripts/fuzz_loop.sh +0 -15
  35. package/src/tigerbeetle/scripts/scripts/fuzz_loop_hash_log.sh +0 -12
  36. package/src/tigerbeetle/scripts/scripts/fuzz_unique_errors.sh +0 -7
  37. package/src/tigerbeetle/scripts/scripts/install.bat +0 -7
  38. package/src/tigerbeetle/scripts/scripts/install.sh +0 -21
  39. package/src/tigerbeetle/scripts/scripts/install_zig.bat +0 -113
  40. package/src/tigerbeetle/scripts/scripts/install_zig.sh +0 -90
  41. package/src/tigerbeetle/scripts/scripts/lint.zig +0 -199
  42. package/src/tigerbeetle/scripts/scripts/pre-commit.sh +0 -9
  43. package/src/tigerbeetle/scripts/scripts/shellcheck.sh +0 -5
  44. package/src/tigerbeetle/scripts/scripts/tests_on_alpine.sh +0 -10
  45. package/src/tigerbeetle/scripts/scripts/tests_on_ubuntu.sh +0 -14
  46. package/src/tigerbeetle/scripts/scripts/upgrade_ubuntu_kernel.sh +0 -48
  47. package/src/tigerbeetle/scripts/scripts/validate_docs.sh +0 -23
  48. package/src/tigerbeetle/scripts/scripts/vr_state_enumerate +0 -46
  49. package/src/tigerbeetle/scripts/shellcheck.sh +0 -5
  50. package/src/tigerbeetle/scripts/tests_on_alpine.sh +0 -10
  51. package/src/tigerbeetle/scripts/tests_on_ubuntu.sh +0 -14
  52. package/src/tigerbeetle/scripts/upgrade_ubuntu_kernel.sh +0 -48
  53. package/src/tigerbeetle/scripts/validate_docs.sh +0 -23
  54. package/src/tigerbeetle/scripts/vr_state_enumerate +0 -46
  55. package/src/tigerbeetle/src/benchmark.zig +0 -336
  56. package/src/tigerbeetle/src/config.zig +0 -233
  57. package/src/tigerbeetle/src/constants.zig +0 -428
  58. package/src/tigerbeetle/src/ewah.zig +0 -286
  59. package/src/tigerbeetle/src/ewah_benchmark.zig +0 -120
  60. package/src/tigerbeetle/src/ewah_fuzz.zig +0 -130
  61. package/src/tigerbeetle/src/fifo.zig +0 -120
  62. package/src/tigerbeetle/src/io/benchmark.zig +0 -213
  63. package/src/tigerbeetle/src/io/darwin.zig +0 -814
  64. package/src/tigerbeetle/src/io/linux.zig +0 -1071
  65. package/src/tigerbeetle/src/io/test.zig +0 -643
  66. package/src/tigerbeetle/src/io/windows.zig +0 -1183
  67. package/src/tigerbeetle/src/io.zig +0 -34
  68. package/src/tigerbeetle/src/iops.zig +0 -107
  69. package/src/tigerbeetle/src/lsm/README.md +0 -308
  70. package/src/tigerbeetle/src/lsm/binary_search.zig +0 -341
  71. package/src/tigerbeetle/src/lsm/bloom_filter.zig +0 -125
  72. package/src/tigerbeetle/src/lsm/compaction.zig +0 -603
  73. package/src/tigerbeetle/src/lsm/composite_key.zig +0 -77
  74. package/src/tigerbeetle/src/lsm/direction.zig +0 -11
  75. package/src/tigerbeetle/src/lsm/eytzinger.zig +0 -587
  76. package/src/tigerbeetle/src/lsm/eytzinger_benchmark.zig +0 -330
  77. package/src/tigerbeetle/src/lsm/forest.zig +0 -205
  78. package/src/tigerbeetle/src/lsm/forest_fuzz.zig +0 -450
  79. package/src/tigerbeetle/src/lsm/grid.zig +0 -573
  80. package/src/tigerbeetle/src/lsm/groove.zig +0 -1036
  81. package/src/tigerbeetle/src/lsm/k_way_merge.zig +0 -474
  82. package/src/tigerbeetle/src/lsm/level_iterator.zig +0 -332
  83. package/src/tigerbeetle/src/lsm/manifest.zig +0 -617
  84. package/src/tigerbeetle/src/lsm/manifest_level.zig +0 -878
  85. package/src/tigerbeetle/src/lsm/manifest_log.zig +0 -789
  86. package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +0 -691
  87. package/src/tigerbeetle/src/lsm/merge_iterator.zig +0 -106
  88. package/src/tigerbeetle/src/lsm/node_pool.zig +0 -235
  89. package/src/tigerbeetle/src/lsm/posted_groove.zig +0 -381
  90. package/src/tigerbeetle/src/lsm/segmented_array.zig +0 -1329
  91. package/src/tigerbeetle/src/lsm/segmented_array_benchmark.zig +0 -148
  92. package/src/tigerbeetle/src/lsm/segmented_array_fuzz.zig +0 -9
  93. package/src/tigerbeetle/src/lsm/set_associative_cache.zig +0 -850
  94. package/src/tigerbeetle/src/lsm/table.zig +0 -1009
  95. package/src/tigerbeetle/src/lsm/table_immutable.zig +0 -192
  96. package/src/tigerbeetle/src/lsm/table_iterator.zig +0 -340
  97. package/src/tigerbeetle/src/lsm/table_mutable.zig +0 -203
  98. package/src/tigerbeetle/src/lsm/test.zig +0 -439
  99. package/src/tigerbeetle/src/lsm/tree.zig +0 -1169
  100. package/src/tigerbeetle/src/lsm/tree_fuzz.zig +0 -479
  101. package/src/tigerbeetle/src/message_bus.zig +0 -1013
  102. package/src/tigerbeetle/src/message_pool.zig +0 -156
  103. package/src/tigerbeetle/src/ring_buffer.zig +0 -399
  104. package/src/tigerbeetle/src/simulator.zig +0 -580
  105. package/src/tigerbeetle/src/state_machine/auditor.zig +0 -578
  106. package/src/tigerbeetle/src/state_machine/workload.zig +0 -883
  107. package/src/tigerbeetle/src/state_machine.zig +0 -2099
  108. package/src/tigerbeetle/src/static_allocator.zig +0 -65
  109. package/src/tigerbeetle/src/stdx.zig +0 -171
  110. package/src/tigerbeetle/src/storage.zig +0 -393
  111. package/src/tigerbeetle/src/testing/cluster/message_bus.zig +0 -82
  112. package/src/tigerbeetle/src/testing/cluster/network.zig +0 -237
  113. package/src/tigerbeetle/src/testing/cluster/state_checker.zig +0 -169
  114. package/src/tigerbeetle/src/testing/cluster/storage_checker.zig +0 -202
  115. package/src/tigerbeetle/src/testing/cluster.zig +0 -444
  116. package/src/tigerbeetle/src/testing/fuzz.zig +0 -140
  117. package/src/tigerbeetle/src/testing/hash_log.zig +0 -66
  118. package/src/tigerbeetle/src/testing/id.zig +0 -99
  119. package/src/tigerbeetle/src/testing/packet_simulator.zig +0 -374
  120. package/src/tigerbeetle/src/testing/priority_queue.zig +0 -645
  121. package/src/tigerbeetle/src/testing/reply_sequence.zig +0 -139
  122. package/src/tigerbeetle/src/testing/state_machine.zig +0 -250
  123. package/src/tigerbeetle/src/testing/storage.zig +0 -757
  124. package/src/tigerbeetle/src/testing/table.zig +0 -247
  125. package/src/tigerbeetle/src/testing/time.zig +0 -84
  126. package/src/tigerbeetle/src/tigerbeetle.zig +0 -227
  127. package/src/tigerbeetle/src/time.zig +0 -112
  128. package/src/tigerbeetle/src/tracer.zig +0 -529
  129. package/src/tigerbeetle/src/unit_tests.zig +0 -40
  130. package/src/tigerbeetle/src/vopr.zig +0 -495
  131. package/src/tigerbeetle/src/vsr/README.md +0 -209
  132. package/src/tigerbeetle/src/vsr/client.zig +0 -544
  133. package/src/tigerbeetle/src/vsr/clock.zig +0 -855
  134. package/src/tigerbeetle/src/vsr/journal.zig +0 -2415
  135. package/src/tigerbeetle/src/vsr/journal_format_fuzz.zig +0 -111
  136. package/src/tigerbeetle/src/vsr/marzullo.zig +0 -309
  137. package/src/tigerbeetle/src/vsr/replica.zig +0 -6616
  138. package/src/tigerbeetle/src/vsr/replica_format.zig +0 -219
  139. package/src/tigerbeetle/src/vsr/superblock.zig +0 -1631
  140. package/src/tigerbeetle/src/vsr/superblock_client_table.zig +0 -256
  141. package/src/tigerbeetle/src/vsr/superblock_free_set.zig +0 -929
  142. package/src/tigerbeetle/src/vsr/superblock_free_set_fuzz.zig +0 -334
  143. package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +0 -390
  144. package/src/tigerbeetle/src/vsr/superblock_manifest.zig +0 -615
  145. package/src/tigerbeetle/src/vsr/superblock_quorums.zig +0 -394
  146. package/src/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +0 -314
  147. package/src/tigerbeetle/src/vsr.zig +0 -1425
@@ -1,1425 +0,0 @@
1
- const std = @import("std");
2
- const math = std.math;
3
- const Allocator = std.mem.Allocator;
4
- const assert = std.debug.assert;
5
- const log = std.log.scoped(.vsr);
6
-
7
- // vsr.zig is the root of a zig package, reexport all public APIs.
8
- //
9
- // Note that we don't promise any stability of these interfaces yet.
10
- pub const constants = @import("constants.zig");
11
- pub const io = @import("io.zig");
12
- pub const message_bus = @import("message_bus.zig");
13
- pub const message_pool = @import("message_pool.zig");
14
- pub const state_machine = @import("state_machine.zig");
15
- pub const storage = @import("storage.zig");
16
- pub const tigerbeetle = @import("tigerbeetle.zig");
17
- pub const time = @import("time.zig");
18
- pub const tracer = @import("tracer.zig");
19
- pub const config = @import("config.zig");
20
- pub const stdx = @import("stdx.zig");
21
- pub const superblock = @import("vsr/superblock.zig");
22
- pub const lsm = .{
23
- .tree = @import("lsm/tree.zig"),
24
- .grid = @import("lsm/grid.zig"),
25
- .groove = @import("lsm/groove.zig"),
26
- .forest = @import("lsm/forest.zig"),
27
- .posted_groove = @import("lsm/posted_groove.zig"),
28
- };
29
- pub const testing = .{
30
- .cluster = @import("testing/cluster.zig"),
31
- };
32
-
33
- pub const ReplicaType = @import("vsr/replica.zig").ReplicaType;
34
- pub const format = @import("vsr/replica_format.zig").format;
35
- pub const Status = @import("vsr/replica.zig").Status;
36
- pub const Client = @import("vsr/client.zig").Client;
37
- pub const Clock = @import("vsr/clock.zig").Clock;
38
- pub const JournalType = @import("vsr/journal.zig").JournalType;
39
- pub const SlotRange = @import("vsr/journal.zig").SlotRange;
40
- pub const SuperBlockType = superblock.SuperBlockType;
41
- pub const VSRState = superblock.SuperBlockHeader.VSRState;
42
-
43
- /// The version of our Viewstamped Replication protocol in use, including customizations.
44
- /// For backwards compatibility through breaking changes (e.g. upgrading checksums/ciphers).
45
- pub const Version: u8 = 0;
46
-
47
- pub const ProcessType = enum { replica, client };
48
-
49
- pub const Zone = enum {
50
- superblock,
51
- wal_headers,
52
- wal_prepares,
53
- grid,
54
-
55
- const size_superblock = superblock.superblock_zone_size;
56
- const size_wal_headers = constants.journal_size_headers;
57
- const size_wal_prepares = constants.journal_size_prepares;
58
-
59
- comptime {
60
- for (.{
61
- size_superblock,
62
- size_wal_headers,
63
- size_wal_prepares,
64
- }) |zone_size| {
65
- assert(zone_size % constants.sector_size == 0);
66
- }
67
- }
68
-
69
- pub fn offset(zone: Zone, offset_logical: u64) u64 {
70
- if (zone.size()) |zone_size| {
71
- assert(offset_logical < zone_size);
72
- }
73
-
74
- return offset_logical + switch (zone) {
75
- .superblock => 0,
76
- .wal_headers => size_superblock,
77
- .wal_prepares => size_superblock + size_wal_headers,
78
- .grid => size_superblock + size_wal_headers + size_wal_prepares,
79
- };
80
- }
81
-
82
- pub fn size(zone: Zone) ?u64 {
83
- return switch (zone) {
84
- .superblock => size_superblock,
85
- .wal_headers => size_wal_headers,
86
- .wal_prepares => size_wal_prepares,
87
- .grid => null,
88
- };
89
- }
90
- };
91
-
92
- /// Viewstamped Replication protocol commands:
93
- pub const Command = enum(u8) {
94
- reserved,
95
-
96
- ping,
97
- pong,
98
-
99
- ping_client,
100
- pong_client,
101
-
102
- request,
103
- prepare,
104
- prepare_ok,
105
- reply,
106
- commit,
107
-
108
- start_view_change,
109
- do_view_change,
110
- start_view,
111
-
112
- request_start_view,
113
- request_headers,
114
- request_prepare,
115
- headers,
116
- nack_prepare,
117
-
118
- eviction,
119
-
120
- request_block,
121
- block,
122
- };
123
-
124
- /// This type exists to avoid making the Header type dependant on the state
125
- /// machine used, which would cause awkward circular type dependencies.
126
- pub const Operation = enum(u8) {
127
- /// Operations reserved by VR protocol (for all state machines):
128
- /// The value 0 is reserved to prevent a spurious zero from being interpreted as an operation.
129
- reserved = 0,
130
- /// The value 1 is reserved to initialize the cluster.
131
- root = 1,
132
- /// The value 2 is reserved to register a client session with the cluster.
133
- register = 2,
134
-
135
- /// Operations exported by the state machine (all other values are free):
136
- _,
137
-
138
- pub fn from(comptime StateMachine: type, op: StateMachine.Operation) Operation {
139
- check_state_machine_operations(StateMachine.Operation);
140
- return @intToEnum(Operation, @enumToInt(op));
141
- }
142
-
143
- pub fn cast(self: Operation, comptime StateMachine: type) StateMachine.Operation {
144
- check_state_machine_operations(StateMachine.Operation);
145
- return @intToEnum(StateMachine.Operation, @enumToInt(self));
146
- }
147
-
148
- fn check_state_machine_operations(comptime Op: type) void {
149
- if (!@hasField(Op, "reserved") or std.meta.fieldInfo(Op, .reserved).value != 0) {
150
- @compileError("StateMachine.Operation must have a 'reserved' field with value 0");
151
- }
152
- if (!@hasField(Op, "root") or std.meta.fieldInfo(Op, .root).value != 1) {
153
- @compileError("StateMachine.Operation must have a 'root' field with value 1");
154
- }
155
- if (!@hasField(Op, "register") or std.meta.fieldInfo(Op, .register).value != 2) {
156
- @compileError("StateMachine.Operation must have a 'register' field with value 2");
157
- }
158
- }
159
- };
160
-
161
- /// Network message and journal entry header:
162
- /// We reuse the same header for both so that prepare messages from the primary can simply be
163
- /// journalled as is by the backups without requiring any further modification.
164
- pub const Header = extern struct {
165
- const checksum_body_empty = checksum(&.{});
166
-
167
- comptime {
168
- assert(@sizeOf(Header) == 128);
169
- // Assert that there is no implicit padding in the struct.
170
- assert(@bitSizeOf(Header) == @sizeOf(Header) * 8);
171
- }
172
- /// A checksum covering only the remainder of this header.
173
- /// This allows the header to be trusted without having to recv() or read() the associated body.
174
- /// This checksum is enough to uniquely identify a network message or journal entry.
175
- checksum: u128 = 0,
176
-
177
- /// A checksum covering only the associated body after this header.
178
- checksum_body: u128 = 0,
179
-
180
- /// A backpointer to the previous request or prepare checksum for hash chain verification.
181
- /// This provides a cryptographic guarantee for linearizability:
182
- /// 1. across our distributed log of prepares, and
183
- /// 2. across a client's requests and our replies.
184
- /// This may also be used as the initialization vector for AEAD encryption at rest, provided
185
- /// that the primary ratchets the encryption key every view change to ensure that prepares
186
- /// reordered through a view change never repeat the same IV for the same encryption key.
187
- parent: u128 = 0,
188
-
189
- /// Each client process generates a unique, random and ephemeral client ID at initialization.
190
- /// The client ID identifies connections made by the client to the cluster for the sake of
191
- /// routing messages back to the client.
192
- ///
193
- /// With the client ID in hand, the client then registers a monotonically increasing session
194
- /// number (committed through the cluster) to allow the client's session to be evicted safely
195
- /// from the client table if too many concurrent clients cause the client table to overflow.
196
- /// The monotonically increasing session number prevents duplicate client requests from being
197
- /// replayed.
198
- ///
199
- /// The problem of routing is therefore solved by the 128-bit client ID, and the problem of
200
- /// detecting whether a session has been evicted is solved by the session number.
201
- client: u128 = 0,
202
-
203
- /// The checksum of the message to which this message refers, or a unique recovery nonce.
204
- ///
205
- /// We use this cryptographic context in various ways, for example:
206
- ///
207
- /// * A `request` sets this to the client's session number.
208
- /// * A `prepare` sets this to the checksum of the client's request.
209
- /// * A `prepare_ok` sets this to the checksum of the prepare being acked.
210
- /// * A `commit` sets this to the checksum of the latest committed prepare.
211
- /// * A `request_prepare` sets this to the checksum of the prepare being requested.
212
- /// * A `nack_prepare` sets this to the checksum of the prepare being nacked.
213
- /// * A `recovery` and `recovery_response` sets this to the nonce.
214
- ///
215
- /// This allows for cryptographic guarantees beyond request, op, and commit numbers, which have
216
- /// low entropy and may otherwise collide in the event of any correctness bugs.
217
- context: u128 = 0,
218
-
219
- /// Each request is given a number by the client and later requests must have larger numbers
220
- /// than earlier ones. The request number is used by the replicas to avoid running requests more
221
- /// than once; it is also used by the client to discard duplicate responses to its requests.
222
- /// A client is allowed to have at most one request inflight at a time.
223
- request: u32 = 0,
224
-
225
- /// The cluster number binds intention into the header, so that a client or replica can indicate
226
- /// the cluster it believes it is speaking to, instead of accidentally talking to the wrong
227
- /// cluster (for example, staging vs production).
228
- cluster: u32,
229
-
230
- /// The cluster reconfiguration epoch number (for future use).
231
- epoch: u32 = 0,
232
-
233
- /// Every message sent from one replica to another contains the sending replica's current view.
234
- /// A `u32` allows for a minimum lifetime of 136 years at a rate of one view change per second.
235
- view: u32 = 0,
236
-
237
- /// The op number of the latest prepare that may or may not yet be committed. Uncommitted ops
238
- /// may be replaced by different ops if they do not survive through a view change.
239
- op: u64 = 0,
240
-
241
- /// The commit number of the latest committed prepare. Committed ops are immutable.
242
- ///
243
- /// * A `do_view_change` sets this to `commit_min`, to indicate the sending replica's progress.
244
- /// The sending replica may continue to commit after sending the DVC.
245
- /// * A `start_view` sets this to `commit_max`.
246
- commit: u64 = 0,
247
-
248
- /// This field is used in various ways:
249
- ///
250
- /// * A `prepare` sets this to the primary's state machine `prepare_timestamp`.
251
- /// For `create_accounts` and `create_transfers` this is the batch's highest timestamp.
252
- /// * A `reply` sets this to the corresponding `prepare`'s timestamp.
253
- /// This allows the test workload to verify transfer timeouts.
254
- /// * A `do_view_change` sets this to the latest normal view number.
255
- /// * A `pong` sets this to the sender's wall clock value.
256
- /// * A `request_prepare` sets this to `1` when `context` is set to a checksum, and `0`
257
- /// otherwise.
258
- /// * A `commit` message sets this to the replica's monotonic timestamp.
259
- timestamp: u64 = 0,
260
-
261
- /// The size of the Header structure (always), plus any associated body.
262
- size: u32 = @sizeOf(Header),
263
-
264
- /// The index of the replica in the cluster configuration array that authored this message.
265
- /// This identifies only the ultimate author because messages may be forwarded amongst replicas.
266
- replica: u8 = 0,
267
-
268
- /// The Viewstamped Replication protocol command for this message.
269
- command: Command,
270
-
271
- /// The state machine operation to apply.
272
- operation: Operation = .reserved,
273
-
274
- /// The version of the protocol implementation that originated this message.
275
- version: u8 = Version,
276
-
277
- pub fn calculate_checksum(self: *const Header) u128 {
278
- const checksum_size = @sizeOf(@TypeOf(self.checksum));
279
- assert(checksum_size == 16);
280
- const checksum_value = checksum(std.mem.asBytes(self)[checksum_size..]);
281
- assert(@TypeOf(checksum_value) == @TypeOf(self.checksum));
282
- return checksum_value;
283
- }
284
-
285
- pub fn calculate_checksum_body(self: *const Header, body: []const u8) u128 {
286
- assert(self.size == @sizeOf(Header) + body.len);
287
- const checksum_size = @sizeOf(@TypeOf(self.checksum_body));
288
- assert(checksum_size == 16);
289
- const checksum_value = checksum(body);
290
- assert(@TypeOf(checksum_value) == @TypeOf(self.checksum_body));
291
- return checksum_value;
292
- }
293
-
294
- /// This must be called only after set_checksum_body() so that checksum_body is also covered:
295
- pub fn set_checksum(self: *Header) void {
296
- self.checksum = self.calculate_checksum();
297
- }
298
-
299
- pub fn set_checksum_body(self: *Header, body: []const u8) void {
300
- self.checksum_body = self.calculate_checksum_body(body);
301
- }
302
-
303
- pub fn valid_checksum(self: *const Header) bool {
304
- return self.checksum == self.calculate_checksum();
305
- }
306
-
307
- pub fn valid_checksum_body(self: *const Header, body: []const u8) bool {
308
- return self.checksum_body == self.calculate_checksum_body(body);
309
- }
310
-
311
- /// Returns null if all fields are set correctly according to the command, or else a warning.
312
- /// This does not verify that checksum is valid, and expects that this has already been done.
313
- pub fn invalid(self: *const Header) ?[]const u8 {
314
- if (self.version != Version) return "version != Version";
315
- if (self.size < @sizeOf(Header)) return "size < @sizeOf(Header)";
316
- if (self.epoch != 0) return "epoch != 0";
317
- return switch (self.command) {
318
- .reserved => self.invalid_reserved(),
319
- .ping => self.invalid_ping(),
320
- .pong => self.invalid_pong(),
321
- .ping_client => self.invalid_ping_client(),
322
- .pong_client => self.invalid_pong_client(),
323
- .request => self.invalid_request(),
324
- .prepare => self.invalid_prepare(),
325
- .prepare_ok => self.invalid_prepare_ok(),
326
- .reply => self.invalid_reply(),
327
- .commit => self.invalid_commit(),
328
- .start_view_change => self.invalid_start_view_change(),
329
- .do_view_change => self.invalid_do_view_change(),
330
- .start_view => self.invalid_start_view(),
331
- .request_start_view => self.invalid_request_start_view(),
332
- .request_headers => self.invalid_request_headers(),
333
- .request_prepare => self.invalid_request_prepare(),
334
- .request_block => null, // TODO
335
- .headers => self.invalid_headers(),
336
- .nack_prepare => self.invalid_nack_prepare(),
337
- .eviction => self.invalid_eviction(),
338
- .block => null, // TODO
339
- };
340
- }
341
-
342
- fn invalid_reserved(self: *const Header) ?[]const u8 {
343
- assert(self.command == .reserved);
344
- if (self.parent != 0) return "parent != 0";
345
- if (self.client != 0) return "client != 0";
346
- if (self.context != 0) return "context != 0";
347
- if (self.request != 0) return "request != 0";
348
- if (self.view != 0) return "view != 0";
349
- if (self.commit != 0) return "commit != 0";
350
- if (self.timestamp != 0) return "timestamp != 0";
351
- if (self.replica != 0) return "replica != 0";
352
- if (self.operation != .reserved) return "operation != .reserved";
353
- return null;
354
- }
355
-
356
- fn invalid_ping(self: *const Header) ?[]const u8 {
357
- assert(self.command == .ping);
358
- if (self.parent != 0) return "parent != 0";
359
- if (self.client != 0) return "client != 0";
360
- if (self.context != 0) return "context != 0";
361
- if (self.request != 0) return "request != 0";
362
- if (self.view != 0) return "view != 0";
363
- if (self.commit != 0) return "commit != 0";
364
- if (self.timestamp != 0) return "timestamp != 0";
365
- if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
366
- if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
367
- if (self.operation != .reserved) return "operation != .reserved";
368
- return null;
369
- }
370
-
371
- fn invalid_pong(self: *const Header) ?[]const u8 {
372
- assert(self.command == .pong);
373
- if (self.parent != 0) return "parent != 0";
374
- if (self.client != 0) return "client != 0";
375
- if (self.context != 0) return "context != 0";
376
- if (self.request != 0) return "request != 0";
377
- if (self.view != 0) return "view != 0";
378
- if (self.commit != 0) return "commit != 0";
379
- if (self.timestamp == 0) return "timestamp == 0";
380
- if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
381
- if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
382
- if (self.operation != .reserved) return "operation != .reserved";
383
- return null;
384
- }
385
-
386
- fn invalid_ping_client(self: *const Header) ?[]const u8 {
387
- assert(self.command == .ping_client);
388
- if (self.parent != 0) return "parent != 0";
389
- if (self.client == 0) return "client == 0";
390
- if (self.context != 0) return "context != 0";
391
- if (self.request != 0) return "request != 0";
392
- if (self.view != 0) return "view != 0";
393
- if (self.op != 0) return "op != 0";
394
- if (self.commit != 0) return "commit != 0";
395
- if (self.timestamp != 0) return "timestamp != 0";
396
- if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
397
- if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
398
- if (self.replica != 0) return "replica != 0";
399
- if (self.operation != .reserved) return "operation != .reserved";
400
- return null;
401
- }
402
-
403
- fn invalid_pong_client(self: *const Header) ?[]const u8 {
404
- assert(self.command == .pong_client);
405
- if (self.parent != 0) return "parent != 0";
406
- if (self.client != 0) return "client != 0";
407
- if (self.context != 0) return "context != 0";
408
- if (self.request != 0) return "request != 0";
409
- if (self.op != 0) return "op != 0";
410
- if (self.commit != 0) return "commit != 0";
411
- if (self.timestamp != 0) return "timestamp != 0";
412
- if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
413
- if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
414
- if (self.operation != .reserved) return "operation != .reserved";
415
- return null;
416
- }
417
-
418
- fn invalid_request(self: *const Header) ?[]const u8 {
419
- assert(self.command == .request);
420
- if (self.client == 0) return "client == 0";
421
- if (self.op != 0) return "op != 0";
422
- if (self.commit != 0) return "commit != 0";
423
- if (self.timestamp != 0) return "timestamp != 0";
424
- if (self.replica != 0) return "replica != 0";
425
- switch (self.operation) {
426
- .reserved => return "operation == .reserved",
427
- .root => return "operation == .root",
428
- .register => {
429
- // The first request a client makes must be to register with the cluster:
430
- if (self.parent != 0) return "register: parent != 0";
431
- if (self.context != 0) return "register: context != 0";
432
- if (self.request != 0) return "register: request != 0";
433
- // The .register operation carries no payload:
434
- if (self.checksum_body != checksum_body_empty) return "register: checksum_body != expected";
435
- if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
436
- },
437
- else => {
438
- // Thereafter, the client must provide the session number in the context:
439
- // These requests should set `parent` to the `checksum` of the previous reply.
440
- if (self.context == 0) return "context == 0";
441
- if (self.request == 0) return "request == 0";
442
- },
443
- }
444
- return null;
445
- }
446
-
447
- fn invalid_prepare(self: *const Header) ?[]const u8 {
448
- assert(self.command == .prepare);
449
- switch (self.operation) {
450
- .reserved => return "operation == .reserved",
451
- .root => {
452
- if (self.parent != 0) return "root: parent != 0";
453
- if (self.client != 0) return "root: client != 0";
454
- if (self.context != 0) return "root: context != 0";
455
- if (self.request != 0) return "root: request != 0";
456
- if (self.view != 0) return "root: view != 0";
457
- if (self.op != 0) return "root: op != 0";
458
- if (self.commit != 0) return "root: commit != 0";
459
- if (self.timestamp != 0) return "root: timestamp != 0";
460
- if (self.checksum_body != checksum_body_empty) return "root: checksum_body != expected";
461
- if (self.size != @sizeOf(Header)) return "root: size != @sizeOf(Header)";
462
- if (self.replica != 0) return "root: replica != 0";
463
- },
464
- else => {
465
- if (self.client == 0) return "client == 0";
466
- if (self.op == 0) return "op == 0";
467
- if (self.op <= self.commit) return "op <= commit";
468
- if (self.timestamp == 0) return "timestamp == 0";
469
- if (self.operation == .register) {
470
- // Client session numbers are replaced by the reference to the previous prepare.
471
- if (self.request != 0) return "request != 0";
472
- } else {
473
- // Client session numbers are replaced by the reference to the previous prepare.
474
- if (self.request == 0) return "request == 0";
475
- }
476
- },
477
- }
478
- return null;
479
- }
480
-
481
- fn invalid_prepare_ok(self: *const Header) ?[]const u8 {
482
- assert(self.command == .prepare_ok);
483
- if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
484
- if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
485
- switch (self.operation) {
486
- .reserved => return "operation == .reserved",
487
- .root => {
488
- if (self.parent != 0) return "root: parent != 0";
489
- if (self.client != 0) return "root: client != 0";
490
- if (self.context != 0) return "root: context != 0";
491
- if (self.request != 0) return "root: request != 0";
492
- if (self.view != 0) return "root: view != 0";
493
- if (self.op != 0) return "root: op != 0";
494
- if (self.commit != 0) return "root: commit != 0";
495
- if (self.timestamp != 0) return "root: timestamp != 0";
496
- if (self.replica != 0) return "root: replica != 0";
497
- },
498
- else => {
499
- if (self.client == 0) return "client == 0";
500
- if (self.op == 0) return "op == 0";
501
- if (self.op <= self.commit) return "op <= commit";
502
- if (self.timestamp == 0) return "timestamp == 0";
503
- if (self.operation == .register) {
504
- if (self.request != 0) return "request != 0";
505
- } else {
506
- if (self.request == 0) return "request == 0";
507
- }
508
- },
509
- }
510
- return null;
511
- }
512
-
513
- fn invalid_reply(self: *const Header) ?[]const u8 {
514
- assert(self.command == .reply);
515
- // Initialization within `client.zig` asserts that client `id` is greater than zero:
516
- if (self.client == 0) return "client == 0";
517
- if (self.context != 0) return "context != 0";
518
- if (self.op != self.commit) return "op != commit";
519
- if (self.timestamp == 0) return "timestamp == 0";
520
- if (self.operation == .register) {
521
- // In this context, the commit number is the newly registered session number.
522
- // The `0` commit number is reserved for cluster initialization.
523
- if (self.commit == 0) return "commit == 0";
524
- if (self.request != 0) return "request != 0";
525
- } else {
526
- if (self.commit == 0) return "commit == 0";
527
- if (self.request == 0) return "request == 0";
528
- }
529
- return null;
530
- }
531
-
532
- fn invalid_commit(self: *const Header) ?[]const u8 {
533
- assert(self.command == .commit);
534
- if (self.parent != 0) return "parent != 0";
535
- if (self.client != 0) return "client != 0";
536
- if (self.request != 0) return "request != 0";
537
- if (self.op != 0) return "op != 0";
538
- if (self.timestamp == 0) return "timestamp == 0";
539
- if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
540
- if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
541
- if (self.operation != .reserved) return "operation != .reserved";
542
- return null;
543
- }
544
-
545
- fn invalid_start_view_change(self: *const Header) ?[]const u8 {
546
- assert(self.command == .start_view_change);
547
- if (self.parent != 0) return "parent != 0";
548
- if (self.client != 0) return "client != 0";
549
- if (self.context != 0) return "context != 0";
550
- if (self.request != 0) return "request != 0";
551
- if (self.op != 0) return "op != 0";
552
- if (self.commit != 0) return "commit != 0";
553
- if (self.timestamp != 0) return "timestamp != 0";
554
- if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
555
- if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
556
- if (self.operation != .reserved) return "operation != .reserved";
557
- return null;
558
- }
559
-
560
- fn invalid_do_view_change(self: *const Header) ?[]const u8 {
561
- assert(self.command == .do_view_change);
562
- if (self.parent != 0) return "parent != 0";
563
- if (self.client != 0) return "client != 0";
564
- if (self.context != 0) return "context != 0";
565
- if (self.request != 0) return "request != 0";
566
- if (self.operation != .reserved) return "operation != .reserved";
567
- return null;
568
- }
569
-
570
- fn invalid_start_view(self: *const Header) ?[]const u8 {
571
- assert(self.command == .start_view);
572
- if (self.parent != 0) return "parent != 0";
573
- if (self.client != 0) return "client != 0";
574
- if (self.context != 0) return "context != 0";
575
- if (self.request != 0) return "request != 0";
576
- if (self.timestamp != 0) return "timestamp != 0";
577
- if (self.operation != .reserved) return "operation != .reserved";
578
- return null;
579
- }
580
-
581
- fn invalid_request_start_view(self: *const Header) ?[]const u8 {
582
- assert(self.command == .request_start_view);
583
- if (self.parent != 0) return "parent != 0";
584
- if (self.client != 0) return "client != 0";
585
- if (self.context != 0) return "context != 0";
586
- if (self.request != 0) return "request != 0";
587
- if (self.op != 0) return "op != 0";
588
- if (self.commit != 0) return "commit != 0";
589
- if (self.timestamp != 0) return "timestamp != 0";
590
- if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
591
- if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
592
- if (self.operation != .reserved) return "operation != .reserved";
593
- return null;
594
- }
595
-
596
- fn invalid_request_headers(self: *const Header) ?[]const u8 {
597
- assert(self.command == .request_headers);
598
- if (self.parent != 0) return "parent != 0";
599
- if (self.client != 0) return "client != 0";
600
- if (self.context != 0) return "context != 0";
601
- if (self.request != 0) return "request != 0";
602
- if (self.commit > self.op) return "op_min > op_max";
603
- if (self.timestamp != 0) return "timestamp != 0";
604
- if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
605
- if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
606
- if (self.operation != .reserved) return "operation != .reserved";
607
- return null;
608
- }
609
-
610
- fn invalid_request_prepare(self: *const Header) ?[]const u8 {
611
- assert(self.command == .request_prepare);
612
- if (self.parent != 0) return "parent != 0";
613
- if (self.client != 0) return "client != 0";
614
- if (self.request != 0) return "request != 0";
615
- if (self.commit != 0) return "commit != 0";
616
- if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
617
- if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
618
- switch (self.timestamp) {
619
- 0 => if (self.context != 0) return "context != 0",
620
- 1 => {}, // context is a checksum, which may be 0.
621
- else => return "timestamp > 1",
622
- }
623
- if (self.operation != .reserved) return "operation != .reserved";
624
- return null;
625
- }
626
-
627
- fn invalid_headers(self: *const Header) ?[]const u8 {
628
- assert(self.command == .headers);
629
- if (self.parent != 0) return "parent != 0";
630
- if (self.client != 0) return "client != 0";
631
- if (self.request != 0) return "request != 0";
632
- if (self.op != 0) return "op != 0";
633
- if (self.commit != 0) return "commit != 0";
634
- if (self.timestamp != 0) return "timestamp != 0";
635
- if (self.operation != .reserved) return "operation != .reserved";
636
- return null;
637
- }
638
-
639
- fn invalid_nack_prepare(self: *const Header) ?[]const u8 {
640
- assert(self.command == .nack_prepare);
641
- if (self.parent != 0) return "parent != 0";
642
- if (self.client != 0) return "client != 0";
643
- if (self.request != 0) return "request != 0";
644
- if (self.commit != 0) return "commit != 0";
645
- if (self.timestamp != 0) return "timestamp != 0";
646
- if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
647
- if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
648
- if (self.operation != .reserved) return "operation != .reserved";
649
- return null;
650
- }
651
-
652
- fn invalid_eviction(self: *const Header) ?[]const u8 {
653
- assert(self.command == .eviction);
654
- if (self.parent != 0) return "parent != 0";
655
- if (self.context != 0) return "context != 0";
656
- if (self.request != 0) return "request != 0";
657
- if (self.op != 0) return "op != 0";
658
- if (self.commit != 0) return "commit != 0";
659
- if (self.timestamp != 0) return "timestamp != 0";
660
- if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
661
- if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
662
- if (self.operation != .reserved) return "operation != .reserved";
663
- return null;
664
- }
665
-
666
- /// Returns whether the immediate sender is a replica or client (if this can be determined).
667
- /// Some commands such as .request or .prepare may be forwarded on to other replicas so that
668
- /// Header.replica or Header.client only identifies the ultimate origin, not the latest peer.
669
- pub fn peer_type(self: *const Header) enum { unknown, replica, client } {
670
- switch (self.command) {
671
- .reserved => unreachable,
672
- // These messages cannot always identify the peer as they may be forwarded:
673
- .request => switch (self.operation) {
674
- // However, we do not forward the first .register request sent by a client:
675
- .register => return .client,
676
- else => return .unknown,
677
- },
678
- .prepare => return .unknown,
679
- // These messages identify the peer as either a replica or a client:
680
- .ping_client => return .client,
681
- // All other messages identify the peer as a replica:
682
- else => return .replica,
683
- }
684
- }
685
-
686
- pub fn reserved(cluster: u32, slot: u64) Header {
687
- assert(slot < constants.journal_slot_count);
688
-
689
- var header = Header{
690
- .command = .reserved,
691
- .cluster = cluster,
692
- .op = slot,
693
- };
694
- header.set_checksum_body(&[0]u8{});
695
- header.set_checksum();
696
- assert(header.invalid() == null);
697
- return header;
698
- }
699
-
700
- pub fn root_prepare(cluster: u32) Header {
701
- var header = Header{
702
- .cluster = cluster,
703
- .size = @sizeOf(Header),
704
- .command = .prepare,
705
- .operation = .root,
706
- };
707
- header.set_checksum_body(&[0]u8{});
708
- header.set_checksum();
709
- assert(header.invalid() == null);
710
- return header;
711
- }
712
- };
713
-
714
- pub const Timeout = struct {
715
- name: []const u8,
716
- id: u128,
717
- after: u64,
718
- attempts: u8 = 0,
719
- rtt: u64 = constants.rtt_ticks,
720
- rtt_multiple: u8 = constants.rtt_multiple,
721
- ticks: u64 = 0,
722
- ticking: bool = false,
723
-
724
- /// Increments the attempts counter and resets the timeout with exponential backoff and jitter.
725
- /// Allows the attempts counter to wrap from time to time.
726
- /// The overflow period is kept short to surface any related bugs sooner rather than later.
727
- /// We do not saturate the counter as this would cause round-robin retries to get stuck.
728
- pub fn backoff(self: *Timeout, random: std.rand.Random) void {
729
- assert(self.ticking);
730
-
731
- self.ticks = 0;
732
- self.attempts +%= 1;
733
-
734
- log.debug("{}: {s} backing off", .{ self.id, self.name });
735
- self.set_after_for_rtt_and_attempts(random);
736
- }
737
-
738
- /// It's important to check that when fired() is acted on that the timeout is stopped/started,
739
- /// otherwise further ticks around the event loop may trigger a thundering herd of messages.
740
- pub fn fired(self: *const Timeout) bool {
741
- if (self.ticking and self.ticks >= self.after) {
742
- log.debug("{}: {s} fired", .{ self.id, self.name });
743
- if (self.ticks > self.after) {
744
- log.err("{}: {s} is firing every tick", .{ self.id, self.name });
745
- @panic("timeout was not reset correctly");
746
- }
747
- return true;
748
- } else {
749
- return false;
750
- }
751
- }
752
-
753
- pub fn reset(self: *Timeout) void {
754
- self.attempts = 0;
755
- self.ticks = 0;
756
- assert(self.ticking);
757
- // TODO Use self.prng to adjust for rtt and attempts.
758
- log.debug("{}: {s} reset", .{ self.id, self.name });
759
- }
760
-
761
- /// Sets the value of `after` as a function of `rtt` and `attempts`.
762
- /// Adds exponential backoff and jitter.
763
- /// May be called only after a timeout has been stopped or reset, to prevent backward jumps.
764
- pub fn set_after_for_rtt_and_attempts(self: *Timeout, random: std.rand.Random) void {
765
- // If `after` is reduced by this function to less than `ticks`, then `fired()` will panic:
766
- assert(self.ticks == 0);
767
- assert(self.rtt > 0);
768
-
769
- const after = (self.rtt * self.rtt_multiple) + exponential_backoff_with_jitter(
770
- random,
771
- constants.backoff_min_ticks,
772
- constants.backoff_max_ticks,
773
- self.attempts,
774
- );
775
-
776
- // TODO Clamp `after` to min/max tick bounds for timeout.
777
-
778
- log.debug("{}: {s} after={}..{} (rtt={} min={} max={} attempts={})", .{
779
- self.id,
780
- self.name,
781
- self.after,
782
- after,
783
- self.rtt,
784
- constants.backoff_min_ticks,
785
- constants.backoff_max_ticks,
786
- self.attempts,
787
- });
788
-
789
- self.after = after;
790
- assert(self.after > 0);
791
- }
792
-
793
- pub fn set_rtt(self: *Timeout, rtt_ticks: u64) void {
794
- assert(self.rtt > 0);
795
- assert(rtt_ticks > 0);
796
-
797
- log.debug("{}: {s} rtt={}..{}", .{
798
- self.id,
799
- self.name,
800
- self.rtt,
801
- rtt_ticks,
802
- });
803
-
804
- self.rtt = rtt_ticks;
805
- }
806
-
807
- pub fn start(self: *Timeout) void {
808
- self.attempts = 0;
809
- self.ticks = 0;
810
- self.ticking = true;
811
- // TODO Use self.prng to adjust for rtt and attempts.
812
- log.debug("{}: {s} started", .{ self.id, self.name });
813
- }
814
-
815
- pub fn stop(self: *Timeout) void {
816
- self.attempts = 0;
817
- self.ticks = 0;
818
- self.ticking = false;
819
- log.debug("{}: {s} stopped", .{ self.id, self.name });
820
- }
821
-
822
- pub fn tick(self: *Timeout) void {
823
- if (self.ticking) self.ticks += 1;
824
- }
825
- };
826
-
827
- /// Calculates exponential backoff with jitter to prevent cascading failure due to thundering herds.
828
- pub fn exponential_backoff_with_jitter(
829
- random: std.rand.Random,
830
- min: u64,
831
- max: u64,
832
- attempt: u64,
833
- ) u64 {
834
- const range = max - min;
835
- assert(range > 0);
836
-
837
- // Do not use `@truncate(u6, attempt)` since that only discards the high bits:
838
- // We want a saturating exponent here instead.
839
- const exponent = @intCast(u6, std.math.min(std.math.maxInt(u6), attempt));
840
-
841
- // A "1" shifted left gives any power of two:
842
- // 1<<0 = 1, 1<<1 = 2, 1<<2 = 4, 1<<3 = 8
843
- const power = std.math.shlExact(u128, 1, exponent) catch unreachable; // Do not truncate.
844
-
845
- // Ensure that `backoff` is calculated correctly when min is 0, taking `std.math.max(1, min)`.
846
- // Otherwise, the final result will always be 0. This was an actual bug we encountered.
847
- const min_non_zero = std.math.max(1, min);
848
- assert(min_non_zero > 0);
849
- assert(power > 0);
850
-
851
- // Calculate the capped exponential backoff component, `min(range, min * 2 ^ attempt)`:
852
- const backoff = std.math.min(range, min_non_zero * power);
853
- const jitter = random.uintAtMostBiased(u64, backoff);
854
-
855
- const result = @intCast(u64, min + jitter);
856
- assert(result >= min);
857
- assert(result <= max);
858
-
859
- return result;
860
- }
861
-
862
- test "exponential_backoff_with_jitter" {
863
- var prng = std.rand.DefaultPrng.init(0);
864
- const random = prng.random();
865
-
866
- const attempts = 1000;
867
- const max: u64 = std.math.maxInt(u64);
868
- const min = max - attempts;
869
-
870
- var attempt = max - attempts;
871
- while (attempt < max) : (attempt += 1) {
872
- const ebwj = exponential_backoff_with_jitter(random, min, max, attempt);
873
- try std.testing.expect(ebwj >= min);
874
- try std.testing.expect(ebwj <= max);
875
- }
876
- }
877
-
878
- /// Returns An array containing the remote or local addresses of each of the 2f + 1 replicas:
879
- /// Unlike the VRR paper, we do not sort the array but leave the order explicitly to the user.
880
- /// There are several advantages to this:
881
- /// * The operator may deploy a cluster with proximity in mind since replication follows order.
882
- /// * A replica's IP address may be changed without reconfiguration.
883
- /// This does require that the user specify the same order to all replicas.
884
- /// The caller owns the memory of the returned slice of addresses.
885
- pub fn parse_addresses(allocator: std.mem.Allocator, raw: []const u8, address_limit: usize) ![]std.net.Address {
886
- const address_count = std.mem.count(u8, raw, ",") + 1;
887
- if (address_count > address_limit) return error.AddressLimitExceeded;
888
-
889
- const addresses = try allocator.alloc(std.net.Address, address_count);
890
- errdefer allocator.free(addresses);
891
-
892
- var index: usize = 0;
893
- var comma_iterator = std.mem.split(u8, raw, ",");
894
- while (comma_iterator.next()) |raw_address| : (index += 1) {
895
- assert(index < address_limit);
896
- if (raw_address.len == 0) return error.AddressHasTrailingComma;
897
- addresses[index] = try parse_address(raw_address);
898
- }
899
- assert(index == address_count);
900
-
901
- return addresses;
902
- }
903
-
904
- pub fn parse_address(raw: []const u8) !std.net.Address {
905
- var colon_iterator = std.mem.split(u8, raw, ":");
906
- // The split iterator will always return non-null once, even if the delimiter is not found:
907
- const raw_ipv4 = colon_iterator.next().?;
908
-
909
- if (colon_iterator.next()) |raw_port| {
910
- if (colon_iterator.next() != null) return error.AddressHasMoreThanOneColon;
911
-
912
- const port = std.fmt.parseUnsigned(u16, raw_port, 10) catch |err| switch (err) {
913
- error.Overflow => return error.PortOverflow,
914
- error.InvalidCharacter => return error.PortInvalid,
915
- };
916
- return std.net.Address.parseIp4(raw_ipv4, port) catch {
917
- return error.AddressInvalid;
918
- };
919
- } else {
920
- // There was no colon in the address so there are now two cases:
921
- // 1. an IPv4 address with the default port, or
922
- // 2. a port with the default IPv4 address.
923
-
924
- // Let's try parsing as a port first:
925
- if (std.fmt.parseUnsigned(u16, raw, 10)) |port| {
926
- return std.net.Address.parseIp4(constants.address, port) catch unreachable;
927
- } else |err| switch (err) {
928
- error.Overflow => return error.PortOverflow,
929
- error.InvalidCharacter => {
930
- // Something was not a digit, let's try parsing as an IPv4 instead:
931
- return std.net.Address.parseIp4(raw, constants.port) catch {
932
- return error.AddressInvalid;
933
- };
934
- },
935
- }
936
- }
937
- }
938
-
939
- test "parse_addresses" {
940
- const vectors_positive = &[_]struct {
941
- raw: []const u8,
942
- addresses: []const std.net.Address,
943
- }{
944
- .{
945
- // Test the minimum/maximum address/port.
946
- .raw = "1.2.3.4:567,0.0.0.0:0,255.255.255.255:65535",
947
- .addresses = &[3]std.net.Address{
948
- std.net.Address.initIp4([_]u8{ 1, 2, 3, 4 }, 567),
949
- std.net.Address.initIp4([_]u8{ 0, 0, 0, 0 }, 0),
950
- std.net.Address.initIp4([_]u8{ 255, 255, 255, 255 }, 65535),
951
- },
952
- },
953
- .{
954
- // Addresses are not reordered.
955
- .raw = "3.4.5.6:7777,200.3.4.5:6666,1.2.3.4:5555",
956
- .addresses = &[3]std.net.Address{
957
- std.net.Address.initIp4([_]u8{ 3, 4, 5, 6 }, 7777),
958
- std.net.Address.initIp4([_]u8{ 200, 3, 4, 5 }, 6666),
959
- std.net.Address.initIp4([_]u8{ 1, 2, 3, 4 }, 5555),
960
- },
961
- },
962
- .{
963
- // Test default address and port.
964
- .raw = "1.2.3.4:5,4321,2.3.4.5",
965
- .addresses = &[3]std.net.Address{
966
- std.net.Address.initIp4([_]u8{ 1, 2, 3, 4 }, 5),
967
- try std.net.Address.parseIp4(constants.address, 4321),
968
- std.net.Address.initIp4([_]u8{ 2, 3, 4, 5 }, constants.port),
969
- },
970
- },
971
- .{
972
- // Test addresses less than address_limit.
973
- .raw = "1.2.3.4:5,4321",
974
- .addresses = &[2]std.net.Address{
975
- std.net.Address.initIp4([_]u8{ 1, 2, 3, 4 }, 5),
976
- try std.net.Address.parseIp4(constants.address, 4321),
977
- },
978
- },
979
- };
980
-
981
- const vectors_negative = &[_]struct {
982
- raw: []const u8,
983
- err: anyerror![]std.net.Address,
984
- }{
985
- .{ .raw = "", .err = error.AddressHasTrailingComma },
986
- .{ .raw = "1.2.3.4:5,2.3.4.5:6,4.5.6.7:8", .err = error.AddressLimitExceeded },
987
- .{ .raw = "1.2.3.4:7777,", .err = error.AddressHasTrailingComma },
988
- .{ .raw = "1.2.3.4:7777,2.3.4.5::8888", .err = error.AddressHasMoreThanOneColon },
989
- .{ .raw = "1.2.3.4:5,A", .err = error.AddressInvalid }, // default port
990
- .{ .raw = "1.2.3.4:5,2.a.4.5", .err = error.AddressInvalid }, // default port
991
- .{ .raw = "1.2.3.4:5,2.a.4.5:6", .err = error.AddressInvalid }, // specified port
992
- .{ .raw = "1.2.3.4:5,2.3.4.5:", .err = error.PortInvalid },
993
- .{ .raw = "1.2.3.4:5,2.3.4.5:A", .err = error.PortInvalid },
994
- .{ .raw = "1.2.3.4:5,65536", .err = error.PortOverflow }, // default address
995
- .{ .raw = "1.2.3.4:5,2.3.4.5:65536", .err = error.PortOverflow },
996
- };
997
-
998
- for (vectors_positive) |vector| {
999
- const addresses_actual = try parse_addresses(std.testing.allocator, vector.raw, 3);
1000
- defer std.testing.allocator.free(addresses_actual);
1001
-
1002
- try std.testing.expectEqual(addresses_actual.len, vector.addresses.len);
1003
- for (vector.addresses) |address_expect, i| {
1004
- const address_actual = addresses_actual[i];
1005
- try std.testing.expectEqual(address_expect.in.sa.family, address_actual.in.sa.family);
1006
- try std.testing.expectEqual(address_expect.in.sa.port, address_actual.in.sa.port);
1007
- try std.testing.expectEqual(address_expect.in.sa.addr, address_actual.in.sa.addr);
1008
- try std.testing.expectEqual(address_expect.in.sa.zero, address_actual.in.sa.zero);
1009
- }
1010
- }
1011
-
1012
- for (vectors_negative) |vector| {
1013
- try std.testing.expectEqual(vector.err, parse_addresses(std.testing.allocator, vector.raw, 2));
1014
- }
1015
- }
1016
-
1017
- pub fn sector_floor(offset: u64) u64 {
1018
- const sectors = math.divFloor(u64, offset, constants.sector_size) catch unreachable;
1019
- return sectors * constants.sector_size;
1020
- }
1021
-
1022
- pub fn sector_ceil(offset: u64) u64 {
1023
- const sectors = math.divCeil(u64, offset, constants.sector_size) catch unreachable;
1024
- return sectors * constants.sector_size;
1025
- }
1026
-
1027
- pub fn checksum(source: []const u8) u128 {
1028
- @setEvalBranchQuota(4000);
1029
-
1030
- var target: [32]u8 = undefined;
1031
- std.crypto.hash.Blake3.hash(source, target[0..], .{});
1032
- return @bitCast(u128, target[0..@sizeOf(u128)].*);
1033
- }
1034
-
1035
- pub fn quorums(replica_count: u8) struct {
1036
- replication: u8,
1037
- view_change: u8,
1038
- } {
1039
- assert(replica_count > 0);
1040
-
1041
- assert(constants.quorum_replication_max >= 2);
1042
- // For replica_count=2, set quorum_replication=2 even though =1 would intersect.
1043
- // This improves durability of small clusters.
1044
- const quorum_replication = if (replica_count == 2) 2 else std.math.min(
1045
- constants.quorum_replication_max,
1046
- stdx.div_ceil(replica_count, 2),
1047
- );
1048
- assert(quorum_replication <= replica_count);
1049
- assert(quorum_replication >= 2 or quorum_replication == replica_count);
1050
-
1051
- // For replica_count=2, set quorum_view_change=2 even though =1 would intersect.
1052
- // This avoids special cases for a single-replica view-change in Replica.
1053
- const quorum_view_change =
1054
- if (replica_count == 2) 2 else replica_count - quorum_replication + 1;
1055
- // The view change quorum may be more expensive to make the replication quorum cheaper.
1056
- // The insight is that the replication phase is by far more common than the view change.
1057
- // This trade-off allows us to optimize for the common case.
1058
- // See the comments in `constants.zig` for further explanation.
1059
- assert(quorum_view_change <= replica_count);
1060
- assert(quorum_view_change >= 2 or quorum_view_change == replica_count);
1061
- assert(quorum_view_change >= @divFloor(replica_count, 2) + 1);
1062
- assert(quorum_view_change + quorum_replication > replica_count);
1063
-
1064
- return .{
1065
- .replication = quorum_replication,
1066
- .view_change = quorum_view_change,
1067
- };
1068
- }
1069
-
1070
- test "quorums" {
1071
- if (constants.quorum_replication_max != 3) return error.SkipZigTest;
1072
-
1073
- const expect_replication = [_]u8{ 1, 2, 2, 2, 3, 3, 3, 3 };
1074
- const expect_view_change = [_]u8{ 1, 2, 2, 3, 3, 4, 5, 6 };
1075
-
1076
- for (expect_replication[0..]) |_, i| {
1077
- const actual = quorums(@intCast(u8, i) + 1);
1078
- try std.testing.expectEqual(actual.replication, expect_replication[i]);
1079
- try std.testing.expectEqual(actual.view_change, expect_view_change[i]);
1080
- }
1081
- }
1082
-
1083
- pub const Headers = struct {
1084
- pub const Array = std.BoundedArray(Header, constants.view_change_headers_max);
1085
- /// The SuperBlock's persisted VSR headers.
1086
- /// One of the following:
1087
- ///
1088
- /// - SV headers (consecutive chain)
1089
- /// - DVC headers (disjoint chain)
1090
- pub const ViewChangeSlice = ViewChangeHeadersSlice;
1091
- pub const ViewChangeArray = ViewChangeHeadersArray;
1092
- };
1093
-
1094
- const ViewChangeHeadersSlice = struct {
1095
- /// Headers are ordered from high-to-low op.
1096
- slice: []const Header,
1097
-
1098
- pub fn init(slice: []const Header) ViewChangeHeadersSlice {
1099
- ViewChangeHeadersSlice.verify(slice);
1100
-
1101
- return .{ .slice = slice };
1102
- }
1103
-
1104
- pub fn verify(slice: []const Header) void {
1105
- assert(slice.len > 0);
1106
- assert(slice.len <= constants.view_change_headers_max);
1107
-
1108
- var child: ?*const Header = null;
1109
- for (slice) |*header| {
1110
- assert(header.valid_checksum());
1111
- assert(header.command == .prepare);
1112
-
1113
- if (child) |child_header| {
1114
- assert(header.op < child_header.op);
1115
- assert(header.view <= child_header.view);
1116
- assert((header.op + 1 == child_header.op) ==
1117
- (header.checksum == child_header.parent));
1118
- assert(header.timestamp < child_header.timestamp);
1119
- }
1120
- child = header;
1121
- }
1122
- }
1123
-
1124
- const ViewRange = struct {
1125
- min: u32, // inclusive
1126
- max: u32, // inclusive
1127
-
1128
- pub fn contains(range: ViewRange, view: u32) bool {
1129
- return range.min <= view and view <= range.max;
1130
- }
1131
- };
1132
-
1133
- /// Returns the range of possible views (of prepare, not commit) for a message that is part of
1134
- /// the same log_view as these headers.
1135
- ///
1136
- /// - When these are DVC headers for a log_view=V, we must be in view_change status working to
1137
- /// transition to a view beyond V. So we will never prepare anything else as part of view V.
1138
- /// - When these are SV headers for a log_view=V, we can continue to add to them (by preparing
1139
- /// more ops), but those ops will laways be part of the log_view. If they were prepared during
1140
- /// a view prior to the log_view, they would already be part of the headers.
1141
- pub fn view_for_op(headers: ViewChangeHeadersSlice, op: u64, log_view: u32) ViewRange {
1142
- const header_newest = &headers.slice[0];
1143
- const header_oldest = &headers.slice[headers.slice.len - 1];
1144
-
1145
- if (op < header_oldest.op) return .{ .min = 0, .max = header_oldest.view };
1146
- if (op > header_newest.op) return .{ .min = log_view, .max = log_view };
1147
-
1148
- for (headers.slice) |*header| {
1149
- if (header.op == op) return .{ .min = header.view, .max = header.view };
1150
- }
1151
-
1152
- for (headers.slice[0 .. headers.slice.len - 1]) |*header_next, header_next_index| {
1153
- const header_prev = headers.slice[header_next_index + 1];
1154
- if (header_prev.op < op and op < header_next.op) {
1155
- return .{ .min = header_prev.view, .max = header_next.view };
1156
- }
1157
- }
1158
- unreachable;
1159
- }
1160
- };
1161
-
1162
- test "Headers.ViewChangeSlice.view_for_op" {
1163
- var headers_array = [_]Header{
1164
- std.mem.zeroInit(Header, .{ .op = 9, .view = 10 }),
1165
- std.mem.zeroInit(Header, .{ .op = 6, .view = 7 }),
1166
- };
1167
-
1168
- const headers = Headers.ViewChangeSlice{ .slice = &headers_array };
1169
- try std.testing.expect(std.meta.eql(headers.view_for_op(11, 12), .{ .min = 12, .max = 12 }));
1170
- try std.testing.expect(std.meta.eql(headers.view_for_op(10, 12), .{ .min = 12, .max = 12 }));
1171
- try std.testing.expect(std.meta.eql(headers.view_for_op(9, 12), .{ .min = 10, .max = 10 }));
1172
- try std.testing.expect(std.meta.eql(headers.view_for_op(8, 12), .{ .min = 7, .max = 10 }));
1173
- try std.testing.expect(std.meta.eql(headers.view_for_op(7, 12), .{ .min = 7, .max = 10 }));
1174
- try std.testing.expect(std.meta.eql(headers.view_for_op(6, 12), .{ .min = 7, .max = 7 }));
1175
- try std.testing.expect(std.meta.eql(headers.view_for_op(5, 12), .{ .min = 0, .max = 7 }));
1176
- try std.testing.expect(std.meta.eql(headers.view_for_op(0, 12), .{ .min = 0, .max = 7 }));
1177
- }
1178
-
1179
- /// The headers of a SV or DVC message.
1180
- const ViewChangeHeadersArray = struct {
1181
- array: Headers.Array,
1182
-
1183
- pub fn root(cluster: u32) ViewChangeHeadersArray {
1184
- var array = Headers.Array{ .buffer = undefined };
1185
- array.appendAssumeCapacity(Header.root_prepare(cluster));
1186
- return ViewChangeHeadersArray.init(array);
1187
- }
1188
-
1189
- fn init(array: Headers.Array) ViewChangeHeadersArray {
1190
- Headers.ViewChangeSlice.verify(array.constSlice());
1191
- return .{ .array = array };
1192
- }
1193
-
1194
- /// This function generates either DVC headers or SV headers:
1195
- /// - When `current.log_view < current.view`, generate headers for a SV message.
1196
- /// - When `current.log_view = current.view`, generate headers for a DVC message.
1197
- ///
1198
- /// Additionally, the current log_view/view/primary state informs the sort of "faults"
1199
- /// (gaps/breaks/etc) that we expect to find in the journal headers (`current.headers`).
1200
- /// For example, backups generating a DVC can safely skip over gaps (if the gap is after the DVC
1201
- /// anchor).
1202
- ///
1203
- /// Primaries and backups both generate DVCs and SVs.
1204
- /// - However, SVs are broadcast only by the primary.
1205
- /// - Backups generate a SV for persisting to the superblock.
1206
- /// (For convenience/symmetry, not correctness).
1207
- ///
1208
- /// DVCs and SVs have different invariants they must abide.
1209
- /// - Read DVCQuorum's comments to understand DVC invariants.
1210
- /// - SV headers are much simpler: no gaps or breaks, and all uncommitted ops must be included.
1211
- pub fn build(
1212
- results: *ViewChangeHeadersArray,
1213
- options: struct {
1214
- op_checkpoint: u64,
1215
- /// The last view_change_headers_max headers of the journal, starting with the head op
1216
- /// then descending, skipping over all gaps.
1217
- current: struct {
1218
- headers: *const Headers.Array,
1219
- view: u32,
1220
- log_view: u32,
1221
- log_view_primary: bool,
1222
- },
1223
- // The vsr_headers from the working superblock.
1224
- // The durable headers are useful (complimenting `current.headers`) because:
1225
- // - They simplify generation of DVCs in the case where we are recovering from a crash,
1226
- // when we were generating the same DVC prior to the crash.
1227
- // - They enable additional verification of header gaps/breaks based on the
1228
- // gap's/break's position relative to the durable headers.
1229
- durable: struct {
1230
- headers: Headers.ViewChangeSlice,
1231
- view: u32,
1232
- log_view: u32,
1233
- log_view_primary: bool,
1234
- },
1235
- },
1236
- ) void {
1237
- defer Headers.ViewChangeSlice.verify(results.array.constSlice());
1238
-
1239
- const headers = &results.array;
1240
- const current = options.current;
1241
- const durable = options.durable;
1242
-
1243
- assert(headers.len == 0);
1244
- assert(durable.headers.slice.len > 0);
1245
- assert(current.headers.len > 0);
1246
- for (current.headers.constSlice()[1..]) |*header, i| {
1247
- assert(current.headers.get(i).op > header.op);
1248
- }
1249
-
1250
- assert(current.view >= durable.view);
1251
- assert(current.log_view >= durable.log_view);
1252
- assert(current.view >= current.log_view);
1253
- assert(durable.view >= durable.log_view);
1254
-
1255
- const op_head_current = current.headers.get(0).op;
1256
- const op_head_durable = durable.headers.slice[0].op;
1257
-
1258
- // The rules for generating DVCs and SVs differ. We use the current view numbers to
1259
- // determine which is being generated:
1260
- // - When `log_view < view`, generate a DVC.
1261
- // - When `log_view = view`, generate a SV.
1262
- const command_current: enum { start_view, do_view_change } =
1263
- if (current.log_view == current.view) .start_view else .do_view_change;
1264
- // Likewise, the durable view numbers identify whether the durable headers were from a past
1265
- // DVC or SV. The durable headers are only useful if they are from the same view as our
1266
- // current headers, though.
1267
- const command_durable: enum { start_view, do_view_change, outdated } = command: {
1268
- if (durable.log_view == current.log_view) {
1269
- if (durable.log_view == durable.view) {
1270
- break :command .start_view;
1271
- } else {
1272
- break :command .do_view_change;
1273
- }
1274
- } else {
1275
- break :command .outdated;
1276
- }
1277
- };
1278
-
1279
- if (command_durable == .do_view_change and command_current == .do_view_change) {
1280
- assert(op_head_durable == op_head_current);
1281
- // Ensure that if we started a DVC before a crash, that we will resume sending the exact
1282
- // same DVC after recovery. (An alternative implementation would be to load the
1283
- // superblock's DVC headers (including gaps) into the journal during Replica.open(), but
1284
- // that is more complicated to implement correctly).
1285
- for (durable.headers.slice) |*header| headers.appendAssumeCapacity(header.*);
1286
- return;
1287
- }
1288
-
1289
- // What is the relationship between two prepares?
1290
- const Chain = enum {
1291
- // The ops are sequential, and the hash-chain is valid.
1292
- chain_sequence,
1293
- // The ops are sequential, and the hash-chain is invalid.
1294
- chain_break,
1295
- // The ops are non-sequential, and belong to the same view.
1296
- // This gap never hides a break.
1297
- chain_view,
1298
- // The ops are non-sequential, and belong to the different views.
1299
- // Depending on the replica state, this gap may hide a break.
1300
- chain_gap,
1301
- };
1302
-
1303
- // The DVC anchor: Within the log suffix following the anchor, we have additional
1304
- // guarantees about the state of the log headers which allow us to tolerate certain
1305
- // gaps (by locally guaranteeing that the gap does not hide a break).
1306
- const op_dvc_anchor = std.math.max(
1307
- options.op_checkpoint,
1308
- // +1: We may have a full pipeline, but not yet have performed any repair.
1309
- // In such a case, we want to send those pipeline_prepare_queue_max headers in
1310
- // the DVC, but not the preceding op (which may belong to a different chain).
1311
- // This satisfies the DVC invariant because the first op in the pipeline is
1312
- // "connected" to the canonical chain (via its "parent" checksum).
1313
- 1 + op_head_current -| constants.pipeline_prepare_queue_max,
1314
- );
1315
-
1316
- // Within the "suffix" we can make additional assumptions about gaps/etc.
1317
- // After the suffix, we just add as many extra (valid) headers as we can fit.
1318
- var suffix_done = false;
1319
-
1320
- for (current.headers.constSlice()) |*header, i| {
1321
- const op = header.op;
1322
- const chain = chain: {
1323
- // Always include the head message.
1324
- if (i == 0) break :chain Chain.chain_sequence;
1325
-
1326
- const child = headers.get(i - 1);
1327
- if (child.op == header.op + 1) {
1328
- break :chain if (child.parent == header.checksum) Chain.chain_sequence else Chain.chain_break;
1329
- } else {
1330
- break :chain if (child.view == header.view) Chain.chain_view else Chain.chain_gap;
1331
- }
1332
- };
1333
-
1334
- if (command_current == .start_view) {
1335
- // Primary: Collect headers for a start_view message.
1336
- // Backup: these headers are stored in the superblock's vsr_headers.
1337
- switch (chain) {
1338
- .chain_sequence => {},
1339
- // Gaps are due to either:
1340
- // - entries before checkpoint, which are not repaired, or
1341
- // - backup missed prepares and has not repaired headers. (Immediately after
1342
- // receiving a start_view this is not a concern, but the view_durable_update()
1343
- // may be delayed if another is in progress).
1344
- .chain_view, .chain_gap => {
1345
- assert(op <= options.op_checkpoint or !current.log_view_primary);
1346
- break;
1347
- },
1348
- // Breaks are due to:
1349
- // - entries before checkpoint, which are not repaired
1350
- .chain_break => {
1351
- assert(op <= options.op_checkpoint);
1352
- break;
1353
- },
1354
- }
1355
- } else if (suffix_done) {
1356
- // Add extra headers to the DVC. These are not required for correctness or
1357
- // availability, but including extra (correct) headers minimizes header repair at
1358
- // the new primary.
1359
- switch (chain) {
1360
- .chain_sequence => {},
1361
- .chain_view => {},
1362
- // Outside of the log suffix, repair may not have been finished, so gaps and
1363
- // breaks are possible. Non-same-view gaps may hide breaks.
1364
- .chain_gap => break,
1365
- .chain_break => break,
1366
- }
1367
- } else if (current.log_view_primary and command_durable == .start_view) {
1368
- switch (chain) {
1369
- .chain_sequence => {},
1370
- // Gaps to the right of the (durable) SV originate from:
1371
- // 1. The primary (durable SV: 1,2,3) prepares several ops (4,5,6).
1372
- // 2. However, the WAL writes are reordered such that some later ops (5,6)
1373
- // finish before an earlier op (4).
1374
- // 3. Crash, recover. Start sending a DVC for the next view. Either:
1375
- // - There is a gap in the WAL at op=4, but this is to the right of the
1376
- // durable SV, so it may be safely skipped.
1377
- // - Same as above, except op=4 was a torn write (or bit rot).
1378
- .chain_view, .chain_gap => assert(op + 1 > op_head_durable),
1379
- // Breaks are impossible to the right of the durable SV — journal recovery uses
1380
- // the durable SV to prune bad headers by their view numbers.
1381
- .chain_break => unreachable,
1382
- }
1383
- suffix_done = op <= op_head_durable;
1384
- } else if (current.log_view_primary and command_durable != .start_view) {
1385
- switch (chain) {
1386
- .chain_sequence => {},
1387
- .chain_view => {},
1388
- // The retiring primary may have gap-breaks or breaks in its suffix iff:
1389
- // - it didn't finish repairs before the second view-change, and
1390
- // - some uncommitted ops were truncated during the first view-change.
1391
- // (Truncation "moves" the suffix backwards).
1392
- .chain_gap => break,
1393
- .chain_break => break,
1394
- }
1395
- suffix_done = op <= op_dvc_anchor;
1396
- } else if (!current.log_view_primary and command_durable == .start_view) {
1397
- switch (chain) {
1398
- .chain_sequence => {},
1399
- // Backups load a full suffix of headers from the view's SV message. If there
1400
- // is now a gap in it the bcakup's suffix, this must be due to missed prepares.
1401
- .chain_view, .chain_gap => assert(op + 1 > op_head_durable),
1402
- // Breaks are impossible to the right of the durable SV — journal recovery uses
1403
- // the durable SV to prune bad headers by their view numbers.
1404
- .chain_break => unreachable,
1405
- }
1406
- suffix_done = op <= op_head_durable;
1407
- } else if (!current.log_view_primary and command_durable != .start_view) {
1408
- switch (chain) {
1409
- .chain_sequence => {},
1410
- .chain_view => {},
1411
- // Backups load a full suffix of headers from the view's SV message.
1412
- // That SV isn't durable, but it is part of the journal, so any gaps to its
1413
- // right must be due to missed prepares.
1414
- .chain_gap => {},
1415
- // Breaks are impossible to the right of the ephemeral SV, since the log was
1416
- // truncated when the SV was installed.
1417
- .chain_break => unreachable,
1418
- }
1419
- suffix_done = op <= op_dvc_anchor;
1420
- } else unreachable;
1421
-
1422
- headers.appendAssumeCapacity(header.*);
1423
- }
1424
- }
1425
- };