tigerbeetle-node 0.8.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. package/README.md +47 -47
  2. package/dist/benchmark.js +15 -15
  3. package/dist/benchmark.js.map +1 -1
  4. package/dist/index.d.ts +66 -61
  5. package/dist/index.js +66 -61
  6. package/dist/index.js.map +1 -1
  7. package/dist/test.js +1 -1
  8. package/dist/test.js.map +1 -1
  9. package/package.json +14 -16
  10. package/scripts/download_node_headers.sh +3 -1
  11. package/src/index.ts +5 -0
  12. package/src/node.zig +18 -19
  13. package/src/tigerbeetle/scripts/benchmark.bat +47 -46
  14. package/src/tigerbeetle/scripts/benchmark.sh +25 -10
  15. package/src/tigerbeetle/scripts/install.sh +2 -1
  16. package/src/tigerbeetle/scripts/install_zig.bat +109 -109
  17. package/src/tigerbeetle/scripts/install_zig.sh +18 -18
  18. package/src/tigerbeetle/scripts/upgrade_ubuntu_kernel.sh +12 -3
  19. package/src/tigerbeetle/scripts/vopr.bat +47 -47
  20. package/src/tigerbeetle/scripts/vopr.sh +5 -5
  21. package/src/tigerbeetle/src/benchmark.zig +17 -9
  22. package/src/tigerbeetle/src/benchmark_array_search.zig +317 -0
  23. package/src/tigerbeetle/src/benchmarks/perf.zig +299 -0
  24. package/src/tigerbeetle/src/c/tb_client/context.zig +103 -0
  25. package/src/tigerbeetle/src/c/tb_client/packet.zig +80 -0
  26. package/src/tigerbeetle/src/c/tb_client/signal.zig +288 -0
  27. package/src/tigerbeetle/src/c/tb_client/thread.zig +329 -0
  28. package/src/tigerbeetle/src/c/tb_client.h +201 -0
  29. package/src/tigerbeetle/src/c/tb_client.zig +101 -0
  30. package/src/tigerbeetle/src/c/test.zig +1 -0
  31. package/src/tigerbeetle/src/cli.zig +142 -83
  32. package/src/tigerbeetle/src/config.zig +136 -23
  33. package/src/tigerbeetle/src/demo.zig +12 -8
  34. package/src/tigerbeetle/src/demo_03_create_transfers.zig +3 -3
  35. package/src/tigerbeetle/src/demo_04_create_pending_transfers.zig +10 -10
  36. package/src/tigerbeetle/src/demo_05_post_pending_transfers.zig +7 -7
  37. package/src/tigerbeetle/src/demo_06_void_pending_transfers.zig +3 -3
  38. package/src/tigerbeetle/src/demo_07_lookup_transfers.zig +1 -1
  39. package/src/tigerbeetle/src/ewah.zig +318 -0
  40. package/src/tigerbeetle/src/ewah_benchmark.zig +121 -0
  41. package/src/tigerbeetle/src/eytzinger_benchmark.zig +317 -0
  42. package/src/tigerbeetle/src/fifo.zig +17 -1
  43. package/src/tigerbeetle/src/io/darwin.zig +12 -10
  44. package/src/tigerbeetle/src/io/linux.zig +25 -9
  45. package/src/tigerbeetle/src/io/windows.zig +13 -9
  46. package/src/tigerbeetle/src/iops.zig +101 -0
  47. package/src/tigerbeetle/src/lsm/binary_search.zig +214 -0
  48. package/src/tigerbeetle/src/lsm/bloom_filter.zig +82 -0
  49. package/src/tigerbeetle/src/lsm/compaction.zig +603 -0
  50. package/src/tigerbeetle/src/lsm/composite_key.zig +75 -0
  51. package/src/tigerbeetle/src/lsm/direction.zig +11 -0
  52. package/src/tigerbeetle/src/lsm/eytzinger.zig +587 -0
  53. package/src/tigerbeetle/src/lsm/forest.zig +630 -0
  54. package/src/tigerbeetle/src/lsm/grid.zig +473 -0
  55. package/src/tigerbeetle/src/lsm/groove.zig +939 -0
  56. package/src/tigerbeetle/src/lsm/k_way_merge.zig +452 -0
  57. package/src/tigerbeetle/src/lsm/level_iterator.zig +296 -0
  58. package/src/tigerbeetle/src/lsm/manifest.zig +680 -0
  59. package/src/tigerbeetle/src/lsm/manifest_level.zig +1169 -0
  60. package/src/tigerbeetle/src/lsm/manifest_log.zig +904 -0
  61. package/src/tigerbeetle/src/lsm/node_pool.zig +231 -0
  62. package/src/tigerbeetle/src/lsm/posted_groove.zig +399 -0
  63. package/src/tigerbeetle/src/lsm/segmented_array.zig +998 -0
  64. package/src/tigerbeetle/src/lsm/set_associative_cache.zig +844 -0
  65. package/src/tigerbeetle/src/lsm/table.zig +932 -0
  66. package/src/tigerbeetle/src/lsm/table_immutable.zig +196 -0
  67. package/src/tigerbeetle/src/lsm/table_iterator.zig +295 -0
  68. package/src/tigerbeetle/src/lsm/table_mutable.zig +123 -0
  69. package/src/tigerbeetle/src/lsm/test.zig +429 -0
  70. package/src/tigerbeetle/src/lsm/tree.zig +1085 -0
  71. package/src/tigerbeetle/src/main.zig +121 -95
  72. package/src/tigerbeetle/src/message_bus.zig +49 -48
  73. package/src/tigerbeetle/src/message_pool.zig +19 -3
  74. package/src/tigerbeetle/src/ring_buffer.zig +172 -31
  75. package/src/tigerbeetle/src/simulator.zig +171 -43
  76. package/src/tigerbeetle/src/state_machine.zig +1026 -599
  77. package/src/tigerbeetle/src/storage.zig +46 -16
  78. package/src/tigerbeetle/src/test/cluster.zig +257 -78
  79. package/src/tigerbeetle/src/test/message_bus.zig +15 -24
  80. package/src/tigerbeetle/src/test/network.zig +26 -17
  81. package/src/tigerbeetle/src/test/packet_simulator.zig +14 -1
  82. package/src/tigerbeetle/src/test/state_checker.zig +10 -6
  83. package/src/tigerbeetle/src/test/state_machine.zig +159 -68
  84. package/src/tigerbeetle/src/test/storage.zig +137 -49
  85. package/src/tigerbeetle/src/tigerbeetle.zig +5 -0
  86. package/src/tigerbeetle/src/unit_tests.zig +8 -0
  87. package/src/tigerbeetle/src/util.zig +51 -0
  88. package/src/tigerbeetle/src/vsr/client.zig +21 -7
  89. package/src/tigerbeetle/src/vsr/journal.zig +1429 -514
  90. package/src/tigerbeetle/src/vsr/replica.zig +1855 -550
  91. package/src/tigerbeetle/src/vsr/superblock.zig +1743 -0
  92. package/src/tigerbeetle/src/vsr/superblock_client_table.zig +258 -0
  93. package/src/tigerbeetle/src/vsr/superblock_free_set.zig +644 -0
  94. package/src/tigerbeetle/src/vsr/superblock_manifest.zig +546 -0
  95. package/src/tigerbeetle/src/vsr.zig +134 -52
  96. package/.yarn/releases/yarn-berry.cjs +0 -55
  97. package/.yarnrc.yml +0 -1
  98. package/scripts/postinstall.sh +0 -6
  99. package/yarn.lock +0 -42
@@ -9,9 +9,78 @@ const config = @import("../config.zig");
9
9
  const Message = @import("../message_pool.zig").MessagePool.Message;
10
10
  const vsr = @import("../vsr.zig");
11
11
  const Header = vsr.Header;
12
+ const IOPS = @import("../iops.zig").IOPS;
12
13
 
13
14
  const log = std.log.scoped(.journal);
14
15
 
16
+ /// There are two contiguous circular buffers on disk in the journal storage zone (`vsr.Zone.wal`).
17
+ ///
18
+ /// In both rings, the `op` for each reserved header is set to the slot index.
19
+ /// This helps WAL recovery detect misdirected reads/writes.
20
+ const Ring = enum {
21
+ /// A circular buffer of prepare message headers.
22
+ headers,
23
+ /// A circular buffer of prepare messages. Each slot is padded to `config.message_size_max`.
24
+ prepares,
25
+ };
26
+
27
+ const headers_per_sector = @divExact(config.sector_size, @sizeOf(Header));
28
+ comptime {
29
+ assert(headers_per_sector > 0);
30
+ }
31
+
32
+ /// A slot is `op % config.journal_slot_count`.
33
+ const Slot = struct { index: u64 };
34
+
35
+ /// An inclusive, non-empty range of slots.
36
+ const SlotRange = struct {
37
+ head: Slot,
38
+ tail: Slot,
39
+
40
+ /// Returns whether this range (inclusive) includes the specified slot.
41
+ ///
42
+ /// Cases (`·`=included, ` `=excluded):
43
+ ///
44
+ /// * `head < tail` → ` head··tail `
45
+ /// * `head > tail` → `··tail head··` (The range wraps around).
46
+ /// * `head = tail` → panic (Caller must handle this case separately).
47
+ fn contains(self: *const SlotRange, slot: Slot) bool {
48
+ // To avoid confusion, the empty range must be checked separately by the caller.
49
+ assert(self.head.index != self.tail.index);
50
+
51
+ if (self.head.index < self.tail.index) {
52
+ return self.head.index <= slot.index and slot.index <= self.tail.index;
53
+ }
54
+ if (self.head.index > self.tail.index) {
55
+ return slot.index <= self.tail.index or self.head.index <= slot.index;
56
+ }
57
+ unreachable;
58
+ }
59
+ };
60
+
61
+ const slot_count = config.journal_slot_count;
62
+ const headers_size = config.journal_size_headers;
63
+ const prepares_size = config.journal_size_prepares;
64
+
65
+ pub const write_ahead_log_zone_size = headers_size + prepares_size;
66
+
67
+ comptime {
68
+ assert(slot_count > 0);
69
+ assert(slot_count % 2 == 0);
70
+ assert(slot_count % headers_per_sector == 0);
71
+ assert(slot_count >= headers_per_sector);
72
+ // The length of the prepare pipeline is the upper bound on how many ops can be
73
+ // reordered during a view change. See `recover_prepares_callback()` for more detail.
74
+ assert(slot_count > config.pipeline_max);
75
+
76
+ assert(headers_size > 0);
77
+ assert(headers_size % config.sector_size == 0);
78
+
79
+ assert(prepares_size > 0);
80
+ assert(prepares_size % config.sector_size == 0);
81
+ assert(prepares_size % config.message_size_max == 0);
82
+ }
83
+
15
84
  pub fn Journal(comptime Replica: type, comptime Storage: type) type {
16
85
  return struct {
17
86
  const Self = @This();
@@ -28,7 +97,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
28
97
  };
29
98
 
30
99
  pub const Write = struct {
31
- pub const Trigger = enum { append, repair };
100
+ pub const Trigger = enum { append, repair, pipeline };
32
101
 
33
102
  self: *Self,
34
103
  callback: fn (self: *Replica, wrote: ?*Message, trigger: Trigger) void,
@@ -39,6 +108,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
39
108
  /// True if this Write has acquired a lock on a sector of headers.
40
109
  /// This also means that the Write is currently writing sectors or queuing to do so.
41
110
  header_sector_locked: bool = false,
111
+
42
112
  /// Linked list of Writes waiting to acquire the same header sector as this Write.
43
113
  header_sector_next: ?*Write = null,
44
114
 
@@ -46,18 +116,17 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
46
116
  range: Range,
47
117
 
48
118
  const Sector = *align(config.sector_size) [config.sector_size]u8;
119
+
49
120
  fn header_sector(write: *Self.Write, journal: *Self) Sector {
50
121
  assert(journal.writes.items.len == journal.headers_iops.len);
51
- const i = @divExact(@ptrToInt(write) - @ptrToInt(&journal.writes.items), @sizeOf(Self.Write));
52
- // TODO: the compiler should probably be smart enough to avoid needing this align cast
53
- // as the type of `headers_iops` ensures that each buffer is properly aligned.
122
+ const i = @divExact(
123
+ @ptrToInt(write) - @ptrToInt(&journal.writes.items),
124
+ @sizeOf(Self.Write),
125
+ );
126
+ // TODO The compiler should not need this align cast as the type of `headers_iops`
127
+ // ensures that each buffer is properly aligned.
54
128
  return @alignCast(config.sector_size, &journal.headers_iops[i]);
55
129
  }
56
-
57
- fn header_sector_same(write: *Self.Write, other: *Self.Write) bool {
58
- return write_prepare_header_offset(write.message) ==
59
- write_prepare_header_offset(other.message);
60
- }
61
130
  };
62
131
 
63
132
  /// State that needs to be persisted while waiting for an overlapping
@@ -85,27 +154,37 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
85
154
 
86
155
  storage: *Storage,
87
156
  replica: u8,
88
- size: u64,
89
- size_headers: u64,
90
- size_circular_buffer: u64,
91
157
 
158
+ /// A header is located at `slot == header.op % headers.len`.
159
+ ///
160
+ /// Each slot's `header.command` is either `prepare` or `reserved`.
161
+ /// When the slot's header is `reserved`, the header's `op` is the slot index.
162
+ ///
163
+ /// During recovery, store the (unvalidated) headers of the prepare ring.
164
+ // TODO Use 2 separate header lists: "staging" and "working".
165
+ // When participating in a view change, each replica should only send the headers from its
166
+ // working set that it knows it prepared.
167
+ // This also addresses the problem of redundant headers being written prematurely due to
168
+ // batching (after the first log cycle — for the first log cycle we write an invalid message).
92
169
  headers: []align(config.sector_size) Header,
93
- /// We copy-on-write to these buffers when writing, as in-memory headers may change concurrently.
170
+
171
+ /// Store the redundant headers (unvalidated) during recovery.
172
+ // TODO When "headers" is split into "staging" and "working", reuse one of those instead.
173
+ headers_redundant: []align(config.sector_size) Header,
174
+
175
+ /// We copy-on-write to these buffers, as the in-memory headers may change while writing.
94
176
  /// The buffers belong to the IOP at the corresponding index in IOPS.
95
177
  headers_iops: *align(config.sector_size) [config.io_depth_write][config.sector_size]u8,
96
- /// Apart from the header written with the entry, we also store two redundant copies of each
97
- /// header at different locations on disk, and we alternate between these for each append.
98
- /// This tracks which version (0 or 1) should be written to next:
99
- headers_version: u1 = 0,
100
178
 
101
179
  /// Statically allocated read IO operation context data.
102
180
  reads: IOPS(Read, config.io_depth_read) = .{},
181
+
103
182
  /// Statically allocated write IO operation context data.
104
183
  writes: IOPS(Write, config.io_depth_write) = .{},
105
184
 
106
185
  /// Whether an entry is in memory only and needs to be written or is being written:
107
186
  /// We use this in the same sense as a dirty bit in the kernel page cache.
108
- /// A dirty bit means that we have not yet prepared the entry, or need to repair a faulty entry.
187
+ /// A dirty bit means that we have not prepared the entry, or need to repair a faulty entry.
109
188
  dirty: BitSet,
110
189
 
111
190
  /// Whether an entry was written to disk and this write was subsequently lost due to:
@@ -113,43 +192,66 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
113
192
  /// * a misdirected write (or a misdirected read, we do not distinguish), or else
114
193
  /// * a latent sector error, where the sector can no longer be read.
115
194
  /// A faulty bit means that we prepared and then lost the entry.
116
- /// A faulty bit requires the dirty bit to also be set so that functions need not check both.
195
+ /// A faulty bit requires the dirty bit to also be set so that callers need not check both.
117
196
  /// A faulty bit is used then only to qualify the severity of the dirty bit.
118
197
  faulty: BitSet,
119
198
 
120
- recovered: bool = true,
199
+ /// The checksum of the prepare in the corresponding slot.
200
+ /// This is used to respond to `request_prepare` messages even when the slot is faulty.
201
+ /// For example, the slot may be faulty because the redundant header is faulty.
202
+ ///
203
+ /// The checksum will missing (`prepare_checksums[i]=0`, `prepare_inhabited[i]=false`) when:
204
+ /// * the message in the slot is reserved,
205
+ /// * the message in the slot is being written, or when
206
+ /// * the message in the slot is corrupt.
207
+ // TODO: `prepare_checksums` and `prepare_inhabited` should be combined into a []?u128,
208
+ // but that type is currently unusable (as of Zig 0.9.1).
209
+ // See: https://github.com/ziglang/zig/issues/9871
210
+ prepare_checksums: []u128,
211
+ /// When prepare_inhabited[i]==false, prepare_checksums[i]==0.
212
+ /// (`undefined` would may more sense than `0`, but `0` allows it to be asserted).
213
+ prepare_inhabited: []bool,
214
+
215
+ recovered: bool = false,
121
216
  recovering: bool = false,
122
217
 
123
- pub fn init(
124
- allocator: Allocator,
125
- storage: *Storage,
126
- replica: u8,
127
- size: u64,
128
- headers_count: u32,
129
- init_prepare: *Header,
130
- ) !Self {
131
- if (@mod(size, config.sector_size) != 0) return error.SizeMustBeAMultipleOfSectorSize;
132
- if (!math.isPowerOfTwo(headers_count)) return error.HeadersCountMustBeAPowerOfTwo;
133
- assert(storage.size == size);
134
-
135
- const headers_per_sector = @divExact(config.sector_size, @sizeOf(Header));
136
- assert(headers_per_sector > 0);
137
- assert(headers_count >= headers_per_sector);
218
+ pub fn init(allocator: Allocator, storage: *Storage, replica: u8) !Self {
219
+ // TODO Fix this assertion:
220
+ // assert(write_ahead_log_zone_size <= storage.size);
138
221
 
139
222
  var headers = try allocator.allocAdvanced(
140
223
  Header,
141
224
  config.sector_size,
142
- headers_count,
225
+ slot_count,
143
226
  .exact,
144
227
  );
145
228
  errdefer allocator.free(headers);
146
- std.mem.set(Header, headers, Header.reserved());
229
+ for (headers) |*header| header.* = undefined;
147
230
 
148
- var dirty = try BitSet.init(allocator, headers.len);
231
+ var headers_redundant = try allocator.allocAdvanced(
232
+ Header,
233
+ config.sector_size,
234
+ slot_count,
235
+ .exact,
236
+ );
237
+ errdefer allocator.free(headers_redundant);
238
+ for (headers_redundant) |*header| header.* = undefined;
239
+
240
+ var dirty = try BitSet.init(allocator, slot_count);
149
241
  errdefer dirty.deinit(allocator);
242
+ for (headers) |_, index| dirty.set(Slot{ .index = index });
150
243
 
151
- var faulty = try BitSet.init(allocator, headers.len);
244
+ var faulty = try BitSet.init(allocator, slot_count);
152
245
  errdefer faulty.deinit(allocator);
246
+ for (headers) |_, index| faulty.set(Slot{ .index = index });
247
+
248
+ var prepare_checksums = try allocator.alloc(u128, slot_count);
249
+ errdefer allocator.free(prepare_checksums);
250
+ std.mem.set(u128, prepare_checksums, 0);
251
+
252
+ var prepare_inhabited = try allocator.alloc(bool, slot_count);
253
+ errdefer allocator.free(prepare_inhabited);
254
+ std.mem.set(bool, prepare_inhabited, false);
153
255
 
154
256
  const headers_iops = (try allocator.allocAdvanced(
155
257
  [config.sector_size]u8,
@@ -159,45 +261,36 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
159
261
  ))[0..config.io_depth_write];
160
262
  errdefer allocator.free(headers_iops);
161
263
 
162
- const header_copies = 2;
163
- const size_headers = headers.len * @sizeOf(Header);
164
- const size_headers_copies = size_headers * header_copies;
165
- if (size_headers_copies >= size) return error.SizeTooSmallForHeadersCount;
166
-
167
- const size_circular_buffer = size - size_headers_copies;
168
- if (size_circular_buffer < 64 * 1024 * 1024) return error.SizeTooSmallForCircularBuffer;
169
-
170
- log.debug("{}: size={} headers_len={} headers={} circular_buffer={}", .{
264
+ log.debug("{}: slot_count={} size={} headers_size={} prepares_size={}", .{
171
265
  replica,
172
- std.fmt.fmtIntSizeBin(size),
173
- headers.len,
174
- std.fmt.fmtIntSizeBin(size_headers),
175
- std.fmt.fmtIntSizeBin(size_circular_buffer),
266
+ slot_count,
267
+ std.fmt.fmtIntSizeBin(write_ahead_log_zone_size),
268
+ std.fmt.fmtIntSizeBin(headers_size),
269
+ std.fmt.fmtIntSizeBin(prepares_size),
176
270
  });
177
271
 
178
272
  var self = Self{
179
273
  .storage = storage,
180
274
  .replica = replica,
181
- .size = size,
182
- .size_headers = size_headers,
183
- .size_circular_buffer = size_circular_buffer,
184
275
  .headers = headers,
276
+ .headers_redundant = headers_redundant,
185
277
  .dirty = dirty,
186
278
  .faulty = faulty,
279
+ .prepare_checksums = prepare_checksums,
280
+ .prepare_inhabited = prepare_inhabited,
187
281
  .headers_iops = headers_iops,
188
282
  };
189
283
 
190
- assert(@mod(self.size_circular_buffer, config.sector_size) == 0);
191
284
  assert(@mod(@ptrToInt(&self.headers[0]), config.sector_size) == 0);
192
- assert(self.dirty.bits.len == self.headers.len);
193
- assert(self.faulty.bits.len == self.headers.len);
285
+ assert(self.dirty.bits.bit_length == slot_count);
286
+ assert(self.faulty.bits.bit_length == slot_count);
287
+ assert(self.dirty.count == slot_count);
288
+ assert(self.faulty.count == slot_count);
289
+ assert(self.prepare_checksums.len == slot_count);
290
+ assert(self.prepare_inhabited.len == slot_count);
194
291
 
195
- // Op 0 is always the cluster initialization op.
196
- // TODO This will change when we implement synchronized incremental snapshots.
197
- assert(init_prepare.valid_checksum());
198
- assert(init_prepare.invalid() == null);
199
- self.headers[0] = init_prepare.*;
200
- self.assert_headers_reserved_from(init_prepare.op + 1);
292
+ for (self.headers) |*h| assert(!h.valid_checksum());
293
+ for (self.headers_redundant) |*h| assert(!h.valid_checksum());
201
294
 
202
295
  return self;
203
296
  }
@@ -208,7 +301,10 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
208
301
  self.dirty.deinit(allocator);
209
302
  self.faulty.deinit(allocator);
210
303
  allocator.free(self.headers);
304
+ allocator.free(self.headers_redundant);
211
305
  allocator.free(self.headers_iops);
306
+ allocator.free(self.prepare_checksums);
307
+ allocator.free(self.prepare_inhabited);
212
308
 
213
309
  {
214
310
  var it = self.reads.iterate();
@@ -220,74 +316,168 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
220
316
  }
221
317
  }
222
318
 
223
- /// Asserts that headers are .reserved (zeroed) from `op_min` (inclusive).
224
- pub fn assert_headers_reserved_from(self: *Self, op_min: u64) void {
225
- // TODO Snapshots
226
- for (self.headers[op_min..]) |header| assert(header.command == .reserved);
319
+ /// Returns whether this is a fresh database WAL; no prepares (except the root) have ever
320
+ /// been written. This determines whether a replica can transition immediately to normal
321
+ /// status, or if it needs to run recovery protocol.
322
+ ///
323
+ /// Called by the replica immediately after WAL recovery completes, but before the replica
324
+ /// issues any I/O from handling messages.
325
+ pub fn is_empty(self: *const Self) bool {
326
+ assert(!self.recovering);
327
+ assert(self.recovered);
328
+ assert(self.writes.executing() == 0);
329
+
330
+ if (!self.headers[0].valid_checksum()) return false;
331
+ if (self.headers[0].operation != .root) return false;
332
+
333
+ const replica = @fieldParentPtr(Replica, "journal", self);
334
+ assert(self.headers[0].checksum == Header.root_prepare(replica.cluster).checksum);
335
+ assert(self.headers[0].checksum == self.prepare_checksums[0]);
336
+ assert(self.prepare_inhabited[0]);
337
+
338
+ // If any message is faulty, we must fall back to VSR recovery protocol (i.e. treat
339
+ // this as a non-empty WAL) since that message may have been a prepare.
340
+ if (self.faulty.count > 0) return false;
341
+
342
+ for (self.headers[1..]) |*header| {
343
+ if (header.command == .prepare) return false;
344
+ }
345
+
346
+ for (self.prepare_inhabited[1..]) |inhabited| {
347
+ if (inhabited) return false;
348
+ }
349
+
350
+ return true;
351
+ }
352
+
353
+ pub fn slot_for_op(_: *const Self, op: u64) Slot {
354
+ return Slot{ .index = op % slot_count };
355
+ }
356
+
357
+ pub fn slot_with_op(self: *const Self, op: u64) ?Slot {
358
+ if (self.header_with_op(op)) |_| {
359
+ return self.slot_for_op(op);
360
+ } else {
361
+ return null;
362
+ }
363
+ }
364
+
365
+ pub fn slot_with_op_and_checksum(self: *const Self, op: u64, checksum: u128) ?Slot {
366
+ if (self.header_with_op_and_checksum(op, checksum)) |_| {
367
+ return self.slot_for_op(op);
368
+ } else {
369
+ return null;
370
+ }
371
+ }
372
+
373
+ pub fn slot_for_header(self: *const Self, header: *const Header) Slot {
374
+ assert(header.command == .prepare);
375
+ return self.slot_for_op(header.op);
376
+ }
377
+
378
+ pub fn slot_with_header(self: *const Self, header: *const Header) ?Slot {
379
+ assert(header.command == .prepare);
380
+ return self.slot_with_op(header.op);
227
381
  }
228
382
 
229
383
  /// Returns any existing entry at the location indicated by header.op.
230
384
  /// This existing entry may have an older or newer op number.
231
- pub fn entry(self: *Self, header: *const Header) ?*const Header {
385
+ pub fn header_for_entry(self: *const Self, header: *const Header) ?*const Header {
232
386
  assert(header.command == .prepare);
233
- return self.entry_for_op(header.op);
387
+ return self.header_for_op(header.op);
234
388
  }
235
389
 
236
- /// We use the op number directly to index into the headers array and locate ops without a scan.
237
- /// Op numbers cycle through the headers array and do not wrap when offsets wrap. The reason for
238
- /// this is to prevent variable offsets from impacting the location of an op. Otherwise, the
239
- /// same op number but for different views could exist at multiple locations in the journal.
240
- pub fn entry_for_op(self: *Self, op: u64) ?*const Header {
390
+ /// We use `op` directly to index into the headers array and locate ops without a scan.
391
+ pub fn header_for_op(self: *const Self, op: u64) ?*const Header {
241
392
  // TODO Snapshots
242
- const existing = &self.headers[op];
243
- if (existing.command == .reserved) return null;
244
- assert(existing.command == .prepare);
245
- return existing;
393
+ const slot = self.slot_for_op(op);
394
+ const existing = &self.headers[slot.index];
395
+ switch (existing.command) {
396
+ .prepare => {
397
+ assert(self.slot_for_op(existing.op).index == slot.index);
398
+ return existing;
399
+ },
400
+ .reserved => {
401
+ assert(existing.op == slot.index);
402
+ return null;
403
+ },
404
+ else => unreachable,
405
+ }
246
406
  }
247
407
 
248
408
  /// Returns the entry at `@mod(op)` location, but only if `entry.op == op`, else `null`.
249
409
  /// Be careful of using this without considering that there may still be an existing op.
250
- pub fn entry_for_op_exact(self: *Self, op: u64) ?*const Header {
251
- if (self.entry_for_op(op)) |existing| {
410
+ pub fn header_with_op(self: *const Self, op: u64) ?*const Header {
411
+ if (self.header_for_op(op)) |existing| {
252
412
  if (existing.op == op) return existing;
253
413
  }
254
414
  return null;
255
415
  }
256
416
 
257
- /// As per `entry_for_op_exact()`, but only if there is an optional checksum match.
258
- pub fn entry_for_op_exact_with_checksum(
259
- self: *Self,
417
+ /// As per `header_with_op()`, but only if there is an optional checksum match.
418
+ pub fn header_with_op_and_checksum(
419
+ self: *const Self,
260
420
  op: u64,
261
421
  checksum: ?u128,
262
422
  ) ?*const Header {
263
- if (self.entry_for_op_exact(op)) |existing| {
423
+ if (self.header_with_op(op)) |existing| {
264
424
  assert(existing.op == op);
265
425
  if (checksum == null or existing.checksum == checksum.?) return existing;
266
426
  }
267
427
  return null;
268
428
  }
269
429
 
270
- pub fn previous_entry(self: *Self, header: *const Header) ?*const Header {
271
- // TODO Snapshots
272
- if (header.op == 0) return null;
273
- return self.entry_for_op(header.op - 1);
430
+ // TODO How should we handle the case where the current header argument is the same as
431
+ // op_checkpoint?
432
+ pub fn previous_entry(self: *const Self, header: *const Header) ?*const Header {
433
+ if (header.op == 0) {
434
+ return null;
435
+ } else {
436
+ return self.header_for_op(header.op - 1);
437
+ }
274
438
  }
275
439
 
276
- pub fn next_entry(self: *Self, header: *const Header) ?*const Header {
277
- // TODO Snapshots
278
- if (header.op + 1 == self.headers.len) return null;
279
- return self.entry_for_op(header.op + 1);
440
+ pub fn next_entry(self: *const Self, header: *const Header) ?*const Header {
441
+ return self.header_for_op(header.op + 1);
280
442
  }
281
443
 
282
- pub fn next_offset(header: *const Header) u64 {
283
- // TODO Snapshots
284
- assert(header.command == .prepare);
285
- return header.offset + vsr.sector_ceil(header.size);
444
+ /// Returns the highest op number prepared, in any slot without reference to the checkpoint.
445
+ pub fn op_maximum(self: *const Self) u64 {
446
+ assert(self.recovered);
447
+
448
+ var op: u64 = 0;
449
+ for (self.headers) |*header| {
450
+ if (header.command == .prepare) {
451
+ if (header.op > op) op = header.op;
452
+ } else {
453
+ assert(header.command == .reserved);
454
+ }
455
+ }
456
+ return op;
457
+ }
458
+
459
+ /// Returns the highest op number prepared, as per `header_ok()` in the untrusted headers.
460
+ fn op_maximum_headers_untrusted(cluster: u32, headers_untrusted: []const Header) u64 {
461
+ var op: u64 = 0;
462
+ for (headers_untrusted) |*header_untrusted, slot_index| {
463
+ const slot = Slot{ .index = slot_index };
464
+ if (header_ok(cluster, slot, header_untrusted)) |header| {
465
+ if (header.command == .prepare) {
466
+ if (header.op > op) op = header.op;
467
+ } else {
468
+ assert(header.command == .reserved);
469
+ }
470
+ }
471
+ }
472
+ return op;
286
473
  }
287
474
 
288
- pub fn has(self: *Self, header: *const Header) bool {
475
+ pub fn has(self: *const Self, header: *const Header) bool {
476
+ assert(self.recovered);
477
+ assert(header.command == .prepare);
289
478
  // TODO Snapshots
290
- const existing = &self.headers[header.op];
479
+ const slot = self.slot_for_op(header.op);
480
+ const existing = &self.headers[slot.index];
291
481
  if (existing.command == .reserved) {
292
482
  return false;
293
483
  } else {
@@ -301,40 +491,49 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
301
491
  }
302
492
  }
303
493
 
304
- pub fn has_clean(self: *Self, header: *const Header) bool {
494
+ pub fn has_clean(self: *const Self, header: *const Header) bool {
305
495
  // TODO Snapshots
306
- return self.has(header) and !self.dirty.bit(header.op);
496
+ if (self.slot_with_op_and_checksum(header.op, header.checksum)) |slot| {
497
+ if (!self.dirty.bit(slot)) {
498
+ assert(self.prepare_inhabited[slot.index]);
499
+ assert(self.prepare_checksums[slot.index] == header.checksum);
500
+ return true;
501
+ }
502
+ }
503
+ return false;
307
504
  }
308
505
 
309
- pub fn has_dirty(self: *Self, header: *const Header) bool {
506
+ pub fn has_dirty(self: *const Self, header: *const Header) bool {
310
507
  // TODO Snapshots
311
- return self.has(header) and self.dirty.bit(header.op);
508
+ return self.has(header) and self.dirty.bit(self.slot_with_header(header).?);
312
509
  }
313
510
 
314
- /// Copies latest headers between `op_min` and `op_max` (both inclusive) as will fit in `dest`.
315
- /// Reverses the order when copying so that latest headers are copied first, which also protects
511
+ /// Copies latest headers between `op_min` and `op_max` (both inclusive) as fit in `dest`.
512
+ /// Reverses the order when copying so that latest headers are copied first, which protects
316
513
  /// against the callsite slicing the buffer the wrong way and incorrectly.
317
514
  /// Skips .reserved headers (gaps between headers).
318
515
  /// Zeroes the `dest` buffer in case the copy would underflow and leave a buffer bleed.
319
516
  /// Returns the number of headers actually copied.
320
517
  pub fn copy_latest_headers_between(
321
- self: *Self,
518
+ self: *const Self,
322
519
  op_min: u64,
323
520
  op_max: u64,
324
521
  dest: []Header,
325
522
  ) usize {
523
+ assert(self.recovered);
326
524
  assert(op_min <= op_max);
327
525
  assert(dest.len > 0);
328
526
 
329
527
  var copied: usize = 0;
330
- std.mem.set(Header, dest, Header.reserved());
528
+ // Poison all slots; only slots less than `copied` are used.
529
+ std.mem.set(Header, dest, undefined);
331
530
 
332
531
  // Start at op_max + 1 and do the decrement upfront to avoid overflow when op_min == 0:
333
532
  var op = op_max + 1;
334
533
  while (op > op_min) {
335
534
  op -= 1;
336
535
 
337
- if (self.entry_for_op_exact(op)) |header| {
536
+ if (self.header_with_op(op)) |header| {
338
537
  dest[copied] = header.*;
339
538
  assert(dest[copied].invalid() == null);
340
539
  copied += 1;
@@ -342,12 +541,16 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
342
541
  }
343
542
  }
344
543
 
345
- log.debug("copy_latest_headers_between: op_min={} op_max={} dest.len={} copied={}", .{
346
- op_min,
347
- op_max,
348
- dest.len,
349
- copied,
350
- });
544
+ log.debug(
545
+ "{}: copy_latest_headers_between: op_min={} op_max={} dest.len={} copied={}",
546
+ .{
547
+ self.replica,
548
+ op_min,
549
+ op_max,
550
+ dest.len,
551
+ copied,
552
+ },
553
+ );
351
554
 
352
555
  return copied;
353
556
  }
@@ -360,7 +563,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
360
563
  /// We expect that `op_min` and `op_max` (`replica.commit_min` and `replica.op`) must exist.
361
564
  /// A range will never include `op_min` because this is already committed.
362
565
  /// A range will never include `op_max` because this must be up to date as the latest op.
363
- /// We must therefore first resolve any view jump barrier so that we can trust `op_max`.
566
+ /// We must therefore first resolve any op uncertainty so that we can trust `op_max` here.
364
567
  ///
365
568
  /// For example: If ops 3, 9 and 10 are missing, returns: `{ .op_min = 9, .op_max = 10 }`.
366
569
  ///
@@ -382,11 +585,11 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
382
585
  op -= 1;
383
586
 
384
587
  // Get the entry at @mod(op) location, but only if entry.op == op, else null:
385
- var A = self.entry_for_op_exact(op);
588
+ var A = self.header_with_op(op);
386
589
  if (A) |a| {
387
590
  if (B) |b| {
388
591
  // If A was reordered then A may have a newer op than B (but an older view).
389
- // However, here we use entry_for_op_exact() to assert a.op + 1 == b.op:
592
+ // However, here we use header_with_op() to assert a.op + 1 == b.op:
390
593
  assert(a.op + 1 == b.op);
391
594
 
392
595
  // We do not assert a.view <= b.view here unless the chain is intact because
@@ -417,15 +620,11 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
417
620
  } else if (a.checksum == b.parent) {
418
621
  // A is connected to B, and B is connected or B is op_max.
419
622
  assert(a.view <= b.view);
420
- } else if (a.view < b.view) {
421
- // A is not connected to B, and A is older than B, open range:
623
+ } else if (a.view != b.view) {
624
+ // A is not connected to B, open range:
422
625
  assert(a.op > op_min);
626
+ assert(b.op <= op_max);
423
627
  range = .{ .op_min = a.op, .op_max = a.op };
424
- } else if (a.view > b.view) {
425
- // A is not connected to B, but A is newer than B, open and close range:
426
- assert(b.op < op_max);
427
- range = .{ .op_min = b.op, .op_max = b.op };
428
- break;
429
628
  } else {
430
629
  // Op numbers in the same view must be connected.
431
630
  unreachable;
@@ -471,6 +670,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
471
670
  return range;
472
671
  }
473
672
 
673
+ /// Read a prepare from disk. There must be a matching in-memory header.
474
674
  pub fn read_prepare(
475
675
  self: *Self,
476
676
  callback: fn (replica: *Replica, prepare: ?*Message, destination_replica: ?u8) void,
@@ -478,6 +678,9 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
478
678
  checksum: u128,
479
679
  destination_replica: ?u8,
480
680
  ) void {
681
+ assert(self.recovered);
682
+ assert(checksum != 0);
683
+
481
684
  const replica = @fieldParentPtr(Replica, "journal", self);
482
685
  if (op > replica.op) {
483
686
  self.read_prepare_log(op, checksum, "beyond replica.op");
@@ -485,39 +688,46 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
485
688
  return;
486
689
  }
487
690
 
488
- // Do not use this pointer beyond this function's scope, as the
489
- // header memory may then change:
490
- const exact = self.entry_for_op_exact_with_checksum(op, checksum) orelse {
691
+ const slot = self.slot_with_op_and_checksum(op, checksum) orelse {
491
692
  self.read_prepare_log(op, checksum, "no entry exactly");
492
693
  callback(replica, null, null);
493
694
  return;
494
695
  };
495
696
 
496
- if (self.faulty.bit(op)) {
497
- assert(self.dirty.bit(op));
498
-
499
- self.read_prepare_log(op, checksum, "faulty");
500
- callback(replica, null, null);
501
- return;
502
- }
503
-
504
- if (self.dirty.bit(op)) {
505
- self.read_prepare_log(op, checksum, "dirty");
697
+ if (self.prepare_inhabited[slot.index] and
698
+ self.prepare_checksums[slot.index] == checksum)
699
+ {
700
+ self.read_prepare_with_op_and_checksum(callback, op, checksum, destination_replica);
701
+ } else {
702
+ self.read_prepare_log(op, checksum, "no matching prepare");
506
703
  callback(replica, null, null);
507
- return;
508
704
  }
705
+ }
509
706
 
510
- const physical_size = vsr.sector_ceil(exact.size);
511
- assert(physical_size >= exact.size);
707
+ /// Read a prepare from disk. There may or may not be an in-memory header.
708
+ pub fn read_prepare_with_op_and_checksum(
709
+ self: *Self,
710
+ callback: fn (replica: *Replica, prepare: ?*Message, destination_replica: ?u8) void,
711
+ op: u64,
712
+ checksum: u128,
713
+ destination_replica: ?u8,
714
+ ) void {
715
+ const replica = @fieldParentPtr(Replica, "journal", self);
716
+ const slot = self.slot_for_op(op);
717
+ assert(self.recovered);
718
+ assert(self.prepare_inhabited[slot.index]);
719
+ assert(self.prepare_checksums[slot.index] == checksum);
512
720
 
513
721
  const message = replica.message_bus.get_message();
514
722
  defer replica.message_bus.unref(message);
515
723
 
516
- // Skip the disk read if the header is all we need:
517
- if (exact.size == @sizeOf(Header)) {
518
- message.header.* = exact.*;
519
- callback(replica, message, destination_replica);
520
- return;
724
+ // If the header is in-memory, we can skip the read from the disk.
725
+ if (self.header_with_op_and_checksum(op, checksum)) |exact| {
726
+ if (exact.size == @sizeOf(Header)) {
727
+ message.header.* = exact.*;
728
+ callback(replica, message, destination_replica);
729
+ return;
730
+ }
521
731
  }
522
732
 
523
733
  const read = self.reads.acquire() orelse {
@@ -536,29 +746,30 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
536
746
  .destination_replica = destination_replica,
537
747
  };
538
748
 
539
- assert(exact.offset + physical_size <= self.size_circular_buffer);
540
-
541
- const buffer = message.buffer[0..physical_size];
542
- const offset = self.offset_in_circular_buffer(exact.offset);
749
+ const buffer: []u8 = message.buffer[0..config.message_size_max];
750
+ const offset = offset_logical(.prepares, slot);
543
751
 
544
752
  // Memory must not be owned by `self.headers` as these may be modified concurrently:
545
753
  assert(@ptrToInt(buffer.ptr) < @ptrToInt(self.headers.ptr) or
546
- @ptrToInt(buffer.ptr) > @ptrToInt(self.headers.ptr) + self.size_headers);
754
+ @ptrToInt(buffer.ptr) > @ptrToInt(self.headers.ptr) + headers_size);
547
755
 
548
- log.debug(
549
- "{}: read_sectors: offset={} len={}",
550
- .{ replica.replica, offset, buffer.len },
756
+ assert_bounds(.prepares, offset, buffer.len);
757
+ self.storage.read_sectors(
758
+ read_prepare_with_op_and_checksum_callback,
759
+ &read.completion,
760
+ buffer,
761
+ .wal,
762
+ offset,
551
763
  );
552
-
553
- self.storage.read_sectors(on_read, &read.completion, buffer, offset);
554
764
  }
555
765
 
556
- fn on_read(completion: *Storage.Read) void {
766
+ fn read_prepare_with_op_and_checksum_callback(completion: *Storage.Read) void {
557
767
  const read = @fieldParentPtr(Self.Read, "completion", completion);
558
768
  const self = read.self;
559
769
  const replica = @fieldParentPtr(Replica, "journal", self);
560
770
  const op = read.op;
561
771
  const checksum = read.checksum;
772
+ assert(self.recovered);
562
773
 
563
774
  defer {
564
775
  replica.message_bus.unref(read.message);
@@ -571,43 +782,85 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
571
782
  return;
572
783
  }
573
784
 
574
- _ = replica.journal.entry_for_op_exact_with_checksum(op, checksum) orelse {
575
- self.read_prepare_log(op, checksum, "no entry exactly");
785
+ const checksum_inhabited = self.prepare_inhabited[self.slot_for_op(op).index];
786
+ const checksum_match = self.prepare_checksums[self.slot_for_op(op).index] == checksum;
787
+ if (!checksum_inhabited or !checksum_match) {
788
+ self.read_prepare_log(op, checksum, "prepare changed during read");
576
789
  read.callback(replica, null, null);
577
790
  return;
578
- };
791
+ }
792
+
793
+ // Check that the `headers` slot belongs to the same op that it did when the read began.
794
+ // The slot may not match the Read's op/checksum due to either:
795
+ // * The in-memory header changed since the read began.
796
+ // * The in-memory header is reserved+faulty; the read was via `prepare_checksums`
797
+ const slot = self.slot_with_op_and_checksum(op, checksum);
579
798
 
580
799
  if (!read.message.header.valid_checksum()) {
581
- self.faulty.set(op);
582
- self.dirty.set(op);
800
+ if (slot) |s| {
801
+ self.faulty.set(s);
802
+ self.dirty.set(s);
803
+ }
583
804
 
584
805
  self.read_prepare_log(op, checksum, "corrupt header after read");
585
806
  read.callback(replica, null, null);
586
807
  return;
587
808
  }
809
+ assert(read.message.header.invalid() == null);
810
+
811
+ if (read.message.header.cluster != replica.cluster) {
812
+ // This could be caused by a misdirected read or write.
813
+ // Though when a prepare spans multiple sectors, a misdirected read/write will
814
+ // likely manifest as a checksum failure instead.
815
+ if (slot) |s| {
816
+ self.faulty.set(s);
817
+ self.dirty.set(s);
818
+ }
588
819
 
589
- const body = read.message.buffer[@sizeOf(Header)..read.message.header.size];
590
- if (!read.message.header.valid_checksum_body(body)) {
591
- self.faulty.set(op);
592
- self.dirty.set(op);
593
-
594
- self.read_prepare_log(op, checksum, "corrupt body after read");
820
+ self.read_prepare_log(op, checksum, "wrong cluster");
595
821
  read.callback(replica, null, null);
596
822
  return;
597
823
  }
598
824
 
599
825
  if (read.message.header.op != op) {
826
+ // Possible causes:
827
+ // * The prepare was rewritten since the read began.
828
+ // * Misdirected read/write.
829
+ // * The combination of:
830
+ // * The leader is responding to a `request_prepare`.
831
+ // * The `request_prepare` did not include a checksum.
832
+ // * The requested op's slot is faulty, but the prepare is valid. Since the
833
+ // prepare is valid, WAL recovery set `prepare_checksums[slot]`. But on reading
834
+ // this entry it turns out not to have the right op.
835
+ // (This case (and the accompanying unnessary read) could be prevented by storing
836
+ // the op along with the checksum in `prepare_checksums`.)
837
+ assert(slot == null);
838
+
600
839
  self.read_prepare_log(op, checksum, "op changed during read");
601
840
  read.callback(replica, null, null);
602
841
  return;
603
842
  }
604
843
 
605
844
  if (read.message.header.checksum != checksum) {
845
+ // This can also be caused by a misdirected read/write.
846
+ assert(slot == null);
847
+
606
848
  self.read_prepare_log(op, checksum, "checksum changed during read");
607
849
  read.callback(replica, null, null);
608
850
  return;
609
851
  }
610
852
 
853
+ if (!read.message.header.valid_checksum_body(read.message.body())) {
854
+ if (slot) |s| {
855
+ self.faulty.set(s);
856
+ self.dirty.set(s);
857
+ }
858
+
859
+ self.read_prepare_log(op, checksum, "corrupt body after read");
860
+ read.callback(replica, null, null);
861
+ return;
862
+ }
863
+
611
864
  read.callback(replica, read.message, read.destination_replica);
612
865
  }
613
866
 
@@ -620,46 +873,37 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
620
873
 
621
874
  pub fn recover(self: *Self) void {
622
875
  assert(!self.recovered);
876
+ assert(!self.recovering);
877
+ assert(self.dirty.count == slot_count);
878
+ assert(self.faulty.count == slot_count);
623
879
 
624
- if (self.recovering) return;
625
880
  self.recovering = true;
626
881
 
627
882
  log.debug("{}: recover: recovering", .{self.replica});
628
883
 
629
- self.recover_headers(0, 0);
630
- self.recover_headers(0, 1);
884
+ self.recover_headers(0);
631
885
  }
632
886
 
633
- fn recover_headers(self: *Self, offset: u64, version: u1) void {
887
+ fn recover_headers(self: *Self, offset: u64) void {
634
888
  const replica = @fieldParentPtr(Replica, "journal", self);
635
889
 
636
890
  assert(!self.recovered);
637
891
  assert(self.recovering);
892
+ assert(self.dirty.count == slot_count);
893
+ assert(self.faulty.count == slot_count);
638
894
 
639
- if (offset == self.size_headers) {
640
- log.debug("{}: recover_headers: version={} recovered", .{
641
- self.replica,
642
- version,
643
- });
644
- if (self.reads.executing() == 0) {
645
- log.debug("{}: recover_headers: both versions recovered", .{self.replica});
646
- self.recovered = true;
647
- self.recovering = false;
648
- // The initialization op (TODO Snapshots):
649
- assert(!self.dirty.bit(0));
650
- assert(!self.faulty.bit(0));
651
- // From here it's over to the Recovery protocol from VRR 2012.
652
- }
895
+ if (offset == headers_size) {
896
+ log.debug("{}: recover_headers: complete", .{self.replica});
897
+ self.recover_prepares(Slot{ .index = 0 });
653
898
  return;
654
899
  }
655
- assert(offset < self.size_headers);
900
+ assert(offset < headers_size);
656
901
 
657
902
  const message = replica.message_bus.get_message();
658
903
  defer replica.message_bus.unref(message);
659
904
 
660
- // We use the count of reads executing to know when both versions have finished reading:
661
905
  // We expect that no other process is issuing reads while we are recovering.
662
- assert(self.reads.executing() < 2);
906
+ assert(self.reads.executing() == 0);
663
907
 
664
908
  const read = self.reads.acquire() orelse unreachable;
665
909
  read.* = .{
@@ -669,148 +913,547 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
669
913
  .callback = undefined,
670
914
  .op = undefined,
671
915
  .checksum = offset,
672
- .destination_replica = version,
916
+ .destination_replica = null,
673
917
  };
674
918
 
675
- const buffer = self.recover_headers_buffer(message, offset);
919
+ const buffer = recover_headers_buffer(message, offset);
676
920
  assert(buffer.len > 0);
677
921
 
678
- log.debug("{}: recover_headers: version={} offset={} size={} recovering", .{
922
+ log.debug("{}: recover_headers: offset={} size={} recovering", .{
679
923
  self.replica,
680
- version,
681
924
  offset,
682
925
  buffer.len,
683
926
  });
684
927
 
685
928
  self.storage.read_sectors(
686
- recover_headers_on_read,
929
+ recover_headers_callback,
687
930
  &read.completion,
688
931
  buffer,
689
- self.offset_in_headers_version(offset, version),
932
+ .wal,
933
+ offset,
690
934
  );
691
935
  }
692
936
 
693
- fn recover_headers_buffer(self: *Self, message: *Message, offset: u64) []u8 {
694
- const max = std.math.min(message.buffer.len, self.size_headers - offset);
695
- assert(max % config.sector_size == 0);
696
- return message.buffer[0..max];
697
- }
698
-
699
- fn recover_headers_on_read(completion: *Storage.Read) void {
937
+ fn recover_headers_callback(completion: *Storage.Read) void {
700
938
  const read = @fieldParentPtr(Self.Read, "completion", completion);
701
939
  const self = read.self;
702
940
  const replica = @fieldParentPtr(Replica, "journal", self);
703
941
  const message = read.message;
704
942
 
705
943
  const offset = @intCast(u64, read.checksum);
706
- const version = @intCast(u1, read.destination_replica.?);
707
- const buffer = self.recover_headers_buffer(message, offset);
944
+ const buffer = recover_headers_buffer(message, offset);
708
945
 
709
- log.debug("{}: recover_headers: version={} offset={} size={} recovered", .{
946
+ log.debug("{}: recover_headers: offset={} size={} recovered", .{
710
947
  self.replica,
711
- version,
712
948
  offset,
713
949
  buffer.len,
714
950
  });
715
951
 
952
+ assert(!self.recovered);
953
+ assert(self.recovering);
716
954
  assert(offset % @sizeOf(Header) == 0);
717
955
  assert(buffer.len >= @sizeOf(Header));
718
956
  assert(buffer.len % @sizeOf(Header) == 0);
957
+ assert(read.destination_replica == null);
958
+ assert(self.dirty.count == slot_count);
959
+ assert(self.faulty.count == slot_count);
960
+
961
+ // Directly store all the redundant headers in `self.headers_redundant` (including any
962
+ // that are invalid or corrupt). As the prepares are recovered, these will be replaced
963
+ // or removed as necessary.
964
+ const buffer_headers = std.mem.bytesAsSlice(Header, buffer);
965
+ std.mem.copy(
966
+ Header,
967
+ self.headers_redundant[@divExact(offset, @sizeOf(Header))..][0..buffer_headers.len],
968
+ buffer_headers,
969
+ );
719
970
 
720
- for (std.mem.bytesAsSlice(Header, buffer)) |*header, index| {
721
- const op = offset / @sizeOf(Header) + index;
722
-
723
- if (header.valid_checksum()) {
724
- // This header is valid.
725
- if (self.entry_for_op(op)) |existing| {
726
- if (existing.checksum == header.checksum) {
727
- // We also have the same header from the other version.
728
- assert(!self.faulty.bit(op));
729
- } else if (existing.command == .reserved) {
730
- self.set_entry_as_dirty(header);
731
- self.faulty.clear(op);
732
- } else {
733
- // Don't replace any existing op from the other version.
734
- // First come, first served.
735
- // We'll sort out the right order later when we recover higher up.
736
- assert(!self.faulty.bit(op));
737
- }
738
- } else if (header.command == .reserved) {
739
- self.dirty.set(op);
740
- self.faulty.clear(op);
741
- } else {
742
- self.set_entry_as_dirty(header);
743
- }
744
- } else {
745
- // This header is corrupt.
746
- if (self.entry_for_op(op)) |_| {
747
- // However, we have a valid header from the other version.
748
- } else {
749
- self.dirty.set(op);
750
- self.faulty.set(op);
751
- }
752
- }
753
- }
754
-
971
+ const offset_next = offset + buffer.len;
755
972
  // We must release before we call `recover_headers()` in case Storage is synchronous.
756
973
  // Otherwise, we would run out of messages and reads.
757
974
  replica.message_bus.unref(read.message);
758
975
  self.reads.release(read);
759
976
 
760
- self.recover_headers(offset + buffer.len, version);
977
+ self.recover_headers(offset_next);
761
978
  }
762
979
 
763
- /// A safe way of removing an entry, where the header must match the current entry to succeed.
764
- fn remove_entry(self: *Self, header: *const Header) void {
765
- // Copy the header.op by value to avoid a reset() followed by undefined header.op usage:
766
- const op = header.op;
767
- log.debug("{}: remove_entry: op={} checksum={}", .{
980
+ fn recover_headers_buffer(message: *Message, offset: u64) []align(@alignOf(Header)) u8 {
981
+ const max = std.math.min(message.buffer.len, headers_size - offset);
982
+ assert(max % config.sector_size == 0);
983
+ assert(max % @sizeOf(Header) == 0);
984
+ return @alignCast(@alignOf(Header), message.buffer[0..max]);
985
+ }
986
+
987
+ fn recover_prepares(self: *Self, slot: Slot) void {
988
+ const replica = @fieldParentPtr(Replica, "journal", self);
989
+ assert(!self.recovered);
990
+ assert(self.recovering);
991
+ assert(self.dirty.count == slot_count);
992
+ assert(self.faulty.count == slot_count);
993
+ // We expect that no other process is issuing reads while we are recovering.
994
+ assert(self.reads.executing() == 0);
995
+
996
+ if (slot.index == slot_count) {
997
+ self.recover_slots();
998
+ return;
999
+ }
1000
+ assert(slot.index < slot_count);
1001
+
1002
+ const message = replica.message_bus.get_message();
1003
+ defer replica.message_bus.unref(message);
1004
+
1005
+ const read = self.reads.acquire() orelse unreachable;
1006
+ read.* = .{
1007
+ .self = self,
1008
+ .completion = undefined,
1009
+ .message = message.ref(),
1010
+ .callback = undefined,
1011
+ .op = undefined,
1012
+ .checksum = slot.index,
1013
+ .destination_replica = null,
1014
+ };
1015
+
1016
+ log.debug("{}: recover_prepares: recovering slot={}", .{
768
1017
  self.replica,
769
- op,
770
- header.checksum,
1018
+ slot.index,
771
1019
  });
772
1020
 
773
- assert(self.entry(header).?.checksum == header.checksum);
774
- assert(self.headers[op].checksum == header.checksum); // TODO Snapshots
1021
+ self.storage.read_sectors(
1022
+ recover_prepares_callback,
1023
+ &read.completion,
1024
+ // We load the entire message to verify that it isn't torn or corrupt.
1025
+ // We don't know the message's size, so use the entire buffer.
1026
+ message.buffer[0..config.message_size_max],
1027
+ .wal,
1028
+ offset_logical(.prepares, slot),
1029
+ );
1030
+ }
1031
+
1032
+ fn recover_prepares_callback(completion: *Storage.Read) void {
1033
+ const read = @fieldParentPtr(Self.Read, "completion", completion);
1034
+ const self = read.self;
1035
+ const replica = @fieldParentPtr(Replica, "journal", self);
775
1036
 
776
- defer self.headers[op] = Header.reserved();
777
- self.dirty.clear(op);
778
- self.faulty.clear(op);
1037
+ assert(!self.recovered);
1038
+ assert(self.recovering);
1039
+ assert(self.dirty.count == slot_count);
1040
+ assert(self.faulty.count == slot_count);
1041
+ assert(read.destination_replica == null);
1042
+
1043
+ const slot = Slot{ .index = @intCast(u64, read.checksum) };
1044
+ assert(slot.index < slot_count);
1045
+
1046
+ // Check `valid_checksum_body` here rather than in `recover_done` so that we don't need
1047
+ // to hold onto the whole message (just the header).
1048
+ if (read.message.header.valid_checksum() and
1049
+ read.message.header.valid_checksum_body(read.message.body()))
1050
+ {
1051
+ self.headers[slot.index] = read.message.header.*;
1052
+ }
1053
+
1054
+ replica.message_bus.unref(read.message);
1055
+ self.reads.release(read);
1056
+
1057
+ self.recover_prepares(Slot{ .index = slot.index + 1 });
1058
+ }
1059
+
1060
+ /// When in doubt about whether a particular message was received, it must be marked as
1061
+ /// faulty to avoid nacking a prepare which was received then lost/misdirected/corrupted.
1062
+ ///
1063
+ ///
1064
+ /// There are two special cases where faulty slots must be carefully handled:
1065
+ ///
1066
+ /// A) Redundant headers are written in batches. Slots that are marked faulty are written
1067
+ /// as invalid (zeroed). This ensures that if the replica crashes and recovers, the
1068
+ /// entries are still faulty rather than reserved.
1069
+ /// The recovery process must be conservative about which headers are stored in
1070
+ /// `journal.headers`. To understand why this is important, consider what happens if it did
1071
+ /// load the faulty header into `journal.headers`, and then reads it back after a restart:
1072
+ ///
1073
+ /// 1. Suppose slot 8 is in case @D. Per the table below, mark slot 8 faulty.
1074
+ /// 2. Suppose slot 9 is also loaded as faulty.
1075
+ /// 3. Journal recovery finishes. The replica beings to repair its missing/broken messages.
1076
+ /// 4. VSR recovery protocol fetches the true prepare for slot 9.
1077
+ /// 5. The message from step 4 is written to slot 9 of the prepares.
1078
+ /// 6. The header from step 4 is written to slot 9 of the redundant headers.
1079
+ /// But writes to the redundant headers are done in batches of `headers_per_sector`!
1080
+ /// So if step 1 loaded slot 8's prepare header into `journal.headers`, slot 8's
1081
+ /// redundant header would be updated at the same time (in the same write) as slot 9.
1082
+ /// 7! Immediately after step 6's write finishes, suppose the replica crashes (e.g. due to
1083
+ /// power failure.
1084
+ /// 8! Journal recovery again — but now slot 8 is loaded *without* being marked faulty.
1085
+ /// So we may incorrectly nack slot 8's message.
1086
+ ///
1087
+ /// Therefore, recovery will never load a header into a slot *and* mark that slot faulty.
1088
+ ///
1089
+ ///
1090
+ /// B) When replica_count=1, repairing broken/lost prepares over VSR is not an option,
1091
+ /// so if a message is faulty the replica will abort.
1092
+ ///
1093
+ ///
1094
+ /// Recovery decision table:
1095
+ ///
1096
+ /// label @A @B @C @D @E @F @G @H @I @J @K @L @M @N
1097
+ /// header valid 0 1 1 0 0 0 1 1 1 1 1 1 1 1
1098
+ /// header reserved _ 1 0 _ _ _ 1 1 0 1 0 0 0 0
1099
+ /// prepare valid 0 0 0 1 1 1 1 1 1 1 1 1 1 1
1100
+ /// prepare reserved _ _ _ 1 0 0 0 0 1 1 0 0 0 0
1101
+ /// prepare.op is maximum _ _ _ _ 0 1 0 1 _ _ _ _ _ _
1102
+ /// match checksum _ _ _ _ _ _ _ _ _ !1 0 0 0 1
1103
+ /// match op _ _ _ _ _ _ _ _ _ !1 < > 1 !1
1104
+ /// match view _ _ _ _ _ _ _ _ _ !1 _ _ !0 !1
1105
+ /// decision (replicas>1) vsr vsr vsr vsr vsr fix vsr fix vsr nil fix vsr vsr eql
1106
+ /// decision (replicas=1) fix fix
1107
+ ///
1108
+ /// Legend:
1109
+ ///
1110
+ /// 0 false
1111
+ /// 1 true
1112
+ /// !0 assert false
1113
+ /// !1 assert true
1114
+ /// _ ignore
1115
+ /// < header.op < prepare.op
1116
+ /// > header.op > prepare.op
1117
+ /// eql The header and prepare are identical; no repair necessary.
1118
+ /// nil Reserved; dirty/faulty are clear, no repair necessary.
1119
+ /// fix When replicas=1, use intact prepare. When replicas>1, use VSR `request_prepare`.
1120
+ /// vsr Repair with VSR `request_prepare`.
1121
+ ///
1122
+ /// A "valid" header/prepare:
1123
+ /// 1. has a valid checksum
1124
+ /// 2. has the correct cluster
1125
+ /// 3. is in the correct slot (op % slot_count)
1126
+ /// 4. has command=reserved or command=prepare
1127
+ fn recover_slots(self: *Self) void {
1128
+ const replica = @fieldParentPtr(Replica, "journal", self);
1129
+
1130
+ assert(!self.recovered);
1131
+ assert(self.recovering);
1132
+ assert(self.reads.executing() == 0);
1133
+ assert(self.writes.executing() == 0);
1134
+ assert(self.dirty.count == slot_count);
1135
+ assert(self.faulty.count == slot_count);
1136
+
1137
+ const prepare_op_max = std.math.max(
1138
+ replica.op_checkpoint,
1139
+ op_maximum_headers_untrusted(replica.cluster, self.headers),
1140
+ );
1141
+
1142
+ var cases: [slot_count]*const Case = undefined;
1143
+
1144
+ for (self.headers) |_, index| {
1145
+ const slot = Slot{ .index = index };
1146
+ const header = header_ok(replica.cluster, slot, &self.headers_redundant[index]);
1147
+ const prepare = header_ok(replica.cluster, slot, &self.headers[index]);
1148
+
1149
+ cases[index] = recovery_case(header, prepare, prepare_op_max);
1150
+
1151
+ // `prepare_checksums` improves the availability of `request_prepare` by being more
1152
+ // flexible than `headers` regarding the prepares it references. It may hold a
1153
+ // prepare whose redundant header is broken, as long as the prepare itself is valid.
1154
+ if (prepare != null and prepare.?.command == .prepare) {
1155
+ assert(!self.prepare_inhabited[index]);
1156
+ self.prepare_inhabited[index] = true;
1157
+ self.prepare_checksums[index] = prepare.?.checksum;
1158
+ }
1159
+ }
1160
+ assert(self.headers.len == cases.len);
1161
+
1162
+ // Refine cases @B and @C: Repair (truncate) a prepare if it was torn during a crash.
1163
+ if (self.recover_torn_prepare(&cases)) |torn_slot| {
1164
+ assert(cases[torn_slot.index].decision(replica.replica_count) == .vsr);
1165
+ cases[torn_slot.index] = &case_cut;
1166
+ }
1167
+
1168
+ for (cases) |case, index| self.recover_slot(Slot{ .index = index }, case);
1169
+ assert(cases.len == slot_count);
1170
+
1171
+ log.debug("{}: recover_slots: dirty={} faulty={}", .{
1172
+ self.replica,
1173
+ self.dirty.count,
1174
+ self.faulty.count,
1175
+ });
1176
+
1177
+ self.recovered = true;
1178
+ self.recovering = false;
1179
+ self.assert_recovered();
1180
+ // From here it's over to the Recovery protocol from VRR 2012.
1181
+ }
1182
+
1183
+ /// Returns a slot that is safe to truncate.
1184
+ //
1185
+ /// Truncate any prepare that was torn while being appended to the log before a crash, when:
1186
+ /// * the maximum valid op is the same in the prepare headers and redundant headers,
1187
+ /// * in the slot following the maximum valid op:
1188
+ /// - the redundant header is valid,
1189
+ /// - the redundant header is reserved, and/or the op is at least a log cycle behind,
1190
+ /// - the prepare is corrupt, and
1191
+ /// * there are no faults except for those between `op_checkpoint` and `op_max + 1`,
1192
+ /// so that we can be sure that the maximum valid op is in fact the maximum.
1193
+ fn recover_torn_prepare(self: *const Self, cases: []const *const Case) ?Slot {
1194
+ const replica = @fieldParentPtr(Replica, "journal", self);
1195
+
1196
+ assert(!self.recovered);
1197
+ assert(self.recovering);
1198
+ assert(self.dirty.count == slot_count);
1199
+ assert(self.faulty.count == slot_count);
1200
+
1201
+ const op_max = op_maximum_headers_untrusted(replica.cluster, self.headers_redundant);
1202
+ if (op_max != op_maximum_headers_untrusted(replica.cluster, self.headers)) return null;
1203
+ if (op_max < replica.op_checkpoint) return null;
1204
+ // We can't assume that the header at `op_max` is a prepare — an empty journal with a
1205
+ // corrupt root prepare (op_max=0) will be repaired later.
1206
+
1207
+ const torn_op = op_max + 1;
1208
+ const torn_slot = self.slot_for_op(torn_op);
1209
+
1210
+ const torn_prepare_untrusted = &self.headers[torn_slot.index];
1211
+ if (torn_prepare_untrusted.valid_checksum()) return null;
1212
+ // The prepare is at least corrupt, possibly torn, but not valid and simply misdirected.
1213
+
1214
+ const header_untrusted = &self.headers_redundant[torn_slot.index];
1215
+ const header = header_ok(replica.cluster, torn_slot, header_untrusted) orelse return null;
1216
+ // The redundant header is valid, also for the correct cluster and not misdirected.
1217
+
1218
+ if (header.command == .prepare) {
1219
+ // The redundant header was already written, so the prepare is corrupt, not torn.
1220
+ if (header.op == torn_op) return null;
1221
+
1222
+ assert(header.op < torn_op); // Since torn_op > op_max.
1223
+ // The redundant header is from any previous log cycle.
1224
+ } else {
1225
+ assert(header.command == .reserved);
1226
+
1227
+ // This is the first log cycle.
1228
+
1229
+ // TODO Can we be more sure about this? What if op_max is clearly many cycles ahead?
1230
+ // Any previous log cycle is then expected to have a prepare, not a reserved header,
1231
+ // unless the prepare header was lost, in which case this slot may also not be torn.
1232
+ }
1233
+
1234
+ const checkpoint_index = self.slot_for_op(replica.op_checkpoint).index;
1235
+ if (checkpoint_index == torn_slot.index) {
1236
+ // The checkpoint and the torn op are in the same slot.
1237
+ assert(cases[checkpoint_index].decision(replica.replica_count) == .vsr);
1238
+ assert(slot_count > 1);
1239
+ assert(op_max >= replica.op_checkpoint);
1240
+ assert(torn_op == op_max + 1);
1241
+ assert(torn_op > replica.op_checkpoint);
1242
+ return null;
1243
+ }
1244
+
1245
+ const known_range = SlotRange{
1246
+ .head = Slot{ .index = checkpoint_index },
1247
+ .tail = torn_slot,
1248
+ };
1249
+
1250
+ // We must be certain that the torn prepare really was being appended to the WAL.
1251
+ // Return if any faults do not lie between the checkpoint and the torn prepare, such as:
1252
+ //
1253
+ // (fault [checkpoint..........torn] fault)
1254
+ // (...torn] fault fault [checkpoint......)
1255
+ for (cases) |case, index| {
1256
+ // Do not use `faulty.bit()` because the decisions have not been processed yet.
1257
+ if (case.decision(replica.replica_count) == .vsr and
1258
+ !known_range.contains(Slot{ .index = index }))
1259
+ {
1260
+ return null;
1261
+ }
1262
+ }
1263
+
1264
+ // The prepare is torn.
1265
+ assert(!self.prepare_inhabited[torn_slot.index]);
1266
+ assert(!torn_prepare_untrusted.valid_checksum());
1267
+ assert(cases[torn_slot.index].decision(replica.replica_count) == .vsr);
1268
+ return torn_slot;
1269
+ }
1270
+
1271
+ fn recover_slot(self: *Self, slot: Slot, case: *const Case) void {
1272
+ const replica = @fieldParentPtr(Replica, "journal", self);
1273
+ const cluster = replica.cluster;
1274
+
1275
+ assert(!self.recovered);
1276
+ assert(self.recovering);
1277
+ assert(self.dirty.bit(slot));
1278
+ assert(self.faulty.bit(slot));
1279
+
1280
+ const header = header_ok(cluster, slot, &self.headers_redundant[slot.index]);
1281
+ const prepare = header_ok(cluster, slot, &self.headers[slot.index]);
1282
+ const decision = case.decision(replica.replica_count);
1283
+ switch (decision) {
1284
+ .eql => {
1285
+ assert(header.?.command == .prepare);
1286
+ assert(prepare.?.command == .prepare);
1287
+ assert(header.?.checksum == prepare.?.checksum);
1288
+ assert(self.prepare_inhabited[slot.index]);
1289
+ assert(self.prepare_checksums[slot.index] == prepare.?.checksum);
1290
+ self.headers[slot.index] = header.?.*;
1291
+ self.dirty.clear(slot);
1292
+ self.faulty.clear(slot);
1293
+ },
1294
+ .nil => {
1295
+ assert(header.?.command == .reserved);
1296
+ assert(prepare.?.command == .reserved);
1297
+ assert(header.?.checksum == prepare.?.checksum);
1298
+ assert(header.?.checksum == Header.reserved(cluster, slot.index).checksum);
1299
+ assert(!self.prepare_inhabited[slot.index]);
1300
+ assert(self.prepare_checksums[slot.index] == 0);
1301
+ self.headers[slot.index] = header.?.*;
1302
+ self.dirty.clear(slot);
1303
+ self.faulty.clear(slot);
1304
+ },
1305
+ .fix => {
1306
+ // TODO Perhaps we should have 3 separate branches here for the different cases.
1307
+ // The header may be valid or invalid.
1308
+ // The header may be reserved or a prepare.
1309
+ assert(prepare.?.command == .prepare);
1310
+ assert(self.prepare_inhabited[slot.index]);
1311
+ assert(self.prepare_checksums[slot.index] == prepare.?.checksum);
1312
+
1313
+ self.headers[slot.index] = prepare.?.*;
1314
+ self.faulty.clear(slot);
1315
+ if (replica.replica_count == 1) {
1316
+ // @E, @F, @G, @H, @K:
1317
+ self.dirty.clear(slot);
1318
+ // TODO Repair header on disk to restore durability.
1319
+ } else {
1320
+ // @F, @H, @K:
1321
+ // TODO Repair without retrieving remotely (i.e. don't set dirty or faulty).
1322
+ assert(self.dirty.bit(slot));
1323
+ }
1324
+ },
1325
+ .vsr => {
1326
+ self.headers[slot.index] = Header.reserved(cluster, slot.index);
1327
+ assert(self.dirty.bit(slot));
1328
+ assert(self.faulty.bit(slot));
1329
+ },
1330
+ .cut => {
1331
+ assert(header != null);
1332
+ assert(prepare == null);
1333
+ assert(!self.prepare_inhabited[slot.index]);
1334
+ assert(self.prepare_checksums[slot.index] == 0);
1335
+ self.headers[slot.index] = Header.reserved(cluster, slot.index);
1336
+ self.dirty.clear(slot);
1337
+ self.faulty.clear(slot);
1338
+ },
1339
+ }
1340
+
1341
+ switch (decision) {
1342
+ .eql, .nil => {
1343
+ log.debug("{}: recover_slot: recovered slot={} label={s} decision={s}", .{
1344
+ self.replica,
1345
+ slot.index,
1346
+ case.label,
1347
+ @tagName(decision),
1348
+ });
1349
+ },
1350
+ .fix, .vsr, .cut => {
1351
+ log.warn("{}: recover_slot: recovered slot={} label={s} decision={s}", .{
1352
+ self.replica,
1353
+ slot.index,
1354
+ case.label,
1355
+ @tagName(decision),
1356
+ });
1357
+ },
1358
+ }
1359
+ }
1360
+
1361
+ fn assert_recovered(self: *const Self) void {
1362
+ const replica = @fieldParentPtr(Replica, "journal", self);
1363
+
1364
+ assert(self.recovered);
1365
+ assert(!self.recovering);
1366
+
1367
+ assert(self.dirty.count <= slot_count);
1368
+ assert(self.faulty.count <= slot_count);
1369
+ assert(self.faulty.count <= self.dirty.count);
1370
+
1371
+ // Abort if all slots are faulty, since something is very wrong.
1372
+ if (self.faulty.count == slot_count) @panic("WAL is completely corrupt");
1373
+ if (self.faulty.count > 0 and replica.replica_count == 1) @panic("WAL is corrupt");
1374
+
1375
+ if (self.headers[0].op == 0 and self.headers[0].command == .prepare) {
1376
+ assert(self.headers[0].checksum == Header.root_prepare(replica.cluster).checksum);
1377
+ assert(!self.faulty.bit(Slot{ .index = 0 }));
1378
+ }
1379
+
1380
+ for (self.headers) |*header, index| {
1381
+ assert(header.valid_checksum());
1382
+ assert(header.cluster == replica.cluster);
1383
+ if (header.command == .reserved) {
1384
+ assert(header.op == index);
1385
+ } else {
1386
+ assert(header.command == .prepare);
1387
+ assert(header.op % slot_count == index);
1388
+ assert(self.prepare_inhabited[index]);
1389
+ assert(self.prepare_checksums[index] == header.checksum);
1390
+ assert(!self.faulty.bit(Slot{ .index = index }));
1391
+ }
1392
+ }
779
1393
  }
780
1394
 
781
1395
  /// Removes entries from `op_min` (inclusive) onwards.
782
- /// This is used after a view change to remove uncommitted entries discarded by the new leader.
1396
+ /// Used after a view change to remove uncommitted entries discarded by the new leader.
783
1397
  pub fn remove_entries_from(self: *Self, op_min: u64) void {
784
- // TODO Snapshots
785
- // TODO Optimize to jump directly to op:
1398
+ const replica = @fieldParentPtr(Replica, "journal", self);
1399
+
1400
+ assert(self.recovered);
786
1401
  assert(op_min > 0);
1402
+
787
1403
  log.debug("{}: remove_entries_from: op_min={}", .{ self.replica, op_min });
788
- for (self.headers) |*header| {
789
- if (header.op >= op_min and header.command == .prepare) {
790
- self.remove_entry(header);
1404
+
1405
+ for (self.headers) |*header, index| {
1406
+ // We must remove the header regardless of whether it is a prepare or reserved,
1407
+ // since a reserved header may have been marked faulty for case @G, and
1408
+ // since the caller expects the WAL to be truncated, with clean slots.
1409
+ if (header.op >= op_min) {
1410
+ // TODO Explore scenarios where the data on disk may resurface after a crash.
1411
+ const slot = self.slot_for_op(header.op);
1412
+ assert(slot.index == index);
1413
+ self.headers[slot.index] = Header.reserved(replica.cluster, slot.index);
1414
+ self.dirty.clear(slot);
1415
+ self.faulty.clear(slot);
1416
+ // Do not clear `prepare_inhabited`/`prepare_checksums`. The prepare is
1417
+ // untouched on disk, and may be useful later. Consider this scenario:
1418
+ //
1419
+ // 1. Op 4 is received; start writing it.
1420
+ // 2. Op 4's prepare is written (setting `prepare_checksums`), start writing
1421
+ // the headers.
1422
+ // 3. View change. Op 4 is discarded by `remove_entries_from`.
1423
+ // 4. View change. Op 4 (the same one from before) is back, marked as dirty. But
1424
+ // we don't start a write, because `journal.writing()` says it is already in
1425
+ // progress.
1426
+ // 5. Op 4's header write finishes (`write_prepare_on_write_header`).
1427
+ //
1428
+ // If `remove_entries_from` cleared `prepare_checksums`,
1429
+ // `write_prepare_on_write_header` would clear `dirty`/`faulty` for a slot with
1430
+ // `prepare_inhabited=false`.
791
1431
  }
792
1432
  }
793
- self.assert_headers_reserved_from(op_min);
794
- // TODO At startup we need to handle entries that may have been removed but now reappear.
795
- // This is because we do not call `write_headers_between()` here.
796
1433
  }
797
1434
 
798
- pub fn set_entry_as_dirty(self: *Self, header: *const Header) void {
799
- log.debug("{}: set_entry_as_dirty: op={} checksum={}", .{
1435
+ pub fn set_header_as_dirty(self: *Self, header: *const Header) void {
1436
+ assert(self.recovered);
1437
+ assert(header.command == .prepare);
1438
+
1439
+ log.debug("{}: set_header_as_dirty: op={} checksum={}", .{
800
1440
  self.replica,
801
1441
  header.op,
802
1442
  header.checksum,
803
1443
  });
804
- if (self.entry(header)) |existing| {
805
- if (existing.checksum != header.checksum) {
806
- self.faulty.clear(header.op);
807
- }
1444
+ const slot = self.slot_for_header(header);
1445
+
1446
+ if (self.has(header)) {
1447
+ assert(self.dirty.bit(slot));
1448
+ // Do not clear any faulty bit for the same entry.
1449
+ } else {
1450
+ self.headers[slot.index] = header.*;
1451
+ self.dirty.set(slot);
1452
+ self.faulty.clear(slot);
808
1453
  }
809
- self.headers[header.op] = header.*;
810
- self.dirty.set(header.op);
811
- // Do not clear any faulty bit for the same entry.
812
1454
  }
813
1455
 
1456
+ /// `write_prepare` uses `write_sectors` to prevent concurrent disk writes.
814
1457
  pub fn write_prepare(
815
1458
  self: *Self,
816
1459
  callback: fn (self: *Replica, wrote: ?*Message, trigger: Write.Trigger) void,
@@ -819,17 +1462,23 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
819
1462
  ) void {
820
1463
  const replica = @fieldParentPtr(Replica, "journal", self);
821
1464
 
1465
+ assert(self.recovered);
822
1466
  assert(message.header.command == .prepare);
823
1467
  assert(message.header.size >= @sizeOf(Header));
824
1468
  assert(message.header.size <= message.buffer.len);
1469
+ assert(self.has(message.header));
825
1470
 
826
1471
  // The underlying header memory must be owned by the buffer and not by self.headers:
827
1472
  // Otherwise, concurrent writes may modify the memory of the pointer while we write.
828
1473
  assert(@ptrToInt(message.header) == @ptrToInt(message.buffer.ptr));
829
1474
 
830
- if (!self.dirty.bit(message.header.op)) {
1475
+ const slot = self.slot_with_header(message.header).?;
1476
+
1477
+ if (!self.dirty.bit(slot)) {
831
1478
  // Any function that sets the faulty bit should also set the dirty bit:
832
- assert(!self.faulty.bit(message.header.op));
1479
+ assert(!self.faulty.bit(slot));
1480
+ assert(self.prepare_inhabited[slot.index]);
1481
+ assert(self.prepare_checksums[slot.index] == message.header.checksum);
833
1482
  self.write_prepare_debug(message.header, "skipping (clean)");
834
1483
  callback(replica, message, trigger);
835
1484
  return;
@@ -854,22 +1503,21 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
854
1503
  };
855
1504
 
856
1505
  // Slice the message to the nearest sector, we don't want to write the whole buffer:
857
- const sectors = message.buffer[0..vsr.sector_ceil(message.header.size)];
858
- assert(message.header.offset + sectors.len <= self.size_circular_buffer);
1506
+ const buffer = message.buffer[0..vsr.sector_ceil(message.header.size)];
1507
+ const offset = offset_logical(.prepares, slot);
859
1508
 
860
1509
  if (builtin.mode == .Debug) {
861
1510
  // Assert that any sector padding has already been zeroed:
862
- var sum_of_sector_padding_bytes: u32 = 0;
863
- for (sectors[message.header.size..]) |byte| sum_of_sector_padding_bytes += byte;
1511
+ var sum_of_sector_padding_bytes: u8 = 0;
1512
+ for (buffer[message.header.size..]) |byte| sum_of_sector_padding_bytes |= byte;
864
1513
  assert(sum_of_sector_padding_bytes == 0);
865
1514
  }
866
1515
 
867
- self.write_sectors(
868
- write_prepare_header,
869
- write,
870
- sectors,
871
- self.offset_in_circular_buffer(message.header.offset),
872
- );
1516
+ self.prepare_inhabited[slot.index] = false;
1517
+ self.prepare_checksums[slot.index] = 0;
1518
+
1519
+ assert_bounds(.prepares, offset, buffer.len);
1520
+ self.write_sectors(write_prepare_header, write, buffer, offset);
873
1521
  }
874
1522
 
875
1523
  /// Attempt to lock the in-memory sector containing the header being written.
@@ -877,8 +1525,13 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
877
1525
  fn write_prepare_header(write: *Self.Write) void {
878
1526
  const self = write.self;
879
1527
  const message = write.message;
1528
+ assert(self.recovered);
880
1529
 
881
- if (!self.has(message.header)) {
1530
+ if (self.slot_with_op_and_checksum(message.header.op, message.header.checksum)) |slot| {
1531
+ assert(!self.prepare_inhabited[slot.index]);
1532
+ self.prepare_inhabited[slot.index] = true;
1533
+ self.prepare_checksums[slot.index] = message.header.checksum;
1534
+ } else {
882
1535
  self.write_prepare_debug(message.header, "entry changed while writing sectors");
883
1536
  self.write_prepare_release(write, null);
884
1537
  return;
@@ -887,14 +1540,19 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
887
1540
  assert(!write.header_sector_locked);
888
1541
  assert(write.header_sector_next == null);
889
1542
 
1543
+ const write_offset = self.offset_logical_in_headers_for_message(message);
1544
+
890
1545
  var it = self.writes.iterate();
891
1546
  while (it.next()) |other| {
892
1547
  if (other == write) continue;
893
1548
  if (!other.header_sector_locked) continue;
894
1549
 
895
- if (other.header_sector_same(write)) {
896
- write.header_sector_next = other.header_sector_next;
897
- other.header_sector_next = write;
1550
+ const other_offset = self.offset_logical_in_headers_for_message(other.message);
1551
+ if (other_offset == write_offset) {
1552
+ // The `other` and `write` target the same sector; append to the list.
1553
+ var tail = other;
1554
+ while (tail.header_sector_next) |next| tail = next;
1555
+ tail.header_sector_next = write;
898
1556
  return;
899
1557
  }
900
1558
  }
@@ -904,6 +1562,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
904
1562
  }
905
1563
 
906
1564
  fn write_prepare_on_lock_header_sector(self: *Self, write: *Write) void {
1565
+ assert(self.recovered);
907
1566
  assert(write.header_sector_locked);
908
1567
 
909
1568
  // TODO It's possible within this section that the header has since been replaced but we
@@ -912,13 +1571,65 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
912
1571
  // For this, we'll need to have a way to tweak write_prepare_release() to release locks.
913
1572
  // At present, we don't return early here simply because it doesn't yet do that.
914
1573
 
1574
+ const replica = @fieldParentPtr(Replica, "journal", self);
915
1575
  const message = write.message;
916
- const offset = write_prepare_header_offset(write.message);
917
- std.mem.copy(
918
- u8,
919
- write.header_sector(self),
920
- std.mem.sliceAsBytes(self.headers)[offset..][0..config.sector_size],
921
- );
1576
+ const slot_of_message = self.slot_for_header(message.header);
1577
+ const slot_first = Slot{
1578
+ .index = @divFloor(slot_of_message.index, headers_per_sector) * headers_per_sector,
1579
+ };
1580
+
1581
+ const offset = offset_logical(.headers, slot_of_message);
1582
+ assert(offset % config.sector_size == 0);
1583
+
1584
+ const buffer: []u8 = write.header_sector(self);
1585
+ const buffer_headers = std.mem.bytesAsSlice(Header, buffer);
1586
+ assert(buffer_headers.len == headers_per_sector);
1587
+
1588
+ var i: usize = 0;
1589
+ while (i < headers_per_sector) : (i += 1) {
1590
+ const slot = Slot{ .index = slot_first.index + i };
1591
+
1592
+ if (self.faulty.bit(slot)) {
1593
+ // Redundant faulty headers are deliberately written as invalid.
1594
+ // This ensures that faulty headers are still faulty when they are read back
1595
+ // from disk during recovery. This prevents faulty entries from changing to
1596
+ // reserved (and clean) after a crash and restart (e.g. accidentally converting
1597
+ // a case `@D` to a `@J` after a restart).
1598
+ buffer_headers[i] = .{
1599
+ .checksum = 0,
1600
+ .cluster = replica.cluster,
1601
+ .command = .reserved,
1602
+ };
1603
+ assert(!buffer_headers[i].valid_checksum());
1604
+ } else if (message.header.op < slot_count and
1605
+ !self.prepare_inhabited[slot.index] and
1606
+ message.header.command == .prepare and
1607
+ self.dirty.bit(slot))
1608
+ {
1609
+ // When:
1610
+ // * this is the first wrap of the WAL, and
1611
+ // * this prepare slot is not inhabited (never has been), and
1612
+ // * this prepare slot is a dirty prepare,
1613
+ // write a reserved header instead of the in-memory prepare header.
1614
+ //
1615
+ // This can be triggered by the follow sequence of events:
1616
+ // 1. Ops 6 and 7 arrive.
1617
+ // 2. The write of prepare 7 finishes (before prepare 6).
1618
+ // 3. Op 7 continues on to write the redundant headers.
1619
+ // Because prepare 6 is not yet written, header 6 is written as reserved.
1620
+ // 4. (If at this point the replica crashes & restarts, slot 6 is in case `@J`
1621
+ // (decision=nil) which can be locally repaired. In contrast, if op 6's
1622
+ // header was written in step 3, it would be case `@I`, which requires
1623
+ // remote repair.
1624
+ //
1625
+ // * When `replica_count=1`, case `@I`, is not recoverable.
1626
+ // * When `replica_count>1` this marginally improves availability by enabling
1627
+ // local repair.
1628
+ buffer_headers[i] = Header.reserved(replica.cluster, slot.index);
1629
+ } else {
1630
+ buffer_headers[i] = self.headers[slot.index];
1631
+ }
1632
+ }
922
1633
 
923
1634
  log.debug("{}: write_header: op={} sectors[{}..{}]", .{
924
1635
  self.replica,
@@ -927,35 +1638,12 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
927
1638
  offset + config.sector_size,
928
1639
  });
929
1640
 
930
- // TODO Snapshots
931
- if (self.write_prepare_header_once(message.header)) {
932
- const version = self.write_headers_increment_version();
933
- self.write_prepare_header_to_version(write, write_prepare_on_write_header, version, write.header_sector(self), offset);
934
- } else {
935
- // Versions must be incremented upfront:
936
- // If we don't increment upfront we could end up writing to the same copy twice.
937
- // We would then lose the redundancy required to locate headers or even overwrite all copies.
938
- const version = self.write_headers_increment_version();
939
- _ = self.write_headers_increment_version();
940
- switch (version) {
941
- 0 => self.write_prepare_header_to_version(write, write_prepare_on_write_header_version_0, 0, write.header_sector(self), offset),
942
- 1 => self.write_prepare_header_to_version(write, write_prepare_on_write_header_version_1, 1, write.header_sector(self), offset),
943
- }
944
- }
945
- }
946
-
947
- fn write_prepare_on_write_header_version_0(write: *Self.Write) void {
948
- const self = write.self;
949
- const offset = write_prepare_header_offset(write.message);
950
- // Pass the opposite version bit from the one we just finished writing.
951
- self.write_prepare_header_to_version(write, write_prepare_on_write_header, 1, write.header_sector(self), offset);
952
- }
1641
+ // Memory must not be owned by self.headers as these may be modified concurrently:
1642
+ assert(@ptrToInt(buffer.ptr) < @ptrToInt(self.headers.ptr) or
1643
+ @ptrToInt(buffer.ptr) > @ptrToInt(self.headers.ptr) + headers_size);
953
1644
 
954
- fn write_prepare_on_write_header_version_1(write: *Self.Write) void {
955
- const self = write.self;
956
- const offset = write_prepare_header_offset(write.message);
957
- // Pass the opposite version bit from the one we just finished writing.
958
- self.write_prepare_header_to_version(write, write_prepare_on_write_header, 0, write.header_sector(self), offset);
1645
+ assert_bounds(.headers, offset, buffer.len);
1646
+ self.write_sectors(write_prepare_on_write_header, write, buffer, offset);
959
1647
  }
960
1648
 
961
1649
  fn write_prepare_on_write_header(write: *Self.Write) void {
@@ -973,9 +1661,10 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
973
1661
 
974
1662
  self.write_prepare_debug(message.header, "complete, marking clean");
975
1663
  // TODO Snapshots
976
- assert(self.has(message.header));
977
- self.dirty.clear(message.header.op);
978
- self.faulty.clear(message.header.op);
1664
+
1665
+ const slot = self.slot_with_header(message.header).?;
1666
+ self.dirty.clear(slot);
1667
+ self.faulty.clear(slot);
979
1668
 
980
1669
  self.write_prepare_release(write, message);
981
1670
  }
@@ -1006,119 +1695,62 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
1006
1695
  self.writes.release(write);
1007
1696
  }
1008
1697
 
1009
- fn write_prepare_debug(self: *Self, header: *const Header, status: []const u8) void {
1010
- log.debug("{}: write: view={} op={} offset={} len={}: {} {s}", .{
1698
+ fn write_prepare_debug(self: *const Self, header: *const Header, status: []const u8) void {
1699
+ log.debug("{}: write: view={} op={} len={}: {} {s}", .{
1011
1700
  self.replica,
1012
1701
  header.view,
1013
1702
  header.op,
1014
- header.offset,
1015
1703
  header.size,
1016
1704
  header.checksum,
1017
1705
  status,
1018
1706
  });
1019
1707
  }
1020
1708
 
1021
- pub fn offset_in_circular_buffer(self: *Self, offset: u64) u64 {
1022
- assert(offset < self.size_circular_buffer);
1023
- return self.size_headers + offset;
1024
- }
1025
-
1026
- fn offset_in_headers_version(self: *Self, offset: u64, version: u1) u64 {
1027
- assert(offset < self.size_headers);
1028
- return switch (version) {
1029
- 0 => offset,
1030
- 1 => self.size_headers + self.size_circular_buffer + offset,
1031
- };
1032
- }
1033
-
1034
- fn write_prepare_header_offset(message: *Message) u64 {
1035
- comptime assert(config.sector_size % @sizeOf(Header) == 0);
1036
- return vsr.sector_floor(message.header.op * @sizeOf(Header));
1037
- }
1038
-
1039
- fn write_headers_increment_version(self: *Self) u1 {
1040
- self.headers_version +%= 1;
1041
- return self.headers_version;
1709
+ fn assert_bounds(ring: Ring, offset: u64, size: u64) void {
1710
+ switch (ring) {
1711
+ .headers => assert(offset + size <= headers_size),
1712
+ .prepares => {
1713
+ assert(offset >= headers_size);
1714
+ assert(offset + size <= headers_size + prepares_size);
1715
+ },
1716
+ }
1042
1717
  }
1043
1718
 
1044
- /// Since we allow gaps in the journal, we may have to write our headers twice.
1045
- /// If a dirty header is being written as reserved (empty) then write twice to make this clear.
1046
- /// If a dirty header has no previous clean chained entry to give its offset then write twice.
1047
- /// Otherwise, we only need to write the headers once because their other copy can be located in
1048
- /// the body of the journal (using the previous entry's offset and size).
1049
- fn write_prepare_header_once(self: *Self, header: *const Header) bool {
1050
- // TODO Optimize this to decide whether to write once or twice once we add support to
1051
- // recover from either header version at startup.
1052
- const always_write_twice = true;
1053
- if (always_write_twice) return false;
1054
-
1055
- // TODO Snapshots
1056
- if (header.command == .reserved) {
1057
- log.debug("{}: write_prepare_header_once: dirty reserved header", .{
1058
- self.replica,
1059
- });
1060
- return false;
1719
+ fn offset_logical(ring: Ring, slot: Slot) u64 {
1720
+ assert(slot.index < slot_count);
1721
+
1722
+ switch (ring) {
1723
+ .headers => {
1724
+ comptime assert(config.sector_size % @sizeOf(Header) == 0);
1725
+ const offset = vsr.sector_floor(slot.index * @sizeOf(Header));
1726
+ assert(offset < headers_size);
1727
+ return offset;
1728
+ },
1729
+ .prepares => {
1730
+ const offset = config.message_size_max * slot.index;
1731
+ assert(offset < prepares_size);
1732
+ return offset + config.journal_size_headers;
1733
+ },
1061
1734
  }
1062
- if (self.previous_entry(header)) |previous| {
1063
- assert(previous.command == .prepare);
1064
- if (previous.checksum != header.parent) {
1065
- log.debug("{}: write_headers_once: no hash chain", .{self.replica});
1066
- return false;
1067
- }
1068
- // TODO Add is_dirty(header)
1069
- // TODO Snapshots
1070
- if (self.dirty.bit(previous.op)) {
1071
- log.debug("{}: write_prepare_header_once: previous entry is dirty", .{
1072
- self.replica,
1073
- });
1074
- return false;
1075
- }
1076
- } else {
1077
- log.debug("{}: write_prepare_header_once: no previous entry", .{self.replica});
1078
- return false;
1079
- }
1080
- return true;
1081
1735
  }
1082
1736
 
1083
- fn write_prepare_header_to_version(
1084
- self: *Self,
1085
- write: *Self.Write,
1086
- callback: fn (completion: *Self.Write) void,
1087
- version: u1,
1088
- buffer: []const u8,
1089
- offset: u64,
1090
- ) void {
1091
- log.debug("{}: write_prepare_header_to_version: version={} offset={} len={}", .{
1092
- self.replica,
1093
- version,
1094
- offset,
1095
- buffer.len,
1096
- });
1097
- assert(offset + buffer.len <= self.size_headers);
1098
- // Memory must not be owned by self.headers as self.headers may be modified concurrently:
1099
- assert(@ptrToInt(buffer.ptr) < @ptrToInt(self.headers.ptr) or
1100
- @ptrToInt(buffer.ptr) > @ptrToInt(self.headers.ptr) + self.size_headers);
1101
-
1102
- self.write_sectors(
1103
- callback,
1104
- write,
1105
- buffer,
1106
- self.offset_in_headers_version(offset, version),
1107
- );
1737
+ fn offset_logical_in_headers_for_message(self: *const Self, message: *Message) u64 {
1738
+ return offset_logical(.headers, self.slot_for_header(message.header));
1108
1739
  }
1109
1740
 
1741
+ // TODO Add a `Ring` argument, and make the offset relative to that.
1110
1742
  fn write_sectors(
1111
1743
  self: *Self,
1112
1744
  callback: fn (write: *Self.Write) void,
1113
1745
  write: *Self.Write,
1114
1746
  buffer: []const u8,
1115
- offset: u64,
1747
+ offset_in_wal: u64,
1116
1748
  ) void {
1117
1749
  write.range = .{
1118
1750
  .callback = callback,
1119
1751
  .completion = undefined,
1120
1752
  .buffer = buffer,
1121
- .offset = offset,
1753
+ .offset = offset_in_wal,
1122
1754
  .locked = false,
1123
1755
  };
1124
1756
  self.lock_sectors(write);
@@ -1136,8 +1768,9 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
1136
1768
  if (!other.range.locked) continue;
1137
1769
 
1138
1770
  if (other.range.overlaps(&write.range)) {
1139
- write.range.next = other.range.next;
1140
- other.range.next = &write.range;
1771
+ var tail = &other.range;
1772
+ while (tail.next) |next| tail = next;
1773
+ tail.next = &write.range;
1141
1774
  return;
1142
1775
  }
1143
1776
  }
@@ -1153,10 +1786,11 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
1153
1786
  write_sectors_on_write,
1154
1787
  &write.range.completion,
1155
1788
  write.range.buffer,
1789
+ .wal,
1156
1790
  write.range.offset,
1157
1791
  );
1158
- // We rely on the Storage.write_sectors() implementation being either always synchronous,
1159
- // in which case writes never actually need to be queued, or always always asynchronous,
1792
+ // We rely on the Storage.write_sectors() implementation being always synchronous,
1793
+ // in which case writes never actually need to be queued, or always asynchronous,
1160
1794
  // in which case write_sectors_on_write() doesn't have to handle lock_sectors()
1161
1795
  // synchronously completing a write and making a nested write_sectors_on_write() call.
1162
1796
  //
@@ -1193,7 +1827,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
1193
1827
  self.lock_sectors(@fieldParentPtr(Self.Write, "range", waiting));
1194
1828
  }
1195
1829
 
1196
- // The callback may set range, so we can't set range to undefined after running the callback.
1830
+ // The callback may set range, so we can't set range to undefined after the callback.
1197
1831
  const callback = range.callback;
1198
1832
  range.* = undefined;
1199
1833
  callback(write);
@@ -1208,7 +1842,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
1208
1842
  // However, we compare against the 64-bit op first, since it's a cheap machine word.
1209
1843
  if (write.message.header.op == op and write.message.header.checksum == checksum) {
1210
1844
  // If we truly are writing, then the dirty bit must be set:
1211
- assert(self.dirty.bit(op));
1845
+ assert(self.dirty.bit(self.slot_for_op(op)));
1212
1846
  return true;
1213
1847
  }
1214
1848
  }
@@ -1219,147 +1853,428 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
1219
1853
 
1220
1854
  // TODO Snapshots
1221
1855
  pub const BitSet = struct {
1222
- bits: []bool,
1856
+ bits: std.DynamicBitSetUnmanaged,
1223
1857
 
1224
1858
  /// The number of bits set (updated incrementally as bits are set or cleared):
1225
- len: u64 = 0,
1859
+ count: u64 = 0,
1226
1860
 
1227
- fn init(allocator: Allocator, count: u64) !BitSet {
1228
- const bits = try allocator.alloc(bool, count);
1229
- errdefer allocator.free(bits);
1230
- std.mem.set(bool, bits, false);
1861
+ fn init(allocator: Allocator, count: usize) !BitSet {
1862
+ const bits = try std.DynamicBitSetUnmanaged.initEmpty(allocator, count);
1863
+ errdefer bits.deinit(allocator);
1231
1864
 
1232
1865
  return BitSet{ .bits = bits };
1233
1866
  }
1234
1867
 
1235
1868
  fn deinit(self: *BitSet, allocator: Allocator) void {
1236
- allocator.free(self.bits);
1869
+ self.bits.deinit(allocator);
1237
1870
  }
1238
1871
 
1239
- /// Clear the bit for an op (idempotent):
1240
- pub fn clear(self: *BitSet, op: u64) void {
1241
- if (self.bits[op]) {
1242
- self.bits[op] = false;
1243
- self.len -= 1;
1872
+ /// Clear the bit for a slot (idempotent):
1873
+ pub fn clear(self: *BitSet, slot: Slot) void {
1874
+ if (self.bits.isSet(slot.index)) {
1875
+ self.bits.unset(slot.index);
1876
+ self.count -= 1;
1244
1877
  }
1245
1878
  }
1246
1879
 
1247
- /// Whether the bit for an op is set:
1248
- pub fn bit(self: *BitSet, op: u64) bool {
1249
- return self.bits[op];
1880
+ /// Whether the bit for a slot is set:
1881
+ pub fn bit(self: *const BitSet, slot: Slot) bool {
1882
+ return self.bits.isSet(slot.index);
1250
1883
  }
1251
1884
 
1252
- /// Set the bit for an op (idempotent):
1253
- pub fn set(self: *BitSet, op: u64) void {
1254
- if (!self.bits[op]) {
1255
- self.bits[op] = true;
1256
- self.len += 1;
1257
- assert(self.len <= self.bits.len);
1885
+ /// Set the bit for a slot (idempotent):
1886
+ pub fn set(self: *BitSet, slot: Slot) void {
1887
+ if (!self.bits.isSet(slot.index)) {
1888
+ self.bits.set(slot.index);
1889
+ self.count += 1;
1890
+ assert(self.count <= self.bits.bit_length);
1258
1891
  }
1259
1892
  }
1260
1893
  };
1261
1894
 
1262
- /// Take a u6 to limit to 64 items max (2^6 = 64)
1263
- pub fn IOPS(comptime T: type, comptime size: u6) type {
1264
- const Map = std.meta.Int(.unsigned, size);
1265
- const MapLog2 = math.Log2Int(Map);
1266
- return struct {
1267
- const Self = @This();
1895
+ /// @B and @C:
1896
+ /// This prepare header is corrupt.
1897
+ /// We may have a valid redundant header, but need to recover the full message.
1898
+ ///
1899
+ /// Case @B may be caused by crashing while writing the prepare (torn write).
1900
+ ///
1901
+ /// @E:
1902
+ /// Valid prepare, corrupt header. One of:
1903
+ ///
1904
+ /// 1. The replica crashed while writing the redundant header (torn write).
1905
+ /// 2. The read to the header is corrupt or misdirected.
1906
+ /// 3. Multiple faults, for example: the redundant header read is corrupt, and the prepare read is
1907
+ /// misdirected.
1908
+ ///
1909
+ ///
1910
+ /// @F and @H:
1911
+ /// The replica is recovering from a crash after writing the prepare, but before writing the
1912
+ /// redundant header.
1913
+ ///
1914
+ ///
1915
+ /// @G:
1916
+ /// One of:
1917
+ ///
1918
+ /// * A misdirected read to a reserved header.
1919
+ /// * The redundant header's write was lost or misdirected.
1920
+ ///
1921
+ /// For multi-replica clusters, don't repair locally to prevent data loss in case of 2 lost writes.
1922
+ ///
1923
+ ///
1924
+ /// @I:
1925
+ /// The redundant header is present & valid, but the corresponding prepare was a lost or misdirected
1926
+ /// read or write.
1927
+ ///
1928
+ ///
1929
+ /// @J:
1930
+ /// This slot is legitimately reserved — this may be the first fill of the log.
1931
+ ///
1932
+ ///
1933
+ /// @K and @L:
1934
+ /// When the redundant header & prepare header are both valid but distinct ops, always pick the
1935
+ /// higher op.
1936
+ ///
1937
+ /// For example, consider slot_count=10, the op to the left is 12, the op to the right is 14, and
1938
+ /// the tiebreak is between an op=3 and op=13. Choosing op=13 over op=3 is safe because the op=3
1939
+ /// must be from a previous wrap — it is too far back (>pipeline) to have been replaced by a view
1940
+ /// change.
1941
+ ///
1942
+ /// The length of the prepare pipeline is the upper bound on how many ops can be reordered during a
1943
+ /// view change.
1944
+ ///
1945
+ /// @K:
1946
+ /// When the higher op belongs to the prepare, repair locally.
1947
+ /// The most likely cause for this case is that the log wrapped, but the redundant header write was
1948
+ /// lost.
1949
+ ///
1950
+ /// @L:
1951
+ /// When the higher op belongs to the header, mark faulty.
1952
+ ///
1953
+ ///
1954
+ /// @M:
1955
+ /// The message was rewritten due to a view change.
1956
+ /// A single-replica cluster doesn't ever change views.
1957
+ ///
1958
+ ///
1959
+ /// @N:
1960
+ /// The redundant header matches the message's header.
1961
+ /// This is the usual case: both the prepare and header are correct and equivalent.
1962
+ const recovery_cases = table: {
1963
+ const __ = Matcher.any;
1964
+ const _0 = Matcher.is_false;
1965
+ const _1 = Matcher.is_true;
1966
+ // The replica will abort if any of these checks fail:
1967
+ const a0 = Matcher.assert_is_false;
1968
+ const a1 = Matcher.assert_is_true;
1969
+
1970
+ break :table [_]Case{
1971
+ // Legend:
1972
+ //
1973
+ // R>1 replica_count > 1
1974
+ // R=1 replica_count = 1
1975
+ // ok valid checksum ∧ valid cluster ∧ valid slot ∧ valid command
1976
+ // nil command == reserved
1977
+ // ✓∑ header.checksum == prepare.checksum
1978
+ // op⌈ prepare.op is maximum of all prepare.ops
1979
+ // op= header.op == prepare.op
1980
+ // op< header.op < prepare.op
1981
+ // view header.view == prepare.view
1982
+ //
1983
+ // Label Decision Header Prepare Compare
1984
+ // R>1 R=1 ok nil ok nil op⌈ ✓∑ op= op< view
1985
+ Case.init("@A", .vsr, .vsr, .{ _0, __, _0, __, __, __, __, __, __ }),
1986
+ Case.init("@B", .vsr, .vsr, .{ _1, _1, _0, __, __, __, __, __, __ }),
1987
+ Case.init("@C", .vsr, .vsr, .{ _1, _0, _0, __, __, __, __, __, __ }),
1988
+ Case.init("@D", .vsr, .vsr, .{ _0, __, _1, _1, __, __, __, __, __ }),
1989
+ Case.init("@E", .vsr, .fix, .{ _0, __, _1, _0, _0, __, __, __, __ }),
1990
+ Case.init("@F", .fix, .fix, .{ _0, __, _1, _0, _1, __, __, __, __ }),
1991
+ Case.init("@G", .vsr, .fix, .{ _1, _1, _1, _0, _0, __, __, __, __ }),
1992
+ Case.init("@H", .fix, .fix, .{ _1, _1, _1, _0, _1, __, __, __, __ }),
1993
+ Case.init("@I", .vsr, .vsr, .{ _1, _0, _1, _1, __, __, __, __, __ }),
1994
+ Case.init("@J", .nil, .nil, .{ _1, _1, _1, _1, __, a1, a1, a0, a1 }), // normal path: reserved
1995
+ Case.init("@K", .fix, .fix, .{ _1, _0, _1, _0, __, _0, _0, _1, __ }), // header.op < prepare.op
1996
+ Case.init("@L", .vsr, .vsr, .{ _1, _0, _1, _0, __, _0, _0, _0, __ }), // header.op > prepare.op
1997
+ Case.init("@M", .vsr, .vsr, .{ _1, _0, _1, _0, __, _0, _1, a0, a0 }),
1998
+ Case.init("@N", .eql, .eql, .{ _1, _0, _1, _0, __, _1, a1, a0, a1 }), // normal path: prepare
1999
+ };
2000
+ };
1268
2001
 
1269
- items: [size]T = undefined,
1270
- /// 1 bits are free items
1271
- free: Map = math.maxInt(Map),
2002
+ const case_cut = Case{
2003
+ .label = "@Truncate",
2004
+ .decision_multiple = .cut,
2005
+ .decision_single = .cut,
2006
+ .pattern = undefined,
2007
+ };
1272
2008
 
1273
- pub fn acquire(self: *Self) ?*T {
1274
- const i = @ctz(Map, self.free);
1275
- assert(i <= @bitSizeOf(Map));
1276
- if (i == @bitSizeOf(Map)) return null;
1277
- self.free &= ~(@as(Map, 1) << @intCast(MapLog2, i));
1278
- return &self.items[i];
1279
- }
2009
+ const RecoveryDecision = enum {
2010
+ /// The header and prepare are identical; no repair necessary.
2011
+ eql,
2012
+ /// Reserved; dirty/faulty are clear, no repair necessary.
2013
+ nil,
2014
+ /// If replica_count>1: Repair with VSR `request_prepare`. Mark dirty, clear faulty.
2015
+ /// If replica_count=1: Use intact prepare. Clear dirty, clear faulty.
2016
+ /// (Don't set faulty, because we have the valid message.)
2017
+ fix,
2018
+ /// If replica_count>1: Repair with VSR `request_prepare`. Mark dirty, mark faulty.
2019
+ /// If replica_count=1: Fail; cannot recover safely.
2020
+ vsr,
2021
+ /// Truncate the op, setting it to reserved. Dirty/faulty are clear.
2022
+ cut,
2023
+ };
1280
2024
 
1281
- pub fn release(self: *Self, item: *T) void {
1282
- item.* = undefined;
1283
- const i = (@ptrToInt(item) - @ptrToInt(&self.items)) / @sizeOf(T);
1284
- assert(self.free & (@as(Map, 1) << @intCast(MapLog2, i)) == 0);
1285
- self.free |= (@as(Map, 1) << @intCast(MapLog2, i));
1286
- }
2025
+ const Matcher = enum { any, is_false, is_true, assert_is_false, assert_is_true };
2026
+
2027
+ const Case = struct {
2028
+ label: []const u8,
2029
+ /// Decision when replica_count>1.
2030
+ decision_multiple: RecoveryDecision,
2031
+ /// Decision when replica_count=1.
2032
+ decision_single: RecoveryDecision,
2033
+ /// 0: header_ok(header)
2034
+ /// 1: header.command == reserved
2035
+ /// 2: header_ok(prepare) ∧ valid_checksum_body
2036
+ /// 3: prepare.command == reserved
2037
+ /// 4: prepare.op is maximum of all prepare.ops
2038
+ /// 5: header.checksum == prepare.checksum
2039
+ /// 6: header.op == prepare.op
2040
+ /// 7: header.op < prepare.op
2041
+ /// 8: header.view == prepare.view
2042
+ pattern: [9]Matcher,
2043
+
2044
+ fn init(
2045
+ label: []const u8,
2046
+ decision_multiple: RecoveryDecision,
2047
+ decision_single: RecoveryDecision,
2048
+ pattern: [9]Matcher,
2049
+ ) Case {
2050
+ return .{
2051
+ .label = label,
2052
+ .decision_multiple = decision_multiple,
2053
+ .decision_single = decision_single,
2054
+ .pattern = pattern,
2055
+ };
2056
+ }
1287
2057
 
1288
- /// Returns true if there is at least one IOP available
1289
- pub fn available(self: *const Self) math.Log2IntCeil(Map) {
1290
- return @popCount(Map, self.free);
2058
+ fn check(self: *const Case, parameters: [9]bool) !bool {
2059
+ for (parameters) |b, i| {
2060
+ switch (self.pattern[i]) {
2061
+ .any => {},
2062
+ .is_false => if (b) return false,
2063
+ .is_true => if (!b) return false,
2064
+ .assert_is_false => if (b) return error.ExpectFalse,
2065
+ .assert_is_true => if (!b) return error.ExpectTrue,
2066
+ }
1291
2067
  }
2068
+ return true;
2069
+ }
1292
2070
 
1293
- /// Returns true if there is at least one IOP in use
1294
- pub fn executing(self: *const Self) math.Log2IntCeil(Map) {
1295
- return @popCount(Map, math.maxInt(Map)) - @popCount(Map, self.free);
2071
+ fn decision(self: *const Case, replica_count: u8) RecoveryDecision {
2072
+ assert(replica_count > 0);
2073
+ if (replica_count == 1) {
2074
+ return self.decision_single;
2075
+ } else {
2076
+ return self.decision_multiple;
1296
2077
  }
2078
+ }
2079
+ };
1297
2080
 
1298
- pub const Iterator = struct {
1299
- iops: *Self,
1300
- /// On iteration start this is a copy of the free map, but
1301
- /// inverted so we can use @ctz() to find occupied instead of free slots.
1302
- unseen: Map,
2081
+ fn recovery_case(header: ?*const Header, prepare: ?*const Header, prepare_op_max: u64) *const Case {
2082
+ const h_ok = header != null;
2083
+ const p_ok = prepare != null;
2084
+
2085
+ if (h_ok) assert(header.?.invalid() == null);
2086
+ if (p_ok) assert(prepare.?.invalid() == null);
2087
+
2088
+ const parameters = .{
2089
+ h_ok,
2090
+ if (h_ok) header.?.command == .reserved else false,
2091
+ p_ok,
2092
+ if (p_ok) prepare.?.command == .reserved else false,
2093
+ if (p_ok) prepare.?.op == prepare_op_max else false,
2094
+ if (h_ok and p_ok) header.?.checksum == prepare.?.checksum else false,
2095
+ if (h_ok and p_ok) header.?.op == prepare.?.op else false,
2096
+ if (h_ok and p_ok) header.?.op < prepare.?.op else false,
2097
+ if (h_ok and p_ok) header.?.view == prepare.?.view else false,
2098
+ };
1303
2099
 
1304
- pub fn next(iterator: *Iterator) ?*T {
1305
- const i = @ctz(Map, iterator.unseen);
1306
- assert(i <= @bitSizeOf(Map));
1307
- if (i == @bitSizeOf(Map)) return null;
1308
- // Set this bit of unseen to 1 to indicate this slot has been seen.
1309
- iterator.unseen &= ~(@as(Map, 1) << @intCast(MapLog2, i));
1310
- return &iterator.iops.items[i];
1311
- }
2100
+ var result: ?*const Case = null;
2101
+ for (recovery_cases) |*case| {
2102
+ const match = case.check(parameters) catch {
2103
+ log.err("recovery_case: impossible state: case={s} parameters={any}", .{
2104
+ case.label,
2105
+ parameters,
2106
+ });
2107
+ unreachable;
1312
2108
  };
1313
-
1314
- pub fn iterate(self: *Self) Iterator {
1315
- return .{ .iops = self, .unseen = ~self.free };
2109
+ if (match) {
2110
+ assert(result == null);
2111
+ result = case;
1316
2112
  }
2113
+ }
2114
+ // The recovery table is exhaustive.
2115
+ // Every combination of parameters matches exactly one case.
2116
+ return result.?;
2117
+ }
2118
+
2119
+ /// Returns the header, only if the header:
2120
+ /// * has a valid checksum, and
2121
+ /// * has the expected cluster, and
2122
+ /// * has an expected command, and
2123
+ /// * resides in the correct slot.
2124
+ fn header_ok(cluster: u32, slot: Slot, header: *const Header) ?*const Header {
2125
+ // We must first validate the header checksum before accessing any fields.
2126
+ // Otherwise, we may hit undefined data or an out-of-bounds enum and cause a runtime crash.
2127
+ if (!header.valid_checksum()) return null;
2128
+
2129
+ // A header with the wrong cluster, or in the wrong slot, may indicate a misdirected read/write.
2130
+ // All journalled headers should be reserved or else prepares.
2131
+ // A misdirected read/write to or from another storage zone may return the wrong message.
2132
+ const valid_cluster_command_and_slot = switch (header.command) {
2133
+ .prepare => header.cluster == cluster and slot.index == header.op % slot_count,
2134
+ .reserved => header.cluster == cluster and slot.index == header.op,
2135
+ else => false,
1317
2136
  };
2137
+
2138
+ // Do not check the checksum here, because that would run only after the other field accesses.
2139
+ return if (valid_cluster_command_and_slot) header else null;
1318
2140
  }
1319
2141
 
1320
- test {
1321
- const testing = std.testing;
1322
- var iops = IOPS(u32, 4){};
2142
+ test "recovery_cases" {
2143
+ // Verify that every pattern matches exactly one case.
2144
+ //
2145
+ // Every possible combination of parameters must either:
2146
+ // * have a matching case
2147
+ // * have a case that fails (which would result in a panic).
2148
+ var i: usize = 0;
2149
+ while (i <= std.math.maxInt(u8)) : (i += 1) {
2150
+ var parameters: [9]bool = undefined;
2151
+ comptime var j: usize = 0;
2152
+ inline while (j < parameters.len) : (j += 1) {
2153
+ parameters[j] = i & (1 << j) != 0;
2154
+ }
1323
2155
 
1324
- try testing.expectEqual(@as(u4, 4), iops.available());
1325
- try testing.expectEqual(@as(u4, 0), iops.executing());
2156
+ var case_match: ?*const Case = null;
2157
+ for (recovery_cases) |*case| {
2158
+ if (case.check(parameters) catch true) {
2159
+ try std.testing.expectEqual(case_match, null);
2160
+ case_match = case;
2161
+ }
2162
+ }
2163
+ if (case_match == null) @panic("no matching case");
2164
+ }
2165
+ }
1326
2166
 
1327
- var one = iops.acquire().?;
2167
+ /// Format part of a new WAL, writing to `target`.
2168
+ ///
2169
+ /// `offset_logical` is relative to the beginning of the WAL.
2170
+ /// Returns the number of bytes written to `target`.
2171
+ pub fn format_journal(cluster: u32, offset_logical: u64, target: []u8) usize {
2172
+ assert(offset_logical <= config.journal_size_max);
2173
+ assert(offset_logical % config.sector_size == 0);
2174
+ assert(target.len > 0);
2175
+ assert(target.len % config.sector_size == 0);
2176
+
2177
+ const sector_max = @divExact(config.journal_size_max, config.sector_size);
2178
+ var sectors = std.mem.bytesAsSlice([config.sector_size]u8, target);
2179
+ for (sectors) |*sector_data, i| {
2180
+ const sector = @divExact(offset_logical, config.sector_size) + i;
2181
+ if (sector == sector_max) {
2182
+ if (i == 0) {
2183
+ assert(offset_logical == config.journal_size_max);
2184
+ }
2185
+ return i * config.sector_size;
2186
+ } else {
2187
+ format_journal_sector(cluster, sector, sector_data);
2188
+ }
2189
+ }
2190
+ return target.len;
2191
+ }
1328
2192
 
1329
- try testing.expectEqual(@as(u4, 3), iops.available());
1330
- try testing.expectEqual(@as(u4, 1), iops.executing());
2193
+ fn format_journal_sector(cluster: u32, sector: usize, sector_data: *[config.sector_size]u8) void {
2194
+ assert(sector < @divExact(config.journal_size_max, config.sector_size));
1331
2195
 
1332
- var two = iops.acquire().?;
1333
- var three = iops.acquire().?;
2196
+ var sector_headers = std.mem.bytesAsSlice(Header, sector_data);
1334
2197
 
1335
- try testing.expectEqual(@as(u4, 1), iops.available());
1336
- try testing.expectEqual(@as(u4, 3), iops.executing());
2198
+ if (sector * headers_per_sector < slot_count) {
2199
+ for (sector_headers) |*header, i| {
2200
+ const slot = sector * headers_per_sector + i;
2201
+ if (sector == 0 and i == 0) {
2202
+ header.* = Header.root_prepare(cluster);
2203
+ assert(header.op == 0);
2204
+ assert(header.command == .prepare);
2205
+ assert(header.operation == .root);
2206
+ } else {
2207
+ header.* = Header.reserved(cluster, slot);
2208
+ }
2209
+ }
2210
+ return;
2211
+ }
2212
+
2213
+ const sectors_per_message = @divExact(config.message_size_max, config.sector_size);
2214
+ const sector_in_prepares = sector - @divExact(slot_count, headers_per_sector);
2215
+ const message_slot = @divFloor(sector_in_prepares, sectors_per_message);
2216
+ assert(message_slot < slot_count);
2217
+
2218
+ std.mem.set(u8, sector_data, 0);
2219
+ if (sector_in_prepares % sectors_per_message == 0) {
2220
+ // The header goes in the first sector of the message.
2221
+ if (message_slot == 0) {
2222
+ sector_headers[0] = Header.root_prepare(cluster);
2223
+ } else {
2224
+ sector_headers[0] = Header.reserved(cluster, message_slot);
2225
+ }
2226
+ }
2227
+ }
1337
2228
 
1338
- var four = iops.acquire().?;
1339
- try testing.expectEqual(@as(?*u32, null), iops.acquire());
2229
+ test "format_journal" {
2230
+ const cluster = 123;
2231
+ const write_sizes = [_]usize{
2232
+ config.sector_size,
2233
+ config.sector_size * 2,
2234
+ config.sector_size * 3,
2235
+ config.journal_size_max,
2236
+ };
1340
2237
 
1341
- try testing.expectEqual(@as(u4, 0), iops.available());
1342
- try testing.expectEqual(@as(u4, 4), iops.executing());
2238
+ for (write_sizes) |write_size_max| {
2239
+ var wal_data = try std.testing.allocator.alloc(u8, config.journal_size_max);
2240
+ defer std.testing.allocator.free(wal_data);
1343
2241
 
1344
- iops.release(two);
2242
+ var write_data = try std.testing.allocator.alloc(u8, write_size_max);
2243
+ defer std.testing.allocator.free(write_data);
1345
2244
 
1346
- try testing.expectEqual(@as(u4, 1), iops.available());
1347
- try testing.expectEqual(@as(u4, 3), iops.executing());
2245
+ var headers_ring = std.mem.bytesAsSlice(Header, @alignCast(@alignOf(Header), wal_data[0..config.journal_size_headers]));
2246
+ var prepare_ring = std.mem.bytesAsSlice([config.message_size_max]u8, wal_data[config.journal_size_headers..]);
2247
+ try std.testing.expectEqual(@as(usize, config.journal_slot_count), headers_ring.len);
2248
+ try std.testing.expectEqual(@as(usize, config.journal_slot_count), prepare_ring.len);
1348
2249
 
1349
- // there is only one slot free, so we will get the same pointer back.
1350
- try testing.expectEqual(@as(?*u32, two), iops.acquire());
2250
+ var offset: u64 = 0;
2251
+ while (true) {
2252
+ const write_size = format_journal(cluster, offset, write_data);
2253
+ if (write_size == 0) break;
2254
+ std.mem.copy(u8, wal_data[offset..][0..write_size], write_data[0..write_size]);
2255
+ offset += write_size;
2256
+ }
1351
2257
 
1352
- iops.release(four);
1353
- iops.release(two);
1354
- iops.release(one);
1355
- iops.release(three);
2258
+ for (headers_ring) |*header, slot| {
2259
+ try std.testing.expect(header.valid_checksum());
2260
+ try std.testing.expect(header.valid_checksum_body(&[0]u8{}));
2261
+ try std.testing.expectEqual(header.invalid(), null);
2262
+ try std.testing.expectEqual(header.cluster, cluster);
2263
+ try std.testing.expectEqual(header.op, slot);
2264
+ try std.testing.expectEqual(header.size, @sizeOf(Header));
2265
+ if (slot == 0) {
2266
+ try std.testing.expectEqual(header.command, .prepare);
2267
+ try std.testing.expectEqual(header.operation, .root);
2268
+ } else {
2269
+ try std.testing.expectEqual(header.command, .reserved);
2270
+ }
1356
2271
 
1357
- try testing.expectEqual(@as(u4, 4), iops.available());
1358
- try testing.expectEqual(@as(u4, 0), iops.executing());
2272
+ const prepare_bytes = prepare_ring[slot];
2273
+ const prepare_header = std.mem.bytesAsValue(Header, prepare_bytes[0..@sizeOf(Header)]);
2274
+ const prepare_body = prepare_bytes[@sizeOf(Header)..];
1359
2275
 
1360
- one = iops.acquire().?;
1361
- two = iops.acquire().?;
1362
- three = iops.acquire().?;
1363
- four = iops.acquire().?;
1364
- try testing.expectEqual(@as(?*u32, null), iops.acquire());
2276
+ try std.testing.expectEqual(header.*, prepare_header.*);
2277
+ for (prepare_body) |byte| try std.testing.expectEqual(byte, 0);
2278
+ }
2279
+ }
1365
2280
  }