tigerbeetle-node 0.11.5 → 0.11.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/dist/.client.node.sha256 +1 -1
  2. package/dist/index.d.ts +41 -42
  3. package/dist/index.js +41 -42
  4. package/dist/index.js.map +1 -1
  5. package/package.json +2 -2
  6. package/src/index.ts +0 -1
  7. package/src/tigerbeetle/scripts/benchmark.bat +7 -3
  8. package/src/tigerbeetle/scripts/benchmark.sh +2 -3
  9. package/src/tigerbeetle/scripts/install.bat +7 -0
  10. package/src/tigerbeetle/scripts/install.sh +2 -3
  11. package/src/tigerbeetle/src/benchmark.zig +3 -3
  12. package/src/tigerbeetle/src/config.zig +24 -3
  13. package/src/tigerbeetle/src/constants.zig +8 -5
  14. package/src/tigerbeetle/src/ewah.zig +6 -5
  15. package/src/tigerbeetle/src/ewah_fuzz.zig +1 -1
  16. package/src/tigerbeetle/src/io/darwin.zig +19 -0
  17. package/src/tigerbeetle/src/io/linux.zig +8 -0
  18. package/src/tigerbeetle/src/io/windows.zig +20 -2
  19. package/src/tigerbeetle/src/iops.zig +7 -1
  20. package/src/tigerbeetle/src/lsm/compaction.zig +27 -72
  21. package/src/tigerbeetle/src/lsm/forest_fuzz.zig +10 -11
  22. package/src/tigerbeetle/src/lsm/grid.zig +267 -267
  23. package/src/tigerbeetle/src/lsm/groove.zig +3 -0
  24. package/src/tigerbeetle/src/lsm/level_iterator.zig +18 -1
  25. package/src/tigerbeetle/src/lsm/manifest.zig +29 -1
  26. package/src/tigerbeetle/src/lsm/manifest_level.zig +1 -0
  27. package/src/tigerbeetle/src/lsm/manifest_log.zig +5 -5
  28. package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +19 -11
  29. package/src/tigerbeetle/src/lsm/merge_iterator.zig +106 -0
  30. package/src/tigerbeetle/src/lsm/posted_groove.zig +1 -0
  31. package/src/tigerbeetle/src/lsm/segmented_array.zig +1 -0
  32. package/src/tigerbeetle/src/lsm/set_associative_cache.zig +26 -70
  33. package/src/tigerbeetle/src/lsm/table.zig +56 -0
  34. package/src/tigerbeetle/src/lsm/table_iterator.zig +29 -2
  35. package/src/tigerbeetle/src/lsm/table_mutable.zig +49 -15
  36. package/src/tigerbeetle/src/lsm/test.zig +10 -7
  37. package/src/tigerbeetle/src/lsm/tree.zig +27 -6
  38. package/src/tigerbeetle/src/lsm/tree_fuzz.zig +302 -263
  39. package/src/tigerbeetle/src/message_pool.zig +2 -1
  40. package/src/tigerbeetle/src/simulator.zig +22 -84
  41. package/src/tigerbeetle/src/{test/accounting → state_machine}/auditor.zig +8 -8
  42. package/src/tigerbeetle/src/{test/accounting → state_machine}/workload.zig +108 -48
  43. package/src/tigerbeetle/src/state_machine.zig +20 -14
  44. package/src/tigerbeetle/src/storage.zig +58 -6
  45. package/src/tigerbeetle/src/test/cluster.zig +14 -11
  46. package/src/tigerbeetle/src/test/conductor.zig +2 -3
  47. package/src/tigerbeetle/src/test/id.zig +10 -0
  48. package/src/tigerbeetle/src/test/state_checker.zig +1 -1
  49. package/src/tigerbeetle/src/test/state_machine.zig +151 -46
  50. package/src/tigerbeetle/src/test/storage.zig +22 -1
  51. package/src/tigerbeetle/src/tigerbeetle.zig +0 -1
  52. package/src/tigerbeetle/src/tracer.zig +50 -28
  53. package/src/tigerbeetle/src/unit_tests.zig +11 -6
  54. package/src/tigerbeetle/src/vopr.zig +4 -4
  55. package/src/tigerbeetle/src/vsr/client.zig +5 -5
  56. package/src/tigerbeetle/src/vsr/clock.zig +2 -2
  57. package/src/tigerbeetle/src/vsr/journal.zig +647 -537
  58. package/src/tigerbeetle/src/vsr/replica.zig +333 -333
  59. package/src/tigerbeetle/src/vsr/replica_format.zig +7 -4
  60. package/src/tigerbeetle/src/vsr/superblock.zig +87 -39
  61. package/src/tigerbeetle/src/vsr/superblock_free_set.zig +114 -93
  62. package/src/tigerbeetle/src/vsr/superblock_free_set_fuzz.zig +1 -1
  63. package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +11 -8
  64. package/src/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +3 -3
  65. package/src/tigerbeetle/src/vsr.zig +60 -13
  66. package/src/tigerbeetle/src/c/tb_client/context.zig +0 -304
  67. package/src/tigerbeetle/src/c/tb_client/echo_client.zig +0 -108
  68. package/src/tigerbeetle/src/c/tb_client/packet.zig +0 -80
  69. package/src/tigerbeetle/src/c/tb_client/signal.zig +0 -286
  70. package/src/tigerbeetle/src/c/tb_client/thread.zig +0 -88
  71. package/src/tigerbeetle/src/c/tb_client.h +0 -221
  72. package/src/tigerbeetle/src/c/tb_client.zig +0 -177
  73. package/src/tigerbeetle/src/c/tb_client_header.zig +0 -218
  74. package/src/tigerbeetle/src/c/tb_client_header_test.zig +0 -135
  75. package/src/tigerbeetle/src/c/test.zig +0 -371
  76. package/src/tigerbeetle/src/cli.zig +0 -375
  77. package/src/tigerbeetle/src/main.zig +0 -245
@@ -46,8 +46,10 @@ const Ring = enum {
46
46
  };
47
47
 
48
48
  const headers_per_sector = @divExact(constants.sector_size, @sizeOf(Header));
49
+ const headers_per_message = @divExact(constants.message_size_max, @sizeOf(Header));
49
50
  comptime {
50
51
  assert(headers_per_sector > 0);
52
+ assert(headers_per_message > 0);
51
53
  }
52
54
 
53
55
  /// A slot is an index within:
@@ -60,7 +62,7 @@ comptime {
60
62
  /// - `journal.faulty`
61
63
  ///
62
64
  /// A header's slot is `header.op % constants.journal_slot_count`.
63
- const Slot = struct { index: u64 };
65
+ const Slot = struct { index: usize };
64
66
 
65
67
  /// An inclusive, non-empty range of slots.
66
68
  const SlotRange = struct {
@@ -74,26 +76,20 @@ const SlotRange = struct {
74
76
  /// * `head < tail` → ` head··tail `
75
77
  /// * `head > tail` → `··tail head··` (The range wraps around).
76
78
  /// * `head = tail` → panic (Caller must handle this case separately).
77
- fn contains(self: *const SlotRange, slot: Slot) bool {
79
+ fn contains(range: *const SlotRange, slot: Slot) bool {
78
80
  // To avoid confusion, the empty range must be checked separately by the caller.
79
- assert(self.head.index != self.tail.index);
81
+ assert(range.head.index != range.tail.index);
80
82
 
81
- if (self.head.index < self.tail.index) {
82
- return self.head.index <= slot.index and slot.index <= self.tail.index;
83
+ if (range.head.index < range.tail.index) {
84
+ return range.head.index <= slot.index and slot.index <= range.tail.index;
83
85
  }
84
- if (self.head.index > self.tail.index) {
85
- return slot.index <= self.tail.index or self.head.index <= slot.index;
86
+ if (range.head.index > range.tail.index) {
87
+ return slot.index <= range.tail.index or range.head.index <= slot.index;
86
88
  }
87
89
  unreachable;
88
90
  }
89
91
  };
90
92
 
91
- const Status = enum {
92
- init,
93
- recovering,
94
- recovered,
95
- };
96
-
97
93
  const slot_count = constants.journal_slot_count;
98
94
  const headers_size = constants.journal_size_headers;
99
95
  const prepares_size = constants.journal_size_prepares;
@@ -117,14 +113,21 @@ comptime {
117
113
  assert(prepares_size % constants.message_size_max == 0);
118
114
  }
119
115
 
120
- pub fn Journal(comptime Replica: type, comptime Storage: type) type {
116
+ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
121
117
  return struct {
122
- const Self = @This();
118
+ const Journal = @This();
119
+ const Sector = *align(constants.sector_size) [constants.sector_size]u8;
120
+
121
+ const Status = union(enum) {
122
+ init: void,
123
+ recovering: fn (journal: *Journal) void,
124
+ recovered: void,
125
+ };
123
126
 
124
127
  pub const Read = struct {
125
- self: *Self,
128
+ journal: *Journal,
126
129
  completion: Storage.Read,
127
- callback: fn (self: *Replica, prepare: ?*Message, destination_replica: ?u8) void,
130
+ callback: fn (replica: *Replica, prepare: ?*Message, destination_replica: ?u8) void,
128
131
 
129
132
  message: *Message,
130
133
  op: u64,
@@ -133,10 +136,10 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
133
136
  };
134
137
 
135
138
  pub const Write = struct {
136
- pub const Trigger = enum { append, repair, pipeline };
139
+ pub const Trigger = enum { append, fix, repair, pipeline };
137
140
 
138
- self: *Self,
139
- callback: fn (self: *Replica, wrote: ?*Message, trigger: Trigger) void,
141
+ journal: *Journal,
142
+ callback: fn (replica: *Replica, wrote: ?*Message, trigger: Trigger) void,
140
143
 
141
144
  message: *Message,
142
145
  trigger: Trigger,
@@ -150,26 +153,13 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
150
153
 
151
154
  /// This is reset to undefined and reused for each Storage.write_sectors() call.
152
155
  range: Range,
153
-
154
- const Sector = *align(constants.sector_size) [constants.sector_size]u8;
155
-
156
- fn header_sector(write: *Self.Write, journal: *Self) Sector {
157
- assert(journal.writes.items.len == journal.headers_iops.len);
158
- const i = @divExact(
159
- @ptrToInt(write) - @ptrToInt(&journal.writes.items),
160
- @sizeOf(Self.Write),
161
- );
162
- // TODO The compiler should not need this align cast as the type of `headers_iops`
163
- // ensures that each buffer is properly aligned.
164
- return @alignCast(constants.sector_size, &journal.headers_iops[i]);
165
- }
166
156
  };
167
157
 
168
158
  /// State that needs to be persisted while waiting for an overlapping
169
159
  /// concurrent write to complete. This is a range on the physical disk.
170
160
  const Range = struct {
171
161
  completion: Storage.Write,
172
- callback: fn (write: *Self.Write) void,
162
+ callback: fn (write: *Journal.Write) void,
173
163
  buffer: []const u8,
174
164
  ring: Ring,
175
165
  /// Offset within the ring.
@@ -181,17 +171,19 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
181
171
  /// True if a Storage.write_sectors() operation is in progress for this buffer/offset.
182
172
  locked: bool,
183
173
 
184
- fn overlaps(self: *const Range, other: *const Range) bool {
185
- if (self.ring != other.ring) return false;
174
+ fn overlaps(journal: *const Range, other: *const Range) bool {
175
+ if (journal.ring != other.ring) return false;
186
176
 
187
- if (self.offset < other.offset) {
188
- return self.offset + self.buffer.len > other.offset;
177
+ if (journal.offset < other.offset) {
178
+ return journal.offset + journal.buffer.len > other.offset;
189
179
  } else {
190
- return other.offset + other.buffer.len > self.offset;
180
+ return other.offset + other.buffer.len > journal.offset;
191
181
  }
192
182
  }
193
183
  };
194
184
 
185
+ const HeaderChunks = std.StaticBitSet(util.div_ceil(slot_count, headers_per_message));
186
+
195
187
  storage: *Storage,
196
188
  replica: u8,
197
189
 
@@ -224,6 +216,11 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
224
216
  /// The buffers belong to the IOP at the corresponding index in IOPS.
225
217
  headers_iops: *align(constants.sector_size) [constants.journal_iops_write_max][constants.sector_size]u8,
226
218
 
219
+ /// A set bit indicates a chunk of redundant headers that no read has been issued to yet.
220
+ header_chunks_requested: HeaderChunks = HeaderChunks.initFull(),
221
+ /// A set bit indicates a chunk of redundant headers that has been recovered.
222
+ header_chunks_recovered: HeaderChunks = HeaderChunks.initEmpty(),
223
+
227
224
  /// Statically allocated read IO operation context data.
228
225
  reads: IOPS(Read, constants.journal_iops_read_max) = .{},
229
226
 
@@ -262,7 +259,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
262
259
 
263
260
  status: Status = .init,
264
261
 
265
- pub fn init(allocator: Allocator, storage: *Storage, replica: u8) !Self {
262
+ pub fn init(allocator: Allocator, storage: *Storage, replica: u8) !Journal {
266
263
  // TODO Fix this assertion:
267
264
  // assert(write_ahead_log_zone_size <= storage.size);
268
265
 
@@ -284,13 +281,11 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
284
281
  errdefer allocator.free(headers_redundant);
285
282
  for (headers_redundant) |*header| header.* = undefined;
286
283
 
287
- var dirty = try BitSet.init(allocator, slot_count);
284
+ var dirty = try BitSet.init_full(allocator, slot_count);
288
285
  errdefer dirty.deinit(allocator);
289
- for (headers) |_, index| dirty.set(Slot{ .index = index });
290
286
 
291
- var faulty = try BitSet.init(allocator, slot_count);
287
+ var faulty = try BitSet.init_full(allocator, slot_count);
292
288
  errdefer faulty.deinit(allocator);
293
- for (headers) |_, index| faulty.set(Slot{ .index = index });
294
289
 
295
290
  var prepare_checksums = try allocator.alloc(u128, slot_count);
296
291
  errdefer allocator.free(prepare_checksums);
@@ -316,7 +311,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
316
311
  std.fmt.fmtIntSizeBin(prepares_size),
317
312
  });
318
313
 
319
- var self = Self{
314
+ var journal = Journal{
320
315
  .storage = storage,
321
316
  .replica = replica,
322
317
  .headers = headers,
@@ -328,37 +323,37 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
328
323
  .headers_iops = headers_iops,
329
324
  };
330
325
 
331
- assert(@mod(@ptrToInt(&self.headers[0]), constants.sector_size) == 0);
332
- assert(self.dirty.bits.bit_length == slot_count);
333
- assert(self.faulty.bits.bit_length == slot_count);
334
- assert(self.dirty.count == slot_count);
335
- assert(self.faulty.count == slot_count);
336
- assert(self.prepare_checksums.len == slot_count);
337
- assert(self.prepare_inhabited.len == slot_count);
326
+ assert(@mod(@ptrToInt(&journal.headers[0]), constants.sector_size) == 0);
327
+ assert(journal.dirty.bits.bit_length == slot_count);
328
+ assert(journal.faulty.bits.bit_length == slot_count);
329
+ assert(journal.dirty.count == slot_count);
330
+ assert(journal.faulty.count == slot_count);
331
+ assert(journal.prepare_checksums.len == slot_count);
332
+ assert(journal.prepare_inhabited.len == slot_count);
338
333
 
339
- for (self.headers) |*h| assert(!h.valid_checksum());
340
- for (self.headers_redundant) |*h| assert(!h.valid_checksum());
334
+ for (journal.headers) |*h| assert(!h.valid_checksum());
335
+ for (journal.headers_redundant) |*h| assert(!h.valid_checksum());
341
336
 
342
- return self;
337
+ return journal;
343
338
  }
344
339
 
345
- pub fn deinit(self: *Self, allocator: Allocator) void {
346
- const replica = @fieldParentPtr(Replica, "journal", self);
340
+ pub fn deinit(journal: *Journal, allocator: Allocator) void {
341
+ const replica = @fieldParentPtr(Replica, "journal", journal);
347
342
 
348
- self.dirty.deinit(allocator);
349
- self.faulty.deinit(allocator);
350
- allocator.free(self.headers);
351
- allocator.free(self.headers_redundant);
352
- allocator.free(self.headers_iops);
353
- allocator.free(self.prepare_checksums);
354
- allocator.free(self.prepare_inhabited);
343
+ journal.dirty.deinit(allocator);
344
+ journal.faulty.deinit(allocator);
345
+ allocator.free(journal.headers);
346
+ allocator.free(journal.headers_redundant);
347
+ allocator.free(journal.headers_iops);
348
+ allocator.free(journal.prepare_checksums);
349
+ allocator.free(journal.prepare_inhabited);
355
350
 
356
351
  {
357
- var it = self.reads.iterate();
352
+ var it = journal.reads.iterate();
358
353
  while (it.next()) |read| replica.message_bus.unref(read.message);
359
354
  }
360
355
  {
361
- var it = self.writes.iterate();
356
+ var it = journal.writes.iterate();
362
357
  while (it.next()) |write| replica.message_bus.unref(write.message);
363
358
  }
364
359
  }
@@ -369,79 +364,78 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
369
364
  ///
370
365
  /// Called by the replica immediately after WAL recovery completes, but before the replica
371
366
  /// issues any I/O from handling messages.
372
- pub fn is_empty(self: *const Self) bool {
373
- assert(self.status == .recovered);
374
- assert(self.writes.executing() == 0);
367
+ pub fn is_empty(journal: *const Journal) bool {
368
+ assert(journal.status == .recovered);
369
+ assert(journal.writes.executing() == 0);
375
370
 
376
- if (!self.headers[0].valid_checksum()) return false;
377
- if (self.headers[0].operation != .root) return false;
371
+ if (!journal.headers[0].valid_checksum()) return false;
372
+ if (journal.headers[0].operation != .root) return false;
378
373
 
379
- const replica = @fieldParentPtr(Replica, "journal", self);
380
- assert(self.headers[0].checksum == Header.root_prepare(replica.cluster).checksum);
381
- assert(self.headers[0].checksum == self.prepare_checksums[0]);
382
- assert(self.prepare_inhabited[0]);
374
+ const replica = @fieldParentPtr(Replica, "journal", journal);
375
+ assert(journal.headers[0].checksum == Header.root_prepare(replica.cluster).checksum);
376
+ assert(journal.headers[0].checksum == journal.prepare_checksums[0]);
377
+ assert(journal.prepare_inhabited[0]);
383
378
 
384
379
  // If any message is faulty, we must fall back to VSR recovery protocol (i.e. treat
385
380
  // this as a non-empty WAL) since that message may have been a prepare.
386
- if (self.faulty.count > 0) return false;
381
+ if (journal.faulty.count > 0) return false;
387
382
 
388
- for (self.headers[1..]) |*header| {
383
+ for (journal.headers[1..]) |*header| {
389
384
  if (header.command == .prepare) return false;
390
385
  }
391
386
 
392
- for (self.prepare_inhabited[1..]) |inhabited| {
387
+ for (journal.prepare_inhabited[1..]) |inhabited| {
393
388
  if (inhabited) return false;
394
389
  }
395
390
 
396
391
  return true;
397
392
  }
398
393
 
399
- pub fn slot_for_op(_: *const Self, op: u64) Slot {
394
+ pub fn slot_for_op(_: *const Journal, op: u64) Slot {
400
395
  return Slot{ .index = op % slot_count };
401
396
  }
402
397
 
403
- pub fn slot_with_op(self: *const Self, op: u64) ?Slot {
404
- if (self.header_with_op(op)) |_| {
405
- return self.slot_for_op(op);
398
+ pub fn slot_with_op(journal: *const Journal, op: u64) ?Slot {
399
+ if (journal.header_with_op(op)) |_| {
400
+ return journal.slot_for_op(op);
406
401
  } else {
407
402
  return null;
408
403
  }
409
404
  }
410
405
 
411
- pub fn slot_with_op_and_checksum(self: *const Self, op: u64, checksum: u128) ?Slot {
412
- if (self.header_with_op_and_checksum(op, checksum)) |_| {
413
- return self.slot_for_op(op);
406
+ pub fn slot_with_op_and_checksum(journal: *const Journal, op: u64, checksum: u128) ?Slot {
407
+ if (journal.header_with_op_and_checksum(op, checksum)) |_| {
408
+ return journal.slot_for_op(op);
414
409
  } else {
415
410
  return null;
416
411
  }
417
412
  }
418
413
 
419
- pub fn slot_for_header(self: *const Self, header: *const Header) Slot {
414
+ pub fn slot_for_header(journal: *const Journal, header: *const Header) Slot {
420
415
  assert(header.command == .prepare);
421
- return self.slot_for_op(header.op);
416
+ return journal.slot_for_op(header.op);
422
417
  }
423
418
 
424
- pub fn slot_with_header(self: *const Self, header: *const Header) ?Slot {
419
+ pub fn slot_with_header(journal: *const Journal, header: *const Header) ?Slot {
425
420
  assert(header.command == .prepare);
426
- return self.slot_with_op(header.op);
421
+ return journal.slot_with_op(header.op);
427
422
  }
428
423
 
429
424
  /// Returns any existing header at the location indicated by header.op.
430
425
  /// The existing header may have an older or newer op number.
431
- pub fn header_for_prepare(self: *const Self, header: *const Header) ?*const Header {
426
+ pub fn header_for_prepare(journal: *const Journal, header: *const Header) ?*const Header {
432
427
  assert(header.command == .prepare);
433
- return self.header_for_op(header.op);
428
+ return journal.header_for_op(header.op);
434
429
  }
435
430
 
436
431
  /// We use `op` directly to index into the headers array and locate ops without a scan.
437
432
  /// The existing header may have an older or newer op number.
438
- pub fn header_for_op(self: *const Self, op: u64) ?*const Header {
439
- // TODO Snapshots
440
- const slot = self.slot_for_op(op);
441
- const existing = &self.headers[slot.index];
433
+ pub fn header_for_op(journal: *const Journal, op: u64) ?*const Header {
434
+ const slot = journal.slot_for_op(op);
435
+ const existing = &journal.headers[slot.index];
442
436
  switch (existing.command) {
443
437
  .prepare => {
444
- assert(self.slot_for_op(existing.op).index == slot.index);
438
+ assert(journal.slot_for_op(existing.op).index == slot.index);
445
439
  return existing;
446
440
  },
447
441
  .reserved => {
@@ -454,8 +448,8 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
454
448
 
455
449
  /// Returns the entry at `@mod(op)` location, but only if `entry.op == op`, else `null`.
456
450
  /// Be careful of using this without considering that there may still be an existing op.
457
- pub fn header_with_op(self: *const Self, op: u64) ?*const Header {
458
- if (self.header_for_op(op)) |existing| {
451
+ pub fn header_with_op(journal: *const Journal, op: u64) ?*const Header {
452
+ if (journal.header_for_op(op)) |existing| {
459
453
  if (existing.op == op) return existing;
460
454
  }
461
455
  return null;
@@ -463,37 +457,35 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
463
457
 
464
458
  /// As per `header_with_op()`, but only if there is an optional checksum match.
465
459
  pub fn header_with_op_and_checksum(
466
- self: *const Self,
460
+ journal: *const Journal,
467
461
  op: u64,
468
462
  checksum: ?u128,
469
463
  ) ?*const Header {
470
- if (self.header_with_op(op)) |existing| {
464
+ if (journal.header_with_op(op)) |existing| {
471
465
  assert(existing.op == op);
472
466
  if (checksum == null or existing.checksum == checksum.?) return existing;
473
467
  }
474
468
  return null;
475
469
  }
476
470
 
477
- // TODO How should we handle the case where the current header argument is the same as
478
- // op_checkpoint?
479
- pub fn previous_entry(self: *const Self, header: *const Header) ?*const Header {
471
+ pub fn previous_entry(journal: *const Journal, header: *const Header) ?*const Header {
480
472
  if (header.op == 0) {
481
473
  return null;
482
474
  } else {
483
- return self.header_for_op(header.op - 1);
475
+ return journal.header_for_op(header.op - 1);
484
476
  }
485
477
  }
486
478
 
487
- pub fn next_entry(self: *const Self, header: *const Header) ?*const Header {
488
- return self.header_for_op(header.op + 1);
479
+ pub fn next_entry(journal: *const Journal, header: *const Header) ?*const Header {
480
+ return journal.header_for_op(header.op + 1);
489
481
  }
490
482
 
491
483
  /// Returns the highest op number prepared, in any slot without reference to the checkpoint.
492
- pub fn op_maximum(self: *const Self) u64 {
493
- assert(self.status == .recovered);
484
+ pub fn op_maximum(journal: *const Journal) u64 {
485
+ assert(journal.status == .recovered);
494
486
 
495
487
  var op: u64 = 0;
496
- for (self.headers) |*header| {
488
+ for (journal.headers) |*header| {
497
489
  if (header.command == .prepare) {
498
490
  if (header.op > op) op = header.op;
499
491
  } else {
@@ -519,12 +511,12 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
519
511
  return op;
520
512
  }
521
513
 
522
- pub fn has(self: *const Self, header: *const Header) bool {
523
- assert(self.status == .recovered);
514
+ pub fn has(journal: *const Journal, header: *const Header) bool {
515
+ assert(journal.status == .recovered);
524
516
  assert(header.command == .prepare);
525
517
 
526
- const slot = self.slot_for_op(header.op);
527
- const existing = &self.headers[slot.index];
518
+ const slot = journal.slot_for_op(header.op);
519
+ const existing = &journal.headers[slot.index];
528
520
  if (existing.command == .reserved) {
529
521
  return false;
530
522
  } else {
@@ -538,19 +530,19 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
538
530
  }
539
531
  }
540
532
 
541
- pub fn has_clean(self: *const Self, header: *const Header) bool {
542
- if (self.slot_with_op_and_checksum(header.op, header.checksum)) |slot| {
543
- if (!self.dirty.bit(slot)) {
544
- assert(self.prepare_inhabited[slot.index]);
545
- assert(self.prepare_checksums[slot.index] == header.checksum);
533
+ pub fn has_clean(journal: *const Journal, header: *const Header) bool {
534
+ if (journal.slot_with_op_and_checksum(header.op, header.checksum)) |slot| {
535
+ if (!journal.dirty.bit(slot)) {
536
+ assert(journal.prepare_inhabited[slot.index]);
537
+ assert(journal.prepare_checksums[slot.index] == header.checksum);
546
538
  return true;
547
539
  }
548
540
  }
549
541
  return false;
550
542
  }
551
543
 
552
- pub fn has_dirty(self: *const Self, header: *const Header) bool {
553
- return self.has(header) and self.dirty.bit(self.slot_with_header(header).?);
544
+ pub fn has_dirty(journal: *const Journal, header: *const Header) bool {
545
+ return journal.has(header) and journal.dirty.bit(journal.slot_with_header(header).?);
554
546
  }
555
547
 
556
548
  /// Copies latest headers between `op_min` and `op_max` (both inclusive) as fit in `dest`.
@@ -561,12 +553,12 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
561
553
  /// Zeroes the `dest` buffer in case the copy would underflow and leave a buffer bleed.
562
554
  /// Returns the number of headers actually copied.
563
555
  pub fn copy_latest_headers_between(
564
- self: *const Self,
556
+ journal: *const Journal,
565
557
  op_min: u64,
566
558
  op_max: u64,
567
559
  dest: []Header,
568
560
  ) usize {
569
- assert(self.status == .recovered);
561
+ assert(journal.status == .recovered);
570
562
  assert(op_min <= op_max);
571
563
  assert(dest.len > 0);
572
564
 
@@ -579,7 +571,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
579
571
  while (op > op_min) {
580
572
  op -= 1;
581
573
 
582
- if (self.header_with_op(op)) |header| {
574
+ if (journal.header_with_op(op)) |header| {
583
575
  dest[copied] = header.*;
584
576
  assert(dest[copied].invalid() == null);
585
577
  copied += 1;
@@ -590,7 +582,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
590
582
  log.debug(
591
583
  "{}: copy_latest_headers_between: op_min={} op_max={} dest.len={} copied={}",
592
584
  .{
593
- self.replica,
585
+ journal.replica,
594
586
  op_min,
595
587
  op_max,
596
588
  dest.len,
@@ -616,7 +608,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
616
608
  /// Another example: If op 17 is disconnected from op 18, 16 is connected to 17, and 12-15
617
609
  /// are missing, returns: `{ .op_min = 12, .op_max = 17 }`.
618
610
  pub fn find_latest_headers_break_between(
619
- self: *const Self,
611
+ journal: *const Journal,
620
612
  op_min: u64,
621
613
  op_max: u64,
622
614
  ) ?HeaderRange {
@@ -632,7 +624,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
632
624
  op -= 1;
633
625
 
634
626
  // Get the entry at @mod(op) location, but only if entry.op == op, else null:
635
- var A = self.header_with_op(op);
627
+ var A = journal.header_with_op(op);
636
628
  if (A) |a| {
637
629
  if (B) |b| {
638
630
  // If A was reordered then A may have a newer op than B (but an older view).
@@ -719,51 +711,51 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
719
711
 
720
712
  /// Read a prepare from disk. There must be a matching in-memory header.
721
713
  pub fn read_prepare(
722
- self: *Self,
714
+ journal: *Journal,
723
715
  callback: fn (replica: *Replica, prepare: ?*Message, destination_replica: ?u8) void,
724
716
  op: u64,
725
717
  checksum: u128,
726
718
  destination_replica: ?u8,
727
719
  ) void {
728
- assert(self.status == .recovered);
720
+ assert(journal.status == .recovered);
729
721
  assert(checksum != 0);
730
722
 
731
- const replica = @fieldParentPtr(Replica, "journal", self);
723
+ const replica = @fieldParentPtr(Replica, "journal", journal);
732
724
  if (op > replica.op) {
733
- self.read_prepare_log(op, checksum, "beyond replica.op");
725
+ journal.read_prepare_log(op, checksum, "beyond replica.op");
734
726
  callback(replica, null, null);
735
727
  return;
736
728
  }
737
729
 
738
- const slot = self.slot_with_op_and_checksum(op, checksum) orelse {
739
- self.read_prepare_log(op, checksum, "no entry exactly");
730
+ const slot = journal.slot_with_op_and_checksum(op, checksum) orelse {
731
+ journal.read_prepare_log(op, checksum, "no entry exactly");
740
732
  callback(replica, null, null);
741
733
  return;
742
734
  };
743
735
 
744
- if (self.prepare_inhabited[slot.index] and
745
- self.prepare_checksums[slot.index] == checksum)
736
+ if (journal.prepare_inhabited[slot.index] and
737
+ journal.prepare_checksums[slot.index] == checksum)
746
738
  {
747
- self.read_prepare_with_op_and_checksum(callback, op, checksum, destination_replica);
739
+ journal.read_prepare_with_op_and_checksum(callback, op, checksum, destination_replica);
748
740
  } else {
749
- self.read_prepare_log(op, checksum, "no matching prepare");
741
+ journal.read_prepare_log(op, checksum, "no matching prepare");
750
742
  callback(replica, null, null);
751
743
  }
752
744
  }
753
745
 
754
746
  /// Read a prepare from disk. There may or may not be an in-memory header.
755
747
  pub fn read_prepare_with_op_and_checksum(
756
- self: *Self,
748
+ journal: *Journal,
757
749
  callback: fn (replica: *Replica, prepare: ?*Message, destination_replica: ?u8) void,
758
750
  op: u64,
759
751
  checksum: u128,
760
752
  destination_replica: ?u8,
761
753
  ) void {
762
- const replica = @fieldParentPtr(Replica, "journal", self);
763
- const slot = self.slot_for_op(op);
764
- assert(self.status == .recovered);
765
- assert(self.prepare_inhabited[slot.index]);
766
- assert(self.prepare_checksums[slot.index] == checksum);
754
+ const replica = @fieldParentPtr(Replica, "journal", journal);
755
+ const slot = journal.slot_for_op(op);
756
+ assert(journal.status == .recovered);
757
+ assert(journal.prepare_inhabited[slot.index]);
758
+ assert(journal.prepare_checksums[slot.index] == checksum);
767
759
 
768
760
  const message = replica.message_bus.get_message();
769
761
  defer replica.message_bus.unref(message);
@@ -771,7 +763,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
771
763
  var message_size: usize = constants.message_size_max;
772
764
 
773
765
  // If the header is in-memory, we can skip the read from the disk.
774
- if (self.header_with_op_and_checksum(op, checksum)) |exact| {
766
+ if (journal.header_with_op_and_checksum(op, checksum)) |exact| {
775
767
  if (exact.size == @sizeOf(Header)) {
776
768
  message.header.* = exact.*;
777
769
  // Normally the message's padding would have been zeroed by the MessageBus,
@@ -787,14 +779,14 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
787
779
  }
788
780
  }
789
781
 
790
- const read = self.reads.acquire() orelse {
791
- self.read_prepare_log(op, checksum, "waiting for IOP");
782
+ const read = journal.reads.acquire() orelse {
783
+ journal.read_prepare_log(op, checksum, "waiting for IOP");
792
784
  callback(replica, null, null);
793
785
  return;
794
786
  };
795
787
 
796
788
  read.* = .{
797
- .self = self,
789
+ .journal = journal,
798
790
  .completion = undefined,
799
791
  .message = message.ref(),
800
792
  .callback = callback,
@@ -805,11 +797,11 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
805
797
 
806
798
  const buffer: []u8 = message.buffer[0..message_size];
807
799
 
808
- // Memory must not be owned by `self.headers` as these may be modified concurrently:
809
- assert(@ptrToInt(buffer.ptr) < @ptrToInt(self.headers.ptr) or
810
- @ptrToInt(buffer.ptr) > @ptrToInt(self.headers.ptr) + headers_size);
800
+ // Memory must not be owned by `journal.headers` as these may be modified concurrently:
801
+ assert(@ptrToInt(buffer.ptr) < @ptrToInt(journal.headers.ptr) or
802
+ @ptrToInt(buffer.ptr) > @ptrToInt(journal.headers.ptr) + headers_size);
811
803
 
812
- self.storage.read_sectors(
804
+ journal.storage.read_sectors(
813
805
  read_prepare_with_op_and_checksum_callback,
814
806
  &read.completion,
815
807
  buffer,
@@ -819,28 +811,28 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
819
811
  }
820
812
 
821
813
  fn read_prepare_with_op_and_checksum_callback(completion: *Storage.Read) void {
822
- const read = @fieldParentPtr(Self.Read, "completion", completion);
823
- const self = read.self;
824
- const replica = @fieldParentPtr(Replica, "journal", self);
814
+ const read = @fieldParentPtr(Journal.Read, "completion", completion);
815
+ const journal = read.journal;
816
+ const replica = @fieldParentPtr(Replica, "journal", journal);
825
817
  const op = read.op;
826
818
  const checksum = read.checksum;
827
- assert(self.status == .recovered);
819
+ assert(journal.status == .recovered);
828
820
 
829
821
  defer {
830
822
  replica.message_bus.unref(read.message);
831
- self.reads.release(read);
823
+ journal.reads.release(read);
832
824
  }
833
825
 
834
826
  if (op > replica.op) {
835
- self.read_prepare_log(op, checksum, "beyond replica.op");
827
+ journal.read_prepare_log(op, checksum, "beyond replica.op");
836
828
  read.callback(replica, null, null);
837
829
  return;
838
830
  }
839
831
 
840
- const checksum_inhabited = self.prepare_inhabited[self.slot_for_op(op).index];
841
- const checksum_match = self.prepare_checksums[self.slot_for_op(op).index] == checksum;
832
+ const checksum_inhabited = journal.prepare_inhabited[journal.slot_for_op(op).index];
833
+ const checksum_match = journal.prepare_checksums[journal.slot_for_op(op).index] == checksum;
842
834
  if (!checksum_inhabited or !checksum_match) {
843
- self.read_prepare_log(op, checksum, "prepare changed during read");
835
+ journal.read_prepare_log(op, checksum, "prepare changed during read");
844
836
  read.callback(replica, null, null);
845
837
  return;
846
838
  }
@@ -849,15 +841,15 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
849
841
  // The slot may not match the Read's op/checksum due to either:
850
842
  // * The in-memory header changed since the read began.
851
843
  // * The in-memory header is reserved+faulty; the read was via `prepare_checksums`
852
- const slot = self.slot_with_op_and_checksum(op, checksum);
844
+ const slot = journal.slot_with_op_and_checksum(op, checksum);
853
845
 
854
846
  if (!read.message.header.valid_checksum()) {
855
847
  if (slot) |s| {
856
- self.faulty.set(s);
857
- self.dirty.set(s);
848
+ journal.faulty.set(s);
849
+ journal.dirty.set(s);
858
850
  }
859
851
 
860
- self.read_prepare_log(op, checksum, "corrupt header after read");
852
+ journal.read_prepare_log(op, checksum, "corrupt header after read");
861
853
  read.callback(replica, null, null);
862
854
  return;
863
855
  }
@@ -868,11 +860,11 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
868
860
  // Though when a prepare spans multiple sectors, a misdirected read/write will
869
861
  // likely manifest as a checksum failure instead.
870
862
  if (slot) |s| {
871
- self.faulty.set(s);
872
- self.dirty.set(s);
863
+ journal.faulty.set(s);
864
+ journal.dirty.set(s);
873
865
  }
874
866
 
875
- self.read_prepare_log(op, checksum, "wrong cluster");
867
+ journal.read_prepare_log(op, checksum, "wrong cluster");
876
868
  read.callback(replica, null, null);
877
869
  return;
878
870
  }
@@ -882,7 +874,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
882
874
  // * The prepare was rewritten since the read began.
883
875
  // * Misdirected read/write.
884
876
  // * The combination of:
885
- // * The leader is responding to a `request_prepare`.
877
+ // * The primary is responding to a `request_prepare`.
886
878
  // * The `request_prepare` did not include a checksum.
887
879
  // * The requested op's slot is faulty, but the prepare is valid. Since the
888
880
  // prepare is valid, WAL recovery set `prepare_checksums[slot]`. But on reading
@@ -891,7 +883,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
891
883
  // the op along with the checksum in `prepare_checksums`.)
892
884
  assert(slot == null);
893
885
 
894
- self.read_prepare_log(op, checksum, "op changed during read");
886
+ journal.read_prepare_log(op, checksum, "op changed during read");
895
887
  read.callback(replica, null, null);
896
888
  return;
897
889
  }
@@ -900,18 +892,18 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
900
892
  // This can also be caused by a misdirected read/write.
901
893
  assert(slot == null);
902
894
 
903
- self.read_prepare_log(op, checksum, "checksum changed during read");
895
+ journal.read_prepare_log(op, checksum, "checksum changed during read");
904
896
  read.callback(replica, null, null);
905
897
  return;
906
898
  }
907
899
 
908
900
  if (!read.message.header.valid_checksum_body(read.message.body())) {
909
901
  if (slot) |s| {
910
- self.faulty.set(s);
911
- self.dirty.set(s);
902
+ journal.faulty.set(s);
903
+ journal.dirty.set(s);
912
904
  }
913
905
 
914
- self.read_prepare_log(op, checksum, "corrupt body after read");
906
+ journal.read_prepare_log(op, checksum, "corrupt body after read");
915
907
  read.callback(replica, null, null);
916
908
  return;
917
909
  }
@@ -919,68 +911,80 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
919
911
  read.callback(replica, read.message, read.destination_replica);
920
912
  }
921
913
 
922
- fn read_prepare_log(self: *Self, op: u64, checksum: ?u128, notice: []const u8) void {
914
+ fn read_prepare_log(journal: *Journal, op: u64, checksum: ?u128, notice: []const u8) void {
923
915
  log.info(
924
916
  "{}: read_prepare: op={} checksum={}: {s}",
925
- .{ self.replica, op, checksum, notice },
917
+ .{ journal.replica, op, checksum, notice },
926
918
  );
927
919
  }
928
920
 
929
- pub fn recover(self: *Self) void {
930
- assert(self.status == .init);
931
- assert(self.dirty.count == slot_count);
932
- assert(self.faulty.count == slot_count);
921
+ pub fn recover(journal: *Journal, callback: fn (journal: *Journal) void) void {
922
+ assert(journal.status == .init);
923
+ assert(journal.dirty.count == slot_count);
924
+ assert(journal.faulty.count == slot_count);
925
+ assert(journal.reads.executing() == 0);
926
+ assert(journal.writes.executing() == 0);
927
+ assert(journal.header_chunks_requested.count() == HeaderChunks.bit_length);
928
+ assert(journal.header_chunks_recovered.count() == 0);
933
929
 
934
- self.status = .recovering;
930
+ journal.status = .{ .recovering = callback };
931
+ log.debug("{}: recover: recovering", .{journal.replica});
935
932
 
936
- log.debug("{}: recover: recovering", .{self.replica});
933
+ var available: usize = journal.reads.available();
934
+ while (available > 0) : (available -= 1) journal.recover_headers();
937
935
 
938
- self.recover_headers(0);
936
+ assert(journal.header_chunks_recovered.count() == 0);
937
+ assert(journal.header_chunks_requested.count() ==
938
+ HeaderChunks.bit_length - journal.reads.executing());
939
939
  }
940
940
 
941
- fn recover_headers(self: *Self, offset: u64) void {
942
- const replica = @fieldParentPtr(Replica, "journal", self);
943
-
944
- assert(self.status == .recovering);
945
- assert(self.dirty.count == slot_count);
946
- assert(self.faulty.count == slot_count);
941
+ fn recover_headers(journal: *Journal) void {
942
+ const replica = @fieldParentPtr(Replica, "journal", journal);
943
+ assert(journal.status == .recovering);
944
+ assert(journal.reads.available() > 0);
947
945
 
948
- if (offset == headers_size) {
949
- log.debug("{}: recover_headers: complete", .{self.replica});
950
- self.recover_prepares(Slot{ .index = 0 });
946
+ if (journal.header_chunks_recovered.count() == HeaderChunks.bit_length) {
947
+ assert(journal.header_chunks_requested.count() == 0);
948
+ log.debug("{}: recover_headers: complete", .{journal.replica});
949
+ journal.recover_prepares();
951
950
  return;
952
951
  }
953
- assert(offset < headers_size);
952
+
953
+ const chunk_index = journal.header_chunks_requested.findFirstSet() orelse return;
954
+ assert(!journal.header_chunks_recovered.isSet(chunk_index));
954
955
 
955
956
  const message = replica.message_bus.get_message();
956
957
  defer replica.message_bus.unref(message);
957
958
 
958
- // We expect that no other process is issuing reads while we are recovering.
959
- assert(self.reads.executing() == 0);
960
-
961
- const read = self.reads.acquire() orelse unreachable;
962
- read.* = .{
963
- .self = self,
959
+ const chunk_read = journal.reads.acquire() orelse unreachable;
960
+ chunk_read.* = .{
961
+ .journal = journal,
964
962
  .completion = undefined,
965
963
  .message = message.ref(),
966
964
  .callback = undefined,
967
- .op = undefined,
968
- .checksum = offset,
965
+ .op = chunk_index,
966
+ .checksum = undefined,
969
967
  .destination_replica = null,
970
968
  };
971
969
 
970
+ const offset = constants.message_size_max * chunk_index;
971
+ assert(offset < headers_size);
972
+
972
973
  const buffer = recover_headers_buffer(message, offset);
973
974
  assert(buffer.len > 0);
975
+ assert(buffer.len <= constants.message_size_max);
976
+ assert(buffer.len + offset <= headers_size);
974
977
 
975
978
  log.debug("{}: recover_headers: offset={} size={} recovering", .{
976
- self.replica,
979
+ journal.replica,
977
980
  offset,
978
981
  buffer.len,
979
982
  });
980
983
 
981
- self.storage.read_sectors(
984
+ journal.header_chunks_requested.unset(chunk_index);
985
+ journal.storage.read_sectors(
982
986
  recover_headers_callback,
983
- &read.completion,
987
+ &chunk_read.completion,
984
988
  buffer,
985
989
  .wal_headers,
986
990
  offset,
@@ -988,90 +992,116 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
988
992
  }
989
993
 
990
994
  fn recover_headers_callback(completion: *Storage.Read) void {
991
- const read = @fieldParentPtr(Self.Read, "completion", completion);
992
- const self = read.self;
993
- const replica = @fieldParentPtr(Replica, "journal", self);
994
- const message = read.message;
995
-
996
- const offset = @intCast(u64, read.checksum);
997
- const buffer = recover_headers_buffer(message, offset);
995
+ const chunk_read = @fieldParentPtr(Journal.Read, "completion", completion);
996
+ const journal = chunk_read.journal;
997
+ const replica = @fieldParentPtr(Replica, "journal", journal);
998
+ assert(journal.status == .recovering);
999
+ assert(chunk_read.destination_replica == null);
1000
+
1001
+ const chunk_index = chunk_read.op;
1002
+ assert(!journal.header_chunks_requested.isSet(chunk_index));
1003
+ assert(!journal.header_chunks_recovered.isSet(chunk_index));
1004
+
1005
+ const chunk_buffer = recover_headers_buffer(
1006
+ chunk_read.message,
1007
+ chunk_index * constants.message_size_max,
1008
+ );
1009
+ assert(chunk_buffer.len >= @sizeOf(Header));
1010
+ assert(chunk_buffer.len % @sizeOf(Header) == 0);
998
1011
 
999
1012
  log.debug("{}: recover_headers: offset={} size={} recovered", .{
1000
- self.replica,
1001
- offset,
1002
- buffer.len,
1013
+ journal.replica,
1014
+ chunk_index * constants.message_size_max,
1015
+ chunk_buffer.len,
1003
1016
  });
1004
1017
 
1005
- assert(self.status == .recovering);
1006
- assert(offset % @sizeOf(Header) == 0);
1007
- assert(buffer.len >= @sizeOf(Header));
1008
- assert(buffer.len % @sizeOf(Header) == 0);
1009
- assert(read.destination_replica == null);
1010
- assert(self.dirty.count == slot_count);
1011
- assert(self.faulty.count == slot_count);
1012
-
1013
- // Directly store all the redundant headers in `self.headers_redundant` (including any
1018
+ // Directly store all the redundant headers in `journal.headers_redundant` (including any
1014
1019
  // that are invalid or corrupt). As the prepares are recovered, these will be replaced
1015
1020
  // or removed as necessary.
1016
- const buffer_headers = std.mem.bytesAsSlice(Header, buffer);
1021
+ const chunk_headers = std.mem.bytesAsSlice(Header, chunk_buffer);
1017
1022
  util.copy_disjoint(
1018
1023
  .exact,
1019
1024
  Header,
1020
- self.headers_redundant[@divExact(offset, @sizeOf(Header))..][0..buffer_headers.len],
1021
- buffer_headers,
1025
+ journal.headers_redundant[chunk_index * headers_per_message ..][0..chunk_headers.len],
1026
+ chunk_headers,
1022
1027
  );
1023
1028
 
1024
- const offset_next = offset + buffer.len;
1025
1029
  // We must release before we call `recover_headers()` in case Storage is synchronous.
1026
1030
  // Otherwise, we would run out of messages and reads.
1027
- replica.message_bus.unref(read.message);
1028
- self.reads.release(read);
1031
+ replica.message_bus.unref(chunk_read.message);
1032
+ journal.reads.release(chunk_read);
1029
1033
 
1030
- self.recover_headers(offset_next);
1034
+ journal.header_chunks_recovered.set(chunk_index);
1035
+ journal.recover_headers();
1031
1036
  }
1032
1037
 
1033
1038
  fn recover_headers_buffer(message: *Message, offset: u64) []align(@alignOf(Header)) u8 {
1034
- const max = std.math.min(message.buffer.len, headers_size - offset);
1039
+ const max = std.math.min(constants.message_size_max, headers_size - offset);
1035
1040
  assert(max % constants.sector_size == 0);
1036
1041
  assert(max % @sizeOf(Header) == 0);
1037
1042
  return message.buffer[0..max];
1038
1043
  }
1039
1044
 
1040
- fn recover_prepares(self: *Self, slot: Slot) void {
1041
- const replica = @fieldParentPtr(Replica, "journal", self);
1042
- assert(self.status == .recovering);
1043
- assert(self.dirty.count == slot_count);
1044
- assert(self.faulty.count == slot_count);
1045
- // We expect that no other process is issuing reads while we are recovering.
1046
- assert(self.reads.executing() == 0);
1045
+ /// Recover the prepares ring. Reads are issued concurrently.
1046
+ /// - `dirty` is initially full.
1047
+ /// Bits are cleared when a read is issued to the slot.
1048
+ /// All bits are set again before recover_slots() is called.
1049
+ /// - `faulty` is initially full.
1050
+ /// Bits are cleared when the slot's read finishes.
1051
+ /// All bits are set again before recover_slots() is called.
1052
+ /// - The prepare's headers are loaded into `journal.headers`.
1053
+ fn recover_prepares(journal: *Journal) void {
1054
+ assert(journal.status == .recovering);
1055
+ assert(journal.dirty.count == slot_count);
1056
+ assert(journal.faulty.count == slot_count);
1057
+ assert(journal.reads.executing() == 0);
1058
+ assert(journal.writes.executing() == 0);
1059
+
1060
+ var available: usize = journal.reads.available();
1061
+ while (available > 0) : (available -= 1) journal.recover_prepare();
1062
+
1063
+ assert(journal.writes.executing() == 0);
1064
+ assert(journal.reads.executing() > 0);
1065
+ assert(journal.reads.executing() + journal.dirty.count == slot_count);
1066
+ assert(journal.faulty.count == slot_count);
1067
+ }
1047
1068
 
1048
- if (slot.index == slot_count) {
1049
- self.recover_slots();
1050
- return;
1069
+ fn recover_prepare(journal: *Journal) void {
1070
+ const replica = @fieldParentPtr(Replica, "journal", journal);
1071
+ assert(journal.status == .recovering);
1072
+ assert(journal.reads.available() > 0);
1073
+ assert(journal.dirty.count <= journal.faulty.count);
1074
+
1075
+ if (journal.faulty.count == 0) {
1076
+ for (journal.headers) |_, index| journal.dirty.set(Slot{ .index = index });
1077
+ for (journal.headers) |_, index| journal.faulty.set(Slot{ .index = index });
1078
+ return journal.recover_slots();
1051
1079
  }
1052
- assert(slot.index < slot_count);
1053
1080
 
1081
+ const slot_index = journal.dirty.bits.findFirstSet() orelse return;
1082
+ const slot = Slot{ .index = slot_index };
1054
1083
  const message = replica.message_bus.get_message();
1055
1084
  defer replica.message_bus.unref(message);
1056
1085
 
1057
- const read = self.reads.acquire() orelse unreachable;
1086
+ const read = journal.reads.acquire() orelse unreachable;
1058
1087
  read.* = .{
1059
- .self = self,
1088
+ .journal = journal,
1060
1089
  .completion = undefined,
1061
1090
  .message = message.ref(),
1062
1091
  .callback = undefined,
1063
- .op = undefined,
1064
- .checksum = slot.index,
1092
+ .op = slot.index,
1093
+ .checksum = undefined,
1065
1094
  .destination_replica = null,
1066
1095
  };
1067
1096
 
1068
- log.debug("{}: recover_prepares: recovering slot={}", .{
1069
- self.replica,
1097
+ log.debug("{}: recover_prepare: recovering slot={}", .{
1098
+ journal.replica,
1070
1099
  slot.index,
1071
1100
  });
1072
1101
 
1073
- self.storage.read_sectors(
1074
- recover_prepares_callback,
1102
+ journal.dirty.clear(slot);
1103
+ journal.storage.read_sectors(
1104
+ recover_prepare_callback,
1075
1105
  &read.completion,
1076
1106
  // We load the entire message to verify that it isn't torn or corrupt.
1077
1107
  // We don't know the message's size, so use the entire buffer.
@@ -1081,31 +1111,33 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
1081
1111
  );
1082
1112
  }
1083
1113
 
1084
- fn recover_prepares_callback(completion: *Storage.Read) void {
1085
- const read = @fieldParentPtr(Self.Read, "completion", completion);
1086
- const self = read.self;
1087
- const replica = @fieldParentPtr(Replica, "journal", self);
1114
+ fn recover_prepare_callback(completion: *Storage.Read) void {
1115
+ const read = @fieldParentPtr(Journal.Read, "completion", completion);
1116
+ const journal = read.journal;
1117
+ const replica = @fieldParentPtr(Replica, "journal", journal);
1088
1118
 
1089
- assert(self.status == .recovering);
1090
- assert(self.dirty.count == slot_count);
1091
- assert(self.faulty.count == slot_count);
1119
+ assert(journal.status == .recovering);
1120
+ assert(journal.dirty.count <= journal.faulty.count);
1092
1121
  assert(read.destination_replica == null);
1093
1122
 
1094
- const slot = Slot{ .index = @intCast(u64, read.checksum) };
1123
+ const slot = Slot{ .index = @intCast(u64, read.op) };
1095
1124
  assert(slot.index < slot_count);
1125
+ assert(!journal.dirty.bit(slot));
1126
+ assert(journal.faulty.bit(slot));
1096
1127
 
1097
1128
  // Check `valid_checksum_body` here rather than in `recover_done` so that we don't need
1098
1129
  // to hold onto the whole message (just the header).
1099
1130
  if (read.message.header.valid_checksum() and
1100
1131
  read.message.header.valid_checksum_body(read.message.body()))
1101
1132
  {
1102
- self.headers[slot.index] = read.message.header.*;
1133
+ journal.headers[slot.index] = read.message.header.*;
1103
1134
  }
1104
1135
 
1105
1136
  replica.message_bus.unref(read.message);
1106
- self.reads.release(read);
1137
+ journal.reads.release(read);
1107
1138
 
1108
- self.recover_prepares(Slot{ .index = slot.index + 1 });
1139
+ journal.faulty.clear(slot);
1140
+ journal.recover_prepare();
1109
1141
  }
1110
1142
 
1111
1143
  /// When in doubt about whether a particular message was received, it must be marked as
@@ -1175,65 +1207,63 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
1175
1207
  /// 2. has the correct cluster
1176
1208
  /// 3. is in the correct slot (op % slot_count)
1177
1209
  /// 4. has command=reserved or command=prepare
1178
- fn recover_slots(self: *Self) void {
1179
- const replica = @fieldParentPtr(Replica, "journal", self);
1210
+ fn recover_slots(journal: *Journal) void {
1211
+ const replica = @fieldParentPtr(Replica, "journal", journal);
1180
1212
 
1181
- assert(self.status == .recovering);
1182
- assert(self.reads.executing() == 0);
1183
- assert(self.writes.executing() == 0);
1184
- assert(self.dirty.count == slot_count);
1185
- assert(self.faulty.count == slot_count);
1213
+ assert(journal.status == .recovering);
1214
+ assert(journal.reads.executing() == 0);
1215
+ assert(journal.writes.executing() == 0);
1216
+ assert(journal.dirty.count == slot_count);
1217
+ assert(journal.faulty.count == slot_count);
1186
1218
 
1187
1219
  const prepare_op_max = std.math.max(
1188
1220
  replica.op_checkpoint,
1189
- op_maximum_headers_untrusted(replica.cluster, self.headers),
1221
+ op_maximum_headers_untrusted(replica.cluster, journal.headers),
1190
1222
  );
1191
1223
 
1192
1224
  var cases: [slot_count]*const Case = undefined;
1193
1225
 
1194
- for (self.headers) |_, index| {
1226
+ for (journal.headers) |_, index| {
1195
1227
  const slot = Slot{ .index = index };
1196
- const header = header_ok(replica.cluster, slot, &self.headers_redundant[index]);
1197
- const prepare = header_ok(replica.cluster, slot, &self.headers[index]);
1228
+ const header = header_ok(replica.cluster, slot, &journal.headers_redundant[index]);
1229
+ const prepare = header_ok(replica.cluster, slot, &journal.headers[index]);
1198
1230
 
1199
1231
  cases[index] = recovery_case(header, prepare, prepare_op_max);
1200
1232
 
1201
1233
  // `prepare_checksums` improves the availability of `request_prepare` by being more
1202
1234
  // flexible than `headers` regarding the prepares it references. It may hold a
1203
- // prepare whose redundant header is broken, as long as the prepare itself is valid.
1235
+ // prepare whose redundant header is broken, as long as the prepare itjournal is valid.
1204
1236
  if (prepare != null and prepare.?.command == .prepare) {
1205
- assert(!self.prepare_inhabited[index]);
1206
- self.prepare_inhabited[index] = true;
1207
- self.prepare_checksums[index] = prepare.?.checksum;
1237
+ assert(!journal.prepare_inhabited[index]);
1238
+ journal.prepare_inhabited[index] = true;
1239
+ journal.prepare_checksums[index] = prepare.?.checksum;
1208
1240
  }
1209
1241
  }
1210
- assert(self.headers.len == cases.len);
1242
+ assert(journal.headers.len == cases.len);
1211
1243
 
1212
1244
  // Refine cases @B and @C: Repair (truncate) a prepare if it was torn during a crash.
1213
- if (self.recover_torn_prepare(&cases)) |torn_slot| {
1245
+ if (journal.recover_torn_prepare(&cases)) |torn_slot| {
1214
1246
  assert(cases[torn_slot.index].decision(replica.replica_count) == .vsr);
1215
1247
  cases[torn_slot.index] = &case_cut;
1216
1248
 
1217
1249
  log.warn("{}: recover_slots: torn prepare in slot={}", .{
1218
- self.replica,
1250
+ journal.replica,
1219
1251
  torn_slot.index,
1220
1252
  });
1221
1253
  }
1222
1254
 
1223
- for (cases) |case, index| self.recover_slot(Slot{ .index = index }, case);
1255
+ for (cases) |case, index| journal.recover_slot(Slot{ .index = index }, case);
1224
1256
  assert(cases.len == slot_count);
1225
1257
 
1226
- util.copy_disjoint(.exact, Header, self.headers_redundant, self.headers);
1258
+ util.copy_disjoint(.exact, Header, journal.headers_redundant, journal.headers);
1227
1259
 
1228
1260
  log.debug("{}: recover_slots: dirty={} faulty={}", .{
1229
- self.replica,
1230
- self.dirty.count,
1231
- self.faulty.count,
1261
+ journal.replica,
1262
+ journal.dirty.count,
1263
+ journal.faulty.count,
1232
1264
  });
1233
1265
 
1234
- self.status = .recovered;
1235
- self.assert_recovered();
1236
- // From here it's over to the Recovery protocol from VRR 2012.
1266
+ journal.recover_fix();
1237
1267
  }
1238
1268
 
1239
1269
  /// Returns a slot that is safe to truncate.
@@ -1246,27 +1276,27 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
1246
1276
  /// - the prepare is corrupt, and
1247
1277
  /// * there are no faults except for those between `op_checkpoint` and `op_max + 1`,
1248
1278
  /// so that we can be sure that the maximum valid op is in fact the maximum.
1249
- fn recover_torn_prepare(self: *const Self, cases: []const *const Case) ?Slot {
1250
- const replica = @fieldParentPtr(Replica, "journal", self);
1279
+ fn recover_torn_prepare(journal: *const Journal, cases: []const *const Case) ?Slot {
1280
+ const replica = @fieldParentPtr(Replica, "journal", journal);
1251
1281
 
1252
- assert(self.status == .recovering);
1253
- assert(self.dirty.count == slot_count);
1254
- assert(self.faulty.count == slot_count);
1282
+ assert(journal.status == .recovering);
1283
+ assert(journal.dirty.count == slot_count);
1284
+ assert(journal.faulty.count == slot_count);
1255
1285
 
1256
- const op_max = op_maximum_headers_untrusted(replica.cluster, self.headers_redundant);
1257
- if (op_max != op_maximum_headers_untrusted(replica.cluster, self.headers)) return null;
1286
+ const op_max = op_maximum_headers_untrusted(replica.cluster, journal.headers_redundant);
1287
+ if (op_max != op_maximum_headers_untrusted(replica.cluster, journal.headers)) return null;
1258
1288
  if (op_max < replica.op_checkpoint) return null;
1259
1289
  // We can't assume that the header at `op_max` is a prepare — an empty journal with a
1260
1290
  // corrupt root prepare (op_max=0) will be repaired later.
1261
1291
 
1262
1292
  const torn_op = op_max + 1;
1263
- const torn_slot = self.slot_for_op(torn_op);
1293
+ const torn_slot = journal.slot_for_op(torn_op);
1264
1294
 
1265
- const torn_prepare_untrusted = &self.headers[torn_slot.index];
1295
+ const torn_prepare_untrusted = &journal.headers[torn_slot.index];
1266
1296
  if (torn_prepare_untrusted.valid_checksum()) return null;
1267
1297
  // The prepare is at least corrupt, possibly torn, but not valid and simply misdirected.
1268
1298
 
1269
- const header_untrusted = &self.headers_redundant[torn_slot.index];
1299
+ const header_untrusted = &journal.headers_redundant[torn_slot.index];
1270
1300
  const header = header_ok(replica.cluster, torn_slot, header_untrusted) orelse return null;
1271
1301
  // The redundant header is valid, also for the correct cluster and not misdirected.
1272
1302
 
@@ -1286,7 +1316,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
1286
1316
  // unless the prepare header was lost, in which case this slot may also not be torn.
1287
1317
  }
1288
1318
 
1289
- const checkpoint_index = self.slot_for_op(replica.op_checkpoint).index;
1319
+ const checkpoint_index = journal.slot_for_op(replica.op_checkpoint).index;
1290
1320
  const known_range = SlotRange{
1291
1321
  .head = Slot{ .index = checkpoint_index },
1292
1322
  .tail = torn_slot,
@@ -1304,7 +1334,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
1304
1334
  // truncate).
1305
1335
  //
1306
1336
  // When the checkpoint and torn op are in the same slot, then we can only be certain
1307
- // if there are no faults other than the torn op itself.
1337
+ // if there are no faults other than the torn op itjournal.
1308
1338
  for (cases) |case, index| {
1309
1339
  // Do not use `faulty.bit()` because the decisions have not been processed yet.
1310
1340
  if (case.decision(replica.replica_count) == .vsr) {
@@ -1319,81 +1349,78 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
1319
1349
  }
1320
1350
 
1321
1351
  // The prepare is torn.
1322
- assert(!self.prepare_inhabited[torn_slot.index]);
1352
+ assert(!journal.prepare_inhabited[torn_slot.index]);
1323
1353
  assert(!torn_prepare_untrusted.valid_checksum());
1324
1354
  assert(cases[torn_slot.index].decision(replica.replica_count) == .vsr);
1325
1355
  return torn_slot;
1326
1356
  }
1327
1357
 
1328
- fn recover_slot(self: *Self, slot: Slot, case: *const Case) void {
1329
- const replica = @fieldParentPtr(Replica, "journal", self);
1358
+ fn recover_slot(journal: *Journal, slot: Slot, case: *const Case) void {
1359
+ const replica = @fieldParentPtr(Replica, "journal", journal);
1330
1360
  const cluster = replica.cluster;
1331
1361
 
1332
- assert(self.status == .recovering);
1333
- assert(self.dirty.bit(slot));
1334
- assert(self.faulty.bit(slot));
1362
+ assert(journal.status == .recovering);
1363
+ assert(journal.dirty.bit(slot));
1364
+ assert(journal.faulty.bit(slot));
1335
1365
 
1336
- const header = header_ok(cluster, slot, &self.headers_redundant[slot.index]);
1337
- const prepare = header_ok(cluster, slot, &self.headers[slot.index]);
1366
+ const header = header_ok(cluster, slot, &journal.headers_redundant[slot.index]);
1367
+ const prepare = header_ok(cluster, slot, &journal.headers[slot.index]);
1338
1368
  const decision = case.decision(replica.replica_count);
1339
1369
  switch (decision) {
1340
1370
  .eql => {
1341
1371
  assert(header.?.command == .prepare);
1342
1372
  assert(prepare.?.command == .prepare);
1343
1373
  assert(header.?.checksum == prepare.?.checksum);
1344
- assert(self.prepare_inhabited[slot.index]);
1345
- assert(self.prepare_checksums[slot.index] == prepare.?.checksum);
1346
- self.headers[slot.index] = header.?.*;
1347
- self.dirty.clear(slot);
1348
- self.faulty.clear(slot);
1374
+ assert(journal.prepare_inhabited[slot.index]);
1375
+ assert(journal.prepare_checksums[slot.index] == prepare.?.checksum);
1376
+ journal.headers[slot.index] = header.?.*;
1377
+ journal.dirty.clear(slot);
1378
+ journal.faulty.clear(slot);
1349
1379
  },
1350
1380
  .nil => {
1351
1381
  assert(header.?.command == .reserved);
1352
1382
  assert(prepare.?.command == .reserved);
1353
1383
  assert(header.?.checksum == prepare.?.checksum);
1354
1384
  assert(header.?.checksum == Header.reserved(cluster, slot.index).checksum);
1355
- assert(!self.prepare_inhabited[slot.index]);
1356
- assert(self.prepare_checksums[slot.index] == 0);
1357
- self.headers[slot.index] = header.?.*;
1358
- self.dirty.clear(slot);
1359
- self.faulty.clear(slot);
1385
+ assert(!journal.prepare_inhabited[slot.index]);
1386
+ assert(journal.prepare_checksums[slot.index] == 0);
1387
+ journal.headers[slot.index] = header.?.*;
1388
+ journal.dirty.clear(slot);
1389
+ journal.faulty.clear(slot);
1360
1390
  },
1361
1391
  .fix => {
1362
- self.headers[slot.index] = prepare.?.*;
1363
- self.faulty.clear(slot);
1392
+ journal.headers[slot.index] = prepare.?.*;
1393
+ journal.faulty.clear(slot);
1394
+ assert(journal.dirty.bit(slot));
1364
1395
  if (replica.replica_count == 1) {
1365
- // @D, @E, @F, @G, @H, @K:
1366
- self.dirty.clear(slot);
1367
- // TODO Repair header on disk to restore durability.
1396
+ // @D, @E, @F, @G, @H, @K
1368
1397
  } else {
1369
1398
  assert(prepare.?.command == .prepare);
1370
- assert(self.prepare_inhabited[slot.index]);
1371
- assert(self.prepare_checksums[slot.index] == prepare.?.checksum);
1372
- // @F, @H, @K:
1373
- // TODO Repair without retrieving remotely (i.e. don't set dirty or faulty).
1374
- assert(self.dirty.bit(slot));
1399
+ assert(journal.prepare_inhabited[slot.index]);
1400
+ assert(journal.prepare_checksums[slot.index] == prepare.?.checksum);
1401
+ // @F, @H, @K
1375
1402
  }
1376
1403
  },
1377
1404
  .vsr => {
1378
- self.headers[slot.index] = Header.reserved(cluster, slot.index);
1379
- assert(self.dirty.bit(slot));
1380
- assert(self.faulty.bit(slot));
1405
+ journal.headers[slot.index] = Header.reserved(cluster, slot.index);
1406
+ assert(journal.dirty.bit(slot));
1407
+ assert(journal.faulty.bit(slot));
1381
1408
  },
1382
1409
  .cut => {
1383
1410
  assert(header != null);
1384
1411
  assert(prepare == null);
1385
- assert(!self.prepare_inhabited[slot.index]);
1386
- assert(self.prepare_checksums[slot.index] == 0);
1387
- self.headers[slot.index] = Header.reserved(cluster, slot.index);
1388
- self.dirty.clear(slot);
1389
- self.faulty.clear(slot);
1412
+ assert(!journal.prepare_inhabited[slot.index]);
1413
+ assert(journal.prepare_checksums[slot.index] == 0);
1414
+ journal.headers[slot.index] = Header.reserved(cluster, slot.index);
1415
+ journal.dirty.clear(slot);
1416
+ journal.faulty.clear(slot);
1390
1417
  },
1391
1418
  }
1392
1419
 
1393
1420
  switch (decision) {
1394
1421
  .eql, .nil => {
1395
1422
  log.debug("{}: recover_slot: recovered slot={} label={s} decision={s}", .{
1396
- self.replica,
1423
+ journal.replica,
1397
1424
  slot.index,
1398
1425
  case.label,
1399
1426
  @tagName(decision),
@@ -1401,7 +1428,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
1401
1428
  },
1402
1429
  .fix, .vsr, .cut => {
1403
1430
  log.warn("{}: recover_slot: recovered slot={} label={s} decision={s}", .{
1404
- self.replica,
1431
+ journal.replica,
1405
1432
  slot.index,
1406
1433
  case.label,
1407
1434
  @tagName(decision),
@@ -1410,69 +1437,126 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
1410
1437
  }
1411
1438
  }
1412
1439
 
1413
- fn assert_recovered(self: *const Self) void {
1414
- const replica = @fieldParentPtr(Replica, "journal", self);
1440
+ /// Repair the redundant headers for slots with decision=fix, one sector at a time.
1441
+ fn recover_fix(journal: *Journal) void {
1442
+ assert(journal.status == .recovering);
1443
+ assert(journal.writes.executing() == 0);
1444
+ assert(journal.dirty.count >= journal.faulty.count);
1445
+ assert(journal.dirty.count <= slot_count);
1446
+
1447
+ var fix_sector: ?usize = null;
1448
+ var dirty_iterator = journal.dirty.bits.iterator(.{ .kind = .set });
1449
+ while (dirty_iterator.next()) |dirty_slot| {
1450
+ if (journal.faulty.bit(Slot{ .index = dirty_slot })) continue;
1451
+
1452
+ const dirty_slot_sector = @divFloor(dirty_slot, headers_per_sector);
1453
+ if (fix_sector) |fix_sector_| {
1454
+ if (fix_sector_ != dirty_slot_sector) break;
1455
+ } else {
1456
+ fix_sector = dirty_slot_sector;
1457
+ }
1458
+ journal.dirty.clear(Slot{ .index = dirty_slot });
1459
+ }
1460
+
1461
+ if (fix_sector == null) return journal.recover_done();
1462
+
1463
+ const write = journal.writes.acquire().?;
1464
+ write.* = .{
1465
+ .journal = journal,
1466
+ .callback = undefined,
1467
+ .message = undefined,
1468
+ .trigger = .fix,
1469
+ .range = undefined,
1470
+ };
1471
+
1472
+ const buffer: []u8 = journal.header_sector(fix_sector.?, write);
1473
+ const buffer_headers = std.mem.bytesAsSlice(Header, buffer);
1474
+ assert(buffer_headers.len == headers_per_sector);
1475
+
1476
+ const offset = Ring.headers.offset(Slot{ .index = fix_sector.? * headers_per_sector });
1477
+ journal.write_sectors(recover_fix_callback, write, buffer, .headers, offset);
1478
+ }
1479
+
1480
+ fn recover_fix_callback(write: *Journal.Write) void {
1481
+ const journal = write.journal;
1482
+ assert(journal.status == .recovering);
1483
+ assert(write.trigger == .fix);
1415
1484
 
1416
- assert(self.status == .recovered);
1485
+ journal.writes.release(write);
1486
+ journal.recover_fix();
1487
+ }
1417
1488
 
1418
- assert(self.dirty.count <= slot_count);
1419
- assert(self.faulty.count <= slot_count);
1420
- assert(self.faulty.count <= self.dirty.count);
1489
+ fn recover_done(journal: *Journal) void {
1490
+ assert(journal.status == .recovering);
1491
+ assert(journal.reads.executing() == 0);
1492
+ assert(journal.writes.executing() == 0);
1493
+ assert(journal.dirty.count <= slot_count);
1494
+ assert(journal.faulty.count <= slot_count);
1495
+ assert(journal.faulty.count == journal.dirty.count);
1496
+ assert(journal.header_chunks_requested.count() == 0);
1497
+ assert(journal.header_chunks_recovered.count() == HeaderChunks.bit_length);
1498
+
1499
+ const replica = @fieldParentPtr(Replica, "journal", journal);
1500
+ const callback = journal.status.recovering;
1501
+ journal.status = .recovered;
1421
1502
 
1422
1503
  // Abort if all slots are faulty, since something is very wrong.
1423
- if (self.faulty.count == slot_count) @panic("WAL is completely corrupt");
1424
- if (self.faulty.count > 0 and replica.replica_count == 1) @panic("WAL is corrupt");
1504
+ if (journal.faulty.count == slot_count) @panic("WAL is completely corrupt");
1505
+ if (journal.faulty.count > 0 and replica.replica_count == 1) @panic("WAL is corrupt");
1425
1506
 
1426
- if (self.headers[0].op == 0 and self.headers[0].command == .prepare) {
1427
- assert(self.headers[0].checksum == Header.root_prepare(replica.cluster).checksum);
1428
- assert(!self.faulty.bit(Slot{ .index = 0 }));
1507
+ if (journal.headers[0].op == 0 and journal.headers[0].command == .prepare) {
1508
+ assert(journal.headers[0].checksum == Header.root_prepare(replica.cluster).checksum);
1509
+ assert(!journal.faulty.bit(Slot{ .index = 0 }));
1429
1510
  }
1430
1511
 
1431
- for (self.headers) |*header, index| {
1512
+ for (journal.headers) |*header, index| {
1432
1513
  assert(header.valid_checksum());
1433
1514
  assert(header.cluster == replica.cluster);
1434
- assert(std.meta.eql(header.*, self.headers_redundant[index]));
1515
+ assert(std.meta.eql(header.*, journal.headers_redundant[index]));
1435
1516
  if (header.command == .reserved) {
1436
1517
  assert(header.op == index);
1437
1518
  } else {
1438
1519
  assert(header.command == .prepare);
1439
1520
  assert(header.op % slot_count == index);
1440
- assert(self.prepare_inhabited[index]);
1441
- assert(self.prepare_checksums[index] == header.checksum);
1442
- assert(!self.faulty.bit(Slot{ .index = index }));
1521
+ assert(journal.prepare_inhabited[index]);
1522
+ assert(journal.prepare_checksums[index] == header.checksum);
1523
+ assert(!journal.faulty.bit(Slot{ .index = index }));
1443
1524
  }
1444
1525
  }
1526
+
1527
+ // From here it's over to the Recovery protocol from VRR 2012.
1528
+ callback(journal);
1445
1529
  }
1446
1530
 
1447
1531
  /// Removes entries from `op_min` (inclusive) onwards.
1448
- /// Used after a view change to remove uncommitted entries discarded by the new leader.
1449
- pub fn remove_entries_from(self: *Self, op_min: u64) void {
1450
- assert(self.status == .recovered);
1532
+ /// Used after a view change to remove uncommitted entries discarded by the new primary.
1533
+ pub fn remove_entries_from(journal: *Journal, op_min: u64) void {
1534
+ assert(journal.status == .recovered);
1451
1535
  assert(op_min > 0);
1452
1536
 
1453
- log.debug("{}: remove_entries_from: op_min={}", .{ self.replica, op_min });
1537
+ log.debug("{}: remove_entries_from: op_min={}", .{ journal.replica, op_min });
1454
1538
 
1455
- for (self.headers) |*header, index| {
1539
+ for (journal.headers) |*header, index| {
1456
1540
  // We must remove the header regardless of whether it is a prepare or reserved,
1457
1541
  // since a reserved header may have been marked faulty for case @G, and
1458
1542
  // since the caller expects the WAL to be truncated, with clean slots.
1459
1543
  if (header.op >= op_min) {
1460
1544
  // TODO Explore scenarios where the data on disk may resurface after a crash.
1461
- const slot = self.slot_for_op(header.op);
1545
+ const slot = journal.slot_for_op(header.op);
1462
1546
  assert(slot.index == index);
1463
- self.remove_entry(slot);
1547
+ journal.remove_entry(slot);
1464
1548
  }
1465
1549
  }
1466
1550
  }
1467
1551
 
1468
- pub fn remove_entry(self: *Self, slot: Slot) void {
1469
- const replica = @fieldParentPtr(Replica, "journal", self);
1552
+ pub fn remove_entry(journal: *Journal, slot: Slot) void {
1553
+ const replica = @fieldParentPtr(Replica, "journal", journal);
1470
1554
 
1471
1555
  const reserved = Header.reserved(replica.cluster, slot.index);
1472
- self.headers[slot.index] = reserved;
1473
- self.headers_redundant[slot.index] = reserved;
1474
- self.dirty.clear(slot);
1475
- self.faulty.clear(slot);
1556
+ journal.headers[slot.index] = reserved;
1557
+ journal.headers_redundant[slot.index] = reserved;
1558
+ journal.dirty.clear(slot);
1559
+ journal.faulty.clear(slot);
1476
1560
  // Do not clear `prepare_inhabited`/`prepare_checksums`. The prepare is
1477
1561
  // untouched on disk, and may be useful later. Consider this scenario:
1478
1562
  //
@@ -1490,29 +1574,29 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
1490
1574
  // `prepare_inhabited=false`.
1491
1575
  }
1492
1576
 
1493
- pub fn set_header_as_dirty(self: *Self, header: *const Header) void {
1494
- assert(self.status == .recovered);
1577
+ pub fn set_header_as_dirty(journal: *Journal, header: *const Header) void {
1578
+ assert(journal.status == .recovered);
1495
1579
  assert(header.command == .prepare);
1496
1580
 
1497
1581
  log.debug("{}: set_header_as_dirty: op={} checksum={}", .{
1498
- self.replica,
1582
+ journal.replica,
1499
1583
  header.op,
1500
1584
  header.checksum,
1501
1585
  });
1502
1586
 
1503
- const slot = self.slot_for_header(header);
1587
+ const slot = journal.slot_for_header(header);
1504
1588
 
1505
- if (self.has(header)) {
1506
- assert(self.dirty.bit(slot));
1589
+ if (journal.has(header)) {
1590
+ assert(journal.dirty.bit(slot));
1507
1591
  // Do not clear any faulty bit for the same entry.
1508
1592
  } else {
1509
1593
  // Overwriting a new op with an old op would be a correctness bug; it could cause a
1510
1594
  // message to be uncommitted.
1511
- assert(self.headers[slot.index].op <= header.op);
1595
+ assert(journal.headers[slot.index].op <= header.op);
1512
1596
 
1513
- self.headers[slot.index] = header.*;
1514
- self.dirty.set(slot);
1515
- self.faulty.clear(slot);
1597
+ journal.headers[slot.index] = header.*;
1598
+ journal.dirty.set(slot);
1599
+ journal.faulty.clear(slot);
1516
1600
  }
1517
1601
  }
1518
1602
 
@@ -1520,49 +1604,49 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
1520
1604
  // TODO To guard against torn writes, don't write simultaneously to all redundant header
1521
1605
  // sectors. (This is mostly a risk for single-replica clusters with small WALs).
1522
1606
  pub fn write_prepare(
1523
- self: *Self,
1524
- callback: fn (self: *Replica, wrote: ?*Message, trigger: Write.Trigger) void,
1607
+ journal: *Journal,
1608
+ callback: fn (journal: *Replica, wrote: ?*Message, trigger: Write.Trigger) void,
1525
1609
  message: *Message,
1526
- trigger: Self.Write.Trigger,
1610
+ trigger: Journal.Write.Trigger,
1527
1611
  ) void {
1528
- const replica = @fieldParentPtr(Replica, "journal", self);
1612
+ const replica = @fieldParentPtr(Replica, "journal", journal);
1529
1613
 
1530
- assert(self.status == .recovered);
1614
+ assert(journal.status == .recovered);
1531
1615
  assert(message.header.command == .prepare);
1532
1616
  assert(message.header.size >= @sizeOf(Header));
1533
1617
  assert(message.header.size <= message.buffer.len);
1534
- assert(self.has(message.header));
1535
- assert(replica.replica_count != 1 or self.writes.executing() == 0);
1618
+ assert(journal.has(message.header));
1619
+ assert(replica.replica_count != 1 or journal.writes.executing() == 0);
1536
1620
 
1537
- // The underlying header memory must be owned by the buffer and not by self.headers:
1621
+ // The underlying header memory must be owned by the buffer and not by journal.headers:
1538
1622
  // Otherwise, concurrent writes may modify the memory of the pointer while we write.
1539
1623
  assert(@ptrToInt(message.header) == @ptrToInt(message.buffer.ptr));
1540
1624
 
1541
- const slot = self.slot_with_header(message.header).?;
1625
+ const slot = journal.slot_with_header(message.header).?;
1542
1626
 
1543
- if (!self.dirty.bit(slot)) {
1627
+ if (!journal.dirty.bit(slot)) {
1544
1628
  // Any function that sets the faulty bit should also set the dirty bit:
1545
- assert(!self.faulty.bit(slot));
1546
- assert(self.prepare_inhabited[slot.index]);
1547
- assert(self.prepare_checksums[slot.index] == message.header.checksum);
1548
- assert(self.headers_redundant[slot.index].checksum == message.header.checksum);
1549
- self.write_prepare_debug(message.header, "skipping (clean)");
1629
+ assert(!journal.faulty.bit(slot));
1630
+ assert(journal.prepare_inhabited[slot.index]);
1631
+ assert(journal.prepare_checksums[slot.index] == message.header.checksum);
1632
+ assert(journal.headers_redundant[slot.index].checksum == message.header.checksum);
1633
+ journal.write_prepare_debug(message.header, "skipping (clean)");
1550
1634
  callback(replica, message, trigger);
1551
1635
  return;
1552
1636
  }
1553
1637
 
1554
- assert(self.has_dirty(message.header));
1638
+ assert(journal.has_dirty(message.header));
1555
1639
 
1556
- const write = self.writes.acquire() orelse {
1557
- self.write_prepare_debug(message.header, "waiting for IOP");
1640
+ const write = journal.writes.acquire() orelse {
1641
+ journal.write_prepare_debug(message.header, "waiting for IOP");
1558
1642
  callback(replica, null, trigger);
1559
1643
  return;
1560
1644
  };
1561
1645
 
1562
- self.write_prepare_debug(message.header, "starting");
1646
+ journal.write_prepare_debug(message.header, "starting");
1563
1647
 
1564
1648
  write.* = .{
1565
- .self = self,
1649
+ .journal = journal,
1566
1650
  .callback = callback,
1567
1651
  .message = message.ref(),
1568
1652
  .trigger = trigger,
@@ -1580,47 +1664,47 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
1580
1664
  assert(sum_of_sector_padding_bytes == 0);
1581
1665
  }
1582
1666
 
1583
- self.prepare_inhabited[slot.index] = false;
1584
- self.prepare_checksums[slot.index] = 0;
1667
+ journal.prepare_inhabited[slot.index] = false;
1668
+ journal.prepare_checksums[slot.index] = 0;
1585
1669
 
1586
- self.write_sectors(write_prepare_header, write, buffer, .prepares, offset);
1670
+ journal.write_sectors(write_prepare_header, write, buffer, .prepares, offset);
1587
1671
  }
1588
1672
 
1589
1673
  /// Attempt to lock the in-memory sector containing the header being written.
1590
1674
  /// If the sector is already locked, add this write to the wait queue.
1591
- fn write_prepare_header(write: *Self.Write) void {
1592
- const self = write.self;
1675
+ fn write_prepare_header(write: *Journal.Write) void {
1676
+ const journal = write.journal;
1593
1677
  const message = write.message;
1594
- assert(self.status == .recovered);
1678
+ assert(journal.status == .recovered);
1595
1679
 
1596
1680
  {
1597
1681
  // `prepare_inhabited[slot.index]` is usually false here, but may be true if two
1598
1682
  // (or more) writes to the same slot were queued concurrently and this is not the
1599
1683
  // first to finish writing its prepare.
1600
- const slot = self.slot_for_header(message.header);
1601
- self.prepare_inhabited[slot.index] = true;
1602
- self.prepare_checksums[slot.index] = message.header.checksum;
1684
+ const slot = journal.slot_for_header(message.header);
1685
+ journal.prepare_inhabited[slot.index] = true;
1686
+ journal.prepare_checksums[slot.index] = message.header.checksum;
1603
1687
  }
1604
1688
 
1605
- if (self.slot_with_op_and_checksum(message.header.op, message.header.checksum)) |slot| {
1606
- self.headers_redundant[slot.index] = message.header.*;
1689
+ if (journal.slot_with_op_and_checksum(message.header.op, message.header.checksum)) |slot| {
1690
+ journal.headers_redundant[slot.index] = message.header.*;
1607
1691
  } else {
1608
- self.write_prepare_debug(message.header, "entry changed while writing sectors");
1609
- self.write_prepare_release(write, null);
1692
+ journal.write_prepare_debug(message.header, "entry changed while writing sectors");
1693
+ journal.write_prepare_release(write, null);
1610
1694
  return;
1611
1695
  }
1612
1696
 
1613
1697
  assert(!write.header_sector_locked);
1614
1698
  assert(write.header_sector_next == null);
1615
1699
 
1616
- const write_offset = self.offset_logical_in_headers_for_message(message);
1700
+ const write_offset = journal.offset_logical_in_headers_for_message(message);
1617
1701
 
1618
- var it = self.writes.iterate();
1702
+ var it = journal.writes.iterate();
1619
1703
  while (it.next()) |other| {
1620
1704
  if (other == write) continue;
1621
1705
  if (!other.header_sector_locked) continue;
1622
1706
 
1623
- const other_offset = self.offset_logical_in_headers_for_message(other.message);
1707
+ const other_offset = journal.offset_logical_in_headers_for_message(other.message);
1624
1708
  if (other_offset == write_offset) {
1625
1709
  // The `other` and `write` target the same sector; append to the list.
1626
1710
  var tail = other;
@@ -1631,11 +1715,11 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
1631
1715
  }
1632
1716
 
1633
1717
  write.header_sector_locked = true;
1634
- self.write_prepare_on_lock_header_sector(write);
1718
+ journal.write_prepare_on_lock_header_sector(write);
1635
1719
  }
1636
1720
 
1637
- fn write_prepare_on_lock_header_sector(self: *Self, write: *Write) void {
1638
- assert(self.status == .recovered);
1721
+ fn write_prepare_on_lock_header_sector(journal: *Journal, write: *Write) void {
1722
+ assert(journal.status == .recovered);
1639
1723
  assert(write.header_sector_locked);
1640
1724
 
1641
1725
  // TODO It's possible within this section that the header has since been replaced but we
@@ -1644,84 +1728,55 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
1644
1728
  // For this, we'll need to have a way to tweak write_prepare_release() to release locks.
1645
1729
  // At present, we don't return early here simply because it doesn't yet do that.
1646
1730
 
1647
- const replica = @fieldParentPtr(Replica, "journal", self);
1648
1731
  const message = write.message;
1649
- const slot_of_message = self.slot_for_header(message.header);
1650
- const slot_first = Slot{
1651
- .index = @divFloor(slot_of_message.index, headers_per_sector) * headers_per_sector,
1652
- };
1653
-
1732
+ const slot_of_message = journal.slot_for_header(message.header);
1654
1733
  const offset = Ring.headers.offset(slot_of_message);
1655
1734
  assert(offset % constants.sector_size == 0);
1656
1735
 
1657
- const buffer: []u8 = write.header_sector(self);
1658
- const buffer_headers = std.mem.bytesAsSlice(Header, buffer);
1659
- assert(buffer_headers.len == headers_per_sector);
1660
-
1661
- var i: usize = 0;
1662
- while (i < headers_per_sector) : (i += 1) {
1663
- const slot = Slot{ .index = slot_first.index + i };
1664
-
1665
- if (self.faulty.bit(slot)) {
1666
- // Redundant faulty headers are deliberately written as invalid.
1667
- // This ensures that faulty headers are still faulty when they are read back
1668
- // from disk during recovery. This prevents faulty entries from changing to
1669
- // reserved (and clean) after a crash and restart (e.g. accidentally converting
1670
- // a case `@D` to a `@J` after a restart).
1671
- buffer_headers[i] = .{
1672
- .checksum = 0,
1673
- .cluster = replica.cluster,
1674
- .command = .reserved,
1675
- };
1676
- assert(!buffer_headers[i].valid_checksum());
1677
- } else {
1678
- // Write headers from `headers_redundant` instead of `headers` — we need to
1679
- // avoid writing (leaking) a redundant header before its corresponding prepare
1680
- // is on disk.
1681
- buffer_headers[i] = self.headers_redundant[slot.index];
1682
- }
1683
- }
1736
+ const buffer: []u8 = journal.header_sector(
1737
+ @divFloor(slot_of_message.index, headers_per_sector),
1738
+ write,
1739
+ );
1684
1740
 
1685
1741
  log.debug("{}: write_header: op={} sectors[{}..{}]", .{
1686
- self.replica,
1742
+ journal.replica,
1687
1743
  message.header.op,
1688
1744
  offset,
1689
1745
  offset + constants.sector_size,
1690
1746
  });
1691
1747
 
1692
- // Memory must not be owned by self.headers as these may be modified concurrently:
1693
- assert(@ptrToInt(buffer.ptr) < @ptrToInt(self.headers.ptr) or
1694
- @ptrToInt(buffer.ptr) > @ptrToInt(self.headers.ptr) + headers_size);
1748
+ // Memory must not be owned by journal.headers as these may be modified concurrently:
1749
+ assert(@ptrToInt(buffer.ptr) < @ptrToInt(journal.headers.ptr) or
1750
+ @ptrToInt(buffer.ptr) > @ptrToInt(journal.headers.ptr) + headers_size);
1695
1751
 
1696
- self.write_sectors(write_prepare_on_write_header, write, buffer, .headers, offset);
1752
+ journal.write_sectors(write_prepare_on_write_header, write, buffer, .headers, offset);
1697
1753
  }
1698
1754
 
1699
- fn write_prepare_on_write_header(write: *Self.Write) void {
1700
- const self = write.self;
1755
+ fn write_prepare_on_write_header(write: *Journal.Write) void {
1756
+ const journal = write.journal;
1701
1757
  const message = write.message;
1702
1758
 
1703
1759
  assert(write.header_sector_locked);
1704
- self.write_prepare_unlock_header_sector(write);
1760
+ journal.write_prepare_unlock_header_sector(write);
1705
1761
 
1706
- if (!self.has(message.header)) {
1707
- self.write_prepare_debug(message.header, "entry changed while writing headers");
1708
- self.write_prepare_release(write, null);
1762
+ if (!journal.has(message.header)) {
1763
+ journal.write_prepare_debug(message.header, "entry changed while writing headers");
1764
+ journal.write_prepare_release(write, null);
1709
1765
  return;
1710
1766
  }
1711
1767
 
1712
- self.write_prepare_debug(message.header, "complete, marking clean");
1713
- // TODO Snapshots
1768
+ journal.write_prepare_debug(message.header, "complete, marking clean");
1714
1769
 
1715
- const slot = self.slot_with_header(message.header).?;
1716
- self.dirty.clear(slot);
1717
- self.faulty.clear(slot);
1770
+ const slot = journal.slot_with_header(message.header).?;
1771
+ journal.dirty.clear(slot);
1772
+ journal.faulty.clear(slot);
1718
1773
 
1719
- self.write_prepare_release(write, message);
1774
+ journal.write_prepare_release(write, message);
1720
1775
  }
1721
1776
 
1722
1777
  /// Release the lock held by a write on an in-memory header sector and pass
1723
1778
  /// it to a waiting Write, if any.
1724
- fn write_prepare_unlock_header_sector(self: *Self, write: *Self.Write) void {
1779
+ fn write_prepare_unlock_header_sector(journal: *Journal, write: *Journal.Write) void {
1725
1780
  assert(write.header_sector_locked);
1726
1781
  write.header_sector_locked = false;
1727
1782
 
@@ -1733,13 +1788,13 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
1733
1788
 
1734
1789
  assert(waiting.header_sector_locked == false);
1735
1790
  waiting.header_sector_locked = true;
1736
- self.write_prepare_on_lock_header_sector(waiting);
1791
+ journal.write_prepare_on_lock_header_sector(waiting);
1737
1792
  }
1738
1793
  assert(write.header_sector_next == null);
1739
1794
  }
1740
1795
 
1741
- fn write_prepare_release(self: *Self, write: *Self.Write, wrote: ?*Message) void {
1742
- const replica = @fieldParentPtr(Replica, "journal", self);
1796
+ fn write_prepare_release(journal: *Journal, write: *Journal.Write, wrote: ?*Message) void {
1797
+ const replica = @fieldParentPtr(Replica, "journal", journal);
1743
1798
  const write_callback = write.callback;
1744
1799
  const write_trigger = write.trigger;
1745
1800
  const write_message = write.message;
@@ -1747,14 +1802,14 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
1747
1802
  // Release the write prior to returning control to the caller.
1748
1803
  // This allows us to enforce journal.writes.len≤1 when replica_count=1, because the
1749
1804
  // callback may immediately start the next write.
1750
- self.writes.release(write);
1805
+ journal.writes.release(write);
1751
1806
  write_callback(replica, wrote, write_trigger);
1752
1807
  replica.message_bus.unref(write_message);
1753
1808
  }
1754
1809
 
1755
- fn write_prepare_debug(self: *const Self, header: *const Header, status: []const u8) void {
1810
+ fn write_prepare_debug(journal: *const Journal, header: *const Header, status: []const u8) void {
1756
1811
  log.debug("{}: write: view={} op={} len={}: {} {s}", .{
1757
- self.replica,
1812
+ journal.replica,
1758
1813
  header.view,
1759
1814
  header.op,
1760
1815
  header.size,
@@ -1763,14 +1818,14 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
1763
1818
  });
1764
1819
  }
1765
1820
 
1766
- fn offset_logical_in_headers_for_message(self: *const Self, message: *Message) u64 {
1767
- return Ring.headers.offset(self.slot_for_header(message.header));
1821
+ fn offset_logical_in_headers_for_message(journal: *const Journal, message: *Message) u64 {
1822
+ return Ring.headers.offset(journal.slot_for_header(message.header));
1768
1823
  }
1769
1824
 
1770
1825
  fn write_sectors(
1771
- self: *Self,
1772
- callback: fn (write: *Self.Write) void,
1773
- write: *Self.Write,
1826
+ journal: *Journal,
1827
+ callback: fn (write: *Journal.Write) void,
1828
+ write: *Journal.Write,
1774
1829
  buffer: []const u8,
1775
1830
  ring: Ring,
1776
1831
  offset: u64, // Offset within the Ring.
@@ -1783,16 +1838,16 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
1783
1838
  .offset = offset,
1784
1839
  .locked = false,
1785
1840
  };
1786
- self.lock_sectors(write);
1841
+ journal.lock_sectors(write);
1787
1842
  }
1788
1843
 
1789
1844
  /// Start the write on the current range or add it to the proper queue
1790
1845
  /// if an overlapping range is currently being written.
1791
- fn lock_sectors(self: *Self, write: *Self.Write) void {
1846
+ fn lock_sectors(journal: *Journal, write: *Journal.Write) void {
1792
1847
  assert(!write.range.locked);
1793
1848
  assert(write.range.next == null);
1794
1849
 
1795
- var it = self.writes.iterate();
1850
+ var it = journal.writes.iterate();
1796
1851
  while (it.next()) |other| {
1797
1852
  if (other == write) continue;
1798
1853
  if (!other.range.locked) continue;
@@ -1806,14 +1861,14 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
1806
1861
  }
1807
1862
 
1808
1863
  log.debug("{}: write_sectors: ring={} offset={} len={} locked", .{
1809
- self.replica,
1864
+ journal.replica,
1810
1865
  write.range.ring,
1811
1866
  write.range.offset,
1812
1867
  write.range.buffer.len,
1813
1868
  });
1814
1869
 
1815
1870
  write.range.locked = true;
1816
- self.storage.write_sectors(
1871
+ journal.storage.write_sectors(
1817
1872
  write_sectors_on_write,
1818
1873
  &write.range.completion,
1819
1874
  write.range.buffer,
@@ -1839,14 +1894,14 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
1839
1894
 
1840
1895
  fn write_sectors_on_write(completion: *Storage.Write) void {
1841
1896
  const range = @fieldParentPtr(Range, "completion", completion);
1842
- const write = @fieldParentPtr(Self.Write, "range", range);
1843
- const self = write.self;
1897
+ const write = @fieldParentPtr(Journal.Write, "range", range);
1898
+ const journal = write.journal;
1844
1899
 
1845
1900
  assert(write.range.locked);
1846
1901
  write.range.locked = false;
1847
1902
 
1848
1903
  log.debug("{}: write_sectors: ring={} offset={} len={} unlocked", .{
1849
- self.replica,
1904
+ journal.replica,
1850
1905
  write.range.ring,
1851
1906
  write.range.offset,
1852
1907
  write.range.buffer.len,
@@ -1859,18 +1914,70 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
1859
1914
  assert(waiting.locked == false);
1860
1915
  current = waiting.next;
1861
1916
  waiting.next = null;
1862
- self.lock_sectors(@fieldParentPtr(Self.Write, "range", waiting));
1917
+ journal.lock_sectors(@fieldParentPtr(Journal.Write, "range", waiting));
1863
1918
  }
1864
1919
 
1865
1920
  range.callback(write);
1866
1921
  }
1867
1922
 
1868
- pub fn writing(self: *Self, op: u64, checksum: u128) bool {
1869
- const slot = self.slot_for_op(op);
1923
+ /// Returns a sector of redundant headers, ready to be written to the specified sector.
1924
+ /// `sector_index` is relative to the start of the redundant header zone.
1925
+ fn header_sector(
1926
+ journal: *const Journal,
1927
+ sector_index: usize,
1928
+ write: *const Journal.Write,
1929
+ ) Sector {
1930
+ assert(journal.status != .init);
1931
+ assert(journal.writes.items.len == journal.headers_iops.len);
1932
+ assert(sector_index < @divFloor(slot_count, headers_per_sector));
1933
+
1934
+ const replica = @fieldParentPtr(Replica, "journal", journal);
1935
+ const sector_slot = Slot{ .index = sector_index * headers_per_sector };
1936
+ assert(sector_slot.index < slot_count);
1937
+
1938
+ const write_index = @divExact(
1939
+ @ptrToInt(write) - @ptrToInt(&journal.writes.items),
1940
+ @sizeOf(Journal.Write),
1941
+ );
1942
+
1943
+ // TODO The compiler should not need this align cast as the type of `headers_iops`
1944
+ // ensures that each buffer is properly aligned.
1945
+ const sector = @alignCast(constants.sector_size, &journal.headers_iops[write_index]);
1946
+ const sector_headers = std.mem.bytesAsSlice(Header, sector);
1947
+ assert(sector_headers.len == headers_per_sector);
1948
+
1949
+ var i: usize = 0;
1950
+ while (i < headers_per_sector) : (i += 1) {
1951
+ const slot = Slot{ .index = sector_slot.index + i };
1952
+
1953
+ if (journal.faulty.bit(slot)) {
1954
+ // Redundant faulty headers are deliberately written as invalid.
1955
+ // This ensures that faulty headers are still faulty when they are read back
1956
+ // from disk during recovery. This prevents faulty entries from changing to
1957
+ // reserved (and clean) after a crash and restart (e.g. accidentally converting
1958
+ // a case `@D` to a `@J` after a restart).
1959
+ sector_headers[i] = .{
1960
+ .checksum = 0,
1961
+ .cluster = replica.cluster,
1962
+ .command = .reserved,
1963
+ };
1964
+ assert(!sector_headers[i].valid_checksum());
1965
+ } else {
1966
+ // Write headers from `headers_redundant` instead of `headers` — we need to
1967
+ // avoid writing (leaking) a redundant header before its corresponding prepare
1968
+ // is on disk.
1969
+ sector_headers[i] = journal.headers_redundant[slot.index];
1970
+ }
1971
+ }
1972
+ return sector;
1973
+ }
1974
+
1975
+ pub fn writing(journal: *Journal, op: u64, checksum: u128) bool {
1976
+ const slot = journal.slot_for_op(op);
1870
1977
  var found: bool = false;
1871
- var it = self.writes.iterate();
1978
+ var it = journal.writes.iterate();
1872
1979
  while (it.next()) |write| {
1873
- const write_slot = self.slot_for_op(write.message.header.op);
1980
+ const write_slot = journal.slot_for_op(write.message.header.op);
1874
1981
 
1875
1982
  // It's possible that we might be writing the same op but with a different checksum.
1876
1983
  // For example, if the op we are writing did not survive the view change and was
@@ -1878,7 +1985,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
1878
1985
  // However, we compare against the 64-bit op first, since it's a cheap machine word.
1879
1986
  if (write.message.header.op == op and write.message.header.checksum == checksum) {
1880
1987
  // If we truly are writing, then the dirty bit must be set:
1881
- assert(self.dirty.bit(self.slot_for_op(op)));
1988
+ assert(journal.dirty.bit(journal.slot_for_op(op)));
1882
1989
  found = true;
1883
1990
  } else if (write_slot.index == slot.index) {
1884
1991
  // If the in-progress write of '{op, checksum}' will be overwritten by another
@@ -1891,46 +1998,6 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
1891
1998
  };
1892
1999
  }
1893
2000
 
1894
- pub const BitSet = struct {
1895
- bits: std.DynamicBitSetUnmanaged,
1896
-
1897
- /// The number of bits set (updated incrementally as bits are set or cleared):
1898
- count: u64 = 0,
1899
-
1900
- fn init(allocator: Allocator, count: usize) !BitSet {
1901
- const bits = try std.DynamicBitSetUnmanaged.initEmpty(allocator, count);
1902
- errdefer bits.deinit(allocator);
1903
-
1904
- return BitSet{ .bits = bits };
1905
- }
1906
-
1907
- fn deinit(self: *BitSet, allocator: Allocator) void {
1908
- self.bits.deinit(allocator);
1909
- }
1910
-
1911
- /// Clear the bit for a slot (idempotent):
1912
- pub fn clear(self: *BitSet, slot: Slot) void {
1913
- if (self.bits.isSet(slot.index)) {
1914
- self.bits.unset(slot.index);
1915
- self.count -= 1;
1916
- }
1917
- }
1918
-
1919
- /// Whether the bit for a slot is set:
1920
- pub fn bit(self: *const BitSet, slot: Slot) bool {
1921
- return self.bits.isSet(slot.index);
1922
- }
1923
-
1924
- /// Set the bit for a slot (idempotent):
1925
- pub fn set(self: *BitSet, slot: Slot) void {
1926
- if (!self.bits.isSet(slot.index)) {
1927
- self.bits.set(slot.index);
1928
- self.count += 1;
1929
- assert(self.count <= self.bits.bit_length);
1930
- }
1931
- }
1932
- };
1933
-
1934
2001
  /// @B and @C:
1935
2002
  /// This prepare header is corrupt.
1936
2003
  /// We may have a valid redundant header, but need to recover the full message.
@@ -2058,9 +2125,7 @@ const RecoveryDecision = enum {
2058
2125
  eql,
2059
2126
  /// Reserved; dirty/faulty are clear, no repair necessary.
2060
2127
  nil,
2061
- /// If replica_count>1: Repair with VSR `request_prepare`. Mark dirty, clear faulty.
2062
- /// If replica_count=1: Use intact prepare. Clear dirty, clear faulty.
2063
- /// (Don't set faulty, because we have the valid message.)
2128
+ /// Use intact prepare to repair redundant header. Dirty/faulty are clear.
2064
2129
  fix,
2065
2130
  /// If replica_count>1: Repair with VSR `request_prepare`. Mark dirty, mark faulty.
2066
2131
  /// If replica_count=1: Fail; cannot recover safely.
@@ -2102,9 +2167,9 @@ const Case = struct {
2102
2167
  };
2103
2168
  }
2104
2169
 
2105
- fn check(self: *const Case, parameters: [9]bool) !bool {
2170
+ fn check(case: *const Case, parameters: [9]bool) !bool {
2106
2171
  for (parameters) |b, i| {
2107
- switch (self.pattern[i]) {
2172
+ switch (case.pattern[i]) {
2108
2173
  .any => {},
2109
2174
  .is_false => if (b) return false,
2110
2175
  .is_true => if (!b) return false,
@@ -2115,12 +2180,12 @@ const Case = struct {
2115
2180
  return true;
2116
2181
  }
2117
2182
 
2118
- fn decision(self: *const Case, replica_count: u8) RecoveryDecision {
2183
+ fn decision(case: *const Case, replica_count: u8) RecoveryDecision {
2119
2184
  assert(replica_count > 0);
2120
2185
  if (replica_count == 1) {
2121
- return self.decision_single;
2186
+ return case.decision_single;
2122
2187
  } else {
2123
- return self.decision_multiple;
2188
+ return case.decision_multiple;
2124
2189
  }
2125
2190
  }
2126
2191
  };
@@ -2211,6 +2276,51 @@ test "recovery_cases" {
2211
2276
  }
2212
2277
  }
2213
2278
 
2279
+ pub const BitSet = struct {
2280
+ bits: std.DynamicBitSetUnmanaged,
2281
+
2282
+ /// The number of bits set (updated incrementally as bits are set or cleared):
2283
+ count: u64 = 0,
2284
+
2285
+ fn init_full(allocator: Allocator, count: usize) !BitSet {
2286
+ const bits = try std.DynamicBitSetUnmanaged.initFull(allocator, count);
2287
+ errdefer bits.deinit(allocator);
2288
+
2289
+ return BitSet{
2290
+ .bits = bits,
2291
+ .count = count,
2292
+ };
2293
+ }
2294
+
2295
+ fn deinit(bit_set: *BitSet, allocator: Allocator) void {
2296
+ assert(bit_set.count == bit_set.bits.count());
2297
+
2298
+ bit_set.bits.deinit(allocator);
2299
+ }
2300
+
2301
+ /// Clear the bit for a slot (idempotent):
2302
+ pub fn clear(bit_set: *BitSet, slot: Slot) void {
2303
+ if (bit_set.bits.isSet(slot.index)) {
2304
+ bit_set.bits.unset(slot.index);
2305
+ bit_set.count -= 1;
2306
+ }
2307
+ }
2308
+
2309
+ /// Whether the bit for a slot is set:
2310
+ pub fn bit(bit_set: *const BitSet, slot: Slot) bool {
2311
+ return bit_set.bits.isSet(slot.index);
2312
+ }
2313
+
2314
+ /// Set the bit for a slot (idempotent):
2315
+ pub fn set(bit_set: *BitSet, slot: Slot) void {
2316
+ if (!bit_set.bits.isSet(slot.index)) {
2317
+ bit_set.bits.set(slot.index);
2318
+ bit_set.count += 1;
2319
+ assert(bit_set.count <= bit_set.bits.bit_length);
2320
+ }
2321
+ }
2322
+ };
2323
+
2214
2324
  /// Format part of a new WAL's Zone.wal_headers, writing to `target`.
2215
2325
  ///
2216
2326
  /// `offset_logical` is relative to the beginning of the `wal_headers` zone.