tigerbeetle-node 0.9.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. package/README.md +305 -103
  2. package/dist/index.d.ts +70 -67
  3. package/dist/index.js +70 -67
  4. package/dist/index.js.map +1 -1
  5. package/package.json +6 -6
  6. package/scripts/download_node_headers.sh +14 -7
  7. package/src/index.ts +11 -10
  8. package/src/node.zig +22 -20
  9. package/src/tigerbeetle/scripts/benchmark.bat +4 -3
  10. package/src/tigerbeetle/scripts/benchmark.sh +25 -10
  11. package/src/tigerbeetle/scripts/confirm_image.sh +44 -0
  12. package/src/tigerbeetle/scripts/fuzz_loop.sh +15 -0
  13. package/src/tigerbeetle/scripts/fuzz_unique_errors.sh +7 -0
  14. package/src/tigerbeetle/scripts/install.sh +20 -4
  15. package/src/tigerbeetle/scripts/install_zig.bat +5 -1
  16. package/src/tigerbeetle/scripts/install_zig.sh +32 -26
  17. package/src/tigerbeetle/scripts/pre-commit.sh +9 -0
  18. package/src/tigerbeetle/scripts/shellcheck.sh +5 -0
  19. package/src/tigerbeetle/scripts/tests_on_alpine.sh +10 -0
  20. package/src/tigerbeetle/scripts/tests_on_ubuntu.sh +14 -0
  21. package/src/tigerbeetle/scripts/upgrade_ubuntu_kernel.sh +12 -3
  22. package/src/tigerbeetle/src/benchmark.zig +19 -9
  23. package/src/tigerbeetle/src/benchmark_array_search.zig +317 -0
  24. package/src/tigerbeetle/src/benchmarks/perf.zig +299 -0
  25. package/src/tigerbeetle/src/c/tb_client/context.zig +103 -0
  26. package/src/tigerbeetle/src/c/tb_client/packet.zig +80 -0
  27. package/src/tigerbeetle/src/c/tb_client/signal.zig +288 -0
  28. package/src/tigerbeetle/src/c/tb_client/thread.zig +328 -0
  29. package/src/tigerbeetle/src/c/tb_client.h +221 -0
  30. package/src/tigerbeetle/src/c/tb_client.zig +104 -0
  31. package/src/tigerbeetle/src/c/test.zig +1 -0
  32. package/src/tigerbeetle/src/cli.zig +143 -84
  33. package/src/tigerbeetle/src/config.zig +161 -20
  34. package/src/tigerbeetle/src/demo.zig +14 -8
  35. package/src/tigerbeetle/src/demo_05_post_pending_transfers.zig +2 -2
  36. package/src/tigerbeetle/src/ewah.zig +318 -0
  37. package/src/tigerbeetle/src/ewah_benchmark.zig +121 -0
  38. package/src/tigerbeetle/src/eytzinger_benchmark.zig +317 -0
  39. package/src/tigerbeetle/src/fifo.zig +17 -1
  40. package/src/tigerbeetle/src/io/darwin.zig +12 -10
  41. package/src/tigerbeetle/src/io/linux.zig +25 -9
  42. package/src/tigerbeetle/src/io/windows.zig +13 -9
  43. package/src/tigerbeetle/src/iops.zig +101 -0
  44. package/src/tigerbeetle/src/lsm/README.md +214 -0
  45. package/src/tigerbeetle/src/lsm/binary_search.zig +341 -0
  46. package/src/tigerbeetle/src/lsm/bloom_filter.zig +125 -0
  47. package/src/tigerbeetle/src/lsm/compaction.zig +557 -0
  48. package/src/tigerbeetle/src/lsm/composite_key.zig +77 -0
  49. package/src/tigerbeetle/src/lsm/direction.zig +11 -0
  50. package/src/tigerbeetle/src/lsm/eytzinger.zig +587 -0
  51. package/src/tigerbeetle/src/lsm/forest.zig +204 -0
  52. package/src/tigerbeetle/src/lsm/forest_fuzz.zig +412 -0
  53. package/src/tigerbeetle/src/lsm/grid.zig +549 -0
  54. package/src/tigerbeetle/src/lsm/groove.zig +1002 -0
  55. package/src/tigerbeetle/src/lsm/k_way_merge.zig +474 -0
  56. package/src/tigerbeetle/src/lsm/level_iterator.zig +315 -0
  57. package/src/tigerbeetle/src/lsm/manifest.zig +580 -0
  58. package/src/tigerbeetle/src/lsm/manifest_level.zig +925 -0
  59. package/src/tigerbeetle/src/lsm/manifest_log.zig +953 -0
  60. package/src/tigerbeetle/src/lsm/node_pool.zig +231 -0
  61. package/src/tigerbeetle/src/lsm/posted_groove.zig +387 -0
  62. package/src/tigerbeetle/src/lsm/segmented_array.zig +1318 -0
  63. package/src/tigerbeetle/src/lsm/segmented_array_benchmark.zig +148 -0
  64. package/src/tigerbeetle/src/lsm/segmented_array_fuzz.zig +9 -0
  65. package/src/tigerbeetle/src/lsm/set_associative_cache.zig +894 -0
  66. package/src/tigerbeetle/src/lsm/table.zig +967 -0
  67. package/src/tigerbeetle/src/lsm/table_immutable.zig +203 -0
  68. package/src/tigerbeetle/src/lsm/table_iterator.zig +306 -0
  69. package/src/tigerbeetle/src/lsm/table_mutable.zig +174 -0
  70. package/src/tigerbeetle/src/lsm/test.zig +423 -0
  71. package/src/tigerbeetle/src/lsm/tree.zig +1090 -0
  72. package/src/tigerbeetle/src/lsm/tree_fuzz.zig +457 -0
  73. package/src/tigerbeetle/src/main.zig +141 -109
  74. package/src/tigerbeetle/src/message_bus.zig +49 -48
  75. package/src/tigerbeetle/src/message_pool.zig +22 -12
  76. package/src/tigerbeetle/src/ring_buffer.zig +126 -30
  77. package/src/tigerbeetle/src/simulator.zig +205 -140
  78. package/src/tigerbeetle/src/state_machine.zig +1268 -721
  79. package/src/tigerbeetle/src/static_allocator.zig +65 -0
  80. package/src/tigerbeetle/src/storage.zig +40 -14
  81. package/src/tigerbeetle/src/test/accounting/auditor.zig +577 -0
  82. package/src/tigerbeetle/src/test/accounting/workload.zig +819 -0
  83. package/src/tigerbeetle/src/test/cluster.zig +104 -88
  84. package/src/tigerbeetle/src/test/conductor.zig +365 -0
  85. package/src/tigerbeetle/src/test/fuzz.zig +121 -0
  86. package/src/tigerbeetle/src/test/id.zig +89 -0
  87. package/src/tigerbeetle/src/test/message_bus.zig +15 -24
  88. package/src/tigerbeetle/src/test/network.zig +26 -17
  89. package/src/tigerbeetle/src/test/priority_queue.zig +645 -0
  90. package/src/tigerbeetle/src/test/state_checker.zig +94 -68
  91. package/src/tigerbeetle/src/test/state_machine.zig +135 -69
  92. package/src/tigerbeetle/src/test/storage.zig +78 -28
  93. package/src/tigerbeetle/src/tigerbeetle.zig +19 -16
  94. package/src/tigerbeetle/src/unit_tests.zig +15 -0
  95. package/src/tigerbeetle/src/util.zig +51 -0
  96. package/src/tigerbeetle/src/vopr.zig +494 -0
  97. package/src/tigerbeetle/src/vopr_hub/README.md +58 -0
  98. package/src/tigerbeetle/src/vopr_hub/SETUP.md +199 -0
  99. package/src/tigerbeetle/src/vopr_hub/go.mod +3 -0
  100. package/src/tigerbeetle/src/vopr_hub/main.go +1022 -0
  101. package/src/tigerbeetle/src/vopr_hub/scheduler/go.mod +3 -0
  102. package/src/tigerbeetle/src/vopr_hub/scheduler/main.go +403 -0
  103. package/src/tigerbeetle/src/vsr/client.zig +34 -7
  104. package/src/tigerbeetle/src/vsr/journal.zig +164 -174
  105. package/src/tigerbeetle/src/vsr/replica.zig +1602 -651
  106. package/src/tigerbeetle/src/vsr/superblock.zig +1761 -0
  107. package/src/tigerbeetle/src/vsr/superblock_client_table.zig +255 -0
  108. package/src/tigerbeetle/src/vsr/superblock_free_set.zig +644 -0
  109. package/src/tigerbeetle/src/vsr/superblock_manifest.zig +561 -0
  110. package/src/tigerbeetle/src/vsr.zig +118 -170
  111. package/src/tigerbeetle/scripts/vopr.bat +0 -48
  112. package/src/tigerbeetle/scripts/vopr.sh +0 -33
@@ -4,14 +4,20 @@ const assert = std.debug.assert;
4
4
 
5
5
  const config = @import("../config.zig");
6
6
 
7
+ const StaticAllocator = @import("../static_allocator.zig");
8
+ const GridType = @import("../lsm/grid.zig").GridType;
9
+ const MessagePool = @import("../message_pool.zig").MessagePool;
7
10
  const Message = @import("../message_pool.zig").MessagePool.Message;
8
11
  const RingBuffer = @import("../ring_buffer.zig").RingBuffer;
12
+ const ClientTable = @import("superblock_client_table.zig").ClientTable;
13
+ const format_journal = @import("./journal.zig").format_journal;
9
14
 
10
15
  const vsr = @import("../vsr.zig");
11
16
  const Header = vsr.Header;
12
17
  const Timeout = vsr.Timeout;
13
18
  const Command = vsr.Command;
14
19
  const Version = vsr.Version;
20
+ const VSRState = vsr.VSRState;
15
21
 
16
22
  const log = std.log.scoped(.replica);
17
23
 
@@ -39,32 +45,6 @@ pub const Status = enum {
39
45
  recovering,
40
46
  };
41
47
 
42
- const ClientTable = std.AutoHashMapUnmanaged(u128, ClientTableEntry);
43
-
44
- /// We found two bugs in the VRR paper relating to the client table:
45
- ///
46
- /// 1. a correctness bug, where successive client crashes may cause request numbers to collide for
47
- /// different request payloads, resulting in requests receiving the wrong reply, and
48
- ///
49
- /// 2. a liveness bug, where if the client table is updated for request and prepare messages with
50
- /// the client's latest request number, then the client may be locked out from the cluster if the
51
- /// request is ever reordered through a view change.
52
- ///
53
- /// We therefore take a different approach with the implementation of our client table, to:
54
- ///
55
- /// 1. register client sessions explicitly through the state machine to ensure that client session
56
- /// numbers always increase, and
57
- ///
58
- /// 2. make a more careful distinction between uncommitted and committed request numbers,
59
- /// considering that uncommitted requests may not survive a view change.
60
- const ClientTableEntry = struct {
61
- /// The client's session number as committed to the cluster by a register request.
62
- session: u64,
63
-
64
- /// The reply sent to the client's latest committed request.
65
- reply: *Message,
66
- };
67
-
68
48
  const Nonce = u128;
69
49
 
70
50
  const Prepare = struct {
@@ -84,18 +64,40 @@ const quorum_messages_null = [_]?*Message{null} ** config.replicas_max;
84
64
  const QuorumCounter = std.StaticBitSet(config.replicas_max);
85
65
  const quorum_counter_null = QuorumCounter.initEmpty();
86
66
 
87
- pub fn Replica(
67
+ // CRITICAL: The number of prepare headers to include in the body:
68
+ // We must provide enough headers to cover all uncommitted headers so that the new
69
+ // leader (if we are in a view change) can decide whether to discard uncommitted headers
70
+ // that cannot be repaired because they are gaps, and this must be relative to the
71
+ // cluster as a whole (not relative to the difference between our op and commit number)
72
+ // as otherwise we would break correctness.
73
+ const view_change_headers_count = config.pipeline_max;
74
+
75
+ comptime {
76
+ assert(view_change_headers_count > 0);
77
+ assert(view_change_headers_count >= config.pipeline_max);
78
+ assert(view_change_headers_count <=
79
+ @divFloor(config.message_size_max - @sizeOf(Header), @sizeOf(Header)));
80
+ }
81
+
82
+ pub fn ReplicaType(
88
83
  comptime StateMachine: type,
89
84
  comptime MessageBus: type,
90
85
  comptime Storage: type,
91
86
  comptime Time: type,
92
87
  ) type {
88
+ const Grid = GridType(Storage);
89
+ const SuperBlock = vsr.SuperBlockType(Storage);
90
+
93
91
  return struct {
94
92
  const Self = @This();
95
93
 
96
94
  const Journal = vsr.Journal(Self, Storage);
97
95
  const Clock = vsr.Clock(Time);
98
96
 
97
+ /// We use this allocator during open/init and then disable it.
98
+ /// An accidental dynamic allocation after open/init will cause an assertion failure.
99
+ static_allocator: StaticAllocator,
100
+
99
101
  /// The number of the cluster to which this replica belongs:
100
102
  cluster: u32,
101
103
 
@@ -111,6 +113,8 @@ pub fn Replica(
111
113
  /// The minimum number of replicas required to form a view change quorum:
112
114
  quorum_view_change: u8,
113
115
 
116
+ time: Time,
117
+
114
118
  /// A distributed fault-tolerant clock for lower and upper bounds on the leader's wall clock:
115
119
  clock: Clock,
116
120
 
@@ -118,14 +122,17 @@ pub fn Replica(
118
122
  journal: Journal,
119
123
 
120
124
  /// An abstraction to send messages from the replica to another replica or client.
121
- /// The message bus will also deliver messages to this replica by calling `on_message()`.
122
- message_bus: *MessageBus,
125
+ /// The message bus will also deliver messages to this replica by calling `on_message_from_bus()`.
126
+ message_bus: MessageBus,
123
127
 
124
128
  /// For executing service up-calls after an operation has been committed:
125
- state_machine: *StateMachine,
129
+ state_machine: StateMachine,
126
130
 
127
- /// The client table records for each client the latest session and the latest committed reply.
128
- client_table: ClientTable,
131
+ // TODO Document.
132
+ superblock: SuperBlock,
133
+ superblock_context: SuperBlock.Context = undefined,
134
+ grid: Grid,
135
+ opened: bool,
129
136
 
130
137
  /// The current view, initially 0:
131
138
  view: u32,
@@ -136,24 +143,46 @@ pub fn Replica(
136
143
  /// The current status, either normal, view_change, or recovering:
137
144
  status: Status = .recovering,
138
145
 
139
- /// The op number assigned to the most recently prepared operation:
146
+ /// The op number assigned to the most recently prepared operation.
147
+ ///
148
+ /// Invariants (not applicable during status=recovering):
149
+ /// * `replica.op` exists in the Journal.
150
+ /// * `replica.op ≥ replica.commit_min`.
151
+ /// * `replica.op ≤ replica.op_checkpoint_trigger`: don't wrap the WAL until we are sure
152
+ /// that the overwritten entry will not be required for recovery.
153
+ // TODO: When recovery protocol is removed, load the `op` from the WAL, and verify that it is ≥op_checkpoint.
154
+ // Also verify that a corresponding header exists in the WAL.
140
155
  op: u64,
141
156
 
142
157
  /// The op of the highest checkpointed message.
143
- // TODO Update this to use LSM storage.
144
158
  // TODO Refuse to store/ack any op>op_checkpoint+journal_slot_count.
145
- // TODO Enforce invariant op≥op_checkpoint.
146
- op_checkpoint: u64 = 0,
159
+ op_checkpoint: u64,
147
160
 
148
161
  /// The op number of the latest committed and executed operation (according to the replica):
149
162
  /// The replica may have to wait for repairs to complete before commit_min reaches commit_max.
163
+ ///
164
+ /// Invariants (not applicable during status=recovering):
165
+ /// * `replica.commit_min` exists in the Journal.
166
+ /// * `replica.commit_min ≤ replica.op`
167
+ /// * `replica.commit_min ≥ replica.op_checkpoint`.
168
+ /// * never decreases
150
169
  commit_min: u64,
151
170
 
152
171
  /// The op number of the latest committed operation (according to the cluster):
153
172
  /// This is the commit number in terms of the VRR paper.
173
+ ///
174
+ /// Invariants:
175
+ /// * `replica.commit_max ≥ replica.commit_min`.
176
+ /// * never decreases
154
177
  commit_max: u64,
155
178
 
156
- /// Whether we are reading a prepare from storage in order to commit.
179
+ /// Guards against concurrent commits.
180
+ ///
181
+ /// Set while:
182
+ /// * prefetching from storage, in preparation for a commit
183
+ /// * reading a prepare from storage in order to commit
184
+ /// * compacting storage
185
+ /// * checkpointing
157
186
  committing: bool = false,
158
187
 
159
188
  /// Whether we are reading a prepare from storage in order to push to the pipeline.
@@ -164,7 +193,7 @@ pub fn Replica(
164
193
  ///
165
194
  /// After a view change, the old leader's pipeline is left untouched so that it is able to
166
195
  /// help the new leader repair, even in the face of local storage faults.
167
- pipeline: RingBuffer(Prepare, config.pipeline_max) = .{},
196
+ pipeline: RingBuffer(Prepare, config.pipeline_max, .array) = .{},
168
197
 
169
198
  /// In some cases, a replica may send a message to itself. We do not submit these messages
170
199
  /// to the message bus but rather queue them here for guaranteed immediate delivery, which
@@ -236,18 +265,112 @@ pub fn Replica(
236
265
 
237
266
  on_change_state: ?fn (replica: *Self) void = null,
238
267
 
239
- pub fn init(
240
- allocator: Allocator,
268
+ /// Called when `commit_prepare` finishes committing.
269
+ commit_callback: ?fn (*Self) void = null,
270
+
271
+ /// The prepare message being committed.
272
+ commit_prepare: ?*Message = null,
273
+
274
+ const OpenOptions = struct {
275
+ replica_count: u8,
276
+ storage: *Storage,
277
+ message_pool: *MessagePool,
278
+ time: Time,
279
+ state_machine_options: StateMachine.Options,
280
+ message_bus_options: MessageBus.Options,
281
+ };
282
+
283
+ /// Initializes and opens the provided replica using the options.
284
+ pub fn open(self: *Self, parent_allocator: std.mem.Allocator, options: OpenOptions) !void {
285
+ self.static_allocator = StaticAllocator.init(parent_allocator);
286
+ const allocator = self.static_allocator.allocator();
287
+
288
+ self.superblock = try SuperBlock.init(
289
+ allocator,
290
+ options.storage,
291
+ options.message_pool,
292
+ );
293
+
294
+ // Once initialzed, the replica is in charge of calling superblock.deinit()
295
+ var initialized = false;
296
+ errdefer if (!initialized) self.superblock.deinit(allocator);
297
+
298
+ // Open the superblock:
299
+ self.opened = false;
300
+ self.superblock.open(superblock_open_callback, &self.superblock_context);
301
+ while (!self.opened) self.superblock.storage.tick();
302
+ assert(self.superblock.working.vsr_state.internally_consistent());
303
+
304
+ if (self.superblock.working.replica >= options.replica_count) {
305
+ log.err("{}: open: no address for replica (replica_count={})", .{
306
+ self.superblock.working.replica,
307
+ options.replica_count,
308
+ });
309
+ return error.NoAddress;
310
+ }
311
+
312
+ // Intiaize the replica:
313
+ try self.init(allocator, .{
314
+ .cluster = self.superblock.working.cluster,
315
+ .replica_index = self.superblock.working.replica,
316
+ .replica_count = options.replica_count,
317
+ .storage = options.storage,
318
+ .time = options.time,
319
+ .message_pool = options.message_pool,
320
+ .state_machine_options = options.state_machine_options,
321
+ .message_bus_options = options.message_bus_options,
322
+ });
323
+
324
+ // Disable all dynamic allocation from this point onwards.
325
+ self.static_allocator.transition_from_init_to_static();
326
+
327
+ initialized = true;
328
+ errdefer self.deinit(allocator);
329
+
330
+ // Open the (Forest inside) StateMachine:
331
+ self.opened = false;
332
+ self.state_machine.open(state_machine_open_callback);
333
+ while (!self.opened) {
334
+ self.grid.tick();
335
+ self.superblock.storage.tick();
336
+ }
337
+ }
338
+
339
+ fn superblock_open_callback(superblock_context: *SuperBlock.Context) void {
340
+ const self = @fieldParentPtr(Self, "superblock_context", superblock_context);
341
+ assert(!self.opened);
342
+ self.opened = true;
343
+ }
344
+
345
+ fn state_machine_open_callback(state_machine: *StateMachine) void {
346
+ const self = @fieldParentPtr(Self, "state_machine", state_machine);
347
+ assert(!self.opened);
348
+ self.opened = true;
349
+ }
350
+
351
+ const Options = struct {
241
352
  cluster: u32,
242
353
  replica_count: u8,
243
- replica: u8,
244
- time: *Time,
354
+ replica_index: u8,
355
+ time: Time,
245
356
  storage: *Storage,
246
- message_bus: *MessageBus,
247
- state_machine: *StateMachine,
248
- ) !Self {
357
+ message_pool: *MessagePool,
358
+ // TODO With https://github.com/coilhq/tigerbeetle/issues/71,
359
+ // the separate message_bus_options won't be necessary.
360
+ message_bus_options: MessageBus.Options,
361
+ state_machine_options: StateMachine.Options,
362
+ };
363
+
364
+ /// NOTE: self.superblock must be initialized and opened prior to this call.
365
+ fn init(self: *Self, allocator: Allocator, options: Options) !void {
366
+ const replica_count = options.replica_count;
367
+ const replica_index = options.replica_index;
249
368
  assert(replica_count > 0);
250
- assert(replica < replica_count);
369
+ assert(replica_index < replica_count);
370
+
371
+ assert(self.opened);
372
+ assert(self.superblock.opened);
373
+ assert(self.superblock.working.vsr_state.internally_consistent());
251
374
 
252
375
  const majority = (replica_count / 2) + 1;
253
376
  assert(majority <= replica_count);
@@ -277,91 +400,112 @@ pub fn Replica(
277
400
  // Flexible quorums are safe if these two quorums intersect so that this relation holds:
278
401
  assert(quorum_replication + quorum_view_change > replica_count);
279
402
 
280
- var client_table: ClientTable = .{};
281
- errdefer client_table.deinit(allocator);
282
- try client_table.ensureTotalCapacity(allocator, @intCast(u32, config.clients_max));
283
- assert(client_table.capacity() >= config.clients_max);
403
+ self.time = options.time;
404
+ self.clock = try Clock.init(
405
+ allocator,
406
+ replica_count,
407
+ replica_index,
408
+ &self.time,
409
+ );
410
+ errdefer self.clock.deinit(allocator);
284
411
 
285
- const root_prepare = Header.root_prepare(cluster);
412
+ self.journal = try Journal.init(allocator, options.storage, replica_index);
413
+ errdefer self.journal.deinit(allocator);
286
414
 
287
- var clock = try Clock.init(
415
+ self.message_bus = try MessageBus.init(
288
416
  allocator,
289
- replica_count,
290
- replica,
291
- time,
417
+ options.cluster,
418
+ .{ .replica = options.replica_index },
419
+ options.message_pool,
420
+ Self.on_message_from_bus,
421
+ options.message_bus_options,
292
422
  );
293
- errdefer clock.deinit(allocator);
423
+ errdefer self.message_bus.deinit(allocator);
424
+
425
+ self.grid = try Grid.init(allocator, &self.superblock);
426
+ errdefer self.grid.deinit(allocator);
294
427
 
295
- const journal = try Journal.init(allocator, storage, replica);
296
- errdefer journal.deinit(allocator);
428
+ self.state_machine = try StateMachine.init(
429
+ allocator,
430
+ &self.grid,
431
+ options.state_machine_options,
432
+ );
433
+ errdefer self.state_machine.deinit(allocator);
297
434
 
298
435
  const recovery_nonce = blk: {
299
436
  var nonce: [@sizeOf(Nonce)]u8 = undefined;
300
437
  var hash = std.crypto.hash.Blake3.init(.{});
301
- hash.update(std.mem.asBytes(&clock.monotonic()));
302
- hash.update(&[_]u8{replica});
438
+ hash.update(std.mem.asBytes(&self.clock.monotonic()));
439
+ hash.update(&[_]u8{replica_index});
303
440
  hash.final(&nonce);
304
441
  break :blk @bitCast(Nonce, nonce);
305
442
  };
306
443
 
307
- var self = Self{
308
- .cluster = cluster,
444
+ self.* = Self{
445
+ .static_allocator = self.static_allocator,
446
+ .cluster = options.cluster,
309
447
  .replica_count = replica_count,
310
- .replica = replica,
448
+ .replica = replica_index,
311
449
  .quorum_replication = quorum_replication,
312
450
  .quorum_view_change = quorum_view_change,
313
- .clock = clock,
314
- .journal = journal,
315
- .message_bus = message_bus,
316
- .state_machine = state_machine,
317
- .client_table = client_table,
318
- .view = root_prepare.view,
319
- .view_normal = root_prepare.view,
320
- .op = root_prepare.op,
321
- .commit_min = root_prepare.commit,
322
- .commit_max = root_prepare.commit,
451
+ // Copy the (already-initialized) time back, to avoid regressing the monotonic
452
+ // clock guard.
453
+ .time = self.time,
454
+ .clock = self.clock,
455
+ .journal = self.journal,
456
+ .message_bus = self.message_bus,
457
+ .state_machine = self.state_machine,
458
+ .superblock = self.superblock,
459
+ .grid = self.grid,
460
+ .opened = self.opened,
461
+ .view = self.superblock.working.vsr_state.view,
462
+ .view_normal = self.superblock.working.vsr_state.view_normal,
463
+ .op = 0,
464
+ .op_checkpoint = self.superblock.working.vsr_state.commit_min,
465
+ .commit_min = self.superblock.working.vsr_state.commit_min,
466
+ .commit_max = self.superblock.working.vsr_state.commit_max,
323
467
  .ping_timeout = Timeout{
324
468
  .name = "ping_timeout",
325
- .id = replica,
469
+ .id = replica_index,
326
470
  .after = 100,
327
471
  },
328
472
  .prepare_timeout = Timeout{
329
473
  .name = "prepare_timeout",
330
- .id = replica,
474
+ .id = replica_index,
331
475
  .after = 50,
332
476
  },
333
477
  .commit_timeout = Timeout{
334
478
  .name = "commit_timeout",
335
- .id = replica,
479
+ .id = replica_index,
336
480
  .after = 100,
337
481
  },
338
482
  .normal_status_timeout = Timeout{
339
483
  .name = "normal_status_timeout",
340
- .id = replica,
484
+ .id = replica_index,
341
485
  .after = 500,
342
486
  },
343
487
  .view_change_status_timeout = Timeout{
344
488
  .name = "view_change_status_timeout",
345
- .id = replica,
489
+ .id = replica_index,
346
490
  .after = 500,
347
491
  },
348
492
  .view_change_message_timeout = Timeout{
349
493
  .name = "view_change_message_timeout",
350
- .id = replica,
494
+ .id = replica_index,
351
495
  .after = 50,
352
496
  },
353
497
  .repair_timeout = Timeout{
354
498
  .name = "repair_timeout",
355
- .id = replica,
499
+ .id = replica_index,
356
500
  .after = 50,
357
501
  },
358
502
  .recovery_timeout = Timeout{
359
503
  .name = "recovery_timeout",
360
- .id = replica,
504
+ .id = replica_index,
361
505
  .after = 200,
362
506
  },
363
507
  .recovery_nonce = recovery_nonce,
364
- .prng = std.rand.DefaultPrng.init(replica),
508
+ .prng = std.rand.DefaultPrng.init(replica_index),
365
509
  };
366
510
 
367
511
  log.debug("{}: init: replica_count={} quorum_view_change={} quorum_replication={}", .{
@@ -375,28 +519,24 @@ pub fn Replica(
375
519
  // always overallocate capacity by a factor of two.
376
520
  log.debug("{}: init: client_table.capacity()={} for config.clients_max={} entries", .{
377
521
  self.replica,
378
- self.client_table.capacity(),
522
+ self.client_table().capacity(),
379
523
  config.clients_max,
380
524
  });
381
525
 
382
526
  assert(self.status == .recovering);
383
-
384
- return self;
385
527
  }
386
528
 
387
529
  /// Free all memory and unref all messages held by the replica
388
530
  /// This does not deinitialize the StateMachine, MessageBus, Storage, or Time
389
531
  pub fn deinit(self: *Self, allocator: Allocator) void {
532
+ self.static_allocator.transition_from_static_to_deinit();
533
+
390
534
  self.journal.deinit(allocator);
391
535
  self.clock.deinit(allocator);
392
-
393
- {
394
- var it = self.client_table.iterator();
395
- while (it.next()) |entry| {
396
- self.message_bus.unref(entry.value_ptr.reply);
397
- }
398
- self.client_table.deinit(allocator);
399
- }
536
+ self.state_machine.deinit(allocator);
537
+ self.superblock.deinit(allocator);
538
+ self.grid.deinit(allocator);
539
+ defer self.message_bus.deinit(allocator);
400
540
 
401
541
  while (self.pipeline.pop()) |prepare| self.message_bus.unref(prepare.message);
402
542
 
@@ -406,6 +546,15 @@ pub fn Replica(
406
546
  self.loopback_queue = null;
407
547
  }
408
548
 
549
+ if (self.commit_prepare) |message| {
550
+ assert(self.committing);
551
+ assert(self.commit_callback != null);
552
+ self.message_bus.unref(message);
553
+ self.commit_prepare = null;
554
+ } else {
555
+ assert(self.commit_callback == null);
556
+ }
557
+
409
558
  for (self.do_view_change_from_all_replicas) |message| {
410
559
  if (message) |m| self.message_bus.unref(m);
411
560
  }
@@ -415,6 +564,11 @@ pub fn Replica(
415
564
  }
416
565
  }
417
566
 
567
+ /// The client table records for each client the latest session and the latest committed reply.
568
+ inline fn client_table(self: *Self) *ClientTable {
569
+ return &self.superblock.client_table;
570
+ }
571
+
418
572
  /// Time is measured in logical ticks that are incremented on every call to tick().
419
573
  /// This eliminates a dependency on the system time and enables deterministic testing.
420
574
  pub fn tick(self: *Self) void {
@@ -424,8 +578,15 @@ pub fn Replica(
424
578
  // decrease throughput significantly.
425
579
  assert(self.loopback_queue == null);
426
580
 
581
+ // TODO Replica owns Time; should it tick() here instead of Clock?
427
582
  self.clock.tick();
428
583
 
584
+ // Storage/IO is ticked by top-level in case of multiple replicas sharing the same IO.
585
+ // self.journal.storage.tick();
586
+
587
+ self.grid.tick();
588
+ self.message_bus.tick();
589
+
429
590
  if (!self.journal.recovered) {
430
591
  if (!self.journal.recovering) self.journal.recover();
431
592
  return;
@@ -442,6 +603,10 @@ pub fn Replica(
442
603
  // The data file is brand new — no messages have ever been written.
443
604
  // Transition to normal status; no need to run the VSR recovery protocol.
444
605
  assert(self.journal.faulty.count == 0);
606
+ assert(self.commit_min == 0);
607
+ assert(self.commit_max == 0);
608
+ assert(self.op_checkpoint == 0);
609
+ assert(self.op == 0);
445
610
  self.transition_to_normal_from_recovering_status(0);
446
611
  assert(self.status == .normal);
447
612
  } else if (self.replica_count == 1) {
@@ -449,8 +614,13 @@ pub fn Replica(
449
614
  if (self.journal.faulty.count != 0) @panic("journal is corrupt");
450
615
  if (self.committing) return;
451
616
  assert(self.op == 0);
617
+ // TODO Assert that this path isn't taken more than once.
452
618
  self.op = self.journal.op_maximum();
453
- self.commit_ops(self.op);
619
+ assert(self.op >= self.commit_min);
620
+ assert(self.op >= self.op_checkpoint);
621
+ assert(self.op <= self.op_checkpoint_trigger());
622
+ assert(self.journal.header_with_op(self.op) != null);
623
+ self.commit_journal(self.op);
454
624
  // The recovering→normal transition is deferred until all ops are committed.
455
625
  } else {
456
626
  // The journal just finished recovery.
@@ -482,6 +652,11 @@ pub fn Replica(
482
652
  }
483
653
 
484
654
  /// Called by the MessageBus to deliver a message to the replica.
655
+ fn on_message_from_bus(message_bus: *MessageBus, message: *Message) void {
656
+ const self = @fieldParentPtr(Self, "message_bus", message_bus);
657
+ self.on_message(message);
658
+ }
659
+
485
660
  pub fn on_message(self: *Self, message: *Message) void {
486
661
  assert(self.loopback_queue == null);
487
662
  assert(message.references > 0);
@@ -533,6 +708,7 @@ pub fn Replica(
533
708
  .request_start_view => self.on_request_start_view(message),
534
709
  .request_prepare => self.on_request_prepare(message),
535
710
  .request_headers => self.on_request_headers(message),
711
+ .request_block => unreachable, // TODO
536
712
  .headers => self.on_headers(message),
537
713
  .nack_prepare => self.on_nack_prepare(message),
538
714
  // A replica should never handle misdirected messages intended for a client:
@@ -543,6 +719,7 @@ pub fn Replica(
543
719
  });
544
720
  return;
545
721
  },
722
+ .block => unreachable, // TODO
546
723
  .reserved => unreachable,
547
724
  }
548
725
 
@@ -731,7 +908,7 @@ pub fn Replica(
731
908
  }
732
909
 
733
910
  // Verify that the new request will fit in the WAL.
734
- if (message.header.op >= self.op_checkpoint + config.journal_slot_count) {
911
+ if (message.header.op > self.op_checkpoint_trigger()) {
735
912
  log.debug("{}: on_prepare: ignoring op={} (too far ahead, checkpoint={})", .{
736
913
  self.replica,
737
914
  message.header.op,
@@ -749,13 +926,15 @@ pub fn Replica(
749
926
  assert(message.header.op > self.op_checkpoint);
750
927
  assert(message.header.op > self.op);
751
928
  assert(message.header.op > self.commit_min);
752
- assert(message.header.op < self.op_checkpoint + config.journal_slot_count);
929
+ assert(message.header.op <= self.op_checkpoint_trigger());
753
930
 
754
931
  if (self.follower()) self.normal_status_timeout.reset();
755
932
 
756
933
  if (message.header.op > self.op + 1) {
757
934
  log.debug("{}: on_prepare: newer op", .{self.replica});
758
935
  self.jump_to_newer_op_in_normal_status(message.header);
936
+ // "`replica.op` exists" invariant is temporarily broken.
937
+ assert(self.journal.header_with_op(message.header.op - 1) == null);
759
938
  }
760
939
 
761
940
  if (self.journal.previous_entry(message.header)) |previous| {
@@ -782,7 +961,7 @@ pub fn Replica(
782
961
 
783
962
  if (self.follower()) {
784
963
  // A prepare may already be committed if requested by repair() so take the max:
785
- self.commit_ops(std.math.max(message.header.commit, self.commit_max));
964
+ self.commit_journal(std.math.max(message.header.commit, self.commit_max));
786
965
  assert(self.commit_max >= message.header.commit);
787
966
  }
788
967
  }
@@ -802,7 +981,10 @@ pub fn Replica(
802
981
  assert(prepare.message.header.op <= self.op);
803
982
 
804
983
  // Wait until we have `f + 1` prepare_ok messages (including ourself) for quorum:
805
- const threshold = self.quorum_replication;
984
+ // const threshold = self.quorum_replication;
985
+ // TODO: When Block recover & state transfer are implemented, this can be removed.
986
+ const threshold =
987
+ if (prepare.message.header.op == self.op_checkpoint_trigger()) self.replica_count else self.quorum_replication;
806
988
 
807
989
  const count = self.count_message_and_receive_quorum_exactly_once(
808
990
  &prepare.ok_from_all_replicas,
@@ -867,7 +1049,7 @@ pub fn Replica(
867
1049
  }
868
1050
 
869
1051
  self.normal_status_timeout.reset();
870
- self.commit_ops(message.header.commit);
1052
+ self.commit_journal(message.header.commit);
871
1053
  }
872
1054
 
873
1055
  fn on_repair(self: *Self, message: *Message) void {
@@ -894,7 +1076,9 @@ pub fn Replica(
894
1076
  }
895
1077
 
896
1078
  if (self.status == .view_change and !self.do_view_change_quorum) {
897
- log.debug("{}: on_repair: ignoring (view change, waiting for quorum)", .{self.replica});
1079
+ log.debug("{}: on_repair: ignoring (view change, waiting for quorum)", .{
1080
+ self.replica,
1081
+ });
898
1082
  return;
899
1083
  }
900
1084
 
@@ -911,6 +1095,7 @@ pub fn Replica(
911
1095
 
912
1096
  if (self.journal.has_clean(message.header)) {
913
1097
  log.debug("{}: on_repair: ignoring (duplicate)", .{self.replica});
1098
+
914
1099
  self.send_prepare_ok(message.header);
915
1100
  defer self.flush_loopback_queue();
916
1101
  return;
@@ -985,6 +1170,28 @@ pub fn Replica(
985
1170
  /// informs the other replicas of the completion of the view change by sending
986
1171
  /// ⟨start_view v, l, n, k⟩ messages to the other replicas, where l is the new log, n is the
987
1172
  /// op number, and k is the commit number.
1173
+ ///
1174
+ /// For each DVC in the quorum:
1175
+ ///
1176
+ /// * The headers must all belong to the same hash chain. (Gaps are allowed).
1177
+ /// (Reason: the headers bundled with the DVC(s) with the highest view_normal will be
1178
+ /// loaded into the new leader with `replace_header()`, not `repair_header()`).
1179
+ ///
1180
+ /// Across all DVCs in the quorum:
1181
+ ///
1182
+ /// * The headers of every DVC with the same view_normal must agree. In other words:
1183
+ /// dvc₁.headers[i].op == dvc₂.headers[j].op implies
1184
+ /// dvc₁.headers[i].checksum == dvc₂.headers[j].checksum.
1185
+ /// (Reason: the headers bundled with the DVC(s) with the highest view_normal will be
1186
+ /// loaded into the new leader with `replace_header()`, not `repair_header()`).
1187
+ ///
1188
+ /// Perhaps unintuitively, it is safe to advertise a header before its message is prepared
1189
+ /// (e.g. the write is still queued). The header is either:
1190
+ ///
1191
+ /// * committed — so another replica in the quorum must have a copy, according to the quorum
1192
+ /// intersection property. Or,
1193
+ /// * uncommitted — if the header is chosen, but cannot be recovered from any replica, then
1194
+ /// it will be discarded by the nack protocol.
988
1195
  fn on_do_view_change(self: *Self, message: *Message) void {
989
1196
  if (self.ignore_view_change_message(message)) return;
990
1197
 
@@ -1000,8 +1207,9 @@ pub fn Replica(
1000
1207
  // We may receive a `do_view_change` quorum from other replicas, which already have a
1001
1208
  // `start_view_change_quorum`, before we receive a `start_view_change_quorum`:
1002
1209
  if (!self.start_view_change_quorum) {
1003
- log.debug("{}: on_do_view_change: waiting for start_view_change quorum", .{
1210
+ log.debug("{}: on_do_view_change: waiting for start_view_change quorum (view={})", .{
1004
1211
  self.replica,
1212
+ self.view,
1005
1213
  });
1006
1214
  return;
1007
1215
  }
@@ -1023,75 +1231,14 @@ pub fn Replica(
1023
1231
  self.view,
1024
1232
  });
1025
1233
 
1026
- var v: ?u32 = null;
1027
- var k: ?u64 = null;
1028
- var latest = Header.reserved(self.cluster, 0);
1029
-
1030
- for (self.do_view_change_from_all_replicas) |received, replica| {
1031
- if (received) |m| {
1032
- assert(m.header.command == .do_view_change);
1033
- assert(m.header.cluster == self.cluster);
1034
- assert(m.header.replica == replica);
1035
- assert(m.header.view == self.view);
1036
-
1037
- // The latest normal view experienced by this replica:
1038
- // This may be higher than the view in any of the prepare headers.
1039
- var replica_view_normal = @intCast(u32, m.header.timestamp);
1040
- assert(replica_view_normal < m.header.view);
1041
-
1042
- var replica_latest = Header.reserved(self.cluster, 0);
1043
- set_latest_op(self.message_body_as_headers(m), &replica_latest);
1044
- assert(replica_latest.op == m.header.op);
1045
-
1046
- log.debug(
1047
- "{}: on_do_view_change: replica={} v'={} op={} commit={} latest={}",
1048
- .{
1049
- self.replica,
1050
- m.header.replica,
1051
- replica_view_normal,
1052
- m.header.op,
1053
- m.header.commit,
1054
- replica_latest,
1055
- },
1056
- );
1057
-
1058
- if (v == null or replica_view_normal > v.?) {
1059
- v = replica_view_normal;
1060
- latest = replica_latest;
1061
- } else if (replica_view_normal == v.? and replica_latest.op > latest.op) {
1062
- v = replica_view_normal;
1063
- latest = replica_latest;
1064
- }
1065
-
1066
- if (k == null or m.header.commit > k.?) k = m.header.commit;
1067
- }
1068
- }
1069
-
1070
- self.set_latest_op_and_k(&latest, k.?, "on_do_view_change");
1071
-
1072
- // Now that we have the latest op in place, repair any other headers:
1073
- for (self.do_view_change_from_all_replicas) |received| {
1074
- if (received) |m| {
1075
- for (self.message_body_as_headers(m)) |*h| {
1076
- _ = self.repair_header(h);
1077
- }
1078
- }
1079
- }
1080
-
1081
- // Verify that the repairs above have not replaced or advanced the latest op:
1082
- assert(self.journal.header_with_op(self.op).?.checksum == latest.checksum);
1083
-
1084
1234
  assert(self.start_view_change_quorum);
1085
1235
  assert(!self.do_view_change_quorum);
1086
1236
  self.do_view_change_quorum = true;
1087
1237
 
1088
- self.discard_uncommitted_headers();
1238
+ self.set_log_from_do_view_change_messages();
1089
1239
  assert(self.op >= self.commit_max);
1090
-
1091
- const prepare_timestamp = self.journal.header_with_op(self.op).?.timestamp;
1092
- if (self.state_machine.prepare_timestamp < prepare_timestamp) {
1093
- self.state_machine.prepare_timestamp = prepare_timestamp;
1094
- }
1240
+ assert(self.state_machine.prepare_timestamp >=
1241
+ self.journal.header_with_op(self.op).?.timestamp);
1095
1242
 
1096
1243
  // Start repairs according to the CTRL protocol:
1097
1244
  assert(!self.repair_timeout.ticking);
@@ -1109,6 +1256,16 @@ pub fn Replica(
1109
1256
  fn on_start_view(self: *Self, message: *const Message) void {
1110
1257
  if (self.ignore_view_change_message(message)) return;
1111
1258
 
1259
+ if (message.header.op > self.op_checkpoint_trigger()) {
1260
+ // This replica is too far behind, i.e. the new `self.op` is too far ahead of the
1261
+ // last checkpoint. If we wrap now, we overwrite un-checkpointed transfers in the WAL,
1262
+ // precluding recovery.
1263
+ //
1264
+ // TODO State transfer. Currently this is unreachable because the
1265
+ // leader won't checkpoint until all replicas are caught up.
1266
+ unreachable;
1267
+ }
1268
+
1112
1269
  assert(self.status == .view_change or self.status == .normal);
1113
1270
  assert(message.header.view >= self.view);
1114
1271
  assert(message.header.replica != self.replica);
@@ -1118,20 +1275,12 @@ pub fn Replica(
1118
1275
 
1119
1276
  assert(self.status == .view_change);
1120
1277
  assert(message.header.view == self.view);
1278
+ assert(message.header.op == op_highest(message_body_as_headers(message)));
1121
1279
 
1122
- var latest = Header.reserved(self.cluster, 0);
1123
- set_latest_op(self.message_body_as_headers(message), &latest);
1124
- assert(latest.op == message.header.op);
1125
-
1126
- self.set_latest_op_and_k(&latest, message.header.commit, "on_start_view");
1280
+ self.set_op_and_commit_max(message.header.op, message.header.commit, "on_start_view");
1281
+ self.replace_headers(message_body_as_headers(message));
1127
1282
 
1128
- // Now that we have the latest op in place, repair any other headers:
1129
- for (self.message_body_as_headers(message)) |*h| {
1130
- _ = self.repair_header(h);
1131
- }
1132
-
1133
- // Verify that the repairs above have not replaced or advanced the latest op:
1134
- assert(self.journal.header_with_op(self.op).?.checksum == latest.checksum);
1283
+ assert(self.op == message.header.op);
1135
1284
 
1136
1285
  if (self.status == .view_change) {
1137
1286
  self.transition_to_normal_from_view_change_status(message.header.view);
@@ -1142,7 +1291,7 @@ pub fn Replica(
1142
1291
  assert(message.header.view == self.view);
1143
1292
  assert(self.follower());
1144
1293
 
1145
- self.commit_ops(self.commit_max);
1294
+ self.commit_journal(self.commit_max);
1146
1295
 
1147
1296
  self.repair();
1148
1297
  }
@@ -1201,8 +1350,45 @@ pub fn Replica(
1201
1350
  .commit = self.commit_max,
1202
1351
  };
1203
1352
 
1204
- const count_max = 8; // The maximum number of prepare headers to include in the body.
1205
- const count = self.copy_latest_headers_and_set_size(0, self.op, count_max, response);
1353
+ // A recovery response attaches at least as many headers as a DVC message attaches.
1354
+ // To understand why, consider this scenario, where:
1355
+ //
1356
+ // replica_count 3
1357
+ // do_view_change.headers.len 3 (= pipeline_max)
1358
+ // recovery_response.headers.len 2 (!)
1359
+ // replica 0 log 3, 4a, 5a, 6a, 7a, 8a (status=normal, leader)
1360
+ // replica 1 log 3, 4a, 5a, --, --, -- (status=normal, follower)
1361
+ // replica 2 log 3, 4b, 5b, --, --, -- (status=recovering)
1362
+ //
1363
+ // 1. Replica 2 receives a recovery_response quorum.
1364
+ // 2. Replica 2 sets `replica.op` to 8a.
1365
+ // 3. Replica 2 sets its headers from the leader's recovery_response (8a, 7a)
1366
+ // (via `replace_header()`).
1367
+ // 4. Replica 2 transitions to status=normal.
1368
+ // 5. Replica 0 fails (before replica 2 has a chance to repair its hash chain.)
1369
+ // 6. Replica 1 initiates a view change.
1370
+ // 7. Replica 1 collects a DVC quorum:
1371
+ // replica 1: 3, 4a, 5a (view_normal=latest)
1372
+ // replica 2: 5b, 7a, 8a (view_normal=latest)
1373
+ // Replicas 1 and 2 share the highest view_normal, so both sets of headers are canonical.
1374
+ // 8. Replica 1 loads the canonical headers (via `replace_header()`) from both DVCs.
1375
+ // Messages 8a and 7a will be dropped via `do_view_change_op_max()` (due to the
1376
+ // gap at op 6). But there is a conflict at op=5. For correctness, replica 1 must
1377
+ // pick 5a — 5a may be committed by replica 0.
1378
+ // Without replica 0's assistance, replica 1 has no way to pick between 5a/5b.
1379
+ //
1380
+ // Including at least as many headers in the recovery response as the DVC maintains the
1381
+ // invariant: DVCs with the same view_normal must never disagree on the identity of a
1382
+ // message.
1383
+ //
1384
+ // (DVCs can still safely include gaps — but they must be of the form [4a,__,6a],
1385
+ // not [4a,__,6b]).
1386
+ const count = self.copy_latest_headers_and_set_size(
1387
+ 0,
1388
+ self.op,
1389
+ view_change_headers_count,
1390
+ response,
1391
+ );
1206
1392
  assert(count > 0); // We expect that self.op always exists.
1207
1393
  assert(@divExact(response.header.size, @sizeOf(Header)) == 1 + count);
1208
1394
 
@@ -1258,7 +1444,7 @@ pub fn Replica(
1258
1444
  // receiver's state changed in the mean time.
1259
1445
 
1260
1446
  log.debug(
1261
- "{}: on_recovery_response: replica={} view={}..{} op={}..{} commit={}..{}",
1447
+ "{}: on_recovery_response: replacing response replica={} view={}..{} op={}..{} commit={}..{}",
1262
1448
  .{
1263
1449
  self.replica,
1264
1450
  existing.header.replica,
@@ -1371,17 +1557,41 @@ pub fn Replica(
1371
1557
  // protocol), if the view number indicates that this replica is a leader, it must
1372
1558
  // transition to status=view_change instead of status=normal.
1373
1559
 
1374
- const leader_headers = self.message_body_as_headers(leader_response.?);
1560
+ const leader_headers = message_body_as_headers(leader_response.?);
1375
1561
  assert(leader_headers.len > 0);
1376
1562
 
1377
1563
  const commit = leader_response.?.header.commit;
1378
1564
  {
1379
- var latest = Header.reserved(self.cluster, 0);
1380
- set_latest_op(leader_headers, &latest);
1381
- assert(latest.op == leader_response.?.header.op);
1565
+ const op = op_highest(leader_headers);
1566
+ assert(op == leader_response.?.header.op);
1567
+
1568
+ self.set_op_and_commit_max(op, commit, "on_recovery_response");
1569
+
1570
+ // TODO If the view's primary is >1 WAL ahead of us, these headers could cause
1571
+ // problems. We don't want to jump this far ahead to repair, but we still need to
1572
+ // use the hash chain to figure out which headers to request. Maybe include our
1573
+ // `op_checkpoint` in the recovery (request) message so that the response can give
1574
+ // more useful (i.e. older) headers.
1575
+ self.replace_headers(leader_headers);
1576
+
1577
+ if (self.op < config.journal_slot_count) {
1578
+ if (self.journal.header_with_op(0)) |header| {
1579
+ assert(header.command == .prepare);
1580
+ assert(header.operation == .root);
1581
+ } else {
1582
+ // This is the first wrap of the log, and the root prepare is corrupt.
1583
+ // Repair the root repair. This is necessary to maintain the invariant that
1584
+ // the op=commit_min exists in-memory.
1585
+ //
1586
+ // op=0 wouldn't have been repaired by replace_headers above, because it is
1587
+ // already "checkpointed".
1588
+ const header = Header.root_prepare(self.cluster);
1589
+ self.journal.set_header_as_dirty(&header);
1590
+ log.debug("{}: on_recovery_response: repair root op", .{self.replica});
1591
+ }
1592
+ }
1382
1593
 
1383
- self.set_latest_op_and_k(&latest, commit, "on_recovery_response");
1384
- assert(self.op == latest.op);
1594
+ assert(self.op == op);
1385
1595
  assert(self.journal.header_with_op(self.op) != null);
1386
1596
  }
1387
1597
 
@@ -1390,30 +1600,7 @@ pub fn Replica(
1390
1600
  assert(self.status == .normal);
1391
1601
  assert(self.follower());
1392
1602
 
1393
- // TODO If the view's primary is >1 WAL ahead of us, these headers could cause
1394
- // problems. We don't want to jump this far ahead to repair, but we still need to use
1395
- // the hash chain to figure out which headers to request. Maybe include our
1396
- // `op_checkpoint` in the recovery (request) message so that the response can give more
1397
- // useful (i.e. older) headers.
1398
- for (leader_headers) |*header| {
1399
- _ = self.repair_header(header);
1400
- }
1401
-
1402
- if (self.op < config.journal_slot_count) {
1403
- if (self.journal.header_with_op(0)) |header| {
1404
- assert(header.command == .prepare);
1405
- assert(header.operation == .root);
1406
- } else {
1407
- // This is the first wrap of the log, and the root prepare is corrupt.
1408
- // Repair the root repair. This is necessary to maintain the invariant that the
1409
- // op=commit_min exists in-memory.
1410
- const header = Header.root_prepare(self.cluster);
1411
- self.journal.set_header_as_dirty(&header);
1412
- log.debug("{}: on_recovery_response: repair root op", .{self.replica});
1413
- }
1414
- }
1415
-
1416
- log.debug("{}: on_recovery_response: responses={} view={} headers={}..{}" ++
1603
+ log.info("{}: on_recovery_response: recovery done responses={} view={} headers={}..{}" ++
1417
1604
  " commit={} dirty={} faulty={}", .{
1418
1605
  self.replica,
1419
1606
  count,
@@ -1429,7 +1616,7 @@ pub fn Replica(
1429
1616
  // `state_machine.commit_timestamp` is updated as messages are committed.
1430
1617
 
1431
1618
  self.reset_quorum_recovery_response();
1432
- self.commit_ops(commit);
1619
+ self.commit_journal(commit);
1433
1620
  self.repair();
1434
1621
  }
1435
1622
 
@@ -1486,28 +1673,18 @@ pub fn Replica(
1486
1673
  checksum,
1487
1674
  });
1488
1675
 
1489
- if (self.journal.header_with_op_and_checksum(op, checksum)) |_| {
1490
- // The header for the target prepare is already in-memory.
1491
- // This is preferable to the `else` case since we have the prepare's
1492
- // `header.size` in-memory, so the read can be (potentially) shorter.
1493
- // TODO Do not reissue the read if we are already reading in order to send
1494
- // to this particular destination replica.
1495
- self.journal.read_prepare(
1496
- on_request_prepare_read,
1497
- op,
1498
- prepare_checksum,
1499
- message.header.replica,
1500
- );
1501
- } else {
1502
- // TODO Do not reissue the read if we are already reading in order to send to
1503
- // this particular destination replica.
1504
- self.journal.read_prepare_with_op_and_checksum(
1505
- on_request_prepare_read,
1506
- op,
1507
- prepare_checksum,
1508
- message.header.replica,
1509
- );
1510
- }
1676
+ // Improve availability by calling `read_prepare_with_op_and_checksum` instead
1677
+ // of `read_prepare` even if `journal.headers` contains the target message.
1678
+ // The latter skips the read when the target prepare is present but dirty (e.g.
1679
+ // it was recovered with decision=fix).
1680
+ // TODO Do not reissue the read if we are already reading in order to send to
1681
+ // this particular destination replica.
1682
+ self.journal.read_prepare_with_op_and_checksum(
1683
+ on_request_prepare_read,
1684
+ op,
1685
+ prepare_checksum,
1686
+ message.header.replica,
1687
+ );
1511
1688
 
1512
1689
  // We have guaranteed the prepare (not safe to nack).
1513
1690
  // Our copy may or may not be valid, but we will try to read & forward it.
@@ -1734,7 +1911,7 @@ pub fn Replica(
1734
1911
 
1735
1912
  var op_min: ?u64 = null;
1736
1913
  var op_max: ?u64 = null;
1737
- for (self.message_body_as_headers(message)) |*h| {
1914
+ for (message_body_as_headers(message)) |*h| {
1738
1915
  if (op_min == null or h.op < op_min.?) op_min = h.op;
1739
1916
  if (op_max == null or h.op > op_max.?) op_max = h.op;
1740
1917
  _ = self.repair_header(h);
@@ -1944,10 +2121,44 @@ pub fn Replica(
1944
2121
  assert(m.header.replica == message.header.replica);
1945
2122
  assert(m.header.view == message.header.view);
1946
2123
  assert(m.header.op == message.header.op);
1947
- assert(m.header.commit == message.header.commit);
1948
2124
  assert(m.header.checksum_body == message.header.checksum_body);
1949
- assert(m.header.checksum == message.header.checksum);
1950
- log.debug("{}: on_{s}: ignoring (duplicate message)", .{ self.replica, command });
2125
+
2126
+ if (message.header.command == .do_view_change) {
2127
+ // Replicas don't resend `do_view_change` messages to themselves.
2128
+ assert(message.header.replica != self.replica);
2129
+ // A replica may resend a `do_view_change` with a different commit if it was
2130
+ // committing originally. Keep the one with the highest commit.
2131
+ // This is *not* necessary for correctness.
2132
+ if (m.header.commit < message.header.commit) {
2133
+ log.debug("{}: on_{s}: replacing (newer message replica={} commit={}..{})", .{
2134
+ self.replica,
2135
+ command,
2136
+ message.header.replica,
2137
+ m.header.commit,
2138
+ message.header.commit,
2139
+ });
2140
+ // TODO(Buggify): skip updating the DVC, since it isn't required for correctness.
2141
+ self.message_bus.unref(m);
2142
+ messages[message.header.replica] = message.ref();
2143
+ } else if (m.header.commit > message.header.commit) {
2144
+ log.debug("{}: on_{s}: ignoring (older message replica={})", .{
2145
+ self.replica,
2146
+ command,
2147
+ message.header.replica,
2148
+ });
2149
+ } else {
2150
+ assert(m.header.checksum == message.header.checksum);
2151
+ }
2152
+ } else {
2153
+ assert(m.header.commit == message.header.commit);
2154
+ assert(m.header.checksum == message.header.checksum);
2155
+ }
2156
+
2157
+ log.debug("{}: on_{s}: ignoring (duplicate message replica={})", .{
2158
+ self.replica,
2159
+ command,
2160
+ message.header.replica,
2161
+ });
1951
2162
  return null;
1952
2163
  }
1953
2164
 
@@ -2004,6 +2215,7 @@ pub fn Replica(
2004
2215
  if (self.replica_count == 2) assert(threshold == 1);
2005
2216
 
2006
2217
  assert(self.status == .view_change);
2218
+ assert(self.replica != message.header.replica);
2007
2219
  },
2008
2220
  .nack_prepare => {
2009
2221
  assert(self.replica_count > 1);
@@ -2011,6 +2223,8 @@ pub fn Replica(
2011
2223
 
2012
2224
  assert(self.status == .view_change);
2013
2225
  assert(self.leader_index(self.view) == self.replica);
2226
+ assert(message.header.replica != self.replica);
2227
+ assert(message.header.op == self.nack_prepare_op.?);
2014
2228
  },
2015
2229
  else => unreachable,
2016
2230
  }
@@ -2065,9 +2279,15 @@ pub fn Replica(
2065
2279
  // In a cluster-of-one, the prepares must always be written to the WAL sequentially
2066
2280
  // (never concurrently). This ensures that there will be no gaps in the WAL during
2067
2281
  // crash recovery.
2068
- log.debug("{}: append: serializing append op={}", .{ self.replica, message.header.op });
2282
+ log.debug("{}: append: serializing append op={}", .{
2283
+ self.replica,
2284
+ message.header.op,
2285
+ });
2069
2286
  } else {
2070
- log.debug("{}: append: appending to journal", .{self.replica});
2287
+ log.debug("{}: append: appending to journal op={}", .{
2288
+ self.replica,
2289
+ message.header.op,
2290
+ });
2071
2291
  self.write_prepare(message, .append);
2072
2292
  }
2073
2293
  }
@@ -2115,9 +2335,9 @@ pub fn Replica(
2115
2335
  }
2116
2336
 
2117
2337
  /// Commit ops up to commit number `commit` (inclusive).
2118
- /// A function which calls `commit_ops()` to set `commit_max` must first call `view_jump()`.
2119
- /// Otherwise, we may fork the log.
2120
- fn commit_ops(self: *Self, commit: u64) void {
2338
+ /// A function which calls `commit_journal()` to set `commit_max` must first call
2339
+ /// `view_jump()`. Otherwise, we may fork the log.
2340
+ fn commit_journal(self: *Self, commit: u64) void {
2121
2341
  // TODO Restrict `view_change` status only to the leader purely as defense-in-depth.
2122
2342
  // Be careful of concurrency when doing this, as successive view changes can happen quickly.
2123
2343
  assert(self.status == .normal or self.status == .view_change or
@@ -2131,9 +2351,9 @@ pub fn Replica(
2131
2351
  if (commit <= self.commit_min) return;
2132
2352
 
2133
2353
  // We must update `commit_max` even if we are already committing, otherwise we will lose
2134
- // information that we should know, and `set_latest_op_and_k()` will catch us out:
2354
+ // information that we should know, and `set_op_and_commit_max()` will catch us out:
2135
2355
  if (commit > self.commit_max) {
2136
- log.debug("{}: commit_ops: advancing commit_max={}..{}", .{
2356
+ log.debug("{}: commit_journal: advancing commit_max={}..{}", .{
2137
2357
  self.replica,
2138
2358
  self.commit_max,
2139
2359
  commit,
@@ -2141,9 +2361,9 @@ pub fn Replica(
2141
2361
  self.commit_max = commit;
2142
2362
  }
2143
2363
 
2144
- // Guard against multiple concurrent invocations of commit_ops():
2364
+ // Guard against multiple concurrent invocations of commit_journal()/commit_pipeline():
2145
2365
  if (self.committing) {
2146
- log.debug("{}: commit_ops: already committing...", .{self.replica});
2366
+ log.debug("{}: commit_journal: already committing...", .{self.replica});
2147
2367
  return;
2148
2368
  }
2149
2369
 
@@ -2160,19 +2380,19 @@ pub fn Replica(
2160
2380
  assert(!self.committing);
2161
2381
  self.committing = true;
2162
2382
 
2163
- self.commit_ops_read();
2383
+ self.commit_journal_next();
2164
2384
  }
2165
2385
 
2166
- fn commit_ops_read(self: *Self) void {
2386
+ fn commit_journal_next(self: *Self) void {
2167
2387
  assert(self.committing);
2168
2388
  assert(self.status == .normal or self.status == .view_change or
2169
2389
  (self.status == .recovering and self.replica_count == 1));
2170
2390
  assert(self.commit_min <= self.commit_max);
2171
2391
  assert(self.commit_min <= self.op);
2172
2392
 
2173
- if (!self.valid_hash_chain("commit_ops_read")) {
2174
- self.committing = false;
2393
+ if (!self.valid_hash_chain("commit_journal_next")) {
2175
2394
  assert(self.replica_count > 1);
2395
+ self.commit_ops_done();
2176
2396
  return;
2177
2397
  }
2178
2398
  assert(self.op >= self.commit_max);
@@ -2182,9 +2402,9 @@ pub fn Replica(
2182
2402
  if (self.commit_min < self.commit_max and self.commit_min < self.op) {
2183
2403
  const op = self.commit_min + 1;
2184
2404
  const checksum = self.journal.header_with_op(op).?.checksum;
2185
- self.journal.read_prepare(commit_ops_commit, op, checksum, null);
2405
+ self.journal.read_prepare(commit_journal_next_callback, op, checksum, null);
2186
2406
  } else {
2187
- self.committing = false;
2407
+ self.commit_ops_done();
2188
2408
  // This is an optimization to expedite the view change before the `repair_timeout`:
2189
2409
  if (self.status == .view_change and self.repairs_allowed()) self.repair();
2190
2410
 
@@ -2194,33 +2414,43 @@ pub fn Replica(
2194
2414
  assert(self.commit_min == self.op);
2195
2415
  self.transition_to_normal_from_recovering_status(0);
2196
2416
  } else {
2197
- // We expect that a cluster-of-one only calls commit_ops() in recovering status.
2417
+ // We expect that a cluster-of-one only calls commit_journal() in recovering status.
2198
2418
  assert(self.replica_count > 1);
2199
2419
  }
2200
2420
  }
2201
2421
  }
2202
2422
 
2203
- fn commit_ops_commit(self: *Self, prepare: ?*Message, destination_replica: ?u8) void {
2204
- assert(destination_replica == null);
2205
-
2423
+ fn commit_journal_next_callback(self: *Self, prepare: ?*Message, destination_replica: ?u8) void {
2206
2424
  assert(self.committing);
2207
- self.committing = false;
2425
+ assert(destination_replica == null);
2208
2426
 
2209
2427
  if (prepare == null) {
2210
- log.debug("{}: commit_ops_commit: prepare == null", .{self.replica});
2428
+ self.commit_ops_done();
2429
+ log.debug("{}: commit_journal_next_callback: prepare == null", .{self.replica});
2211
2430
  if (self.replica_count == 1) @panic("cannot recover corrupt prepare");
2212
2431
  return;
2213
2432
  }
2214
2433
 
2434
+ const slot = self.journal.slot_with_op_and_checksum(
2435
+ prepare.?.header.op,
2436
+ prepare.?.header.checksum,
2437
+ ).?;
2438
+ assert(self.journal.prepare_inhabited[slot.index]);
2439
+ assert(self.journal.prepare_checksums[slot.index] == prepare.?.header.checksum);
2440
+ assert(self.journal.has(prepare.?.header));
2441
+
2215
2442
  switch (self.status) {
2216
2443
  .normal => {},
2217
2444
  .view_change => {
2218
2445
  if (self.leader_index(self.view) != self.replica) {
2219
- log.debug("{}: commit_ops_commit: no longer leader", .{self.replica});
2446
+ self.commit_ops_done();
2447
+ log.debug("{}: commit_journal_next_callback: no longer leader view={}", .{
2448
+ self.replica,
2449
+ self.view,
2450
+ });
2220
2451
  assert(self.replica_count > 1);
2221
2452
  return;
2222
2453
  }
2223
-
2224
2454
  // Only the leader may commit during a view change before starting the new view.
2225
2455
  // Fall through if this is indeed the case.
2226
2456
  },
@@ -2231,31 +2461,194 @@ pub fn Replica(
2231
2461
  }
2232
2462
 
2233
2463
  const op = self.commit_min + 1;
2464
+ assert(prepare.?.header.op == op);
2234
2465
 
2235
- if (prepare.?.header.op != op) {
2236
- log.debug("{}: commit_ops_commit: op changed", .{self.replica});
2237
- assert(self.replica_count > 1);
2238
- return;
2466
+ self.commit_op_prefetch(prepare.?, commit_journal_callback);
2467
+ }
2468
+
2469
+ fn commit_journal_callback(self: *Self) void {
2470
+ assert(self.committing);
2471
+ assert(self.commit_min <= self.commit_max);
2472
+ assert(self.commit_min <= self.op);
2473
+
2474
+ self.commit_journal_next();
2475
+ }
2476
+
2477
+ /// Begin the commit path that is common between `commit_pipeline` and `commit_journal`:
2478
+ ///
2479
+ /// 1. prefetch
2480
+ /// 2. commit_op: Update the state machine and the replica's commit_min/commit_max.
2481
+ /// 3. compact
2482
+ /// 4. checkpoint: (Only called when `commit_min == op_checkpoint_trigger`).
2483
+ /// 5. done: Call the `callback` that was passed to `commit_op_prefetch`.
2484
+ fn commit_op_prefetch(
2485
+ self: *Self,
2486
+ prepare: *Message,
2487
+ callback: fn (*Self) void,
2488
+ ) void {
2489
+ assert(self.committing);
2490
+ assert(self.status == .normal or self.status == .view_change or
2491
+ (self.status == .recovering and self.replica_count == 1));
2492
+ assert(self.commit_prepare == null);
2493
+ assert(self.commit_callback == null);
2494
+ assert(prepare.header.command == .prepare);
2495
+ assert(prepare.header.operation != .root);
2496
+ assert(prepare.header.op == self.commit_min + 1);
2497
+ assert(prepare.header.op <= self.op);
2498
+
2499
+ self.commit_prepare = prepare.ref();
2500
+ self.commit_callback = callback;
2501
+ self.state_machine.prefetch(
2502
+ commit_op_prefetch_callback,
2503
+ prepare.header.op,
2504
+ prepare.header.operation.cast(StateMachine),
2505
+ prepare.body(),
2506
+ );
2507
+ }
2508
+
2509
+ fn commit_op_prefetch_callback(state_machine: *StateMachine) void {
2510
+ const self = @fieldParentPtr(Self, "state_machine", state_machine);
2511
+ assert(self.committing);
2512
+ assert(self.commit_prepare != null);
2513
+ assert(self.commit_callback != null);
2514
+ assert(self.commit_prepare.?.header.op == self.commit_min + 1);
2515
+
2516
+ self.commit_op(self.commit_prepare.?);
2517
+ assert(self.commit_min == self.commit_prepare.?.header.op);
2518
+ assert(self.commit_min <= self.commit_max);
2519
+
2520
+ if (self.status == .normal and self.leader()) {
2521
+ const prepare = self.pipeline.pop().?;
2522
+ assert(self.commit_min == self.commit_max);
2523
+ assert(prepare.message.header.checksum == self.commit_prepare.?.header.checksum);
2524
+ assert(prepare.message.header.op == self.commit_min);
2525
+ assert(prepare.message.header.op == self.commit_max);
2526
+ assert(self.prepare_timeout.ticking);
2527
+
2528
+ self.message_bus.unref(prepare.message);
2529
+
2530
+ if (self.pipeline.head_ptr()) |next| {
2531
+ assert(next.message.header.op == self.commit_min + 1);
2532
+ assert(next.message.header.op == self.commit_prepare.?.header.op + 1);
2533
+
2534
+ if (self.replica_count == 1) {
2535
+ // Write the next message in the queue.
2536
+ // A cluster-of-one writes prepares sequentially to avoid gaps in the
2537
+ // WAL caused by reordered writes.
2538
+ log.debug("{}: append: appending to journal op={}", .{
2539
+ self.replica,
2540
+ next.message.header.op,
2541
+ });
2542
+ self.write_prepare(next.message, .append);
2543
+ }
2544
+ } else {
2545
+ // When the pipeline is empty, stop the prepare timeout.
2546
+ // The timeout will be restarted when another entry arrives for the pipeline.
2547
+ self.prepare_timeout.stop();
2548
+ }
2239
2549
  }
2240
2550
 
2241
- if (prepare.?.header.checksum != self.journal.header_with_op(op).?.checksum) {
2242
- log.debug("{}: commit_ops_commit: checksum changed", .{self.replica});
2243
- assert(self.replica_count > 1);
2244
- return;
2551
+ self.state_machine.compact(commit_op_compact_callback, self.commit_prepare.?.header.op);
2552
+ }
2553
+
2554
+ fn commit_op_compact_callback(state_machine: *StateMachine) void {
2555
+ const self = @fieldParentPtr(Self, "state_machine", state_machine);
2556
+ assert(self.committing);
2557
+ assert(self.commit_callback != null);
2558
+ assert(self.op_checkpoint == self.superblock.staging.vsr_state.commit_min);
2559
+ assert(self.op_checkpoint == self.superblock.working.vsr_state.commit_min);
2560
+
2561
+ const op = self.commit_prepare.?.header.op;
2562
+ assert(op == self.commit_min);
2563
+
2564
+ if (op == self.op_checkpoint_trigger()) {
2565
+ assert(op == self.op);
2566
+ assert((op + 1) % config.lsm_batch_multiple == 0);
2567
+ log.debug("{}: commit_op_compact_callback: checkpoint start " ++
2568
+ "(op={} current_checkpoint={} next_checkpoint={})", .{
2569
+ self.replica,
2570
+ self.op,
2571
+ self.op_checkpoint,
2572
+ self.op_checkpoint_next(),
2573
+ });
2574
+ self.state_machine.checkpoint(commit_op_checkpoint_state_machine_callback);
2575
+ } else {
2576
+ assert(op < self.op_checkpoint_trigger());
2577
+ self.commit_op_done();
2245
2578
  }
2579
+ }
2246
2580
 
2247
- self.commit_op(prepare.?);
2581
+ fn commit_op_checkpoint_state_machine_callback(state_machine: *StateMachine) void {
2582
+ const self = @fieldParentPtr(Self, "state_machine", state_machine);
2583
+ assert(self.committing);
2584
+ assert(self.commit_callback != null);
2585
+ assert(self.commit_prepare.?.header.op == self.op);
2586
+ assert(self.commit_prepare.?.header.op == self.commit_min);
2587
+ assert(self.commit_prepare.?.header.op == self.op_checkpoint_trigger());
2248
2588
 
2249
- assert(self.commit_min == op);
2250
- assert(self.commit_min <= self.commit_max);
2251
- assert(self.commit_min <= self.op);
2589
+ // For the given WAL (journal_slot_count=8, lsm_batch_multiple=2, op=commit_min=7):
2590
+ //
2591
+ // A B C D E
2592
+ // |01|23|45|67|
2593
+ //
2594
+ // The checkpoint is triggered at "E".
2595
+ // At this point, ops 6 and 7 are in the in-memory immutable table.
2596
+ // They will only be compacted to disk in the next bar.
2597
+ // Therefore, only ops "A..D" are committed to disk.
2598
+ // Thus, the SuperBlock's `commit_min` is set to 7-2=5.
2599
+ const vsr_state_new = .{
2600
+ .commit_min = self.op_checkpoint_next(),
2601
+ .commit_max = self.commit_max,
2602
+ .view_normal = self.view_normal,
2603
+ .view = self.view,
2604
+ };
2605
+ assert(VSRState.monotonic(self.superblock.working.vsr_state, vsr_state_new));
2252
2606
 
2253
- self.committing = true;
2254
- self.commit_ops_read();
2607
+ self.superblock.staging.vsr_state = vsr_state_new;
2608
+ self.superblock.checkpoint(
2609
+ commit_op_checkpoint_superblock_callback,
2610
+ &self.superblock_context,
2611
+ );
2612
+ }
2613
+
2614
+ fn commit_op_checkpoint_superblock_callback(superblock_context: *SuperBlock.Context) void {
2615
+ const self = @fieldParentPtr(Self, "superblock_context", superblock_context);
2616
+ assert(self.committing);
2617
+ assert(self.commit_callback != null);
2618
+ assert(self.commit_prepare.?.header.op == self.op);
2619
+ assert(self.commit_prepare.?.header.op == self.commit_min);
2620
+
2621
+ self.op_checkpoint = self.op_checkpoint_next();
2622
+ assert(self.op_checkpoint == self.commit_min - config.lsm_batch_multiple);
2623
+ assert(self.op_checkpoint == self.superblock.staging.vsr_state.commit_min);
2624
+ assert(self.op_checkpoint == self.superblock.working.vsr_state.commit_min);
2625
+
2626
+ log.debug("{}: commit_op_compact_callback: checkpoint done (op={} new_checkpoint={})", .{
2627
+ self.replica,
2628
+ self.op,
2629
+ self.op_checkpoint,
2630
+ });
2631
+
2632
+ self.commit_op_done();
2633
+ }
2634
+
2635
+ fn commit_op_done(self: *Self) void {
2636
+ const callback = self.commit_callback.?;
2637
+ assert(self.committing);
2638
+ assert(self.commit_prepare.?.header.op == self.commit_min);
2639
+ assert(self.commit_prepare.?.header.op < self.op_checkpoint_trigger());
2640
+
2641
+ self.message_bus.unref(self.commit_prepare.?);
2642
+ self.commit_prepare = null;
2643
+ self.commit_callback = null;
2644
+ callback(self);
2255
2645
  }
2256
2646
 
2257
2647
  fn commit_op(self: *Self, prepare: *const Message) void {
2258
2648
  // TODO Can we add more checks around allowing commit_op() during a view change?
2649
+ assert(self.committing);
2650
+ assert(self.commit_prepare.? == prepare);
2651
+ assert(self.commit_callback != null);
2259
2652
  assert(self.status == .normal or self.status == .view_change or
2260
2653
  (self.status == .recovering and self.replica_count == 1));
2261
2654
  assert(prepare.header.command == .prepare);
@@ -2263,10 +2656,12 @@ pub fn Replica(
2263
2656
  assert(prepare.header.op == self.commit_min + 1);
2264
2657
  assert(prepare.header.op <= self.op);
2265
2658
 
2266
- // If we are a follower committing through `commit_ops()` then a view change may have
2267
- // happened since we last checked in `commit_ops_read()`. However, this would relate to
2268
- // subsequent ops, since by now we have already verified the hash chain for this commit.
2659
+ // If we are a follower committing through `commit_journal()` then a view change may
2660
+ // have happened since we last checked in `commit_journal_next()`. However, this would
2661
+ // relate to subsequent ops, since by now we have already verified the hash chain for
2662
+ // this commit.
2269
2663
 
2664
+ assert(self.journal.has(prepare.header));
2270
2665
  assert(self.journal.header_with_op(self.commit_min).?.checksum ==
2271
2666
  prepare.header.parent);
2272
2667
 
@@ -2282,10 +2677,16 @@ pub fn Replica(
2282
2677
  const reply = self.message_bus.get_message();
2283
2678
  defer self.message_bus.unref(reply);
2284
2679
 
2680
+ log.debug("{}: commit_op: commit_timestamp={} prepare.header.timestamp={}", .{
2681
+ self.replica,
2682
+ self.state_machine.commit_timestamp,
2683
+ prepare.header.timestamp,
2684
+ });
2285
2685
  assert(self.state_machine.commit_timestamp < prepare.header.timestamp);
2286
2686
 
2287
2687
  const reply_body_size = @intCast(u32, self.state_machine.commit(
2288
2688
  prepare.header.client,
2689
+ prepare.header.op,
2289
2690
  prepare.header.operation.cast(StateMachine),
2290
2691
  prepare.buffer[@sizeOf(Header)..prepare.header.size],
2291
2692
  reply.buffer[@sizeOf(Header)..],
@@ -2310,20 +2711,38 @@ pub fn Replica(
2310
2711
  .replica = prepare.header.replica,
2311
2712
  .view = prepare.header.view,
2312
2713
  .op = prepare.header.op,
2714
+ .timestamp = prepare.header.timestamp,
2313
2715
  .commit = prepare.header.op,
2314
2716
  .size = @sizeOf(Header) + reply_body_size,
2315
2717
  };
2316
- assert(reply.header.timestamp == 0);
2317
2718
  assert(reply.header.epoch == 0);
2318
2719
 
2319
2720
  reply.header.set_checksum_body(reply.body());
2320
2721
  reply.header.set_checksum();
2321
2722
 
2322
- if (reply.header.operation == .register) {
2323
- self.create_client_table_entry(reply);
2324
- } else {
2325
- self.update_client_table_entry(reply);
2326
- }
2723
+ if (self.superblock.working.vsr_state.op_compacted(prepare.header.op)) {
2724
+ // We are recovering from a checkpoint. Prior to the crash, the client table was
2725
+ // updated with entries for one bar beyond the op_checkpoint.
2726
+ assert(self.op_checkpoint == self.superblock.working.vsr_state.commit_min);
2727
+ if (self.client_table().get(prepare.header.client)) |entry| {
2728
+ assert(entry.reply.header.command == .reply);
2729
+ assert(entry.reply.header.op >= prepare.header.op);
2730
+ } else {
2731
+ assert(self.client_table().count() == self.client_table().capacity());
2732
+ }
2733
+
2734
+ log.debug("{}: commit_op: skip client table update: prepare.op={} checkpoint={}", .{
2735
+ self.replica,
2736
+ prepare.header.op,
2737
+ self.op_checkpoint,
2738
+ });
2739
+ } else {
2740
+ if (reply.header.operation == .register) {
2741
+ self.create_client_table_entry(reply);
2742
+ } else {
2743
+ self.update_client_table_entry(reply);
2744
+ }
2745
+ }
2327
2746
 
2328
2747
  if (self.leader_index(self.view) == self.replica) {
2329
2748
  log.debug("{}: commit_op: replying to client: {}", .{ self.replica, reply.header });
@@ -2332,22 +2751,38 @@ pub fn Replica(
2332
2751
  }
2333
2752
 
2334
2753
  /// Commits, frees and pops as many prepares at the head of the pipeline as have quorum.
2754
+ /// Can be called only when the replica is the leader.
2335
2755
  /// Can be called only when the pipeline has at least one prepare.
2336
- /// Stops the prepare timeout and resets the timeouts counter if the pipeline becomes empty.
2337
2756
  fn commit_pipeline(self: *Self) void {
2338
2757
  assert(self.status == .normal);
2339
2758
  assert(self.leader());
2340
2759
  assert(self.pipeline.count > 0);
2341
2760
 
2342
- while (self.pipeline.head_ptr()) |prepare| {
2343
- assert(self.pipeline.count > 0);
2761
+ // Guard against multiple concurrent invocations of commit_journal()/commit_pipeline():
2762
+ if (self.committing) {
2763
+ log.debug("{}: commit_pipeline: already committing...", .{self.replica});
2764
+ return;
2765
+ }
2766
+
2767
+ self.committing = true;
2768
+ self.commit_pipeline_next();
2769
+ }
2770
+
2771
+ fn commit_pipeline_next(self: *Self) void {
2772
+ assert(self.committing);
2773
+ assert(self.status == .normal);
2774
+ assert(self.leader());
2775
+
2776
+ if (self.pipeline.head_ptr()) |prepare| {
2344
2777
  assert(self.commit_min == self.commit_max);
2345
- assert(self.commit_max + self.pipeline.count == self.op);
2346
- assert(self.commit_max + 1 == prepare.message.header.op);
2778
+ assert(self.commit_min + 1 == prepare.message.header.op);
2779
+ assert(self.commit_min + self.pipeline.count == self.op);
2780
+ assert(self.journal.has(prepare.message.header));
2347
2781
 
2348
2782
  if (!prepare.ok_quorum_received) {
2349
2783
  // Eventually handled by on_prepare_timeout().
2350
2784
  log.debug("{}: commit_pipeline: waiting for quorum", .{self.replica});
2785
+ self.commit_ops_done();
2351
2786
  return;
2352
2787
  }
2353
2788
 
@@ -2355,26 +2790,30 @@ pub fn Replica(
2355
2790
  assert(count >= self.quorum_replication);
2356
2791
  assert(count <= self.replica_count);
2357
2792
 
2358
- self.commit_op(prepare.message);
2359
-
2360
- assert(self.commit_min == self.commit_max);
2361
- assert(self.commit_max == prepare.message.header.op);
2793
+ self.commit_op_prefetch(prepare.message, commit_pipeline_callback);
2794
+ } else {
2795
+ self.commit_ops_done();
2796
+ }
2797
+ }
2362
2798
 
2363
- self.message_bus.unref(self.pipeline.pop().?.message);
2799
+ fn commit_pipeline_callback(self: *Self) void {
2800
+ assert(self.committing);
2801
+ assert(self.commit_min <= self.commit_max);
2802
+ assert(self.commit_min <= self.op);
2364
2803
 
2365
- if (self.replica_count == 1) {
2366
- if (self.pipeline.head_ptr()) |head| {
2367
- // Write the next message in the queue.
2368
- // A cluster-of-one writes prepares sequentially to avoid gaps in the WAL.
2369
- self.write_prepare(head.message, .append);
2370
- // The loop will wrap around and exit when `!ok_quorum_received`.
2371
- }
2804
+ if (self.status == .normal and self.leader()) {
2805
+ if (self.pipeline.head_ptr()) |pipeline_head| {
2806
+ assert(pipeline_head.message.header.op == self.commit_min + 1);
2372
2807
  }
2808
+ self.commit_pipeline_next();
2809
+ } else {
2810
+ self.commit_ops_done();
2373
2811
  }
2812
+ }
2374
2813
 
2375
- assert(self.prepare_timeout.ticking);
2376
-
2377
- if (self.pipeline.count == 0) self.prepare_timeout.stop();
2814
+ fn commit_ops_done(self: *Self) void {
2815
+ assert(self.committing);
2816
+ self.committing = false;
2378
2817
  }
2379
2818
 
2380
2819
  fn copy_latest_headers_and_set_size(
@@ -2402,7 +2841,10 @@ pub fn Replica(
2402
2841
  const count = self.journal.copy_latest_headers_between(
2403
2842
  op_min,
2404
2843
  op_max,
2405
- std.mem.bytesAsSlice(Header, message.buffer[@sizeOf(Header)..][0..body_size_max]),
2844
+ std.mem.bytesAsSlice(
2845
+ Header,
2846
+ message.buffer[@sizeOf(Header)..][0..body_size_max],
2847
+ ),
2406
2848
  );
2407
2849
 
2408
2850
  message.header.size = @intCast(u32, @sizeOf(Header) * (1 + count));
@@ -2426,17 +2868,8 @@ pub fn Replica(
2426
2868
  assert(m.header.context == context);
2427
2869
  assert(m.header.replica == replica);
2428
2870
  switch (command) {
2429
- .start_view_change => {
2430
- assert(m.header.replica != self.replica);
2431
- assert(m.header.view == self.view);
2432
- },
2433
2871
  .do_view_change => assert(m.header.view == self.view),
2434
2872
  .recovery_response => assert(m.header.replica != self.replica),
2435
- .nack_prepare => {
2436
- // TODO See if we can restrict this branch further.
2437
- assert(m.header.replica != self.replica);
2438
- assert(m.header.op == self.nack_prepare_op.?);
2439
- },
2440
2873
  else => unreachable,
2441
2874
  }
2442
2875
  count += 1;
@@ -2473,12 +2906,12 @@ pub fn Replica(
2473
2906
  // we do require that all entries have different commit numbers and are iterated.
2474
2907
  // This ensures that we will always pick the entry with the oldest commit number.
2475
2908
  // We also check that a client has only one entry in the hash map (or it's buggy).
2476
- const clients = self.client_table.count();
2909
+ const clients = self.client_table().count();
2477
2910
  assert(clients <= config.clients_max);
2478
2911
  if (clients == config.clients_max) {
2479
2912
  var evictee: ?*Message = null;
2480
2913
  var iterated: usize = 0;
2481
- var iterator = self.client_table.valueIterator();
2914
+ var iterator = self.client_table().iterator();
2482
2915
  while (iterator.next()) |entry| : (iterated += 1) {
2483
2916
  assert(entry.reply.header.command == .reply);
2484
2917
  assert(entry.reply.header.context == 0);
@@ -2503,8 +2936,7 @@ pub fn Replica(
2503
2936
  config.clients_max,
2504
2937
  evictee.?.header.client,
2505
2938
  });
2506
- assert(self.client_table.remove(evictee.?.header.client));
2507
- assert(!self.client_table.contains(evictee.?.header.client));
2939
+ self.client_table().remove(evictee.?.header.client);
2508
2940
  self.message_bus.unref(evictee.?);
2509
2941
  }
2510
2942
 
@@ -2517,11 +2949,11 @@ pub fn Replica(
2517
2949
 
2518
2950
  // Any duplicate .register requests should have received the same session number if the
2519
2951
  // client table entry already existed, or been dropped if a session was being committed:
2520
- self.client_table.putAssumeCapacityNoClobber(reply.header.client, .{
2952
+ self.client_table().put(&.{
2521
2953
  .session = session,
2522
2954
  .reply = reply.ref(),
2523
2955
  });
2524
- assert(self.client_table.count() <= config.clients_max);
2956
+ assert(self.client_table().count() <= config.clients_max);
2525
2957
  }
2526
2958
 
2527
2959
  /// The caller owns the returned message, if any, which has exactly 1 reference.
@@ -2545,19 +2977,16 @@ pub fn Replica(
2545
2977
  // We use the `timestamp` field to send this in addition to the current view number:
2546
2978
  .timestamp = if (command == .do_view_change) self.view_normal else 0,
2547
2979
  .op = self.op,
2548
- .commit = self.commit_max,
2980
+ // See the comment in `on_do_view_change()` for why `commit_min` is crucial:
2981
+ .commit = if (command == .do_view_change) self.commit_min else self.commit_max,
2549
2982
  };
2550
2983
 
2551
- // CRITICAL: The number of prepare headers to include in the body:
2552
- // We must provide enough headers to cover all uncommitted headers so that the new
2553
- // leader (if we are in a view change) can decide whether to discard uncommitted headers
2554
- // that cannot be repaired because they are gaps, and this must be relative to the
2555
- // cluster as a whole (not relative to the difference between our op and commit number)
2556
- // as otherwise we would break correctness.
2557
- const count_max = config.pipeline_max;
2558
- assert(count_max > 0);
2559
-
2560
- const count = self.copy_latest_headers_and_set_size(0, self.op, count_max, message);
2984
+ const count = self.copy_latest_headers_and_set_size(
2985
+ 0,
2986
+ self.op,
2987
+ view_change_headers_count,
2988
+ message,
2989
+ );
2561
2990
  assert(count > 0); // We expect that self.op always exists.
2562
2991
  assert(@divExact(message.header.size, @sizeOf(Header)) == 1 + count);
2563
2992
 
@@ -2585,106 +3014,65 @@ pub fn Replica(
2585
3014
  return message.ref();
2586
3015
  }
2587
3016
 
2588
- /// Discards uncommitted headers during a view change before the new leader starts the view.
2589
- /// This is required to maximize availability in the presence of storage faults.
2590
- /// Refer to the CTRL protocol from Protocol-Aware Recovery for Consensus-Based Storage.
3017
+ /// Returns the op of the highest canonical message, according to this replica (the new
3018
+ /// leader) prior to loading the current view change's DVC quorum headers.
3019
+ /// When this replica participated in the last `view_normal`, this is just `replica.op`.
2591
3020
  ///
2592
- /// It's possible for the new leader to have done an op jump in a previous view, and so
2593
- /// introduced a header gap for an op, which was then discarded by another leader during a
2594
- /// newer view change, before surviving into this view as a gap because our latest op was
2595
- /// set as the latest op for the quorum.
3021
+ /// - A *canonical* message was part of the last view_normal.
3022
+ /// - An *uncanonical* message may have been removed/changed by a prior view.
3023
+ /// - Canonical messages do not necessarily survive into the new view, but they take
3024
+ /// precedence over uncanonical messages.
3025
+ /// - Canonical messages may be committed or uncommitted.
2596
3026
  ///
2597
- /// In this case, it may be impossible for the new leader to repair the missing header since
2598
- /// the rest of the cluster may have already discarded it. We therefore iterate over our
2599
- /// uncommitted header gaps and compare them with the quorum of do_view_change messages
2600
- /// received from other replicas, before starting the new view, to discard any that may be
2601
- /// impossible to repair.
3027
+ /// Consider these logs:
2602
3028
  ///
2603
- /// For example, if the old primary replicates ops=7,8,9 (all uncommitted) but only op=9 is
2604
- /// prepared on another replica before the old primary crashes, then this function finds a
2605
- /// gap for ops=7,8 and will attempt to discard ops 7,8,9.
2606
- // TODO To improve availability, potentially call this before the local headers are
2607
- // repaired during the view change, so that we can participate in nacking headers.
2608
- fn discard_uncommitted_headers(self: *Self) void {
3029
+ /// replica 0: 4, 5, 6b, 7b, 8b (commit_min=6b, leader, status=normal, view=X)
3030
+ /// replica 1: 4, 5, 6b, --, -- (commit_min=5, follower, status=normal, view=X)
3031
+ /// replica 2: 4, 5, 6a, --, 8b (view<X)
3032
+ ///
3033
+ /// 1. Replica 0 crashes immediately after committing 6b.
3034
+ /// 2. Replicas 1 and 2 must determine the new chain HEAD.
3035
+ /// 3. 8b is discarded due to the gap in 7.
3036
+ /// 4. To distinguish between 6a and 6b (and safely discard 6a), the new leader trusts ops
3037
+ /// from the DVC(s) with the greatest `view_normal`.
3038
+ fn op_canonical_max(self: *const Self, view_normal_canonical: u64) usize {
3039
+ assert(self.replica_count > 1);
2609
3040
  assert(self.status == .view_change);
2610
3041
  assert(self.leader_index(self.view) == self.replica);
2611
3042
  assert(self.do_view_change_quorum);
2612
3043
  assert(!self.repair_timeout.ticking);
2613
- assert(self.op >= self.commit_max);
2614
- assert(self.replica_count > 1);
2615
- assert(self.op - self.commit_max <= config.journal_slot_count);
3044
+ assert(self.journal.header_with_op(self.op) != null);
3045
+ assert(self.view_normal <= view_normal_canonical);
2616
3046
 
2617
- const threshold = self.replica_count - self.quorum_replication;
2618
- if (threshold == 0) {
2619
- assert(self.replica_count == 2);
2620
- return;
2621
- }
3047
+ if (self.view_normal == view_normal_canonical) return self.op;
2622
3048
 
2623
- // Iterating > commit_max does not in itself guarantee that the header is uncommitted.
2624
- // We must also count nacks from the quorum, since the old primary may have committed
2625
- // another op just before crashing, if there was sufficient quorum. Counting nacks
2626
- // ensures that the old primary could not possibly have committed the header.
2627
- var op = self.op;
2628
- while (op > self.commit_max) : (op -= 1) {
2629
- if (self.journal.header_with_op(op) != null) continue;
2630
-
2631
- log.debug("{}: discard_uncommitted_headers: op={} gap", .{ self.replica, op });
2632
-
2633
- var nacks: usize = 0;
2634
- for (self.do_view_change_from_all_replicas) |received, replica| {
2635
- if (received) |m| {
2636
- assert(m.header.command == .do_view_change);
2637
- assert(m.header.cluster == self.cluster);
2638
- assert(m.header.replica == replica);
2639
- assert(m.header.view == self.view);
2640
- assert(m.header.commit <= self.commit_max);
2641
-
2642
- if (replica != self.replica) {
2643
- // Check for a gap in the uncommitted headers from this replica.
2644
- const received_headers = self.message_body_as_headers(m);
2645
- assert(received_headers.len >= 1);
2646
-
2647
- const received_op_min = received_headers[received_headers.len - 1].op;
2648
- const received_op_max = received_headers[0].op;
2649
- assert(received_op_max >= received_op_min);
2650
-
2651
- const nack = for (received_headers) |*h| {
2652
- if (h.op == op) break false;
2653
- } else nack: {
2654
- // Don't nack ops that didn't fit in the message's attached headers.
2655
- break :nack op >= received_op_min;
2656
- };
2657
-
2658
- if (nack) nacks += 1;
2659
- log.debug("{}: discard_uncommitted_headers: replica={} op={} nack={}", .{
2660
- self.replica,
2661
- m.header.replica,
2662
- op,
2663
- nack,
2664
- });
2665
- }
2666
- }
2667
- }
3049
+ const uncanonical_op_count = std.math.min(
3050
+ // Do not reset any ops that we have already committed.
3051
+ self.op - self.commit_min,
3052
+ // The number of uncommitted ops cannot be more than the length of the pipeline.
3053
+ // Do not reset any ops that we did not include in our do_view_change message.
3054
+ config.pipeline_max,
3055
+ );
2668
3056
 
2669
- log.debug("{}: discard_uncommitted_headers: op={} nacks={} threshold={}", .{
2670
- self.replica,
2671
- op,
2672
- nacks,
2673
- threshold,
2674
- });
3057
+ assert(uncanonical_op_count <= config.pipeline_max);
3058
+ if (uncanonical_op_count == 0) return self.op;
2675
3059
 
2676
- if (nacks >= threshold) {
2677
- assert(op > self.commit_max);
3060
+ // * When uncanonical_op_count = self.op - self.commit_min,
3061
+ // self.op - uncanonical_op_count = self.commit_min.
3062
+ // * When uncanonical_op_count = config.pipeline_max,
3063
+ // config.pipeline_max < self.op - self.commit_min holds.
3064
+ const canonical_op_max = self.op - uncanonical_op_count;
2678
3065
 
2679
- self.journal.remove_entries_from(op);
2680
- self.op = op - 1;
3066
+ log.debug("{}: on_do_view_change: not canonical ops={}..{}", .{
3067
+ self.replica,
3068
+ canonical_op_max + 1,
3069
+ self.op,
3070
+ });
2681
3071
 
2682
- const slot = self.journal.slot_for_op(op);
2683
- assert(self.journal.header_for_op(op) == null);
2684
- assert(!self.journal.dirty.bit(slot));
2685
- assert(!self.journal.faulty.bit(slot));
2686
- }
2687
- }
3072
+ assert(canonical_op_max <= self.op);
3073
+ assert(canonical_op_max >= self.commit_min);
3074
+ assert(canonical_op_max + config.pipeline_max >= self.op);
3075
+ return canonical_op_max;
2688
3076
  }
2689
3077
 
2690
3078
  /// Discards uncommitted ops during a view change from after and including `op`.
@@ -2710,8 +3098,8 @@ pub fn Replica(
2710
3098
  self.view,
2711
3099
  });
2712
3100
 
2713
- self.journal.remove_entries_from(op);
2714
3101
  self.op = op - 1;
3102
+ self.journal.remove_entries_from(op);
2715
3103
 
2716
3104
  assert(self.journal.header_for_op(op) == null);
2717
3105
  assert(!self.journal.dirty.bit(slot));
@@ -2729,8 +3117,8 @@ pub fn Replica(
2729
3117
  }
2730
3118
 
2731
3119
  fn flush_loopback_queue(self: *Self) void {
2732
- // There are three cases where a replica will send a message to itself:
2733
- // However, of these three cases, only two cases will call send_message_to_replica().
3120
+ // There are four cases where a replica will send a message to itself:
3121
+ // However, of these four cases, all but one call send_message_to_replica().
2734
3122
  //
2735
3123
  // 1. In on_request(), the leader sends a synchronous prepare to itself, but this is
2736
3124
  // done by calling on_prepare() directly, and subsequent prepare timeout retries will
@@ -2739,6 +3127,8 @@ pub fn Replica(
2739
3127
  // asynchronous prepare_ok to itself.
2740
3128
  // 3. In on_start_view_change(), after receiving a quorum of start_view_change
2741
3129
  // messages, the new leader sends a synchronous do_view_change to itself.
3130
+ // 4. In start_view_as_the_new_leader(), the new leader sends itself a prepare_ok
3131
+ // message for each uncommitted message.
2742
3132
  if (self.loopback_queue) |message| {
2743
3133
  defer self.message_bus.unref(message);
2744
3134
 
@@ -2891,10 +3281,10 @@ pub fn Replica(
2891
3281
 
2892
3282
  // Verify that the new request will fit in the WAL.
2893
3283
  // The message's op hasn't been assigned yet, but it will be `self.op + 1`.
2894
- if (self.op + 1 >= self.op_checkpoint + config.journal_slot_count) {
3284
+ if (self.op == self.op_checkpoint_trigger()) {
2895
3285
  log.debug("{}: on_request: ignoring op={} (too far ahead, checkpoint={})", .{
2896
3286
  self.replica,
2897
- message.header.op,
3287
+ self.op + 1,
2898
3288
  self.op_checkpoint,
2899
3289
  });
2900
3290
  return true;
@@ -2915,7 +3305,7 @@ pub fn Replica(
2915
3305
  assert(message.header.context == 0 or message.header.operation != .register);
2916
3306
  assert(message.header.request == 0 or message.header.operation != .register);
2917
3307
 
2918
- if (self.client_table.getPtr(message.header.client)) |entry| {
3308
+ if (self.client_table().get(message.header.client)) |entry| {
2919
3309
  assert(entry.reply.header.command == .reply);
2920
3310
  assert(entry.reply.header.client == message.header.client);
2921
3311
 
@@ -3105,6 +3495,81 @@ pub fn Replica(
3105
3495
  return false;
3106
3496
  }
3107
3497
 
3498
+ fn is_repair(self: *const Self, message: *const Message) bool {
3499
+ assert(message.header.command == .prepare);
3500
+
3501
+ if (self.status == .normal) {
3502
+ if (message.header.view < self.view) return true;
3503
+ if (message.header.view == self.view and message.header.op <= self.op) return true;
3504
+ } else if (self.status == .view_change) {
3505
+ if (message.header.view < self.view) return true;
3506
+ // The view has already started or is newer.
3507
+ }
3508
+
3509
+ return false;
3510
+ }
3511
+
3512
+ /// Returns whether the replica is the leader for the current view.
3513
+ /// This may be used only when the replica status is normal.
3514
+ fn leader(self: *const Self) bool {
3515
+ assert(self.status == .normal);
3516
+ return self.leader_index(self.view) == self.replica;
3517
+ }
3518
+
3519
+ /// Returns the index into the configuration of the leader for a given view.
3520
+ fn leader_index(self: *const Self, view: u32) u8 {
3521
+ return @intCast(u8, @mod(view, self.replica_count));
3522
+ }
3523
+
3524
+ /// Advances `op` to where we need to be before `header` can be processed as a prepare.
3525
+ ///
3526
+ /// This function temporarily violates the "replica.op must exist in WAL" invariant.
3527
+ fn jump_to_newer_op_in_normal_status(self: *Self, header: *const Header) void {
3528
+ assert(self.status == .normal);
3529
+ assert(self.follower());
3530
+ assert(header.view == self.view);
3531
+ assert(header.op > self.op + 1);
3532
+ // We may have learned of a higher `commit_max` through a commit message before jumping
3533
+ // to a newer op that is less than `commit_max` but greater than `commit_min`:
3534
+ assert(header.op > self.commit_min);
3535
+ // Never overwrite an op that still needs to be checkpointed.
3536
+ assert(header.op <= self.op_checkpoint_trigger());
3537
+
3538
+ log.debug("{}: jump_to_newer_op: advancing: op={}..{} checksum={}..{}", .{
3539
+ self.replica,
3540
+ self.op,
3541
+ header.op - 1,
3542
+ self.journal.header_with_op(self.op).?.checksum,
3543
+ header.parent,
3544
+ });
3545
+
3546
+ self.op = header.op - 1;
3547
+ assert(self.op >= self.commit_min);
3548
+ assert(self.op + 1 == header.op);
3549
+ assert(self.journal.header_with_op(self.op) == null);
3550
+ }
3551
+
3552
+ fn message_body_as_headers(message: *const Message) []const Header {
3553
+ assert(message.header.size > @sizeOf(Header)); // Body must contain at least one header.
3554
+ assert(message.header.command == .do_view_change or
3555
+ message.header.command == .start_view or
3556
+ message.header.command == .headers or
3557
+ message.header.command == .recovery_response);
3558
+
3559
+ const headers = std.mem.bytesAsSlice(
3560
+ Header,
3561
+ message.buffer[@sizeOf(Header)..message.header.size],
3562
+ );
3563
+
3564
+ for (headers[0 .. headers.len - 1]) |header, index| {
3565
+ // Headers must be provided in reverse order for the sake of `repair_header()`.
3566
+ // Otherwise, headers may never be repaired where the hash chain never connects.
3567
+ assert(header.op > headers[index + 1].op);
3568
+ }
3569
+
3570
+ return headers;
3571
+ }
3572
+
3108
3573
  /// Returns whether the highest known op is certain.
3109
3574
  ///
3110
3575
  /// After recovering the WAL, there are 2 possible outcomes:
@@ -3169,70 +3634,83 @@ pub fn Replica(
3169
3634
  return true;
3170
3635
  }
3171
3636
 
3172
- fn is_repair(self: *const Self, message: *const Message) bool {
3173
- assert(message.header.command == .prepare);
3174
-
3175
- if (self.status == .normal) {
3176
- if (message.header.view < self.view) return true;
3177
- if (message.header.view == self.view and message.header.op <= self.op) return true;
3178
- } else if (self.status == .view_change) {
3179
- if (message.header.view < self.view) return true;
3180
- // The view has already started or is newer.
3181
- }
3182
-
3183
- return false;
3184
- }
3637
+ /// Returns the op that will be `op_checkpoint` after the next checkpoint.
3638
+ ///
3639
+ /// For a replica with journal_slot_count=8 and lsm_batch_multiple=2:
3640
+ ///
3641
+ /// checkpoint() call 0 1 2 3
3642
+ /// op_checkpoint 0 5 11 17
3643
+ /// op_checkpoint_next 5 11 17 23
3644
+ /// op_checkpoint_trigger 7 13 19 25
3645
+ ///
3646
+ /// commit log (ops) │ write-ahead log (slots)
3647
+ /// 0 4 8 2 6 0 4 │ 0---4---
3648
+ /// 0 ─────✓·% │ 01234✓6% initial log fill
3649
+ /// 1 ───────────✓·% │ 890✓2%45 first wrap of log
3650
+ /// 2 ─────────────────✓·% │ 6✓8%0123 second wrap of log
3651
+ /// 3 ───────────────────────✓·% │ 4%67890✓ third wrap of log
3652
+ ///
3653
+ /// Legend:
3654
+ ///
3655
+ /// ─/✓ op on disk at checkpoint
3656
+ /// ·/% op in memory at checkpoint
3657
+ /// ✓ op_checkpoint
3658
+ /// % op_checkpoint_trigger
3659
+ ///
3660
+ fn op_checkpoint_next(self: *const Self) u64 {
3661
+ assert(self.op_checkpoint <= self.commit_min);
3662
+ assert(self.op_checkpoint <= self.op);
3663
+ assert(self.op_checkpoint == 0 or
3664
+ (self.op_checkpoint + 1) % config.lsm_batch_multiple == 0);
3185
3665
 
3186
- /// Returns whether the replica is the leader for the current view.
3187
- /// This may be used only when the replica status is normal.
3188
- fn leader(self: *Self) bool {
3189
- assert(self.status == .normal);
3190
- return self.leader_index(self.view) == self.replica;
3666
+ const op = if (self.op_checkpoint == 0)
3667
+ // First wrap: op_checkpoint_next = 8-2-1 = 5
3668
+ config.journal_slot_count - config.lsm_batch_multiple - 1
3669
+ else
3670
+ // Second wrap: op_checkpoint_next = 5+8-2 = 11
3671
+ // Third wrap: op_checkpoint_next = 11+8-2 = 17
3672
+ self.op_checkpoint + config.journal_slot_count - config.lsm_batch_multiple;
3673
+ assert((op + 1) % config.lsm_batch_multiple == 0);
3674
+ // The checkpoint always advances.
3675
+ assert(op > self.op_checkpoint);
3676
+
3677
+ return op;
3191
3678
  }
3192
3679
 
3193
- /// Returns the index into the configuration of the leader for a given view.
3194
- fn leader_index(self: *Self, view: u32) u8 {
3195
- return @intCast(u8, @mod(view, self.replica_count));
3680
+ /// Returns the next op that will trigger a checkpoint.
3681
+ ///
3682
+ /// Receiving and storing an op higher than `op_checkpoint_trigger()` is forbidden; doing so
3683
+ /// would overwrite a message (or the slot of a message) that has not yet been committed and
3684
+ /// checkpointed.
3685
+ ///
3686
+ /// See `op_checkpoint_next` for more detail.
3687
+ fn op_checkpoint_trigger(self: *const Self) u64 {
3688
+ return self.op_checkpoint_next() + config.lsm_batch_multiple;
3196
3689
  }
3197
3690
 
3198
- /// Advances `op` to where we need to be before `header` can be processed as a prepare:
3199
- fn jump_to_newer_op_in_normal_status(self: *Self, header: *const Header) void {
3200
- assert(self.status == .normal);
3201
- assert(self.follower());
3202
- assert(header.view == self.view);
3203
- assert(header.op > self.op + 1);
3204
- // We may have learned of a higher `commit_max` through a commit message before jumping
3205
- // to a newer op that is less than `commit_max` but greater than `commit_min`:
3206
- assert(header.op > self.commit_min);
3207
- // Never overwrite an op that still needs to be checkpointed.
3208
- assert(header.op - self.op_checkpoint < config.journal_slot_count);
3691
+ /// Finds the header with the highest op number in a slice of headers from a replica.
3692
+ /// The headers must be continuous, in reverse order, all connected, and with no gaps.
3693
+ fn op_highest(headers: []const Header) u64 {
3694
+ assert(headers.len > 0);
3209
3695
 
3210
- log.debug("{}: jump_to_newer_op: advancing: op={}..{} checksum={}..{}", .{
3211
- self.replica,
3212
- self.op,
3213
- header.op - 1,
3214
- self.journal.header_with_op(self.op).?.checksum,
3215
- header.parent,
3216
- });
3696
+ for (headers) |header, index| {
3697
+ assert(header.valid_checksum());
3698
+ assert(header.invalid() == null);
3699
+ assert(header.command == .prepare);
3217
3700
 
3218
- self.op = header.op - 1;
3219
- assert(self.op >= self.commit_min);
3220
- assert(self.op + 1 == header.op);
3221
- }
3701
+ if (index > 0) {
3702
+ assert(header.op + 1 == headers[index - 1].op);
3703
+ assert(header.checksum == headers[index - 1].parent);
3704
+ }
3705
+ }
3222
3706
 
3223
- fn message_body_as_headers(_: *Self, message: *const Message) []Header {
3224
- // TODO Assert message commands that we expect this to be called for.
3225
- assert(message.header.size > @sizeOf(Header)); // Body must contain at least one header.
3226
- return std.mem.bytesAsSlice(
3227
- Header,
3228
- message.buffer[@sizeOf(Header)..message.header.size],
3229
- );
3707
+ return headers[0].op;
3230
3708
  }
3231
3709
 
3232
3710
  /// Panics if immediate neighbors in the same view would have a broken hash chain.
3233
3711
  /// Assumes gaps and does not require that a preceeds b.
3234
3712
  fn panic_if_hash_chain_would_break_in_the_same_view(
3235
- self: *Self,
3713
+ self: *const Self,
3236
3714
  a: *const Header,
3237
3715
  b: *const Header,
3238
3716
  ) void {
@@ -3279,7 +3757,7 @@ pub fn Replica(
3279
3757
 
3280
3758
  var op = self.commit_max + 1;
3281
3759
  var parent = self.journal.header_with_op(self.commit_max).?.checksum;
3282
- var iterator = self.pipeline.iterator();
3760
+ var iterator = self.pipeline.iterator_mutable();
3283
3761
  while (iterator.next_ptr()) |prepare| {
3284
3762
  assert(prepare.message.header.command == .prepare);
3285
3763
  assert(prepare.message.header.op == op);
@@ -3380,10 +3858,7 @@ pub fn Replica(
3380
3858
 
3381
3859
  // The replica repairs backwards from `commit_max`. But if `commit_max` is too high
3382
3860
  // (>1 WAL ahead), then bound it such that uncommitted WAL entries are not overwritten.
3383
- const commit_max_limit = std.math.min(
3384
- self.commit_max,
3385
- self.op_checkpoint + config.journal_slot_count,
3386
- );
3861
+ const commit_max_limit = std.math.min(self.commit_max, self.op_checkpoint_trigger());
3387
3862
 
3388
3863
  // Request outstanding committed prepares to advance our op number:
3389
3864
  // This handles the case of an idle cluster, where a follower will not otherwise advance.
@@ -3460,13 +3935,12 @@ pub fn Replica(
3460
3935
  // Commit ops, which may in turn discover faulty prepares and drive more repairs:
3461
3936
  if (self.commit_min < self.commit_max) {
3462
3937
  assert(self.replica_count > 1);
3463
- self.commit_ops(self.commit_max);
3938
+ self.commit_journal(self.commit_max);
3464
3939
  return;
3465
3940
  }
3466
3941
 
3467
3942
  if (self.status == .view_change and self.leader_index(self.view) == self.replica) {
3468
3943
  if (self.repair_pipeline_op() != null) return self.repair_pipeline();
3469
-
3470
3944
  // Start the view as the new leader:
3471
3945
  self.start_view_as_the_new_leader();
3472
3946
  }
@@ -3505,6 +3979,9 @@ pub fn Replica(
3505
3979
  /// with an older view number may be committed instead of an op with a newer view number:
3506
3980
  /// http://web.stanford.edu/~ouster/cgi-bin/papers/OngaroPhD.pdf.
3507
3981
  ///
3982
+ /// * Do not replace an op belonging to the current WAL wrap with an op belonging to a
3983
+ /// previous wrap. In other words, don't repair checkpointed ops.
3984
+ ///
3508
3985
  fn repair_header(self: *Self, header: *const Header) bool {
3509
3986
  assert(header.valid_checksum());
3510
3987
  assert(header.invalid() == null);
@@ -3517,145 +3994,121 @@ pub fn Replica(
3517
3994
  }
3518
3995
 
3519
3996
  if (header.op > self.op) {
3520
- log.debug("{}: repair_header: false (advances self.op={})", .{
3997
+ log.debug("{}: repair_header: op={} checksum={} (advances hash chain head)", .{
3521
3998
  self.replica,
3522
- self.op,
3999
+ header.op,
4000
+ header.checksum,
4001
+ });
4002
+ return false;
4003
+ } else if (header.op == self.op and !self.journal.has(header)) {
4004
+ assert(self.journal.header_with_op(self.op) != null);
4005
+ log.debug("{}: repair_header: op={} checksum={} (changes hash chain head)", .{
4006
+ self.replica,
4007
+ header.op,
4008
+ header.checksum,
3523
4009
  });
3524
4010
  return false;
3525
- } else if (header.op == self.op) {
3526
- if (self.journal.header_with_op_and_checksum(self.op, header.checksum)) |_| {
3527
- // Fall through below to check if self.op is uncommitted AND reordered,
3528
- // which we would see by the presence of an earlier op with higher view number,
3529
- // that breaks the chain with self.op. In this case, we must skip the repair to
3530
- // avoid overwriting any overlapping op.
4011
+ }
4012
+
4013
+ if (header.op <= self.op_checkpoint) {
4014
+ if (header.op == 0 and self.op_checkpoint == 0) {
4015
+ // Repairing the root op is allowed until the first checkpoint.
3531
4016
  } else {
3532
- log.debug("{}: repair_header: false (changes self.op={})", .{
4017
+ // Otherwise don't repair checkpointed ops, since their slots now belong to
4018
+ // the next wrap of the WAL.
4019
+ log.debug("{}: repair_header: false (precedes self.op_checkpoint={})", .{
3533
4020
  self.replica,
3534
- self.op,
4021
+ self.op_checkpoint,
3535
4022
  });
3536
4023
  return false;
3537
4024
  }
3538
4025
  }
3539
4026
 
3540
- if (self.journal.header_for_entry(header)) |existing| {
3541
- assert(existing.op == header.op);
3542
-
3543
- // Do not replace any existing op lightly as doing so may impair durability and even
3544
- // violate correctness by undoing a prepare already acknowledged to the leader:
4027
+ if (self.journal.header_for_prepare(header)) |existing| {
3545
4028
  if (existing.checksum == header.checksum) {
3546
- const slot = self.journal.slot_with_header(header).?;
3547
- if (!self.journal.dirty.bit(slot)) {
3548
- log.debug("{}: repair_header: op={} false (checksum clean)", .{
4029
+ if (self.journal.has_clean(header)) {
4030
+ log.debug("{}: repair_header: op={} checksum={} (checksum clean)", .{
3549
4031
  self.replica,
3550
4032
  header.op,
4033
+ header.checksum,
3551
4034
  });
3552
4035
  return false;
4036
+ } else {
4037
+ log.debug("{}: repair_header: op={} checksum={} (checksum dirty)", .{
4038
+ self.replica,
4039
+ header.op,
4040
+ header.checksum,
4041
+ });
3553
4042
  }
3554
-
3555
- log.debug("{}: repair_header: op={} exists, checksum dirty", .{
3556
- self.replica,
3557
- header.op,
3558
- });
3559
4043
  } else if (existing.view == header.view) {
3560
4044
  // The journal must have wrapped:
3561
- // We expect that the same view and op will have the same checksum.
4045
+ // We expect that the same view and op would have had the same checksum.
3562
4046
  assert(existing.op != header.op);
3563
-
3564
4047
  if (existing.op > header.op) {
3565
- log.debug("{}: repair_header: op={} false (view has newer op)", .{
4048
+ log.debug("{}: repair_header: op={} checksum={} (same view, newer op)", .{
3566
4049
  self.replica,
3567
4050
  header.op,
4051
+ header.checksum,
3568
4052
  });
3569
- return false;
3570
- }
3571
-
3572
- log.debug("{}: repair_header: op={} exists, view has older op", .{
3573
- self.replica,
3574
- header.op,
3575
- });
3576
- } else {
3577
- assert(existing.view != header.view);
3578
- assert(existing.op == header.op or existing.op != header.op);
3579
-
3580
- if (!self.repair_header_would_connect_hash_chain(header)) {
3581
- // We cannot replace this op until we are sure that doing so would not
3582
- // violate any prior commitments made to the leader.
3583
- log.debug("{}: repair_header: op={} false (exists)", .{
4053
+ } else {
4054
+ log.debug("{}: repair_header: op={} checksum={} (same view, older op)", .{
3584
4055
  self.replica,
3585
4056
  header.op,
4057
+ header.checksum,
3586
4058
  });
3587
- return false;
3588
4059
  }
4060
+ } else {
4061
+ assert(existing.view != header.view);
4062
+ assert(existing.op == header.op or existing.op != header.op);
3589
4063
 
3590
- log.debug("{}: repair_header: op={} exists, connects hash chain", .{
4064
+ log.debug("{}: repair_header: op={} checksum={} (different view)", .{
3591
4065
  self.replica,
3592
4066
  header.op,
4067
+ header.checksum,
3593
4068
  });
3594
4069
  }
3595
4070
  } else {
3596
- log.debug("{}: repair_header: op={} gap", .{ self.replica, header.op });
3597
- }
3598
-
3599
- // Caveat: Do not repair an existing op or gap if doing so would break the hash chain:
3600
- if (self.repair_header_would_break_hash_chain_with_next_entry(header)) {
3601
- log.debug("{}: repair_header: op={} false (breaks hash chain)", .{
4071
+ log.debug("{}: repair_header: op={} checksum={} (gap)", .{
3602
4072
  self.replica,
3603
4073
  header.op,
4074
+ header.checksum,
3604
4075
  });
3605
- return false;
3606
4076
  }
3607
4077
 
3608
- // TODO Snapshots: Skip if this header is already snapshotted.
3609
-
3610
4078
  assert(header.op < self.op or
3611
4079
  self.journal.header_with_op(self.op).?.checksum == header.checksum);
3612
4080
 
3613
- self.journal.set_header_as_dirty(header);
3614
- return true;
3615
- }
3616
-
3617
- /// If we repair this header, then would this break the hash chain only to our immediate right?
3618
- /// This offers a weak guarantee compared to `repair_header_would_connect_hash_chain()` below.
3619
- /// However, this is useful for allowing repairs when the hash chain is sparse.
3620
- fn repair_header_would_break_hash_chain_with_next_entry(
3621
- self: *Self,
3622
- header: *const Header,
3623
- ) bool {
3624
- if (self.journal.previous_entry(header)) |previous| {
3625
- self.panic_if_hash_chain_would_break_in_the_same_view(previous, header);
4081
+ if (!self.repair_header_would_connect_hash_chain(header)) {
4082
+ // We cannot replace this op until we are sure that this would not:
4083
+ // 1. undermine any prior prepare_ok guarantee made to the primary, and
4084
+ // 2. leak stale ops back into our in-memory headers (and so into a view change).
4085
+ log.debug("{}: repair_header: op={} checksum={} (disconnected from hash chain)", .{
4086
+ self.replica,
4087
+ header.op,
4088
+ header.checksum,
4089
+ });
4090
+ return false;
3626
4091
  }
3627
4092
 
3628
- if (self.journal.next_entry(header)) |next| {
3629
- self.panic_if_hash_chain_would_break_in_the_same_view(header, next);
3630
-
3631
- if (header.checksum == next.parent) {
3632
- assert(header.view <= next.view);
3633
- assert(header.op + 1 == next.op);
3634
- // We don't break with `next` but this is no guarantee that `next` does not
3635
- // break.
3636
- return false;
3637
- } else {
3638
- // If the journal has wrapped, then err in favor of a break regardless of op
3639
- // order:
3640
- return true;
4093
+ if (header.op <= self.commit_min) {
4094
+ if (self.journal.header_with_op(header.op)) |existing| {
4095
+ // If we already committed this op, the repair must be the identical message.
4096
+ assert(header.checksum == existing.checksum);
3641
4097
  }
3642
4098
  }
3643
4099
 
3644
- // We are not completely sure since there is no entry to the immediate right:
3645
- return false;
4100
+ self.journal.set_header_as_dirty(header);
4101
+ return true;
3646
4102
  }
3647
4103
 
3648
- /// If we repair this header, then would this connect the hash chain through to the latest
3649
- /// op? This offers a strong guarantee that may be used to replace or overlap an existing
3650
- /// op.
4104
+ /// If we repair this header, would this connect the hash chain through to the latest op?
4105
+ /// This offers a strong guarantee that may be used to replace an existing op.
3651
4106
  ///
3652
4107
  /// Here is an example of what could go wrong if we did not check for complete connection:
3653
4108
  ///
3654
4109
  /// 1. We do a prepare that's going to be committed.
3655
- /// 2. We do a stale prepare to the right of this, ignoring the hash chain break to the
3656
- /// left.
3657
- /// 3. We do another stale prepare that replaces the first op because it connects to the
3658
- /// second.
4110
+ /// 2. We do a stale prepare to the right, ignoring the hash chain break to the left.
4111
+ /// 3. We do another stale prepare that replaces the first since it connects to the second.
3659
4112
  ///
3660
4113
  /// This would violate our quorum replication commitment to the leader.
3661
4114
  /// The mistake in this example was not that we ignored the break to the left, which we must
@@ -4086,6 +4539,55 @@ pub fn Replica(
4086
4539
  }
4087
4540
  }
4088
4541
 
4542
+ /// The caller must ensure that the headers are trustworthy.
4543
+ ///
4544
+ /// Asserts that sequential ops are hash-chained. (Gaps are permitted).
4545
+ fn replace_headers(self: *Self, headers: []const Header) void {
4546
+ for (headers) |*header, i| {
4547
+ if (i > 0) {
4548
+ const next = &headers[i - 1];
4549
+ assert(next.view >= header.view);
4550
+ if (next.op == header.op + 1) {
4551
+ assert(next.parent == header.checksum);
4552
+ } else {
4553
+ assert(next.op > header.op);
4554
+ }
4555
+ }
4556
+
4557
+ self.replace_header(header);
4558
+ }
4559
+ }
4560
+
4561
+ /// Replaces the header if the header is different and not already committed.
4562
+ /// The caller must ensure that the header is trustworthy.
4563
+ fn replace_header(self: *Self, header: *const Header) void {
4564
+ assert(self.op_checkpoint <= self.commit_min);
4565
+ assert(header.command == .prepare);
4566
+ assert(header.op <= self.op); // Never advance the op.
4567
+ assert(header.op <= self.op_checkpoint_trigger());
4568
+
4569
+ if (header.op <= self.commit_min) {
4570
+ if (self.journal.header_with_op(header.op)) |existing_header| {
4571
+ assert(existing_header.checksum == header.checksum);
4572
+ return;
4573
+ } else {
4574
+ if (header.op <= self.op_checkpoint) {
4575
+ // Never replace a checkpointed op — those slots are needed by the following
4576
+ // WAL wrap.
4577
+ return;
4578
+ } else {
4579
+ // If an op is committed but not checkpointed, we must still have the header.
4580
+ @panic("missing committed, uncheckpointed header");
4581
+ }
4582
+ }
4583
+ }
4584
+
4585
+ // Do not set an op as dirty if we already have it exactly because:
4586
+ // 1. this would trigger a repair and delay the view change, or worse,
4587
+ // 2. prevent repairs to another replica when we have the op.
4588
+ if (!self.journal.has(header)) self.journal.set_header_as_dirty(header);
4589
+ }
4590
+
4089
4591
  /// Replicates to the next replica in the configuration (until we get back to the leader):
4090
4592
  /// Replication starts and ends with the leader, we never forward back to the leader.
4091
4593
  /// Does not flood the network with prepares that have already committed.
@@ -4149,7 +4651,7 @@ pub fn Replica(
4149
4651
  assert(replica < self.replica_count);
4150
4652
  }
4151
4653
 
4152
- counter.setIntersection(quorum_counter_null);
4654
+ counter.* = quorum_counter_null;
4153
4655
  assert(counter.count() == 0);
4154
4656
 
4155
4657
  var replica: usize = 0;
@@ -4168,6 +4670,20 @@ pub fn Replica(
4168
4670
  self.nack_prepare_op = null;
4169
4671
  }
4170
4672
 
4673
+ fn reset_quorum_prepare_ok(self: *Self) void {
4674
+ // "prepare_ok"s from previous views are not valid, even if the pipeline entry is reused
4675
+ // after a cycle of view changes. In other words, when a view change cycles around, so
4676
+ // that the original primary becomes a primary of a new view, pipeline entries may be
4677
+ // reused. However, the pipeline's prepare_ok quorums must not be reused, since the
4678
+ // replicas that sent them may have swapped them out during a previous view change.
4679
+ var iterator = self.pipeline.iterator_mutable();
4680
+ while (iterator.next_ptr()) |prepare| {
4681
+ prepare.ok_quorum_received = false;
4682
+ prepare.ok_from_all_replicas = quorum_counter_null;
4683
+ assert(prepare.ok_from_all_replicas.count() == 0);
4684
+ }
4685
+ }
4686
+
4171
4687
  fn reset_quorum_start_view_change(self: *Self) void {
4172
4688
  self.reset_quorum_counter(&self.start_view_change_from_other_replicas);
4173
4689
  self.start_view_change_quorum = false;
@@ -4296,8 +4812,15 @@ pub fn Replica(
4296
4812
  assert(message.header.command == .do_view_change);
4297
4813
  assert(message.header.view == self.view);
4298
4814
  assert(message.header.op == self.op);
4299
- assert(message.header.op == self.message_body_as_headers(message)[0].op);
4300
- assert(message.header.commit == self.commit_max);
4815
+ assert(message.header.op == message_body_as_headers(message)[0].op);
4816
+ // Each replica must advertise its own commit number, so that the new primary can know
4817
+ // which headers must be replaced in its log. Otherwise, a gap in the log may prevent
4818
+ // the new primary from repairing its log, resulting in the log being forked if the new
4819
+ // primary also discards uncommitted operations.
4820
+ // It is also safe not to use `commit_max` here because the new primary will assume that
4821
+ // operations after the highest `commit_min` may yet have been committed before the old
4822
+ // primary crashed. The new primary will use the NACK protocol to be sure of a discard.
4823
+ assert(message.header.commit == self.commit_min);
4301
4824
 
4302
4825
  self.send_message_to_replica(self.leader_index(self.view), message);
4303
4826
  }
@@ -4389,6 +4912,7 @@ pub fn Replica(
4389
4912
  .prepare_ok => {
4390
4913
  assert(self.status == .normal);
4391
4914
  assert(message.header.view == self.view);
4915
+ assert(message.header.op <= self.op_checkpoint_trigger());
4392
4916
  // We must only ever send a prepare_ok to the latest leader of the active view:
4393
4917
  // We must never straddle views by sending to a leader in an older view.
4394
4918
  // Otherwise, we would be enabling a partitioned leader to commit.
@@ -4407,6 +4931,7 @@ pub fn Replica(
4407
4931
  assert(message.header.view == self.view);
4408
4932
  assert(message.header.replica == self.replica);
4409
4933
  assert(message.header.op == self.op);
4934
+ assert(message.header.commit == self.commit_min);
4410
4935
  assert(replica == self.leader_index(self.view));
4411
4936
  },
4412
4937
  .start_view => switch (self.status) {
@@ -4479,46 +5004,13 @@ pub fn Replica(
4479
5004
  }
4480
5005
  }
4481
5006
 
4482
- /// Finds the header with the highest op number in a slice of headers from a replica.
4483
- /// Searches only by op number to find the highest `self.op` for the replica.
4484
- fn set_latest_op(headers: []const Header, latest: *Header) void {
4485
- switch (latest.command) {
4486
- .reserved, .prepare => assert(latest.valid_checksum()),
4487
- else => unreachable,
4488
- }
4489
-
4490
- for (headers) |header| {
4491
- assert(header.valid_checksum());
4492
- assert(header.invalid() == null);
4493
- assert(header.command == .prepare);
4494
-
4495
- if (latest.command == .reserved or header.op > latest.op) {
4496
- // We are simply trying to find the latest `self.op` in the replica's log.
4497
- // We therefore do not compare views here.
4498
- latest.* = header;
4499
- }
4500
- }
4501
- }
4502
-
4503
- fn set_latest_op_and_k(
4504
- self: *Self,
4505
- latest: *const Header,
4506
- k: u64,
4507
- method: []const u8,
4508
- ) void {
5007
+ fn set_op_and_commit_max(self: *Self, op: u64, commit_max: u64, method: []const u8) void {
4509
5008
  assert(self.status == .view_change or self.status == .recovering);
4510
5009
  assert(self.journal.recovered);
4511
- assert(latest.valid_checksum());
4512
- assert(latest.invalid() == null);
4513
- assert(latest.command == .prepare);
4514
- assert(latest.cluster == self.cluster);
4515
5010
 
4516
5011
  switch (self.status) {
4517
5012
  .normal => unreachable,
4518
- .view_change => {
4519
- // The view may have started already, so we can have a prepare in the same view:
4520
- assert(latest.view <= self.view);
4521
- },
5013
+ .view_change => {},
4522
5014
  .recovering => {
4523
5015
  // The replica's view hasn't been set yet.
4524
5016
  // It will be set shortly, when we transition to normal status.
@@ -4526,73 +5018,406 @@ pub fn Replica(
4526
5018
  },
4527
5019
  }
4528
5020
 
4529
- log.debug("{}: {s}: view={} op={}..{} commit={}..{} checksum={}", .{
4530
- self.replica,
4531
- method,
4532
- self.view,
4533
- self.op,
4534
- latest.op,
4535
- self.commit_max,
4536
- k,
4537
- latest.checksum,
4538
- });
4539
-
4540
- // Uncommitted ops may not survive a view change so we must assert `latest.op` against
5021
+ // Uncommitted ops may not survive a view change so we must assert `op` against
4541
5022
  // `commit_max` and not `self.op`. However, committed ops (`commit_max`) must survive:
4542
- assert(latest.op >= self.commit_max);
4543
- assert(latest.op >= latest.commit);
4544
- assert(latest.op >= k);
4545
- // We expect that `commit_max` (and `commit_min`) may be greater than `latest.commit`
4546
- // because `latest.commit` is the commit number at the time the `latest.op` prepared.
4547
- // We expect that `commit_max` (and `commit_min`) may also be greater even than `k`
4548
- // because we may be the old leader joining towards the end of the view change and we
4549
- // may have committed the `latest.op` already. However, this is bounded by pipelining.
4550
- // The intersection property only requires that all "possibly" committed operations must
4551
- // survive into the new view so that they can then be committed by the new leader. This
4552
- // guarantees that if the old leader "possibly" committed the operation, then the new
4553
- // leader will also commit the operation.
4554
- if (k < self.commit_max and self.commit_min == self.commit_max) {
5023
+ assert(op >= self.commit_max);
5024
+ assert(op >= commit_max);
5025
+ assert(op <= self.op_checkpoint_trigger());
5026
+
5027
+ // We expect that our commit numbers may also be greater even than `commit_max` because
5028
+ // we may be the old leader joining towards the end of the view change and we may have
5029
+ // committed `op` already.
5030
+ // However, this is bounded by pipelining.
5031
+ // The intersection property only requires that all possibly committed operations must
5032
+ // survive into the new view so that they can then be committed by the new leader.
5033
+ // This guarantees that if the old leader possibly committed the operation, then the
5034
+ // new leader will also commit the operation.
5035
+ if (commit_max < self.commit_max and self.commit_min == self.commit_max) {
4555
5036
  log.debug("{}: {s}: k={} < commit_max={} and commit_min == commit_max", .{
4556
5037
  self.replica,
4557
5038
  method,
4558
- k,
5039
+ commit_max,
4559
5040
  self.commit_max,
4560
5041
  });
4561
5042
  }
4562
- assert(k >= latest.commit);
4563
- assert(k >= self.commit_max - std.math.min(config.pipeline_max, self.commit_max));
5043
+
5044
+ assert(commit_max >=
5045
+ self.commit_max - std.math.min(config.pipeline_max, self.commit_max));
4564
5046
 
4565
5047
  assert(self.commit_min <= self.commit_max);
4566
5048
  assert(self.op >= self.commit_max or self.op < self.commit_max);
4567
5049
 
4568
- self.op = latest.op;
5050
+ const previous_op = self.op;
5051
+ const previous_commit_max = self.commit_max;
5052
+
5053
+ self.op = op;
5054
+ self.journal.remove_entries_from(self.op + 1);
5055
+
4569
5056
  // Crucially, we must never rewind `commit_max` (and then `commit_min`) because
4570
5057
  // `commit_min` represents what we have already applied to our state machine:
4571
- self.commit_max = std.math.max(self.commit_max, k);
5058
+ self.commit_max = std.math.max(self.commit_max, commit_max);
4572
5059
 
4573
5060
  assert(self.commit_min <= self.commit_max);
4574
- assert(self.op >= self.commit_max);
5061
+ assert(self.commit_max <= self.op);
4575
5062
 
4576
- // Do not set the latest op as dirty if we already have it exactly:
4577
- // Otherwise, this would trigger a repair and delay the view change, or worse, it would
4578
- // prevent us from assisting another replica to recover when we do in fact have the op.
4579
- if (self.journal.has(latest)) {
4580
- log.debug("{}: {s}: latest op exists exactly", .{ self.replica, method });
4581
- } else {
4582
- self.journal.set_header_as_dirty(latest);
5063
+ log.debug("{}: {s}: view={} op={}..{} commit={}..{}", .{
5064
+ self.replica,
5065
+ method,
5066
+ self.view,
5067
+ previous_op,
5068
+ self.op,
5069
+ previous_commit_max,
5070
+ self.commit_max,
5071
+ });
5072
+ }
5073
+
5074
+ /// Load the new view's headers from the DVC quorum.
5075
+ ///
5076
+ /// The iteration order of DVCs for repair does not impact the final result.
5077
+ /// In other words, you can't end up in a situation with a DVC quorum like:
5078
+ ///
5079
+ /// replica headers commit_min
5080
+ /// 0 4 5 _ _ 8 4 (new leader; handling DVC quorum)
5081
+ /// 1 4 _ 6 _ 8 4
5082
+ /// 2 4 _ _ 7 8 4
5083
+ /// 3 (4 5 6 7 8) 8 (didn't participate in view change)
5084
+ /// 4 (4 5 6 7 8) 8 (didn't participate in view change)
5085
+ ///
5086
+ /// where the new leader's headers depends on which of replica 1 and 2's DVC is used
5087
+ /// for repair before the other (i.e. whether they repair op 6 or 7 first).
5088
+ ///
5089
+ /// For the above case to occur, replicas 0, 1, and 2 must all share the highest `view_normal`.
5090
+ /// And since they share the latest `view_normal`, ops 5,6,7 were just installed by
5091
+ /// `replace_header`, which is order-independent (it doesn't use the hash chain).
5092
+ ///
5093
+ /// (If replica 0's view_normal was greater than 1/2's, then replica 0 must have all
5094
+ /// headers from previous views. Which means 6,7 are from the current view. But since
5095
+ /// replica 0 doesn't have 6/7, then replica 1/2 must share the latest view_normal. ∎)
5096
+ fn set_log_from_do_view_change_messages(self: *Self) void {
5097
+ assert(self.status == .view_change);
5098
+ assert(self.leader_index(self.view) == self.replica);
5099
+ assert(self.replica_count > 1);
5100
+ assert(self.start_view_change_quorum);
5101
+ assert(self.do_view_change_quorum);
5102
+
5103
+ const do_view_change_head = self.do_view_change_quorum_head();
5104
+ assert(do_view_change_head.view_normal >= self.view_normal);
5105
+ assert(do_view_change_head.op >= self.commit_min);
5106
+ assert(do_view_change_head.op >= do_view_change_head.commit_min_max);
5107
+ assert(do_view_change_head.commit_min_max >= self.commit_min);
5108
+
5109
+ // The `prepare_timestamp` prevents a primary's own clock from running backwards.
5110
+ // Therefore, `prepare_timestamp`:
5111
+ // 1. is advanced if behind the cluster, but never reset if ahead of the cluster, i.e.
5112
+ // 2. may not always reflect the timestamp of the latest prepared op, and
5113
+ // 3. should be advanced before discarding the timestamps of any uncommitted headers.
5114
+ if (self.state_machine.prepare_timestamp < do_view_change_head.timestamp) {
5115
+ self.state_machine.prepare_timestamp = do_view_change_head.timestamp;
4583
5116
  }
4584
5117
 
4585
- assert(self.op == latest.op);
4586
- self.journal.remove_entries_from(self.op + 1);
4587
- assert(self.journal.header_with_op(self.op).?.checksum == latest.checksum);
5118
+ const view_normal_canonical = do_view_change_head.view_normal;
5119
+ // `op_canonical` must be computed before calling `set_op_and_commit_max()`, since
5120
+ // that may change `replica.op`.
5121
+ //
5122
+ // Don't remove the uncanonical headers yet — even though the removed headers are
5123
+ // a subset of the DVC headers, removing and then adding them back would cause clean
5124
+ // headers to become dirty.
5125
+ const op_canonical = self.op_canonical_max(view_normal_canonical);
5126
+ assert(op_canonical <= self.op);
5127
+ assert(op_canonical >= self.op -| config.pipeline_max);
5128
+ assert(op_canonical >= self.commit_min);
5129
+
5130
+ if (do_view_change_head.op > self.op_checkpoint_trigger()) {
5131
+ // This replica is too far behind, i.e. the new `self.op` is too far ahead of the
5132
+ // last checkpoint. If we wrap now, we overwrite un-checkpointed transfers in the WAL,
5133
+ // precluding recovery.
5134
+ //
5135
+ // TODO State transfer. Currently this is unreachable because the
5136
+ // leader won't checkpoint until all replicas are caught up.
5137
+ unreachable;
5138
+ }
5139
+
5140
+ self.set_op_and_commit_max(
5141
+ do_view_change_head.op,
5142
+ // `set_op_and_commit_max()` expects the highest commit_max that we know of.
5143
+ // But DVCs include replica's `commit_min`, not `commit_max`.
5144
+ std.math.max(
5145
+ self.commit_max,
5146
+ do_view_change_head.commit_min_max,
5147
+ ),
5148
+ "on_do_view_change",
5149
+ );
5150
+ // "`replica.op` exists" invariant may be broken until after the canonical DVC headers
5151
+ // are installed.
5152
+
5153
+ // First, set all the canonical headers from the replica(s) with highest `view_normal`:
5154
+ for (self.do_view_change_from_all_replicas) |received| {
5155
+ if (received) |message| {
5156
+ const view_normal = @intCast(u32, message.header.timestamp);
5157
+ // The view in which this replica's status was normal must be before this view.
5158
+ assert(view_normal < message.header.view);
5159
+
5160
+ if (view_normal < view_normal_canonical) continue;
5161
+ assert(view_normal == view_normal_canonical);
5162
+
5163
+ const message_headers = message_body_as_headers(message);
5164
+ for (message_headers) |*header| {
5165
+ log.debug(
5166
+ "{}: on_do_view_change: canonical: replica={} op={} checksum={}",
5167
+ .{
5168
+ self.replica,
5169
+ message.header.replica,
5170
+ header.op,
5171
+ header.checksum,
5172
+ },
5173
+ );
5174
+ }
5175
+ self.replace_headers(message_headers);
5176
+ }
5177
+ }
5178
+
5179
+ // Since we used do_view_change_head to set the replica.op, it must have been loaded
5180
+ // into the headers (if it wasn't present already).
5181
+ assert(self.journal.header_with_op(self.op) != null);
5182
+
5183
+ // Now that the canonical headers are all in place, repair any other headers:
5184
+ for (self.do_view_change_from_all_replicas) |received| {
5185
+ if (received) |message| {
5186
+ const view_normal = @intCast(u32, message.header.timestamp);
5187
+ assert(view_normal < message.header.view);
5188
+
5189
+ if (view_normal == view_normal_canonical) continue;
5190
+ assert(view_normal < view_normal_canonical);
5191
+
5192
+ for (message_body_as_headers(message)) |*header| {
5193
+ // We must trust headers that other replicas have committed, because
5194
+ // repair_header() will not repair a header if the hash chain has a gap.
5195
+ if (header.op <= message.header.commit) {
5196
+ log.debug(
5197
+ "{}: on_do_view_change: committed: replica={} op={} checksum={}",
5198
+ .{
5199
+ self.replica,
5200
+ message.header.replica,
5201
+ header.op,
5202
+ header.checksum,
5203
+ },
5204
+ );
5205
+ self.replace_header(header);
5206
+ } else {
5207
+ _ = self.repair_header(header);
5208
+ }
5209
+ }
5210
+ }
5211
+ }
5212
+
5213
+ const op_max = self.do_view_change_op_max(op_canonical);
5214
+ assert(op_max <= self.op);
5215
+ assert(op_max >= self.commit_min);
5216
+ if (op_max != self.op) {
5217
+ log.debug("{}: set_log_from_do_view_change_messages: discard op={}..{}", .{
5218
+ self.replica,
5219
+ op_max + 1,
5220
+ self.op,
5221
+ });
5222
+ self.journal.remove_entries_from(op_max + 1);
5223
+ self.op = op_max;
5224
+ }
5225
+ assert(self.journal.header_with_op(self.op) != null);
4588
5226
  }
4589
5227
 
4590
- fn start_view_as_the_new_leader(self: *Self) void {
5228
+ fn do_view_change_quorum_head(self: *const Self) struct {
5229
+ /// The highest `view_normal` of any DVC.
5230
+ ///
5231
+ /// The headers bundled with DVCs with the highest `view_normal` are canonical, since
5232
+ /// the replica has knowledge of previous view changes in which headers were replaced.
5233
+ view_normal: u32,
5234
+ /// The highest `commit_min` from any DVC (this is not a `commit_max`).
5235
+ commit_min_max: u64,
5236
+ /// The highest `op` from a DVC with the highest `view_normal`.
5237
+ op: u64,
5238
+ /// The higest timestamp from any DVC.
5239
+ timestamp: u64,
5240
+ } {
4591
5241
  assert(self.status == .view_change);
4592
5242
  assert(self.leader_index(self.view) == self.replica);
5243
+ assert(self.replica_count > 1);
5244
+ assert(self.start_view_change_quorum);
4593
5245
  assert(self.do_view_change_quorum);
5246
+ assert(self.do_view_change_from_all_replicas[self.replica] != null);
4594
5247
 
4595
- assert(!self.committing);
5248
+ var v: ?u32 = null; // The highest `view_normal` from any replica.
5249
+ var n: ?u64 = null; // The highest `op` for the highest `view_normal` from any replica.
5250
+ var k: ?u64 = null; // The highest `commit_min` from any replica.
5251
+ var t: ?u64 = null; // The highest `timestamp` from any replica.
5252
+
5253
+ for (self.do_view_change_from_all_replicas) |received, replica| {
5254
+ if (received) |message| {
5255
+ assert(message.header.command == .do_view_change);
5256
+ assert(message.header.cluster == self.cluster);
5257
+ assert(message.header.replica == replica);
5258
+ assert(message.header.view == self.view);
5259
+ assert(message.header.op >= message.header.commit);
5260
+ assert(message.header.op - message.header.commit <= config.journal_slot_count);
5261
+
5262
+ // The view when this replica was last in normal status, which:
5263
+ // * may be higher than the view in any of the prepare headers.
5264
+ // * must be lower than the view of this view change.
5265
+ const view_normal = @intCast(u32, message.header.timestamp);
5266
+ assert(view_normal < message.header.view);
5267
+
5268
+ if (replica == self.replica) {
5269
+ assert(view_normal == self.view_normal);
5270
+ assert(message.header.op == self.op);
5271
+ // We may have a newer commit than our DVC due to async commits (see below).
5272
+ assert(message.header.commit <= self.commit_min);
5273
+ }
5274
+
5275
+ log.debug(
5276
+ "{}: on_do_view_change: " ++
5277
+ "replica={} view_normal={} op={} commit_min={}",
5278
+ .{
5279
+ self.replica,
5280
+ message.header.replica,
5281
+ view_normal,
5282
+ message.header.op,
5283
+ message.header.commit, // The `commit_min` of the replica.
5284
+ },
5285
+ );
5286
+
5287
+ if (v == null or view_normal > v.?) {
5288
+ v = view_normal;
5289
+ n = message.header.op;
5290
+ } else if (view_normal == v.? and message.header.op > n.?) {
5291
+ n = message.header.op;
5292
+ }
5293
+
5294
+ if (k == null or message.header.commit > k.?) k = message.header.commit;
5295
+
5296
+ const message_headers = message_body_as_headers(message);
5297
+ if (t == null or t.? < message_headers[0].timestamp) {
5298
+ t = message_headers[0].timestamp;
5299
+ }
5300
+ }
5301
+ }
5302
+
5303
+ // Consider the case:
5304
+ // 1. Start committing op=N…M.
5305
+ // 2. Send `do_view_change` to self.
5306
+ // 3. Finish committing op=N…M.
5307
+ // 4. Remaining `do_view_change` messages arrive, completing the quorum.
5308
+ // In this scenario, our own DVC's commit is `N-1`, but `commit_min=M`.
5309
+ // Don't let the commit backtrack.
5310
+ if (k.? < self.commit_min) {
5311
+ assert(self.commit_min >
5312
+ self.do_view_change_from_all_replicas[self.replica].?.header.commit);
5313
+ log.debug("{}: on_do_view_change: bump commit_min view={} commit={}..{}", .{
5314
+ self.replica,
5315
+ self.view,
5316
+ k.?,
5317
+ self.commit_min,
5318
+ });
5319
+ k = self.commit_min;
5320
+ }
5321
+
5322
+ assert(v.? >= self.view_normal);
5323
+ assert(k.? >= self.commit_min);
5324
+
5325
+ return .{
5326
+ .view_normal = v.?,
5327
+ .commit_min_max = k.?,
5328
+ .op = n.?,
5329
+ .timestamp = t.?,
5330
+ };
5331
+ }
5332
+
5333
+ /// Identify headers to discard during a view change before the primary starts the view.
5334
+ /// This is required to maximize availability in the presence of storage faults.
5335
+ /// Refer to the CTRL protocol from Protocol-Aware Recovery for Consensus-Based Storage.
5336
+ ///
5337
+ /// Returns the highest op that:
5338
+ /// - precedes any hash chain breaks in the uncanonical headers, and
5339
+ /// - precedes any gaps in the uncommitted headers.
5340
+ ///
5341
+ /// Breaks
5342
+ ///
5343
+ /// If there is a hash chain break, none of the headers from the canonical DVCs replaced
5344
+ /// the broken (leftover uncanonical) op.
5345
+ /// Removing these is necessary for correctness and liveness, to ensure that
5346
+ /// disconnected headers do not remain in place in lieu of gaps.
5347
+ ///
5348
+ /// Gaps
5349
+ ///
5350
+ /// It is possible for the new primary to have done an op jump in a previous view, and
5351
+ /// introduced a header gap for an op, which may have then been discarded by another primary
5352
+ /// during a view change, before surviving into this view as a gap because our latest op was
5353
+ /// set as the latest op for the quorum.
5354
+ ///
5355
+ /// In this case, it may be impossible for the new primary to repair the missing header as
5356
+ /// the rest of the cluster may have already discarded it. We therefore iterate over our
5357
+ /// uncommitted header gaps to discard any that may be impossible to repair.
5358
+ ///
5359
+ /// For example, if the old primary replicates ops=7,8,9 (all uncommitted) but only op=9 is
5360
+ /// prepared on another replica before the old primary crashes, then this function finds a
5361
+ /// gap for ops=7,8 and will attempt to discard ops 7,8,9.
5362
+ fn do_view_change_op_max(self: *const Self, op_canonical: u64) u64 {
5363
+ assert(self.replica_count > 1);
5364
+ assert(self.status == .view_change);
5365
+ assert(self.leader_index(self.view) == self.replica);
5366
+ assert(self.do_view_change_quorum);
5367
+ assert(!self.repair_timeout.ticking);
5368
+ assert(self.op >= self.commit_max);
5369
+ // At least one replica in the new quorum committed in the new replica.op's WAL wrap —
5370
+ // wrapping implies a checkpoint (which implies a commit).
5371
+ assert(self.op - self.commit_max <= config.journal_slot_count);
5372
+ assert(self.op - self.commit_min <= config.journal_slot_count);
5373
+
5374
+ assert(op_canonical <= self.op);
5375
+ assert(op_canonical >= self.commit_min);
5376
+
5377
+ // Any uncanonical ops remaining either:
5378
+ // * Connect to the hash chain on the right.
5379
+ // * Do not connect on the right (hash chain break).
5380
+ //
5381
+ // If there is a hash chain break, none of the headers from the canonical DVCs replaced
5382
+ // the broken op. It is truncated like a gap.
5383
+ //
5384
+ // Removing these is necessary for correctness and liveness, to ensure that
5385
+ // disconnected headers do not remain in place in lieu of gaps.
5386
+ const op_before_break = blk: {
5387
+ var op: u64 = op_canonical;
5388
+ while (op < self.op) : (op += 1) {
5389
+ if (self.journal.header_with_op(op)) |header| {
5390
+ if (self.journal.header_with_op(op + 1)) |next| {
5391
+ // Broken hash chain.
5392
+ if (header.checksum != next.parent) break :blk op;
5393
+ }
5394
+ }
5395
+ } else break :blk self.op;
5396
+ };
5397
+
5398
+ // Find the beginning of the lowest gap.
5399
+ //
5400
+ // While iterating > commit_max does not in itself guarantee that an op is uncommitted
5401
+ // (the old primary may have committed the op shortly before crashing), nevertheless,
5402
+ // if it was committed it would have survived into the new view as a header not a gap.
5403
+ const op_before_gap = blk: {
5404
+ // An op cannot be uncommitted if it is definitely outside the pipeline.
5405
+ const op_committed = std.math.max(self.commit_max, self.op -| config.pipeline_max);
5406
+ assert(op_committed <= self.op);
5407
+
5408
+ var op = op_committed;
5409
+ while (op < self.op) : (op += 1) {
5410
+ if (self.journal.header_with_op(op + 1) == null) break :blk op;
5411
+ } else break :blk self.op;
5412
+ };
5413
+
5414
+ return std.math.min(op_before_break, op_before_gap);
5415
+ }
5416
+
5417
+ fn start_view_as_the_new_leader(self: *Self) void {
5418
+ assert(self.status == .view_change);
5419
+ assert(self.leader_index(self.view) == self.replica);
5420
+ assert(self.do_view_change_quorum);
4596
5421
  assert(!self.repairing_pipeline);
4597
5422
 
4598
5423
  assert(self.commit_min == self.commit_max);
@@ -4630,6 +5455,9 @@ pub fn Replica(
4630
5455
  fn transition_to_normal_from_recovering_status(self: *Self, new_view: u32) void {
4631
5456
  assert(self.status == .recovering);
4632
5457
  assert(self.view == 0);
5458
+ assert(!self.committing);
5459
+ assert(self.replica_count > 1 or new_view == 0);
5460
+ assert(self.journal.header_with_op(self.op) != null);
4633
5461
  self.view = new_view;
4634
5462
  self.view_normal = new_view;
4635
5463
  self.status = .normal;
@@ -4679,6 +5507,7 @@ pub fn Replica(
4679
5507
  // For example, this could happen after a state transfer triggered by an op jump.
4680
5508
  assert(self.status == .view_change);
4681
5509
  assert(new_view >= self.view);
5510
+ assert(self.journal.header_with_op(self.op) != null);
4682
5511
  self.view = new_view;
4683
5512
  self.view_normal = new_view;
4684
5513
  self.status = .normal;
@@ -4724,6 +5553,7 @@ pub fn Replica(
4724
5553
  self.reset_quorum_start_view_change();
4725
5554
  self.reset_quorum_do_view_change();
4726
5555
  self.reset_quorum_nack_prepare();
5556
+ self.reset_quorum_prepare_ok();
4727
5557
 
4728
5558
  assert(self.start_view_change_quorum == false);
4729
5559
  assert(self.do_view_change_quorum == false);
@@ -4763,6 +5593,7 @@ pub fn Replica(
4763
5593
  self.reset_quorum_start_view_change();
4764
5594
  self.reset_quorum_do_view_change();
4765
5595
  self.reset_quorum_nack_prepare();
5596
+ self.reset_quorum_prepare_ok();
4766
5597
 
4767
5598
  assert(self.start_view_change_quorum == false);
4768
5599
  assert(self.do_view_change_quorum == false);
@@ -4780,7 +5611,7 @@ pub fn Replica(
4780
5611
  assert(reply.header.commit > 0);
4781
5612
  assert(reply.header.request > 0);
4782
5613
 
4783
- if (self.client_table.getPtr(reply.header.client)) |entry| {
5614
+ if (self.client_table().get(reply.header.client)) |entry| {
4784
5615
  assert(entry.reply.header.command == .reply);
4785
5616
  assert(entry.reply.header.context == 0);
4786
5617
  assert(entry.reply.header.op == entry.reply.header.commit);
@@ -4868,12 +5699,16 @@ pub fn Replica(
4868
5699
  }
4869
5700
 
4870
5701
  fn verify_pipeline(self: *Self) void {
5702
+ assert(self.status == .view_change);
5703
+
4871
5704
  var op = self.commit_max + 1;
4872
5705
  var parent = self.journal.header_with_op(self.commit_max).?.checksum;
4873
5706
 
4874
5707
  var iterator = self.pipeline.iterator();
4875
5708
  while (iterator.next_ptr()) |prepare| {
4876
5709
  assert(prepare.message.header.command == .prepare);
5710
+ assert(!prepare.ok_quorum_received);
5711
+ assert(prepare.ok_from_all_replicas.count() == 0);
4877
5712
 
4878
5713
  log.debug("{}: verify_pipeline: op={} checksum={x} parent={x}", .{
4879
5714
  self.replica,
@@ -4971,6 +5806,12 @@ pub fn Replica(
4971
5806
  assert(message.header.view <= self.view);
4972
5807
  assert(message.header.op <= self.op);
4973
5808
 
5809
+ if (message.header.op == self.op_checkpoint) {
5810
+ assert(message.header.op == 0);
5811
+ } else {
5812
+ assert(message.header.op > self.op_checkpoint);
5813
+ }
5814
+
4974
5815
  if (!self.journal.has(message.header)) {
4975
5816
  log.debug("{}: write_prepare: ignoring op={} checksum={} (header changed)", .{
4976
5817
  self.replica,
@@ -5013,3 +5854,113 @@ pub fn Replica(
5013
5854
  }
5014
5855
  };
5015
5856
  }
5857
+
5858
+ /// Initialize the TigerBeetle replica's data file.
5859
+ pub fn format(
5860
+ comptime Storage: type,
5861
+ allocator: std.mem.Allocator,
5862
+ cluster: u32,
5863
+ replica: u8,
5864
+ storage: *Storage,
5865
+ superblock: *vsr.SuperBlockType(Storage),
5866
+ ) !void {
5867
+ const ReplicaFormat = ReplicaFormatType(Storage);
5868
+ var replica_format = ReplicaFormat{};
5869
+
5870
+ try replica_format.format_wal(allocator, cluster, storage);
5871
+ assert(!replica_format.formatting);
5872
+
5873
+ superblock.format(
5874
+ ReplicaFormat.format_superblock_callback,
5875
+ &replica_format.superblock_context,
5876
+ .{
5877
+ .cluster = cluster,
5878
+ .replica = replica,
5879
+ .size_max = config.size_max, // This can later become a runtime arg, to cap storage.
5880
+ },
5881
+ );
5882
+
5883
+ replica_format.formatting = true;
5884
+ while (replica_format.formatting) storage.tick();
5885
+ }
5886
+
5887
+ fn ReplicaFormatType(comptime Storage: type) type {
5888
+ const SuperBlock = vsr.SuperBlockType(Storage);
5889
+ return struct {
5890
+ const Self = @This();
5891
+
5892
+ formatting: bool = false,
5893
+ superblock_context: SuperBlock.Context = undefined,
5894
+ wal_write: Storage.Write = undefined,
5895
+
5896
+ fn format_wal(
5897
+ self: *Self,
5898
+ allocator: std.mem.Allocator,
5899
+ cluster: u32,
5900
+ storage: *Storage,
5901
+ ) !void {
5902
+ const header_zeroes = [_]u8{0} ** @sizeOf(Header);
5903
+ const wal_write_size_max = 4 * 1024 * 1024;
5904
+ assert(wal_write_size_max % config.sector_size == 0);
5905
+
5906
+ // Direct I/O requires the buffer to be sector-aligned.
5907
+ var wal_buffer = try allocator.allocAdvanced(
5908
+ u8,
5909
+ config.sector_size,
5910
+ wal_write_size_max,
5911
+ .exact,
5912
+ );
5913
+ errdefer allocator.free(wal_buffer);
5914
+
5915
+ // The logical offset *within the WAL*.
5916
+ var wal_offset: u64 = 0;
5917
+ while (wal_offset < config.journal_size_max) {
5918
+ const size = format_journal(cluster, wal_offset, wal_buffer);
5919
+ assert(size % config.sector_size == 0);
5920
+ assert(size > 0);
5921
+
5922
+ for (std.mem.bytesAsSlice(Header, wal_buffer[0..size])) |*header| {
5923
+ if (std.mem.eql(u8, std.mem.asBytes(header), &header_zeroes)) {
5924
+ // This is the (empty) body of a reserved or root Prepare.
5925
+ } else {
5926
+ // This is either a Prepare's header or a redundant header.
5927
+ assert(header.valid_checksum());
5928
+ if (header.op == 0) {
5929
+ assert(header.command == .prepare);
5930
+ assert(header.operation == .root);
5931
+ } else {
5932
+ assert(header.command == .reserved);
5933
+ assert(header.operation == .reserved);
5934
+ }
5935
+ }
5936
+ }
5937
+
5938
+ storage.write_sectors(
5939
+ format_wal_sectors_callback,
5940
+ &self.wal_write,
5941
+ wal_buffer[0..size],
5942
+ .wal,
5943
+ wal_offset,
5944
+ );
5945
+ self.formatting = true;
5946
+ while (self.formatting) storage.tick();
5947
+ wal_offset += size;
5948
+ }
5949
+
5950
+ // There is nothing left to write.
5951
+ assert(format_journal(cluster, wal_offset, wal_buffer) == 0);
5952
+ }
5953
+
5954
+ fn format_wal_sectors_callback(write: *Storage.Write) void {
5955
+ const self = @fieldParentPtr(Self, "wal_write", write);
5956
+ assert(self.formatting);
5957
+ self.formatting = false;
5958
+ }
5959
+
5960
+ fn format_superblock_callback(superblock_context: *SuperBlock.Context) void {
5961
+ const self = @fieldParentPtr(Self, "superblock_context", superblock_context);
5962
+ assert(self.formatting);
5963
+ self.formatting = false;
5964
+ }
5965
+ };
5966
+ }