tigerbeetle-node 0.11.8 → 0.11.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/dist/.client.node.sha256 +1 -1
  2. package/package.json +4 -3
  3. package/scripts/build_lib.sh +41 -0
  4. package/src/node.zig +1 -1
  5. package/src/tigerbeetle/scripts/validate_docs.sh +7 -1
  6. package/src/tigerbeetle/src/benchmark.zig +3 -3
  7. package/src/tigerbeetle/src/config.zig +31 -16
  8. package/src/tigerbeetle/src/constants.zig +48 -9
  9. package/src/tigerbeetle/src/ewah.zig +5 -5
  10. package/src/tigerbeetle/src/ewah_fuzz.zig +1 -1
  11. package/src/tigerbeetle/src/lsm/binary_search.zig +1 -1
  12. package/src/tigerbeetle/src/lsm/bloom_filter.zig +1 -1
  13. package/src/tigerbeetle/src/lsm/compaction.zig +34 -21
  14. package/src/tigerbeetle/src/lsm/forest_fuzz.zig +84 -104
  15. package/src/tigerbeetle/src/lsm/grid.zig +19 -13
  16. package/src/tigerbeetle/src/lsm/manifest_log.zig +8 -10
  17. package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +18 -13
  18. package/src/tigerbeetle/src/lsm/merge_iterator.zig +1 -1
  19. package/src/tigerbeetle/src/lsm/segmented_array.zig +17 -17
  20. package/src/tigerbeetle/src/lsm/segmented_array_fuzz.zig +1 -1
  21. package/src/tigerbeetle/src/lsm/set_associative_cache.zig +1 -1
  22. package/src/tigerbeetle/src/lsm/table.zig +8 -20
  23. package/src/tigerbeetle/src/lsm/table_immutable.zig +1 -1
  24. package/src/tigerbeetle/src/lsm/table_iterator.zig +3 -3
  25. package/src/tigerbeetle/src/lsm/table_mutable.zig +14 -2
  26. package/src/tigerbeetle/src/lsm/test.zig +5 -4
  27. package/src/tigerbeetle/src/lsm/tree.zig +1 -2
  28. package/src/tigerbeetle/src/lsm/tree_fuzz.zig +85 -115
  29. package/src/tigerbeetle/src/message_bus.zig +4 -4
  30. package/src/tigerbeetle/src/message_pool.zig +7 -10
  31. package/src/tigerbeetle/src/ring_buffer.zig +22 -12
  32. package/src/tigerbeetle/src/simulator.zig +366 -239
  33. package/src/tigerbeetle/src/state_machine/auditor.zig +5 -5
  34. package/src/tigerbeetle/src/state_machine/workload.zig +3 -3
  35. package/src/tigerbeetle/src/state_machine.zig +190 -178
  36. package/src/tigerbeetle/src/{util.zig → stdx.zig} +2 -0
  37. package/src/tigerbeetle/src/storage.zig +13 -6
  38. package/src/tigerbeetle/src/{test → testing/cluster}/message_bus.zig +3 -3
  39. package/src/tigerbeetle/src/{test → testing/cluster}/network.zig +46 -22
  40. package/src/tigerbeetle/src/testing/cluster/state_checker.zig +169 -0
  41. package/src/tigerbeetle/src/testing/cluster/storage_checker.zig +202 -0
  42. package/src/tigerbeetle/src/testing/cluster.zig +443 -0
  43. package/src/tigerbeetle/src/{test → testing}/fuzz.zig +0 -0
  44. package/src/tigerbeetle/src/testing/hash_log.zig +66 -0
  45. package/src/tigerbeetle/src/{test → testing}/id.zig +0 -0
  46. package/src/tigerbeetle/src/testing/packet_simulator.zig +365 -0
  47. package/src/tigerbeetle/src/{test → testing}/priority_queue.zig +1 -1
  48. package/src/tigerbeetle/src/testing/reply_sequence.zig +139 -0
  49. package/src/tigerbeetle/src/{test → testing}/state_machine.zig +3 -1
  50. package/src/tigerbeetle/src/testing/storage.zig +757 -0
  51. package/src/tigerbeetle/src/{test → testing}/table.zig +21 -0
  52. package/src/tigerbeetle/src/{test → testing}/time.zig +0 -0
  53. package/src/tigerbeetle/src/tigerbeetle.zig +2 -0
  54. package/src/tigerbeetle/src/tracer.zig +3 -3
  55. package/src/tigerbeetle/src/unit_tests.zig +4 -4
  56. package/src/tigerbeetle/src/vopr.zig +2 -2
  57. package/src/tigerbeetle/src/vsr/client.zig +5 -2
  58. package/src/tigerbeetle/src/vsr/clock.zig +93 -53
  59. package/src/tigerbeetle/src/vsr/journal.zig +109 -98
  60. package/src/tigerbeetle/src/vsr/journal_format_fuzz.zig +2 -2
  61. package/src/tigerbeetle/src/vsr/replica.zig +1983 -1430
  62. package/src/tigerbeetle/src/vsr/replica_format.zig +13 -13
  63. package/src/tigerbeetle/src/vsr/superblock.zig +240 -142
  64. package/src/tigerbeetle/src/vsr/superblock_client_table.zig +7 -7
  65. package/src/tigerbeetle/src/vsr/superblock_free_set.zig +1 -1
  66. package/src/tigerbeetle/src/vsr/superblock_free_set_fuzz.zig +1 -1
  67. package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +49 -14
  68. package/src/tigerbeetle/src/vsr/superblock_manifest.zig +38 -19
  69. package/src/tigerbeetle/src/vsr/superblock_quorums.zig +48 -48
  70. package/src/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +51 -51
  71. package/src/tigerbeetle/src/vsr.zig +99 -33
  72. package/src/tigerbeetle/src/demo.zig +0 -132
  73. package/src/tigerbeetle/src/demo_01_create_accounts.zig +0 -35
  74. package/src/tigerbeetle/src/demo_02_lookup_accounts.zig +0 -7
  75. package/src/tigerbeetle/src/demo_03_create_transfers.zig +0 -37
  76. package/src/tigerbeetle/src/demo_04_create_pending_transfers.zig +0 -61
  77. package/src/tigerbeetle/src/demo_05_post_pending_transfers.zig +0 -37
  78. package/src/tigerbeetle/src/demo_06_void_pending_transfers.zig +0 -24
  79. package/src/tigerbeetle/src/demo_07_lookup_transfers.zig +0 -7
  80. package/src/tigerbeetle/src/test/cluster.zig +0 -352
  81. package/src/tigerbeetle/src/test/conductor.zig +0 -366
  82. package/src/tigerbeetle/src/test/packet_simulator.zig +0 -398
  83. package/src/tigerbeetle/src/test/state_checker.zig +0 -169
  84. package/src/tigerbeetle/src/test/storage.zig +0 -864
  85. package/src/tigerbeetle/src/test/storage_checker.zig +0 -204
@@ -3,6 +3,7 @@ const Allocator = std.mem.Allocator;
3
3
  const assert = std.debug.assert;
4
4
 
5
5
  const constants = @import("../constants.zig");
6
+ const stdx = @import("../stdx.zig");
6
7
 
7
8
  const StaticAllocator = @import("../static_allocator.zig");
8
9
  const GridType = @import("../lsm/grid.zig").GridType;
@@ -24,26 +25,8 @@ const tracer = @import("../tracer.zig");
24
25
  pub const Status = enum {
25
26
  normal,
26
27
  view_change,
27
- // Recovery (for replica_count > 1):
28
- //
29
- // 1. Open the replica:
30
- // a. At replica start: `status=recovering`.
31
- // b. Recover the WAL. Mark questionable entries as faulty.
32
- // c. If the WAL has no entries (besides the initial commit), skip to step 3 with view 0.
33
- // 2. Run VSR recovery protocol:
34
- // a. Send a `recovery` message to every replica (except self).
35
- // b. Wait for f+1 `recovery_response` messages from replicas in `normal` status.
36
- // Each `recovery_response` includes the current view number.
37
- // Each `recovery_response` must include a nonce matching the `recovery` message.
38
- // c. Wait for a `recovery_response` from the primary of the highest known view.
39
- // 3. Transition to `status=normal` with the discovered view number:
40
- // * Set `op` to the highest op in the primary's recovery response.
41
- // * Repair faulty messages.
42
- // * Commit through to the discovered `commit_max`.
43
- // * Set `state_machine.prepare_timeout` to the current op's timestamp.
44
- //
45
- // TODO Document state transfer in this progression.
46
28
  recovering,
29
+ recovering_head,
47
30
  };
48
31
 
49
32
  const Nonce = u128;
@@ -59,27 +42,17 @@ const Prepare = struct {
59
42
  ok_quorum_received: bool = false,
60
43
  };
61
44
 
45
+ const Request = struct {
46
+ message: *Message, // header.command == .request
47
+ realtime: i64,
48
+ };
49
+
62
50
  const QuorumMessages = [constants.replicas_max]?*Message;
63
51
  const quorum_messages_null = [_]?*Message{null} ** constants.replicas_max;
64
52
 
65
53
  const QuorumCounter = std.StaticBitSet(constants.replicas_max);
66
54
  const quorum_counter_null = QuorumCounter.initEmpty();
67
55
 
68
- // CRITICAL: The number of prepare headers to include in the body:
69
- // We must provide enough headers to cover all uncommitted headers so that the new
70
- // primary (if we are in a view change) can decide whether to discard uncommitted headers
71
- // that cannot be repaired because they are gaps, and this must be relative to the
72
- // cluster as a whole (not relative to the difference between our op and commit number)
73
- // as otherwise we would break correctness.
74
- const view_change_headers_count = constants.pipeline_max;
75
-
76
- comptime {
77
- assert(view_change_headers_count > 0);
78
- assert(view_change_headers_count >= constants.pipeline_max);
79
- assert(view_change_headers_count <=
80
- @divFloor(constants.message_size_max - @sizeOf(Header), @sizeOf(Header)));
81
- }
82
-
83
56
  pub fn ReplicaType(
84
57
  comptime StateMachine: type,
85
58
  comptime MessageBus: type,
@@ -129,22 +102,44 @@ pub fn ReplicaType(
129
102
  /// For executing service up-calls after an operation has been committed:
130
103
  state_machine: StateMachine,
131
104
 
132
- // TODO Document.
105
+ /// Durably store VSR state, the "root" of the LSM tree, and other replica metadata.
133
106
  superblock: SuperBlock,
107
+
108
+ /// Context for SuperBlock.open() and .checkpoint().
134
109
  superblock_context: SuperBlock.Context = undefined,
110
+ /// Context for SuperBlock.view_change().
111
+ superblock_context_view_change: SuperBlock.Context = undefined,
112
+
135
113
  grid: Grid,
136
114
  opened: bool,
137
115
 
138
- /// The current view, initially 0:
116
+ /// The current view.
117
+ /// Initialized from the superblock's VSRState.
118
+ ///
119
+ /// Invariants:
120
+ /// * `replica.view = replica.log_view` when status=normal
121
+ /// * `replica.view ≥ replica.log_view`
122
+ /// * `replica.view ≥ replica.view_durable`
123
+ /// * `replica.view = 0` when replica_count=1.
139
124
  view: u32,
140
125
 
141
- /// The latest view, in which the replica's status was normal.
142
- view_normal: u32,
126
+ /// The latest view where
127
+ /// - the replica was a primary and acquired a DVC quorum, or
128
+ /// - the replica was a backup and processed a SV message.
129
+ /// i.e. the latest view in which this replica changed its head message.
130
+ ///
131
+ /// Initialized from the superblock's VSRState.
132
+ ///
133
+ /// Invariants (see `view` for others):
134
+ /// * `replica.log_view ≥ replica.log_view_durable`
135
+ /// * `replica.log_view = 0` when replica_count=1.
136
+ log_view: u32,
143
137
 
144
138
  /// The current status, either normal, view_change, or recovering:
145
139
  status: Status = .recovering,
146
140
 
147
141
  /// The op number assigned to the most recently prepared operation.
142
+ /// This op is sometimes referred to as the replica's "head" or "head op".
148
143
  ///
149
144
  /// Invariants (not applicable during status=recovering):
150
145
  /// * `replica.op` exists in the Journal.
@@ -159,10 +154,6 @@ pub fn ReplicaType(
159
154
  // Also verify that a corresponding header exists in the WAL.
160
155
  op: u64,
161
156
 
162
- /// The op of the highest checkpointed message.
163
- // TODO Refuse to store/ack any op>op_checkpoint+journal_slot_count.
164
- op_checkpoint: u64,
165
-
166
157
  /// The op number of the latest committed and executed operation (according to the replica):
167
158
  /// The replica may have to wait for repairs to complete before commit_min reaches commit_max.
168
159
  ///
@@ -190,15 +181,20 @@ pub fn ReplicaType(
190
181
  /// * checkpointing
191
182
  committing: bool = false,
192
183
 
193
- /// Whether we are reading a prepare from storage in order to push to the pipeline.
194
- repairing_pipeline: bool = false,
195
-
196
- /// The primary's pipeline of inflight prepares waiting to commit in FIFO order.
197
- /// This allows us to pipeline without the complexity of out-of-order commits.
198
- ///
199
- /// After a view change, the old primary's pipeline is left untouched so that it is able to
200
- /// help the new primary repair, even in the face of local storage faults.
201
- pipeline: RingBuffer(Prepare, constants.pipeline_max, .array) = .{},
184
+ /// Whether we are reading a prepare from storage to construct the pipeline.
185
+ pipeline_repairing: bool = false,
186
+
187
+ /// The pipeline is a queue for a replica which is the primary and in status=normal.
188
+ /// At all other times the pipeline is a cache.
189
+ pipeline: union(enum) {
190
+ /// The primary's pipeline of inflight prepares waiting to commit in FIFO order,
191
+ /// with a tail of pending requests which have not begun to prepare.
192
+ /// This allows us to pipeline without the complexity of out-of-order commits.
193
+ queue: PipelineQueue,
194
+ /// Prepares in the cache may be committed or uncommitted, and may not belong to the
195
+ /// current view.
196
+ cache: PipelineCache,
197
+ },
202
198
 
203
199
  /// In some cases, a replica may send a message to itself. We do not submit these messages
204
200
  /// to the message bus but rather queue them here for guaranteed immediate delivery, which
@@ -214,9 +210,6 @@ pub fn ReplicaType(
214
210
  /// Unique nack_prepare messages for the same view from OTHER replicas (excluding ourself).
215
211
  nack_prepare_from_other_replicas: QuorumCounter = quorum_counter_null,
216
212
 
217
- /// Unique recovery_response messages from OTHER replicas (excluding ourself).
218
- recovery_response_from_other_replicas: QuorumMessages = quorum_messages_null,
219
-
220
213
  /// Whether a replica has received a quorum of start_view_change messages for the view change:
221
214
  start_view_change_quorum: bool = false,
222
215
 
@@ -254,9 +247,6 @@ pub fn ReplicaType(
254
247
  /// The number of ticks before repairing missing/disconnected headers and/or dirty entries:
255
248
  repair_timeout: Timeout,
256
249
 
257
- /// The number of ticks before attempting to send another set of `recovery` messages.
258
- recovery_timeout: Timeout,
259
-
260
250
  /// The nonce of the `recovery` messages.
261
251
  recovery_nonce: Nonce,
262
252
 
@@ -268,6 +258,7 @@ pub fn ReplicaType(
268
258
  /// Seeded with the replica's index number.
269
259
  prng: std.rand.DefaultPrng,
270
260
 
261
+ context: ?*anyopaque = null,
271
262
  /// Simulator hooks.
272
263
  on_change_state: ?fn (replica: *const Self) void = null,
273
264
  /// Called immediately after a compaction.
@@ -355,33 +346,83 @@ pub fn ReplicaType(
355
346
  // Open the (Forest inside) StateMachine:
356
347
  self.opened = false;
357
348
  self.state_machine.open(state_machine_open_callback);
358
- while (!self.opened) {
359
- // self.grid.tick();
360
- self.superblock.storage.tick();
361
- }
349
+ while (!self.opened) self.superblock.storage.tick();
362
350
 
363
351
  self.opened = false;
364
352
  self.journal.recover(journal_recover_callback);
365
353
  while (!self.opened) self.superblock.storage.tick();
366
354
 
367
- if (self.journal.is_empty()) {
368
- // The data file is brand new — no messages have ever been written.
369
- // Transition to normal status; no need to run the VSR recovery protocol.
370
- assert(self.journal.dirty.count == 0);
371
- assert(self.journal.faulty.count == 0);
372
- assert(self.commit_min == 0);
373
- assert(self.commit_max == 0);
374
- assert(self.op_checkpoint == 0);
375
- assert(self.op == 0);
376
- assert(self.view == 0);
377
-
378
- log.debug("{}: open: empty data file", .{self.replica});
379
- self.transition_to_normal_from_recovering_status(0);
380
- assert(self.status == .normal);
381
- } else if (self.replica_count == 1) {
382
- if (self.journal.faulty.count != 0) @panic("journal is corrupt");
355
+ const vsr_headers = self.superblock.working.vsr_headers();
356
+ var op_head: u64 = vsr_headers.slice[0].op;
357
+ for (self.journal.headers) |*header| {
358
+ if (header.command == .prepare and header.op > op_head) {
359
+ assert(self.log_view >= header.view);
360
+ assert(self.log_view == self.view);
361
+
362
+ op_head = header.op;
363
+ }
364
+ }
365
+
366
+ self.op = op_head;
367
+ for (vsr_headers.slice) |*header| {
368
+ const slot = .{ .index = header.op % constants.journal_slot_count };
369
+ if (self.journal.has(header)) {
370
+ // Header is already in the WAL.
371
+ assert(!self.journal.dirty.bit(slot));
372
+ assert(!self.journal.faulty.bit(slot));
373
+ } else if (self.journal.header_for_op(header.op)) |journal_header| {
374
+ assert(!self.journal.dirty.bit(slot));
375
+ assert(!self.journal.faulty.bit(slot));
376
+
377
+ if (header.op < journal_header.op) {
378
+ // Don't overwrite a newer op.
379
+ // (This must be a SV message because a DVC would not have a newer op).
380
+ assert(self.log_view == self.view);
381
+ } else {
382
+ self.journal.set_header_as_dirty(header);
383
+ }
384
+ } else {
385
+ assert(self.journal.dirty.bit(slot) == self.journal.faulty.bit(slot));
386
+
387
+ self.journal.headers[slot.index] = header.*;
388
+ self.journal.dirty.set(slot);
389
+ // Don't touch faulty — if it is set, we don't want to unset it. The WAL slot
390
+ // may contain a corrupt version is this op, and we don't want to incorrectly
391
+ // nack it. (This is why we do not call replace_header()/set_header_as_dirty()
392
+ // here.)
393
+ }
394
+ }
395
+
396
+ const header_head = self.journal.header_with_op(self.op).?;
397
+ assert(header_head.view <= self.superblock.working.vsr_state.log_view);
398
+
399
+ if (self.replica_count == 1) {
400
+ if (self.journal.faulty.count > 0) {
401
+ @panic("journal is corrupt");
402
+ }
403
+ assert(self.op_head_certain());
404
+
405
+ if (self.commit_min < self.op) {
406
+ self.commit_journal(self.op);
407
+ } else {
408
+ self.transition_to_normal_from_recovering_status();
409
+ }
383
410
  } else {
384
- assert(self.status == .recovering);
411
+ // Even if op_head_certain() returns false, a DVC always has a certain head op.
412
+ if (self.log_view < self.view or self.op_head_certain()) {
413
+ if (self.log_view == self.view) {
414
+ if (self.primary_index(self.view) == self.replica) {
415
+ self.transition_to_view_change_status(self.view + 1);
416
+ } else {
417
+ self.transition_to_normal_from_recovering_status();
418
+ }
419
+ } else {
420
+ assert(self.view > self.log_view);
421
+ self.transition_to_view_change_status(self.view);
422
+ }
423
+ } else {
424
+ self.transition_to_recovering_head();
425
+ }
385
426
  }
386
427
  }
387
428
 
@@ -504,11 +545,11 @@ pub fn ReplicaType(
504
545
  .grid = self.grid,
505
546
  .opened = self.opened,
506
547
  .view = self.superblock.working.vsr_state.view,
507
- .view_normal = self.superblock.working.vsr_state.view_normal,
548
+ .log_view = self.superblock.working.vsr_state.log_view,
508
549
  .op = 0,
509
- .op_checkpoint = self.superblock.working.vsr_state.commit_min,
510
550
  .commit_min = self.superblock.working.vsr_state.commit_min,
511
551
  .commit_max = self.superblock.working.vsr_state.commit_max,
552
+ .pipeline = .{ .cache = .{} },
512
553
  .ping_timeout = Timeout{
513
554
  .name = "ping_timeout",
514
555
  .id = replica_index,
@@ -544,11 +585,6 @@ pub fn ReplicaType(
544
585
  .id = replica_index,
545
586
  .after = 50,
546
587
  },
547
- .recovery_timeout = Timeout{
548
- .name = "recovery_timeout",
549
- .id = replica_index,
550
- .after = 200,
551
- },
552
588
  .recovery_nonce = recovery_nonce,
553
589
  .prng = std.rand.DefaultPrng.init(replica_index),
554
590
  };
@@ -586,7 +622,11 @@ pub fn ReplicaType(
586
622
  self.grid.deinit(allocator);
587
623
  defer self.message_bus.deinit(allocator);
588
624
 
589
- while (self.pipeline.pop()) |prepare| self.message_bus.unref(prepare.message);
625
+ // TODO(Zig) 0.10: inline-switch.
626
+ switch (self.pipeline) {
627
+ .queue => |*pipeline| pipeline.deinit(self.message_bus.pool),
628
+ .cache => |*pipeline| pipeline.deinit(self.message_bus.pool),
629
+ }
590
630
 
591
631
  if (self.loopback_queue) |loopback_message| {
592
632
  assert(loopback_message.next == null);
@@ -606,10 +646,6 @@ pub fn ReplicaType(
606
646
  for (self.do_view_change_from_all_replicas) |message| {
607
647
  if (message) |m| self.message_bus.unref(m);
608
648
  }
609
-
610
- for (self.recovery_response_from_other_replicas) |message| {
611
- if (message) |m| self.message_bus.unref(m);
612
- }
613
649
  }
614
650
 
615
651
  /// The client table records for each client the latest session and the latest committed reply.
@@ -629,36 +665,8 @@ pub fn ReplicaType(
629
665
 
630
666
  // TODO Replica owns Time; should it tick() here instead of Clock?
631
667
  self.clock.tick();
632
- // self.grid.tick();
633
668
  self.message_bus.tick();
634
669
 
635
- if (self.status == .recovering) {
636
- if (self.recovery_timeout.ticking) {
637
- // Continue running the VSR recovery protocol.
638
- self.recovery_timeout.tick();
639
- if (self.recovery_timeout.fired()) self.on_recovery_timeout();
640
- } else if (self.replica_count == 1) {
641
- // A cluster-of-one does not run the VSR recovery protocol.
642
- if (self.committing) return;
643
- assert(self.journal.faulty.count == 0);
644
- assert(self.op == 0);
645
- // TODO Assert that this path isn't taken more than once.
646
- self.op = self.journal.op_maximum();
647
- assert(self.op >= self.commit_min);
648
- assert(self.op >= self.op_checkpoint);
649
- assert(self.op <= self.op_checkpoint_trigger());
650
- assert(self.journal.header_with_op(self.op) != null);
651
- self.commit_journal(self.op);
652
- // The recovering→normal transition is deferred until all ops are committed.
653
- } else {
654
- // The journal just finished recovery.
655
- // Now try to learn the current view via the VSR recovery protocol.
656
- self.recovery_timeout.start();
657
- self.recover();
658
- }
659
- return;
660
- }
661
-
662
670
  self.ping_timeout.tick();
663
671
  self.prepare_timeout.tick();
664
672
  self.commit_timeout.tick();
@@ -725,8 +733,6 @@ pub fn ReplicaType(
725
733
  .start_view_change => self.on_start_view_change(message),
726
734
  .do_view_change => self.on_do_view_change(message),
727
735
  .start_view => self.on_start_view(message),
728
- .recovery => self.on_recovery(message),
729
- .recovery_response => self.on_recovery_response(message),
730
736
  .request_start_view => self.on_request_start_view(message),
731
737
  .request_prepare => self.on_request_prepare(message),
732
738
  .request_headers => self.on_request_headers(message),
@@ -807,18 +813,22 @@ pub fn ReplicaType(
807
813
  self.clock.learn(message.header.replica, m0, t1, m2);
808
814
  }
809
815
 
810
- /// The primary advances op-number, adds the request to the end of the log, and updates the
811
- /// information for this client in the client-table to contain the new request number, s.
812
- /// Then it sends a ⟨PREPARE v, m, n, k⟩ message to the other replicas, where v is the
813
- /// current view-number, m is the message it received from the client, n is the op-number
814
- /// it assigned to the request, and k is the commit-number.
816
+ /// When there is free space in the pipeline's prepare queue:
817
+ /// The primary advances op-number, adds the request to the end of the log, and updates the
818
+ /// information for this client in the client-table to contain the new request number, s.
819
+ /// Then it sends a ⟨PREPARE v, m, n, k⟩ message to the other replicas, where v is the
820
+ /// current view-number, m is the message it received from the client, n is the op-number
821
+ /// it assigned to the request, and k is the commit-number.
822
+ /// Otherwise, when there is room in the pipeline's request queue:
823
+ /// The request is queued, and will be dequeued & prepared when the pipeline head commits.
824
+ /// Otherwise, drop the request.
815
825
  fn on_request(self: *Self, message: *Message) void {
816
826
  if (self.ignore_request_message(message)) return;
817
827
 
818
828
  assert(self.status == .normal);
819
829
  assert(self.primary());
820
830
  assert(self.commit_min == self.commit_max);
821
- assert(self.commit_max + self.pipeline.count == self.op);
831
+ assert(self.commit_max + self.pipeline.queue.prepare_queue.count == self.op);
822
832
 
823
833
  assert(message.header.command == .request);
824
834
  assert(message.header.view <= self.view); // The client's view may be behind ours.
@@ -828,59 +838,16 @@ pub fn ReplicaType(
828
838
  return;
829
839
  };
830
840
 
831
- log.debug("{}: on_request: request {}", .{ self.replica, message.header.checksum });
832
-
833
- // Guard against the wall clock going backwards by taking the max with timestamps issued:
834
- self.state_machine.prepare_timestamp = std.math.max(
835
- // The cluster `commit_timestamp` may be ahead of our `prepare_timestamp` because this
836
- // may be our first prepare as a recently elected primary:
837
- std.math.max(
838
- self.state_machine.prepare_timestamp,
839
- self.state_machine.commit_timestamp,
840
- ) + 1,
841
- @intCast(u64, realtime),
842
- );
843
- assert(self.state_machine.prepare_timestamp > self.state_machine.commit_timestamp);
844
-
845
- const prepare_timestamp = self.state_machine.prepare(
846
- message.header.operation.cast(StateMachine),
847
- message.body(),
848
- );
849
-
850
- const latest_entry = self.journal.header_with_op(self.op).?;
851
- message.header.parent = latest_entry.checksum;
852
- message.header.context = message.header.checksum;
853
- message.header.view = self.view;
854
- message.header.op = self.op + 1;
855
- message.header.commit = self.commit_max;
856
- message.header.timestamp = prepare_timestamp;
857
- message.header.replica = self.replica;
858
- message.header.command = .prepare;
859
-
860
- message.header.set_checksum_body(message.body());
861
- message.header.set_checksum();
862
-
863
- log.debug("{}: on_request: prepare {}", .{ self.replica, message.header.checksum });
864
-
865
- self.pipeline.push_assume_capacity(.{ .message = message.ref() });
866
- assert(self.pipeline.count >= 1);
841
+ const request = .{
842
+ .message = message.ref(),
843
+ .realtime = realtime,
844
+ };
867
845
 
868
- if (self.pipeline.count == 1) {
869
- // This is the only prepare in the pipeline, start the timeout:
870
- assert(!self.prepare_timeout.ticking);
871
- self.prepare_timeout.start();
846
+ if (self.pipeline.queue.prepare_queue.full()) {
847
+ self.pipeline.queue.push_request(request);
872
848
  } else {
873
- // Do not restart the prepare timeout as it is already ticking for another prepare.
874
- assert(self.prepare_timeout.ticking);
875
- const previous = self.pipeline.get_ptr(self.pipeline.count - 2).?;
876
- assert(previous.message.header.checksum == message.header.parent);
849
+ self.primary_pipeline_prepare(request);
877
850
  }
878
-
879
- self.on_prepare(message);
880
-
881
- // We expect `on_prepare()` to increment `self.op` to match the primary's latest prepare:
882
- // This is critical to ensure that pipelined prepares do not receive the same op number.
883
- assert(self.op == message.header.op);
884
851
  }
885
852
 
886
853
  /// Replication is simple, with a single code path for the primary and backups.
@@ -937,7 +904,7 @@ pub fn ReplicaType(
937
904
  log.debug("{}: on_prepare: ignoring op={} (too far ahead, checkpoint={})", .{
938
905
  self.replica,
939
906
  message.header.op,
940
- self.op_checkpoint,
907
+ self.op_checkpoint(),
941
908
  });
942
909
  // When we are the primary, `on_request` enforces this invariant.
943
910
  assert(self.backup());
@@ -948,7 +915,7 @@ pub fn ReplicaType(
948
915
  assert(message.header.view == self.view);
949
916
  assert(self.primary() or self.backup());
950
917
  assert(message.header.replica == self.primary_index(message.header.view));
951
- assert(message.header.op > self.op_checkpoint);
918
+ assert(message.header.op > self.op_checkpoint());
952
919
  assert(message.header.op > self.op);
953
920
  assert(message.header.op > self.commit_min);
954
921
  assert(message.header.op <= self.op_checkpoint_trigger());
@@ -998,11 +965,20 @@ pub fn ReplicaType(
998
965
  assert(message.header.view == self.view);
999
966
  assert(self.primary());
1000
967
 
1001
- const prepare = self.pipeline_prepare_for_prepare_ok(message) orelse return;
968
+ const prepare = self.pipeline.queue.prepare_by_prepare_ok(message) orelse {
969
+ // This can be normal, for example, if an old prepare_ok is replayed.
970
+ log.debug("{}: on_prepare_ok: not preparing ok={} checksum={}", .{
971
+ self.replica,
972
+ message.header.op,
973
+ message.header.context,
974
+ });
975
+ return;
976
+ };
1002
977
 
1003
978
  assert(prepare.message.header.checksum == message.header.context);
1004
979
  assert(prepare.message.header.op >= self.commit_max + 1);
1005
- assert(prepare.message.header.op <= self.commit_max + self.pipeline.count);
980
+ assert(prepare.message.header.op <= self.commit_max +
981
+ self.pipeline.queue.prepare_queue.count);
1006
982
  assert(prepare.message.header.op <= self.op);
1007
983
 
1008
984
  // Wait until we have `f + 1` prepare_ok messages (including ourself) for quorum:
@@ -1010,7 +986,7 @@ pub fn ReplicaType(
1010
986
  // TODO: When Block recover & state transfer are implemented, this can be removed.
1011
987
  const threshold =
1012
988
  if (prepare.message.header.op == self.op_checkpoint_trigger() or
1013
- prepare.message.header.op == self.op_checkpoint + constants.lsm_batch_multiple + 1)
989
+ prepare.message.header.op == self.op_checkpoint() + constants.lsm_batch_multiple + 1)
1014
990
  self.replica_count
1015
991
  else
1016
992
  self.quorum_replication;
@@ -1199,28 +1175,6 @@ pub fn ReplicaType(
1199
1175
  /// informs the other replicas of the completion of the view change by sending
1200
1176
  /// ⟨start_view v, l, n, k⟩ messages to the other replicas, where l is the new log, n is the
1201
1177
  /// op number, and k is the commit number.
1202
- ///
1203
- /// For each DVC in the quorum:
1204
- ///
1205
- /// * The headers must all belong to the same hash chain. (Gaps are allowed).
1206
- /// (Reason: the headers bundled with the DVC(s) with the highest view_normal will be
1207
- /// loaded into the new primary with `replace_header()`, not `repair_header()`).
1208
- ///
1209
- /// Across all DVCs in the quorum:
1210
- ///
1211
- /// * The headers of every DVC with the same view_normal must agree. In other words:
1212
- /// dvc₁.headers[i].op == dvc₂.headers[j].op implies
1213
- /// dvc₁.headers[i].checksum == dvc₂.headers[j].checksum.
1214
- /// (Reason: the headers bundled with the DVC(s) with the highest view_normal will be
1215
- /// loaded into the new primary with `replace_header()`, not `repair_header()`).
1216
- ///
1217
- /// Perhaps unintuitively, it is safe to advertise a header before its message is prepared
1218
- /// (e.g. the write is still queued). The header is either:
1219
- ///
1220
- /// * committed — so another replica in the quorum must have a copy, according to the quorum
1221
- /// intersection property. Or,
1222
- /// * uncommitted — if the header is chosen, but cannot be recovered from any replica, then
1223
- /// it will be discarded by the nack protocol.
1224
1178
  fn on_do_view_change(self: *Self, message: *Message) void {
1225
1179
  if (self.ignore_view_change_message(message)) return;
1226
1180
 
@@ -1255,6 +1209,7 @@ pub fn ReplicaType(
1255
1209
 
1256
1210
  assert(count == threshold);
1257
1211
  assert(self.do_view_change_from_all_replicas[self.replica] != null);
1212
+ DVCQuorum.verify(self.do_view_change_from_all_replicas);
1258
1213
  log.debug("{}: on_do_view_change: view={} quorum received", .{
1259
1214
  self.replica,
1260
1215
  self.view,
@@ -1265,6 +1220,13 @@ pub fn ReplicaType(
1265
1220
  self.do_view_change_quorum = true;
1266
1221
 
1267
1222
  self.primary_set_log_from_do_view_change_messages();
1223
+ // We aren't status=normal yet, but our headers from our prior log_view may have been
1224
+ // replaced. If we participate in another DVC (before reaching status=normal, which
1225
+ // would update our log_view), we must disambiguate our (new) headers from the
1226
+ // headers of any other replica with the same log_view so that the next primary can
1227
+ // identify an unambiguous set of canonical headers.
1228
+ self.log_view = self.view;
1229
+
1268
1230
  assert(self.op >= self.commit_max);
1269
1231
  assert(self.state_machine.prepare_timestamp >=
1270
1232
  self.journal.header_with_op(self.op).?.timestamp);
@@ -1295,7 +1257,9 @@ pub fn ReplicaType(
1295
1257
  unreachable;
1296
1258
  }
1297
1259
 
1298
- assert(self.status == .view_change or self.status == .normal);
1260
+ assert(self.status == .view_change or
1261
+ self.status == .normal or
1262
+ self.status == .recovering_head);
1299
1263
  assert(message.header.view >= self.view);
1300
1264
  assert(message.header.replica != self.replica);
1301
1265
  assert(message.header.replica == self.primary_index(message.header.view));
@@ -1307,13 +1271,23 @@ pub fn ReplicaType(
1307
1271
  assert(message.header.op == op_highest(message_body_as_headers(message)));
1308
1272
 
1309
1273
  self.set_op_and_commit_max(message.header.op, message.header.commit, "on_start_view");
1310
- self.replace_headers(message_body_as_headers(message));
1274
+ for (message_body_as_headers_chain_consecutive(message)) |*header| {
1275
+ self.replace_header(header);
1276
+ }
1311
1277
 
1312
1278
  assert(self.op == message.header.op);
1313
1279
 
1314
- if (self.status == .view_change) {
1315
- self.transition_to_normal_from_view_change_status(message.header.view);
1316
- self.send_prepare_oks_after_view_change();
1280
+ switch (self.status) {
1281
+ .normal => {},
1282
+ .view_change => {
1283
+ self.transition_to_normal_from_view_change_status(message.header.view);
1284
+ self.send_prepare_oks_after_view_change();
1285
+ },
1286
+ .recovering_head => {
1287
+ self.transition_to_normal_from_recovering_status();
1288
+ self.send_prepare_oks_after_view_change();
1289
+ },
1290
+ .recovering => unreachable,
1317
1291
  }
1318
1292
 
1319
1293
  assert(self.status == .normal);
@@ -1329,6 +1303,7 @@ pub fn ReplicaType(
1329
1303
  if (self.ignore_repair_message(message)) return;
1330
1304
 
1331
1305
  assert(self.status == .normal);
1306
+ assert(self.view == self.log_view);
1332
1307
  assert(message.header.view == self.view);
1333
1308
  assert(message.header.replica != self.replica);
1334
1309
  assert(self.primary());
@@ -1345,391 +1320,90 @@ pub fn ReplicaType(
1345
1320
  self.send_message_to_replica(message.header.replica, start_view);
1346
1321
  }
1347
1322
 
1348
- fn on_recovery(self: *Self, message: *const Message) void {
1349
- assert(self.replica_count > 1);
1350
-
1351
- if (self.status != .normal) {
1352
- log.debug("{}: on_recovery: ignoring ({})", .{ self.replica, self.status });
1353
- return;
1354
- }
1355
-
1356
- if (message.header.replica == self.replica) {
1357
- log.warn("{}: on_recovery: ignoring (self)", .{self.replica});
1358
- return;
1359
- }
1360
-
1361
- const response = self.message_bus.get_message();
1362
- defer self.message_bus.unref(response);
1323
+ /// If the requested prepare has been guaranteed by this replica:
1324
+ /// * Read the prepare from storage, and forward it to the replica that requested it.
1325
+ /// * Otherwise send no reply — it isn't safe to nack.
1326
+ /// If the requested prepare has *not* been guaranteed by this replica, then send a nack.
1327
+ ///
1328
+ /// A prepare is considered "guaranteed" by a replica if that replica has acknowledged it
1329
+ /// to the cluster. The cluster sees the replica as an underwriter of a guaranteed
1330
+ /// prepare. If a guaranteed prepare is found to by faulty, the replica must repair it
1331
+ /// to restore durability.
1332
+ fn on_request_prepare(self: *Self, message: *const Message) void {
1333
+ if (self.ignore_repair_message(message)) return;
1363
1334
 
1364
- log.debug("{}: on_recovery: view={} op={} commit={} nonce={}", .{
1365
- self.replica,
1366
- self.view,
1367
- self.op,
1368
- self.commit_max,
1369
- message.header.context,
1370
- });
1335
+ assert(self.replica_count > 1);
1336
+ assert(self.status == .normal or self.status == .view_change);
1337
+ assert(message.header.view == self.view);
1338
+ assert(message.header.replica != self.replica);
1371
1339
 
1372
- response.header.* = .{
1373
- .command = .recovery_response,
1374
- .cluster = self.cluster,
1375
- .context = message.header.context, // Echo the request's nonce.
1376
- .replica = self.replica,
1377
- .view = self.view,
1378
- .op = self.op,
1379
- .commit = self.commit_max,
1340
+ const op = message.header.op;
1341
+ const slot = self.journal.slot_for_op(op);
1342
+ const checksum: ?u128 = switch (message.header.timestamp) {
1343
+ 0 => null,
1344
+ 1 => message.header.context,
1345
+ else => unreachable,
1380
1346
  };
1381
1347
 
1382
- // A recovery response attaches at least as many headers as a DVC message attaches.
1383
- // To understand why, consider this scenario, where:
1384
- //
1385
- // replica_count 3
1386
- // do_view_change.headers.len 3 (= pipeline_max)
1387
- // recovery_response.headers.len 2 (!)
1388
- // replica 0 log 3, 4a, 5a, 6a, 7a, 8a (status=normal, primary)
1389
- // replica 1 log 3, 4a, 5a, --, --, -- (status=normal, backup)
1390
- // replica 2 log 3, 4b, 5b, --, --, -- (status=recovering)
1391
- //
1392
- // 1. Replica 2 receives a recovery_response quorum.
1393
- // 2. Replica 2 sets `replica.op` to 8a.
1394
- // 3. Replica 2 sets its headers from the primary's recovery_response (8a, 7a)
1395
- // (via `replace_header()`).
1396
- // 4. Replica 2 transitions to status=normal.
1397
- // 5. Replica 0 fails (before replica 2 has a chance to repair its hash chain.)
1398
- // 6. Replica 1 initiates a view change.
1399
- // 7. Replica 1 collects a DVC quorum:
1400
- // replica 1: 3, 4a, 5a (view_normal=latest)
1401
- // replica 2: 5b, 7a, 8a (view_normal=latest)
1402
- // Replicas 1 and 2 share the highest view_normal, so both sets of headers are canonical.
1403
- // 8. Replica 1 loads the canonical headers (via `replace_header()`) from both DVCs.
1404
- // Messages 8a and 7a will be dropped via `do_view_change_op_max()` (due to the
1405
- // gap at op 6). But there is a conflict at op=5. For correctness, replica 1 must
1406
- // pick 5a — 5a may be committed by replica 0.
1407
- // Without replica 0's assistance, replica 1 has no way to pick between 5a/5b.
1408
- //
1409
- // Including at least as many headers in the recovery response as the DVC maintains the
1410
- // invariant: DVCs with the same view_normal must never disagree on the identity of a
1411
- // message.
1412
- //
1413
- // (DVCs can still safely include gaps — but they must be of the form [4a,__,6a],
1414
- // not [4a,__,6b]).
1415
- const count = self.copy_latest_headers_and_set_size(
1416
- 0,
1417
- self.op,
1418
- view_change_headers_count,
1419
- response,
1420
- );
1421
- assert(count > 0); // We expect that self.op always exists.
1422
- assert(@divExact(response.header.size, @sizeOf(Header)) == 1 + count);
1423
-
1424
- response.header.set_checksum_body(response.body());
1425
- response.header.set_checksum();
1426
-
1427
- assert(self.status == .normal);
1428
- // The checksum for a recovery message is deterministic, and cannot be used as a nonce:
1429
- assert(response.header.context != message.header.checksum);
1430
-
1431
- self.send_message_to_replica(message.header.replica, response);
1432
- }
1433
-
1434
- fn on_recovery_response(self: *Self, message: *Message) void {
1435
- assert(self.replica_count > 1);
1348
+ // Only the primary may respond to `request_prepare` messages without a checksum.
1349
+ assert(checksum != null or self.primary_index(self.view) == self.replica);
1436
1350
 
1437
- if (self.status != .recovering) {
1438
- log.debug("{}: on_recovery_response: ignoring ({})", .{
1351
+ // Try to serve the message directly from the pipeline.
1352
+ // This saves us from going to disk. And we don't need to worry that the WAL's copy
1353
+ // of an uncommitted prepare is lost/corrupted.
1354
+ if (self.pipeline_prepare_by_op_and_checksum(op, checksum)) |prepare| {
1355
+ log.debug("{}: on_request_prepare: op={} checksum={} reply from pipeline", .{
1439
1356
  self.replica,
1440
- self.status,
1357
+ op,
1358
+ checksum,
1441
1359
  });
1360
+ self.send_message_to_replica(message.header.replica, prepare);
1442
1361
  return;
1443
1362
  }
1444
1363
 
1445
- if (message.header.replica == self.replica) {
1446
- log.warn("{}: on_recovery_response: ignoring (self)", .{self.replica});
1447
- return;
1448
- }
1449
-
1450
- if (message.header.context != self.recovery_nonce) {
1451
- log.warn("{}: on_recovery_response: ignoring (different nonce)", .{self.replica});
1452
- return;
1453
- }
1454
-
1455
- var responses: *QuorumMessages = &self.recovery_response_from_other_replicas;
1456
- if (responses[message.header.replica]) |existing| {
1457
- assert(message.header.replica == existing.header.replica);
1458
-
1459
- if (message.header.checksum == existing.header.checksum) {
1460
- // The response was replayed by the network; ignore it.
1461
- log.debug("{}: on_recovery_response: ignoring (duplicate message)", .{
1364
+ if (self.journal.prepare_inhabited[slot.index]) {
1365
+ const prepare_checksum = self.journal.prepare_checksums[slot.index];
1366
+ // Consult `journal.prepare_checksums` (rather than `journal.headers`):
1367
+ // the former may have the prepare we want — even if journal recovery marked the
1368
+ // slot as faulty and left the in-memory header as reserved.
1369
+ if (checksum == null or checksum.? == prepare_checksum) {
1370
+ log.debug("{}: on_request_prepare: op={} checksum={} reading", .{
1462
1371
  self.replica,
1372
+ op,
1373
+ checksum,
1463
1374
  });
1464
- return;
1465
- }
1466
1375
 
1467
- // We received a second (distinct) response from a replica. Possible causes:
1468
- // * We retried the `recovery` message, because we had not yet received a quorum.
1469
- // * The `recovery` message was duplicated/misdirected by the network, and the
1470
- // receiver's state changed in the mean time.
1471
-
1472
- log.debug(
1473
- "{}: on_recovery_response: replacing response replica={} view={}..{} op={}..{} commit={}..{}",
1474
- .{
1475
- self.replica,
1476
- existing.header.replica,
1477
- existing.header.view,
1478
- message.header.view,
1479
- existing.header.op,
1480
- message.header.op,
1481
- existing.header.commit,
1482
- message.header.commit,
1483
- },
1484
- );
1376
+ // Improve availability by calling `read_prepare_with_op_and_checksum` instead
1377
+ // of `read_prepare` even if `journal.headers` contains the target message.
1378
+ // The latter skips the read when the target prepare is present but dirty (e.g.
1379
+ // it was recovered with decision=fix).
1380
+ // TODO Do not reissue the read if we are already reading in order to send to
1381
+ // this particular destination replica.
1382
+ self.journal.read_prepare_with_op_and_checksum(
1383
+ on_request_prepare_read,
1384
+ op,
1385
+ prepare_checksum,
1386
+ message.header.replica,
1387
+ );
1485
1388
 
1486
- if (message.header.view < existing.header.view or
1487
- (message.header.view == existing.header.view and
1488
- message.header.op < existing.header.op) or
1489
- (message.header.view == existing.header.view and
1490
- message.header.op == existing.header.op and
1491
- message.header.commit < existing.header.commit))
1492
- {
1493
- // The second message is older than the first one (reordered packets).
1494
- log.debug("{}: on_recovery_response: ignoring (older)", .{self.replica});
1389
+ // We have guaranteed the prepare (not safe to nack).
1390
+ // Our copy may or may not be valid, but we will try to read & forward it.
1495
1391
  return;
1496
1392
  }
1393
+ }
1497
1394
 
1498
- // The second message is newer than the first one.
1499
- assert(message.header.view >= existing.header.view);
1500
- // The op number may regress if an uncommitted op was discarded in a higher view.
1501
- assert(message.header.op >= existing.header.op or
1502
- message.header.view > existing.header.view);
1503
- assert(message.header.commit >= existing.header.commit);
1504
-
1505
- self.message_bus.unref(existing);
1506
- responses[message.header.replica] = null;
1507
- } else {
1508
- log.debug(
1509
- "{}: on_recovery_response: replica={} view={} op={} commit={}",
1510
- .{
1511
- self.replica,
1512
- message.header.replica,
1513
- message.header.view,
1514
- message.header.op,
1515
- message.header.commit,
1516
- },
1517
- );
1518
- }
1519
-
1520
- assert(responses[message.header.replica] == null);
1521
- responses[message.header.replica] = message.ref();
1522
-
1523
- // Wait until we have:
1524
- // * at least `f + 1` messages for quorum (not including ourself), and
1525
- // * a response from the primary of the highest discovered view.
1526
- const count = self.count_quorum(responses, .recovery_response, self.recovery_nonce);
1527
- assert(count <= self.replica_count - 1);
1528
-
1529
- const threshold = self.quorum_view_change;
1530
- if (count < threshold) {
1531
- log.debug("{}: on_recovery_response: waiting for quorum ({}/{})", .{
1532
- self.replica,
1533
- count,
1534
- threshold,
1535
- });
1536
- return;
1537
- }
1538
-
1539
- const view = blk: { // The latest known view.
1540
- var view: u32 = 0;
1541
- for (self.recovery_response_from_other_replicas) |received, replica| {
1542
- if (received) |response| {
1543
- assert(replica != self.replica);
1544
- assert(response.header.replica == replica);
1545
- assert(response.header.context == self.recovery_nonce);
1546
-
1547
- view = std.math.max(view, response.header.view);
1548
- }
1549
- }
1550
- break :blk view;
1551
- };
1552
-
1553
- const primary_response = responses[self.primary_index(view)];
1554
- if (primary_response == null) {
1555
- log.debug(
1556
- "{}: on_recovery_response: ignoring (awaiting response from primary of view={})",
1557
- .{
1558
- self.replica,
1559
- view,
1560
- },
1561
- );
1562
- return;
1563
- }
1564
-
1565
- if (primary_response.?.header.view != view) {
1566
- // The primary (according to the view quorum) isn't the primary (according to itself).
1567
- // The `recovery_timeout` will retry shortly with another round.
1568
- log.debug(
1569
- "{}: on_recovery_response: ignoring (primary view={} != quorum view={})",
1570
- .{
1571
- self.replica,
1572
- primary_response.?.header.view,
1573
- view,
1574
- },
1575
- );
1576
- return;
1577
- }
1578
-
1579
- // This recovering→normal status transition occurs exactly once.
1580
- // All further `recovery_response` messages are ignored.
1581
-
1582
- // TODO When the view is recovered from the superblock (instead of via the VSR recovery
1583
- // protocol), if the view number indicates that this replica is a primary, it must
1584
- // transition to status=view_change instead of status=normal.
1585
-
1586
- const primary_headers = message_body_as_headers(primary_response.?);
1587
- assert(primary_headers.len > 0);
1588
-
1589
- const commit = primary_response.?.header.commit;
1590
- {
1591
- const op = op_highest(primary_headers);
1592
- assert(op == primary_response.?.header.op);
1593
-
1594
- self.set_op_and_commit_max(op, commit, "on_recovery_response");
1595
-
1596
- // TODO If the view's primary is >1 WAL ahead of us, these headers could cause
1597
- // problems. We don't want to jump this far ahead to repair, but we still need to
1598
- // use the hash chain to figure out which headers to request. Maybe include our
1599
- // `op_checkpoint` in the recovery (request) message so that the response can give
1600
- // more useful (i.e. older) headers.
1601
- self.replace_headers(primary_headers);
1602
-
1603
- if (self.op < constants.journal_slot_count) {
1604
- if (self.journal.header_with_op(0)) |header| {
1605
- assert(header.command == .prepare);
1606
- assert(header.operation == .root);
1607
- } else {
1608
- // This is the first wrap of the log, and the root prepare is corrupt.
1609
- // Repair the root repair. This is necessary to maintain the invariant that
1610
- // the op=commit_min exists in-memory.
1611
- //
1612
- // op=0 wouldn't have been repaired by replace_headers above, because it is
1613
- // already "checkpointed".
1614
- const header = Header.root_prepare(self.cluster);
1615
- self.journal.set_header_as_dirty(&header);
1616
- log.debug("{}: on_recovery_response: repair root op", .{self.replica});
1617
- }
1618
- }
1619
-
1620
- assert(self.op == op);
1621
- assert(self.journal.header_with_op(self.op) != null);
1622
- }
1623
-
1624
- assert(self.status == .recovering);
1625
- self.transition_to_normal_from_recovering_status(view);
1626
- assert(self.status == .normal);
1627
- assert(self.backup());
1628
-
1629
- log.info("{}: on_recovery_response: recovery done responses={} view={} headers={}..{}" ++
1630
- " commit={} dirty={} faulty={}", .{
1631
- self.replica,
1632
- count,
1633
- view,
1634
- primary_headers[primary_headers.len - 1].op,
1635
- primary_headers[0].op,
1636
- commit,
1637
- self.journal.dirty.count,
1638
- self.journal.faulty.count,
1639
- });
1640
-
1641
- self.state_machine.prepare_timestamp = self.journal.header_with_op(self.op).?.timestamp;
1642
- // `state_machine.commit_timestamp` is updated as messages are committed.
1643
-
1644
- self.reset_quorum_recovery_response();
1645
- self.commit_journal(commit);
1646
- self.repair();
1647
- }
1648
-
1649
- /// If the requested prepare has been guaranteed by this replica:
1650
- /// * Read the prepare from storage, and forward it to the replica that requested it.
1651
- /// * Otherwise send no reply — it isn't safe to nack.
1652
- /// If the requested prepare has *not* been guaranteed by this replica, then send a nack.
1653
- ///
1654
- /// A prepare is considered "guaranteed" by a replica if that replica has acknowledged it
1655
- /// to the cluster. The cluster sees the replica as an underwriter of a guaranteed
1656
- /// prepare. If a guaranteed prepare is found to by faulty, the replica must repair it
1657
- /// to restore durability.
1658
- fn on_request_prepare(self: *Self, message: *const Message) void {
1659
- if (self.ignore_repair_message(message)) return;
1660
-
1661
- assert(self.replica_count > 1);
1662
- assert(self.status == .normal or self.status == .view_change);
1663
- assert(message.header.view == self.view);
1664
- assert(message.header.replica != self.replica);
1665
-
1666
- const op = message.header.op;
1667
- const slot = self.journal.slot_for_op(op);
1668
- const checksum: ?u128 = switch (message.header.timestamp) {
1669
- 0 => null,
1670
- 1 => message.header.context,
1671
- else => unreachable,
1672
- };
1673
-
1674
- // Only the primary may respond to `request_prepare` messages without a checksum.
1675
- assert(checksum != null or self.primary_index(self.view) == self.replica);
1676
-
1677
- // Try to serve the message directly from the pipeline.
1678
- // This saves us from going to disk. And we don't need to worry that the WAL's copy
1679
- // of an uncommitted prepare is lost/corrupted.
1680
- if (self.pipeline_prepare_for_op_and_checksum(op, checksum)) |prepare| {
1681
- log.debug("{}: on_request_prepare: op={} checksum={} reply from pipeline", .{
1682
- self.replica,
1683
- op,
1684
- checksum,
1685
- });
1686
- self.send_message_to_replica(message.header.replica, prepare.message);
1687
- return;
1688
- }
1689
-
1690
- if (self.journal.prepare_inhabited[slot.index]) {
1691
- const prepare_checksum = self.journal.prepare_checksums[slot.index];
1692
- // Consult `journal.prepare_checksums` (rather than `journal.headers`):
1693
- // the former may have the prepare we want — even if journal recovery marked the
1694
- // slot as faulty and left the in-memory header as reserved.
1695
- if (checksum == null or checksum.? == prepare_checksum) {
1696
- log.debug("{}: on_request_prepare: op={} checksum={} reading", .{
1697
- self.replica,
1698
- op,
1699
- checksum,
1700
- });
1701
-
1702
- // Improve availability by calling `read_prepare_with_op_and_checksum` instead
1703
- // of `read_prepare` — even if `journal.headers` contains the target message.
1704
- // The latter skips the read when the target prepare is present but dirty (e.g.
1705
- // it was recovered with decision=fix).
1706
- // TODO Do not reissue the read if we are already reading in order to send to
1707
- // this particular destination replica.
1708
- self.journal.read_prepare_with_op_and_checksum(
1709
- on_request_prepare_read,
1710
- op,
1711
- prepare_checksum,
1712
- message.header.replica,
1713
- );
1714
-
1715
- // We have guaranteed the prepare (not safe to nack).
1716
- // Our copy may or may not be valid, but we will try to read & forward it.
1717
- return;
1718
- }
1719
- }
1720
-
1721
- {
1722
- // We may have guaranteed the prepare but our copy is faulty (not safe to nack).
1723
- if (self.journal.faulty.bit(slot)) return;
1724
- if (self.journal.header_with_op_and_checksum(message.header.op, checksum)) |_| {
1725
- if (self.journal.dirty.bit(slot)) {
1726
- // We know of the prepare but have yet to write it (safe to nack).
1727
- // Continue through below...
1728
- } else {
1729
- // We have guaranteed the prepare and our copy is clean (not safe to nack).
1730
- return;
1731
- }
1732
- }
1395
+ {
1396
+ // We may have guaranteed the prepare but our copy is faulty (not safe to nack).
1397
+ if (self.journal.faulty.bit(slot)) return;
1398
+ if (self.journal.header_with_op_and_checksum(message.header.op, checksum)) |_| {
1399
+ if (self.journal.dirty.bit(slot)) {
1400
+ // We know of the prepare but have yet to write it (safe to nack).
1401
+ // Continue through below...
1402
+ } else {
1403
+ // We have guaranteed the prepare and our copy is clean (not safe to nack).
1404
+ return;
1405
+ }
1406
+ }
1733
1407
  }
1734
1408
 
1735
1409
  // Protocol-Aware Recovery's CTRL protocol only runs during the view change, when the
@@ -1970,8 +1644,9 @@ pub fn ReplicaType(
1970
1644
  assert(self.status == .normal);
1971
1645
  assert(self.primary());
1972
1646
 
1973
- const prepare = self.pipeline.head_ptr().?;
1647
+ const prepare = self.pipeline.queue.prepare_queue.head_ptr().?;
1974
1648
  assert(prepare.message.header.command == .prepare);
1649
+ assert(prepare.message.header.op == self.commit_min + 1);
1975
1650
 
1976
1651
  if (prepare.ok_quorum_received) {
1977
1652
  self.prepare_timeout.reset();
@@ -2017,10 +1692,10 @@ pub fn ReplicaType(
2017
1692
  // We may be slow and waiting for the write to complete.
2018
1693
  //
2019
1694
  // We may even have maxed out our IO depth and been unable to initiate the write,
2020
- // which can happen if `constants.pipeline_max` exceeds `constants.journal_iops_write_max`.
2021
- // This can lead to deadlock for a cluster of one or two (if we do not retry here),
2022
- // since there is no other way for the primary to repair the dirty op because no
2023
- // other replica has it.
1695
+ // which can happen if `constants.pipeline_prepare_queue_max` exceeds
1696
+ // `constants.journal_iops_write_max`. This can lead to deadlock for a cluster of
1697
+ // one or two (if we do not retry here), since there is no other way for the primary
1698
+ // to repair the dirty op because no other replica has it.
2024
1699
  //
2025
1700
  // Retry the write through `on_repair()` which will work out which is which.
2026
1701
  // We do expect that the op would have been run through `on_prepare()` already.
@@ -2107,13 +1782,6 @@ pub fn ReplicaType(
2107
1782
  self.repair();
2108
1783
  }
2109
1784
 
2110
- fn on_recovery_timeout(self: *Self) void {
2111
- assert(self.status == .recovering);
2112
- assert(self.replica_count > 1);
2113
- self.recovery_timeout.reset();
2114
- self.recover();
2115
- }
2116
-
2117
1785
  fn reference_message_and_receive_quorum_exactly_once(
2118
1786
  self: *Self,
2119
1787
  messages: *QuorumMessages,
@@ -2301,7 +1969,7 @@ pub fn ReplicaType(
2301
1969
  assert(message.header.view == self.view);
2302
1970
  assert(message.header.op == self.op);
2303
1971
 
2304
- if (self.replica_count == 1 and self.pipeline.count > 1) {
1972
+ if (self.replica_count == 1 and self.pipeline.queue.prepare_queue.count > 1) {
2305
1973
  // In a cluster-of-one, the prepares must always be written to the WAL sequentially
2306
1974
  // (never concurrently). This ensures that there will be no gaps in the WAL during
2307
1975
  // crash recovery.
@@ -2364,10 +2032,9 @@ pub fn ReplicaType(
2364
2032
  /// A function which calls `commit_journal()` to set `commit_max` must first call
2365
2033
  /// `view_jump()`. Otherwise, we may fork the log.
2366
2034
  fn commit_journal(self: *Self, commit: u64) void {
2367
- // TODO Restrict `view_change` status only to the primary purely as defense-in-depth.
2368
- // Be careful of concurrency when doing this, as successive view changes can happen quickly.
2369
2035
  assert(self.status == .normal or self.status == .view_change or
2370
2036
  (self.status == .recovering and self.replica_count == 1));
2037
+ assert(!(self.status == .normal and self.primary()));
2371
2038
  assert(self.commit_min <= self.commit_max);
2372
2039
  assert(self.commit_min <= self.op);
2373
2040
  assert(self.commit_max <= self.op or self.commit_max > self.op);
@@ -2392,6 +2059,7 @@ pub fn ReplicaType(
2392
2059
  log.debug("{}: commit_journal: already committing...", .{self.replica});
2393
2060
  return;
2394
2061
  }
2062
+ assert(!(self.status == .normal and self.primary()));
2395
2063
 
2396
2064
  // We check the hash chain before we read each op, rather than once upfront, because
2397
2065
  // it's possible for `commit_max` to change while we read asynchronously, after we
@@ -2413,6 +2081,8 @@ pub fn ReplicaType(
2413
2081
  assert(self.committing);
2414
2082
  assert(self.status == .normal or self.status == .view_change or
2415
2083
  (self.status == .recovering and self.replica_count == 1));
2084
+ assert(!(self.status == .normal and self.primary()));
2085
+ assert(self.pipeline == .cache);
2416
2086
  assert(self.commit_min <= self.commit_max);
2417
2087
  assert(self.commit_min <= self.op);
2418
2088
 
@@ -2427,8 +2097,23 @@ pub fn ReplicaType(
2427
2097
  // Even a naive state transfer may fail to correct for this.
2428
2098
  if (self.commit_min < self.commit_max and self.commit_min < self.op) {
2429
2099
  const op = self.commit_min + 1;
2430
- const checksum = self.journal.header_with_op(op).?.checksum;
2431
- self.journal.read_prepare(commit_journal_next_callback, op, checksum, null);
2100
+ const header = self.journal.header_with_op(op).?;
2101
+
2102
+ if (self.pipeline.cache.prepare_by_op_and_checksum(op, header.checksum)) |prepare| {
2103
+ log.debug("{}: commit_journal_next: cached prepare op={} checksum={}", .{
2104
+ self.replica,
2105
+ op,
2106
+ header.checksum,
2107
+ });
2108
+ self.commit_journal_next_callback(prepare, null);
2109
+ } else {
2110
+ self.journal.read_prepare(
2111
+ commit_journal_next_callback,
2112
+ op,
2113
+ header.checksum,
2114
+ null,
2115
+ );
2116
+ }
2432
2117
  } else {
2433
2118
  self.commit_ops_done();
2434
2119
  // This is an optimization to expedite the view change before the `repair_timeout`:
@@ -2438,7 +2123,7 @@ pub fn ReplicaType(
2438
2123
  assert(self.replica_count == 1);
2439
2124
  assert(self.commit_min == self.commit_max);
2440
2125
  assert(self.commit_min == self.op);
2441
- self.transition_to_normal_from_recovering_status(0);
2126
+ self.transition_to_normal_from_recovering_status();
2442
2127
  } else {
2443
2128
  // We expect that a cluster-of-one only calls commit_journal() in recovering status.
2444
2129
  assert(self.replica_count > 1);
@@ -2457,14 +2142,6 @@ pub fn ReplicaType(
2457
2142
  return;
2458
2143
  }
2459
2144
 
2460
- const slot = self.journal.slot_with_op_and_checksum(
2461
- prepare.?.header.op,
2462
- prepare.?.header.checksum,
2463
- ).?;
2464
- assert(self.journal.prepare_inhabited[slot.index]);
2465
- assert(self.journal.prepare_checksums[slot.index] == prepare.?.header.checksum);
2466
- assert(self.journal.has(prepare.?.header));
2467
-
2468
2145
  switch (self.status) {
2469
2146
  .normal => {},
2470
2147
  .view_change => {
@@ -2484,6 +2161,7 @@ pub fn ReplicaType(
2484
2161
  assert(self.replica_count == 1);
2485
2162
  assert(self.primary_index(self.view) == self.replica);
2486
2163
  },
2164
+ .recovering_head => unreachable,
2487
2165
  }
2488
2166
 
2489
2167
  const op = self.commit_min + 1;
@@ -2497,7 +2175,15 @@ pub fn ReplicaType(
2497
2175
  assert(self.commit_min <= self.commit_max);
2498
2176
  assert(self.commit_min <= self.op);
2499
2177
 
2500
- self.commit_journal_next();
2178
+ if (self.status == .normal and self.primary()) {
2179
+ if (self.pipeline.queue.prepare_queue.empty()) {
2180
+ self.commit_ops_done();
2181
+ } else {
2182
+ self.commit_pipeline_next();
2183
+ }
2184
+ } else {
2185
+ self.commit_journal_next();
2186
+ }
2501
2187
  }
2502
2188
 
2503
2189
  /// Begin the commit path that is common between `commit_pipeline` and `commit_journal`:
@@ -2551,8 +2237,14 @@ pub fn ReplicaType(
2551
2237
  assert(self.commit_min <= self.commit_max);
2552
2238
 
2553
2239
  if (self.status == .normal and self.primary()) {
2554
- const prepare = self.pipeline.pop().?;
2240
+ const prepare = self.pipeline.queue.pop_prepare().?;
2241
+ if (self.pipeline.queue.pop_request()) |request| {
2242
+ // Start preparing the next request in the queue (if any).
2243
+ self.primary_pipeline_prepare(request);
2244
+ }
2245
+
2555
2246
  assert(self.commit_min == self.commit_max);
2247
+ assert(prepare.message.header.command == .prepare);
2556
2248
  assert(prepare.message.header.checksum == self.commit_prepare.?.header.checksum);
2557
2249
  assert(prepare.message.header.op == self.commit_min);
2558
2250
  assert(prepare.message.header.op == self.commit_max);
@@ -2560,7 +2252,7 @@ pub fn ReplicaType(
2560
2252
 
2561
2253
  self.message_bus.unref(prepare.message);
2562
2254
 
2563
- if (self.pipeline.head_ptr()) |next| {
2255
+ if (self.pipeline.queue.prepare_queue.head_ptr()) |next| {
2564
2256
  assert(next.message.header.op == self.commit_min + 1);
2565
2257
  assert(next.message.header.op == self.commit_prepare.?.header.op + 1);
2566
2258
 
@@ -2588,8 +2280,8 @@ pub fn ReplicaType(
2588
2280
  const self = @fieldParentPtr(Self, "state_machine", state_machine);
2589
2281
  assert(self.committing);
2590
2282
  assert(self.commit_callback != null);
2591
- assert(self.op_checkpoint == self.superblock.staging.vsr_state.commit_min);
2592
- assert(self.op_checkpoint == self.superblock.working.vsr_state.commit_min);
2283
+ assert(self.op_checkpoint() == self.superblock.staging.vsr_state.commit_min);
2284
+ assert(self.op_checkpoint() == self.superblock.working.vsr_state.commit_min);
2593
2285
 
2594
2286
  const op = self.commit_prepare.?.header.op;
2595
2287
  assert(op == self.commit_min);
@@ -2604,7 +2296,7 @@ pub fn ReplicaType(
2604
2296
  "(op={} current_checkpoint={} next_checkpoint={})", .{
2605
2297
  self.replica,
2606
2298
  self.op,
2607
- self.op_checkpoint,
2299
+ self.op_checkpoint(),
2608
2300
  self.op_checkpoint_next(),
2609
2301
  });
2610
2302
  tracer.start(
@@ -2638,19 +2330,15 @@ pub fn ReplicaType(
2638
2330
  // Therefore, only ops "A..D" are committed to disk.
2639
2331
  // Thus, the SuperBlock's `commit_min` is set to 7-2=5.
2640
2332
  const vsr_state_commit_min = self.op_checkpoint_next();
2641
- const vsr_state_new = .{
2642
- .commit_min_checksum = self.journal.header_with_op(vsr_state_commit_min).?.checksum,
2643
- .commit_min = vsr_state_commit_min,
2644
- .commit_max = self.commit_max,
2645
- .view_normal = self.view_normal,
2646
- .view = self.view,
2647
- };
2648
- assert(self.superblock.working.vsr_state.monotonic(vsr_state_new));
2649
2333
 
2650
2334
  self.superblock.checkpoint(
2651
2335
  commit_op_checkpoint_superblock_callback,
2652
2336
  &self.superblock_context,
2653
- vsr_state_new,
2337
+ .{
2338
+ .commit_min_checksum = self.journal.header_with_op(vsr_state_commit_min).?.checksum,
2339
+ .commit_min = vsr_state_commit_min,
2340
+ .commit_max = self.commit_max,
2341
+ },
2654
2342
  );
2655
2343
  }
2656
2344
 
@@ -2661,15 +2349,14 @@ pub fn ReplicaType(
2661
2349
  assert(self.commit_prepare.?.header.op == self.op);
2662
2350
  assert(self.commit_prepare.?.header.op == self.commit_min);
2663
2351
 
2664
- self.op_checkpoint = self.op_checkpoint_next();
2665
- assert(self.op_checkpoint == self.commit_min - constants.lsm_batch_multiple);
2666
- assert(self.op_checkpoint == self.superblock.staging.vsr_state.commit_min);
2667
- assert(self.op_checkpoint == self.superblock.working.vsr_state.commit_min);
2352
+ assert(self.op_checkpoint() == self.commit_min - constants.lsm_batch_multiple);
2353
+ assert(self.op_checkpoint() == self.superblock.staging.vsr_state.commit_min);
2354
+ assert(self.op_checkpoint() == self.superblock.working.vsr_state.commit_min);
2668
2355
 
2669
2356
  log.debug("{}: commit_op_compact_callback: checkpoint done (op={} new_checkpoint={})", .{
2670
2357
  self.replica,
2671
2358
  self.op,
2672
- self.op_checkpoint,
2359
+ self.op_checkpoint(),
2673
2360
  });
2674
2361
  tracer.end(
2675
2362
  &self.tracer_slot_checkpoint,
@@ -2720,7 +2407,7 @@ pub fn ReplicaType(
2720
2407
  // this commit.
2721
2408
 
2722
2409
  assert(self.journal.has(prepare.header));
2723
- if (self.op_checkpoint == self.commit_min) {
2410
+ if (self.op_checkpoint() == self.commit_min) {
2724
2411
  // op_checkpoint's slot may have been overwritten in the WAL — but we can
2725
2412
  // always use the VSRState to anchor the hash chain.
2726
2413
  assert(prepare.header.parent ==
@@ -2752,6 +2439,7 @@ pub fn ReplicaType(
2752
2439
  const reply_body_size = @intCast(u32, self.state_machine.commit(
2753
2440
  prepare.header.client,
2754
2441
  prepare.header.op,
2442
+ prepare.header.timestamp,
2755
2443
  prepare.header.operation.cast(StateMachine),
2756
2444
  prepare.buffer[@sizeOf(Header)..prepare.header.size],
2757
2445
  reply.buffer[@sizeOf(Header)..],
@@ -2788,7 +2476,7 @@ pub fn ReplicaType(
2788
2476
  if (self.superblock.working.vsr_state.op_compacted(prepare.header.op)) {
2789
2477
  // We are recovering from a checkpoint. Prior to the crash, the client table was
2790
2478
  // updated with entries for one bar beyond the op_checkpoint.
2791
- assert(self.op_checkpoint == self.superblock.working.vsr_state.commit_min);
2479
+ assert(self.op_checkpoint() == self.superblock.working.vsr_state.commit_min);
2792
2480
  if (self.client_table().get(prepare.header.client)) |entry| {
2793
2481
  assert(entry.reply.header.command == .reply);
2794
2482
  assert(entry.reply.header.op >= prepare.header.op);
@@ -2799,7 +2487,7 @@ pub fn ReplicaType(
2799
2487
  log.debug("{}: commit_op: skip client table update: prepare.op={} checkpoint={}", .{
2800
2488
  self.replica,
2801
2489
  prepare.header.op,
2802
- self.op_checkpoint,
2490
+ self.op_checkpoint(),
2803
2491
  });
2804
2492
  } else {
2805
2493
  if (reply.header.operation == .register) {
@@ -2821,7 +2509,7 @@ pub fn ReplicaType(
2821
2509
  fn commit_pipeline(self: *Self) void {
2822
2510
  assert(self.status == .normal);
2823
2511
  assert(self.primary());
2824
- assert(self.pipeline.count > 0);
2512
+ assert(self.pipeline.queue.prepare_queue.count > 0);
2825
2513
 
2826
2514
  // Guard against multiple concurrent invocations of commit_journal()/commit_pipeline():
2827
2515
  if (self.committing) {
@@ -2838,10 +2526,10 @@ pub fn ReplicaType(
2838
2526
  assert(self.status == .normal);
2839
2527
  assert(self.primary());
2840
2528
 
2841
- if (self.pipeline.head_ptr()) |prepare| {
2529
+ if (self.pipeline.queue.prepare_queue.head_ptr()) |prepare| {
2842
2530
  assert(self.commit_min == self.commit_max);
2843
2531
  assert(self.commit_min + 1 == prepare.message.header.op);
2844
- assert(self.commit_min + self.pipeline.count == self.op);
2532
+ assert(self.commit_min + self.pipeline.queue.prepare_queue.count == self.op);
2845
2533
  assert(self.journal.has(prepare.message.header));
2846
2534
 
2847
2535
  if (!prepare.ok_quorum_received) {
@@ -2867,9 +2555,6 @@ pub fn ReplicaType(
2867
2555
  assert(self.commit_min <= self.op);
2868
2556
 
2869
2557
  if (self.status == .normal and self.primary()) {
2870
- if (self.pipeline.head_ptr()) |pipeline_head| {
2871
- assert(pipeline_head.message.header.op == self.commit_min + 1);
2872
- }
2873
2558
  self.commit_pipeline_next();
2874
2559
  } else {
2875
2560
  self.commit_ops_done();
@@ -2890,10 +2575,7 @@ pub fn ReplicaType(
2890
2575
  ) usize {
2891
2576
  assert(op_max >= op_min);
2892
2577
  assert(count_max == null or count_max.? > 0);
2893
- assert(message.header.command == .do_view_change or
2894
- message.header.command == .start_view or
2895
- message.header.command == .headers or
2896
- message.header.command == .recovery_response);
2578
+ assert(message.header.command == .headers);
2897
2579
 
2898
2580
  const body_size_max = @sizeOf(Header) * std.math.min(
2899
2581
  @divExact(message.buffer.len - @sizeOf(Header), @sizeOf(Header)),
@@ -2934,7 +2616,6 @@ pub fn ReplicaType(
2934
2616
  assert(m.header.replica == replica);
2935
2617
  switch (command) {
2936
2618
  .do_view_change => assert(m.header.view == self.view),
2937
- .recovery_response => assert(m.header.replica != self.replica),
2938
2619
  else => unreachable,
2939
2620
  }
2940
2621
  count += 1;
@@ -3021,17 +2702,30 @@ pub fn ReplicaType(
3021
2702
  assert(self.client_table().count() <= constants.clients_max);
3022
2703
  }
3023
2704
 
2705
+ /// Construct a SV/DVC message, including attached headers from the current log_view.
2706
+ ///
3024
2707
  /// The caller owns the returned message, if any, which has exactly 1 reference.
3025
2708
  fn create_view_change_message(self: *Self, command: Command) *Message {
3026
- assert(command == .do_view_change or command == .start_view);
3027
-
3028
2709
  // We may send a start_view message in normal status to resolve a backup's view jump:
3029
2710
  assert(self.status == .normal or self.status == .view_change);
2711
+ assert((self.status == .normal) == (command == .start_view));
2712
+ assert((self.status == .view_change) == (command == .do_view_change));
2713
+ assert(self.view >= self.log_view);
2714
+ assert(self.view >= self.view_durable());
2715
+ assert(self.log_view >= self.log_view_durable());
2716
+
2717
+ assert(command != .do_view_change or self.log_view < self.view);
2718
+ assert(command != .start_view or self.log_view == self.view);
3030
2719
 
3031
2720
  const message = self.message_bus.get_message();
3032
2721
  defer self.message_bus.unref(message);
3033
2722
 
2723
+ const headers = self.create_view_change_headers();
2724
+ assert(headers.len > 0);
2725
+ assert(headers.get(0).op == self.op);
2726
+
3034
2727
  message.header.* = .{
2728
+ .size = @intCast(u32, @sizeOf(Header) * (1 + headers.len)),
3035
2729
  .command = command,
3036
2730
  .cluster = self.cluster,
3037
2731
  .replica = self.replica,
@@ -3040,33 +2734,167 @@ pub fn ReplicaType(
3040
2734
  // number contained in the prepare headers we include in the body. The former shows
3041
2735
  // how recent a view change the replica participated in, which may be much higher.
3042
2736
  // We use the `timestamp` field to send this in addition to the current view number:
3043
- .timestamp = if (command == .do_view_change) self.view_normal else 0,
2737
+ .timestamp = if (command == .do_view_change) self.log_view else 0,
3044
2738
  .op = self.op,
3045
2739
  // See the comment in `on_do_view_change()` for why `commit_min` is crucial:
3046
2740
  .commit = if (command == .do_view_change) self.commit_min else self.commit_max,
3047
2741
  };
3048
2742
 
3049
- const count = self.copy_latest_headers_and_set_size(
3050
- 0,
3051
- self.op,
3052
- view_change_headers_count,
3053
- message,
2743
+ stdx.copy_disjoint(
2744
+ .exact,
2745
+ Header,
2746
+ std.mem.bytesAsSlice(Header, message.body()),
2747
+ headers.constSlice(),
3054
2748
  );
3055
- assert(count > 0); // We expect that self.op always exists.
3056
- assert(@divExact(message.header.size, @sizeOf(Header)) == 1 + count);
3057
-
3058
2749
  message.header.set_checksum_body(message.body());
3059
2750
  message.header.set_checksum();
3060
2751
 
3061
2752
  return message.ref();
3062
2753
  }
3063
2754
 
2755
+ fn create_view_change_headers(self: *const Self) vsr.ViewChangeHeaders.BoundedArray {
2756
+ assert(self.status == .normal or self.status == .view_change);
2757
+ assert(self.view >= self.log_view);
2758
+ assert(self.view >= self.view_durable());
2759
+ assert(self.log_view >= self.log_view_durable());
2760
+
2761
+ var headers = vsr.ViewChangeHeaders.BoundedArray{ .buffer = undefined };
2762
+
2763
+ // Always include the head message.
2764
+ headers.appendAssumeCapacity(self.journal.header_with_op(self.op).?.*);
2765
+
2766
+ if (self.view == self.log_view) {
2767
+ // Construct SV message headers. (On the backup, these are only stored in the
2768
+ // superblock).
2769
+ if (self.primary_index(self.view) == self.replica and self.status == .normal) {
2770
+ assert(self.op >= self.commit_max);
2771
+
2772
+ // The primary starting a new view has a pristine log suffix.
2773
+ //
2774
+ // +1 because commit_min may have been overwritten (and not repaired) if it
2775
+ // falls on a checkpoint boundary.
2776
+ var op = self.op;
2777
+ while (op > self.commit_min + 1) : (op -= 1) {
2778
+ const header_next = self.journal.header_with_op(op).?;
2779
+ const header_prev = self.journal.header_with_op(op - 1).?;
2780
+ assert(header_prev.checksum == header_next.parent);
2781
+
2782
+ headers.append(header_prev.*) catch break;
2783
+ }
2784
+ } else {
2785
+ // Either:
2786
+ // - The primary started a new view but has not finished repair.
2787
+ // - The backup joining a new view has a pristine log suffix — it just
2788
+ // loaded a SV.
2789
+ //
2790
+ // In each case we send as much of a suffix as is available (fallthrough).
2791
+ }
2792
+ } else {
2793
+ // Construct DVC message headers.
2794
+ assert(self.view > self.log_view);
2795
+
2796
+ if (self.log_view_durable() == self.log_view) {
2797
+ const headers_durable = self.superblock.working.vsr_headers().slice;
2798
+ assert(headers_durable[0].op <= self.op);
2799
+
2800
+ if (self.log_view_durable() < self.view_durable()) {
2801
+ // Ensure that if we started a DVC before a crash, that we will resume
2802
+ // sending the exact same DVC after recovery.
2803
+ // (An alternative implementation would be to load the superblock's DVC
2804
+ // headers (including gaps) into the journal during open(), but that is more
2805
+ // complicated to implement correctly).
2806
+ assert(headers_durable[0].op == self.op);
2807
+ assert(headers_durable[0].checksum == headers.get(0).checksum);
2808
+
2809
+ for (headers_durable[1..]) |*header| headers.appendAssumeCapacity(header.*);
2810
+ } else {
2811
+ // Durable SV anchor. See Example 4.
2812
+ assert(self.log_view_durable() == self.view_durable());
2813
+
2814
+ var op = self.op;
2815
+ while (op > headers_durable[headers_durable.len - 1].op) : (op -= 1) {
2816
+ const header_prev = self.journal.header_with_op(op - 1) orelse continue;
2817
+ const header_next = self.journal.header_with_op(op);
2818
+ assert(header_next == null or header_prev.checksum == header_next.?.parent);
2819
+
2820
+ headers.append(header_prev.*) catch break;
2821
+ }
2822
+ }
2823
+ return headers;
2824
+ }
2825
+
2826
+ // The DVC anchor: Within the log suffix following the anchor, we have additional
2827
+ // guarantees about the state of the log headers which allow us to tolerate certain
2828
+ // gaps (by locally guaranteeing that the gap does not hide a break).
2829
+ // See Example 2/3 for more detail.
2830
+ const op_dvc_anchor = std.math.max(
2831
+ self.commit_min,
2832
+ // +1: We can have a full pipeline, but not yet have performed any repair.
2833
+ // In such a case, we want to send those pipeline_prepare_queue_max headers in
2834
+ // the DVC, but not the preceding op (which may belong to a different chain).
2835
+ // This satisfies the DVC invariant because the first op in the pipeline is
2836
+ // "connected" to the canonical chain (via its "parent" checksum).
2837
+ //
2838
+ // For example, as a follower, we might have received pipeline_prepare_queue_max
2839
+ // headers in the SV message, but not done any repair before the next view
2840
+ // change.
2841
+ 1 + self.op -| constants.pipeline_prepare_queue_max,
2842
+ );
2843
+
2844
+ if (self.primary_index(self.log_view) == self.replica) {
2845
+ // Retired primary: see Example 2a.
2846
+ var op = self.op;
2847
+ while (op > op_dvc_anchor) : (op -= 1) {
2848
+ const header_next = self.journal.header_with_op(op).?;
2849
+ // Exclude gaps since we cannot distinguish the gap from a break.
2850
+ const header_prev = self.journal.header_with_op(op - 1) orelse break;
2851
+ if (header_prev.checksum != header_next.parent) break;
2852
+
2853
+ headers.append(header_prev.*) catch break;
2854
+ }
2855
+ } else {
2856
+ // Retired backup: see Example 2b.
2857
+ var op = self.op;
2858
+ while (op > self.commit_min) : (op -= 1) {
2859
+ const header_prev = self.journal.header_with_op(op - 1) orelse continue;
2860
+ const header_next = self.journal.header_with_op(op);
2861
+ assert(header_next == null or header_prev.checksum == header_next.?.parent);
2862
+
2863
+ headers.append(header_prev.*) catch break;
2864
+
2865
+ // Stop once we connect to the anchor.
2866
+ if (header_prev.op <= op_dvc_anchor + 1) break;
2867
+ } else {
2868
+ assert(self.commit_min == self.op);
2869
+ }
2870
+ }
2871
+ }
2872
+
2873
+ // Include as many extra headers as possible, but with no additional gaps (since they
2874
+ // cannot be differentiated from breaks).
2875
+ // - This reduces the number of headers that the new primary will need to repair.
2876
+ // - More importantly, this ensures that a replica which re-sends its DVC does not
2877
+ // alter the DVC's headers, even if the replica finished a commit (updating
2878
+ // commit_min, possibly modifying the suffix anchor) in the mean time.
2879
+ // (This is not required for correctness, but enables additional verification
2880
+ // in on_do_view_change().)
2881
+ var op = headers.get(headers.len - 1).op;
2882
+ while (op > 0 and headers.len < constants.view_change_headers_max) : (op -= 1) {
2883
+ const header_next = self.journal.header_with_op(op).?;
2884
+ const header_prev = self.journal.header_with_op(op - 1) orelse break;
2885
+ if (header_prev.checksum != header_next.parent) break;
2886
+
2887
+ headers.appendAssumeCapacity(header_prev.*);
2888
+ }
2889
+
2890
+ vsr.ViewChangeHeaders.verify(headers.constSlice());
2891
+ return headers;
2892
+ }
2893
+
3064
2894
  /// The caller owns the returned message, if any, which has exactly 1 reference.
3065
2895
  fn create_message_from_header(self: *Self, header: Header) *Message {
3066
2896
  assert(header.replica == self.replica);
3067
- assert(header.view == self.view or
3068
- header.command == .request_start_view or
3069
- header.command == .recovery);
2897
+ assert(header.view == self.view or header.command == .request_start_view);
3070
2898
  assert(header.size == @sizeOf(Header));
3071
2899
 
3072
2900
  const message = self.message_bus.pool.get_message();
@@ -3079,67 +2907,6 @@ pub fn ReplicaType(
3079
2907
  return message.ref();
3080
2908
  }
3081
2909
 
3082
- /// Returns the op of the highest canonical message, according to this replica (the new
3083
- /// primary) prior to loading the current view change's DVC quorum headers.
3084
- /// When this replica participated in the last `view_normal`, this is just `replica.op`.
3085
- ///
3086
- /// - A *canonical* message was part of the last view_normal.
3087
- /// - An *uncanonical* message may have been removed/changed by a prior view.
3088
- /// - Canonical messages do not necessarily survive into the new view, but they take
3089
- /// precedence over uncanonical messages.
3090
- /// - Canonical messages may be committed or uncommitted.
3091
- ///
3092
- /// Consider these logs:
3093
- ///
3094
- /// replica 0: 4, 5, 6b, 7b, 8b (commit_min=6b, primary, status=normal, view=X)
3095
- /// replica 1: 4, 5, 6b, --, -- (commit_min=5, backup, status=normal, view=X)
3096
- /// replica 2: 4, 5, 6a, --, 8b (view<X)
3097
- ///
3098
- /// 1. Replica 0 crashes immediately after committing 6b.
3099
- /// 2. Replicas 1 and 2 must determine the new chain HEAD.
3100
- /// 3. 8b is discarded due to the gap in 7.
3101
- /// 4. To distinguish between 6a and 6b (and safely discard 6a), the new primary trusts ops
3102
- /// from the DVC(s) with the greatest `view_normal`.
3103
- fn primary_op_canonical_max(self: *const Self, view_normal_canonical: u64) usize {
3104
- assert(self.replica_count > 1);
3105
- assert(self.status == .view_change);
3106
- assert(self.primary_index(self.view) == self.replica);
3107
- assert(self.do_view_change_quorum);
3108
- assert(!self.repair_timeout.ticking);
3109
- assert(self.journal.header_with_op(self.op) != null);
3110
- assert(self.view_normal <= view_normal_canonical);
3111
-
3112
- if (self.view_normal == view_normal_canonical) return self.op;
3113
-
3114
- const uncanonical_op_count = std.math.min(
3115
- // Do not reset any ops that we have already committed.
3116
- self.op - self.commit_min,
3117
- // The number of uncommitted ops cannot be more than the length of the pipeline.
3118
- // Do not reset any ops that we did not include in our do_view_change message.
3119
- constants.pipeline_max,
3120
- );
3121
-
3122
- assert(uncanonical_op_count <= constants.pipeline_max);
3123
- if (uncanonical_op_count == 0) return self.op;
3124
-
3125
- // * When uncanonical_op_count = self.op - self.commit_min,
3126
- // self.op - uncanonical_op_count = self.commit_min.
3127
- // * When uncanonical_op_count = constants.pipeline_max,
3128
- // constants.pipeline_max < self.op - self.commit_min holds.
3129
- const canonical_op_max = self.op - uncanonical_op_count;
3130
-
3131
- log.debug("{}: on_do_view_change: not canonical ops={}..{}", .{
3132
- self.replica,
3133
- canonical_op_max + 1,
3134
- self.op,
3135
- });
3136
-
3137
- assert(canonical_op_max <= self.op);
3138
- assert(canonical_op_max >= self.commit_min);
3139
- assert(canonical_op_max + constants.pipeline_max >= self.op);
3140
- return canonical_op_max;
3141
- }
3142
-
3143
2910
  /// Discards uncommitted ops during a view change from after and including `op`.
3144
2911
  /// This is required to maximize availability in the presence of storage faults.
3145
2912
  /// Refer to the CTRL protocol from Protocol-Aware Recovery for Consensus-Based Storage.
@@ -3192,8 +2959,8 @@ pub fn ReplicaType(
3192
2959
  // asynchronous prepare_ok to itself.
3193
2960
  // 3. In on_start_view_change(), after receiving a quorum of start_view_change
3194
2961
  // messages, the new primary sends a synchronous do_view_change to itself.
3195
- // 4. In start_view_as_the_new_primary(), the new primary sends itself a prepare_ok
3196
- // message for each uncommitted message.
2962
+ // 4. In primary_start_view_as_the_new_primary(), the new primary sends itself a
2963
+ // prepare_ok message for each uncommitted message.
3197
2964
  if (self.loopback_queue) |message| {
3198
2965
  defer self.message_bus.unref(message);
3199
2966
 
@@ -3278,7 +3045,8 @@ pub fn ReplicaType(
3278
3045
  log.warn("{}: on_{s}: ignoring (no context)", .{ self.replica, command });
3279
3046
  return true;
3280
3047
  },
3281
- else => {},
3048
+ .headers, .request_headers => {},
3049
+ else => unreachable,
3282
3050
  }
3283
3051
  }
3284
3052
 
@@ -3344,13 +3112,14 @@ pub fn ReplicaType(
3344
3112
  if (self.ignore_request_message_duplicate(message)) return true;
3345
3113
  if (self.ignore_request_message_preparing(message)) return true;
3346
3114
 
3347
- // Verify that the new request will fit in the WAL.
3348
- // The message's op hasn't been assigned yet, but it will be `self.op + 1`.
3349
- if (self.op == self.op_checkpoint_trigger()) {
3115
+ // Don't accept more requests than will fit in the current checkpoint.
3116
+ // (The request's op hasn't been assigned yet, but it will be `self.op + 1`
3117
+ // when primary_pipeline_next() converts the request to a prepare.)
3118
+ if (self.op + self.pipeline.queue.request_queue.count == self.op_checkpoint_trigger()) {
3350
3119
  log.debug("{}: on_request: ignoring op={} (too far ahead, checkpoint_trigger={})", .{
3351
3120
  self.replica,
3352
3121
  self.op + 1,
3353
- self.op_checkpoint,
3122
+ self.op_checkpoint(),
3354
3123
  });
3355
3124
  return true;
3356
3125
  }
@@ -3419,7 +3188,7 @@ pub fn ReplicaType(
3419
3188
  } else if (message.header.operation == .register) {
3420
3189
  log.debug("{}: on_request: new session", .{self.replica});
3421
3190
  return false;
3422
- } else if (self.pipeline_prepare_for_client(message.header.client)) |_| {
3191
+ } else if (self.pipeline.queue.message_by_client(message.header.client)) |_| {
3423
3192
  // The client registered with the previous primary, which committed and replied back
3424
3193
  // to the client before the view change, after which the register operation was
3425
3194
  // reloaded into the pipeline to be driven to completion by the new primary, which
@@ -3491,21 +3260,31 @@ pub fn ReplicaType(
3491
3260
  assert(message.header.client > 0);
3492
3261
  assert(message.header.view <= self.view); // See ignore_request_message_backup().
3493
3262
 
3494
- if (self.pipeline_prepare_for_client(message.header.client)) |prepare| {
3495
- assert(prepare.message.header.command == .prepare);
3496
- assert(prepare.message.header.client == message.header.client);
3497
- assert(prepare.message.header.op > self.commit_max);
3263
+ if (self.pipeline.queue.message_by_client(message.header.client)) |pipeline_message| {
3264
+ assert(pipeline_message.header.client == message.header.client);
3265
+ assert(pipeline_message.header.command == .request or
3266
+ pipeline_message.header.command == .prepare);
3498
3267
 
3499
- if (message.header.checksum == prepare.message.header.context) {
3500
- log.debug("{}: on_request: ignoring (already preparing)", .{self.replica});
3268
+ if (pipeline_message.header.command == .request and
3269
+ pipeline_message.header.checksum == message.header.checksum)
3270
+ {
3271
+ log.debug("{}: on_request: ignoring (already queued)", .{self.replica});
3501
3272
  return true;
3502
- } else {
3503
- log.err("{}: on_request: ignoring (client forked)", .{self.replica});
3273
+ }
3274
+
3275
+ if (pipeline_message.header.command == .prepare and
3276
+ pipeline_message.header.context == message.header.checksum)
3277
+ {
3278
+ assert(pipeline_message.header.op > self.commit_max);
3279
+ log.debug("{}: on_request: ignoring (already preparing)", .{self.replica});
3504
3280
  return true;
3505
3281
  }
3282
+
3283
+ log.err("{}: on_request: ignoring (client forked)", .{self.replica});
3284
+ return true;
3506
3285
  }
3507
3286
 
3508
- if (self.pipeline.full()) {
3287
+ if (self.pipeline.queue.full()) {
3509
3288
  log.debug("{}: on_request: ignoring (pipeline full)", .{self.replica});
3510
3289
  return true;
3511
3290
  }
@@ -3521,7 +3300,10 @@ pub fn ReplicaType(
3521
3300
 
3522
3301
  const command: []const u8 = @tagName(message.header.command);
3523
3302
 
3524
- // 4.3 Recovery
3303
+ if (self.status == .recovering_head and message.header.command != .start_view) {
3304
+ return true;
3305
+ }
3306
+
3525
3307
  // While a replica's status is recovering it does not participate in either the request
3526
3308
  // processing protocol or the view change protocol.
3527
3309
  // This is critical for correctness (to avoid data loss):
@@ -3614,28 +3396,7 @@ pub fn ReplicaType(
3614
3396
  assert(self.journal.header_with_op(self.op) == null);
3615
3397
  }
3616
3398
 
3617
- fn message_body_as_headers(message: *const Message) []const Header {
3618
- assert(message.header.size > @sizeOf(Header)); // Body must contain at least one header.
3619
- assert(message.header.command == .do_view_change or
3620
- message.header.command == .start_view or
3621
- message.header.command == .headers or
3622
- message.header.command == .recovery_response);
3623
-
3624
- const headers = std.mem.bytesAsSlice(
3625
- Header,
3626
- message.buffer[@sizeOf(Header)..message.header.size],
3627
- );
3628
-
3629
- for (headers[0 .. headers.len - 1]) |header, index| {
3630
- // Headers must be provided in reverse order for the sake of `repair_header()`.
3631
- // Otherwise, headers may never be repaired where the hash chain never connects.
3632
- assert(header.op > headers[index + 1].op);
3633
- }
3634
-
3635
- return headers;
3636
- }
3637
-
3638
- /// Returns whether the highest known op is certain.
3399
+ /// Returns whether the head op is certain.
3639
3400
  ///
3640
3401
  /// After recovering the WAL, there are 2 possible outcomes:
3641
3402
  /// * All entries valid. The highest op is certain, and safe to set as `replica.op`.
@@ -3663,41 +3424,34 @@ pub fn ReplicaType(
3663
3424
  /// * ` ✓ ✗ o `: View change is safe.
3664
3425
  /// * ` ✓ = o `: View change is unsafe if any slots are faulty.
3665
3426
  /// (`replica.op_checkpoint` == `replica.op`).
3666
- // TODO Use this function once we switch from recovery protocol to the superblock.
3667
- // If there is an "unsafe" fault, we will need to request a start_view from the primary to
3668
- // learn the op.
3669
- fn op_certain(self: *const Self) bool {
3427
+ fn op_head_certain(self: *const Self) bool {
3670
3428
  assert(self.status == .recovering);
3671
- assert(self.op_checkpoint <= self.op);
3429
+ assert(self.op_checkpoint() <= self.op);
3672
3430
 
3673
- const slot_op_checkpoint = self.journal.slot_for_op(self.op_checkpoint).index;
3674
- const slot_op = self.journal.slot_with_op(self.op).?.index;
3431
+ const slot_op_checkpoint = self.journal.slot_for_op(self.op_checkpoint());
3432
+ const slot_op_head = self.journal.slot_with_op(self.op).?;
3675
3433
  const slot_known_range = vsr.SlotRange{
3676
3434
  .head = slot_op_checkpoint,
3677
- .tail = slot_op,
3435
+ .tail = slot_op_head,
3678
3436
  };
3679
3437
 
3680
3438
  var iterator = self.journal.faulty.bits.iterator(.{ .kind = .set });
3681
3439
  while (iterator.next()) |slot| {
3682
- // The command is `reserved` when the entry was found faulty during WAL recovery.
3683
- // Faults found after WAL recovery are not relevant, because we know their op.
3684
- if (self.journal.headers[slot.index].command == .reserved) {
3685
- if (slot_op_checkpoint == slot_op or
3686
- !slot_known_range.contains(slot))
3687
- {
3688
- log.warn("{}: op_certain: op not known (faulty_slot={}, op={}, op_checkpoint={})", .{
3689
- self.replica,
3690
- slot.index,
3691
- self.op,
3692
- self.op_checkpoint,
3693
- });
3694
- return false;
3695
- }
3440
+ if (slot_op_checkpoint.index == slot_op_head.index or
3441
+ !slot_known_range.contains(.{ .index = slot }))
3442
+ {
3443
+ return false;
3696
3444
  }
3697
3445
  }
3698
3446
  return true;
3699
3447
  }
3700
3448
 
3449
+ /// The op of the highest checkpointed message.
3450
+ // TODO Refuse to store/ack any op>op_checkpoint+journal_slot_count.
3451
+ pub fn op_checkpoint(self: *const Self) u64 {
3452
+ return self.superblock.working.vsr_state.commit_min;
3453
+ }
3454
+
3701
3455
  /// Returns the op that will be `op_checkpoint` after the next checkpoint.
3702
3456
  ///
3703
3457
  /// For a replica with journal_slot_count=8 and lsm_batch_multiple=2:
@@ -3722,21 +3476,21 @@ pub fn ReplicaType(
3722
3476
  /// % op_checkpoint_trigger
3723
3477
  ///
3724
3478
  fn op_checkpoint_next(self: *const Self) u64 {
3725
- assert(self.op_checkpoint <= self.commit_min);
3726
- assert(self.op_checkpoint <= self.op);
3727
- assert(self.op_checkpoint == 0 or
3728
- (self.op_checkpoint + 1) % constants.lsm_batch_multiple == 0);
3479
+ assert(self.op_checkpoint() <= self.commit_min);
3480
+ assert(self.op_checkpoint() <= self.op);
3481
+ assert(self.op_checkpoint() == 0 or
3482
+ (self.op_checkpoint() + 1) % constants.lsm_batch_multiple == 0);
3729
3483
 
3730
- const op = if (self.op_checkpoint == 0)
3484
+ const op = if (self.op_checkpoint() == 0)
3731
3485
  // First wrap: op_checkpoint_next = 8-2-1 = 5
3732
3486
  constants.journal_slot_count - constants.lsm_batch_multiple - 1
3733
3487
  else
3734
3488
  // Second wrap: op_checkpoint_next = 5+8-2 = 11
3735
3489
  // Third wrap: op_checkpoint_next = 11+8-2 = 17
3736
- self.op_checkpoint + constants.journal_slot_count - constants.lsm_batch_multiple;
3490
+ self.op_checkpoint() + constants.journal_slot_count - constants.lsm_batch_multiple;
3737
3491
  assert((op + 1) % constants.lsm_batch_multiple == 0);
3738
3492
  // The checkpoint always advances.
3739
- assert(op > self.op_checkpoint);
3493
+ assert(op > self.op_checkpoint());
3740
3494
 
3741
3495
  return op;
3742
3496
  }
@@ -3790,110 +3544,94 @@ pub fn ReplicaType(
3790
3544
  }
3791
3545
  }
3792
3546
 
3793
- /// Searches the pipeline for a prepare for a given op and checksum.
3794
- /// When `checksum` is `null`, match any checksum.
3795
- fn pipeline_prepare_for_op_and_checksum(self: *Self, op: u64, checksum: ?u128) ?*Prepare {
3796
- assert(self.status == .normal or self.status == .view_change);
3797
-
3798
- // To optimize the search, we can leverage the fact that the pipeline is ordered and
3799
- // continuous.
3800
- if (self.pipeline.count == 0) return null;
3801
- const head_op = self.pipeline.head_ptr().?.message.header.op;
3802
- const tail_op = self.pipeline.tail_ptr().?.message.header.op;
3803
- if (op < head_op) return null;
3804
- if (op > tail_op) return null;
3805
-
3806
- const pipeline_prepare = self.pipeline.get_ptr(op - head_op).?;
3807
- assert(pipeline_prepare.message.header.op == op);
3808
-
3809
- if (checksum == null or pipeline_prepare.message.header.checksum == checksum.?) {
3810
- return pipeline_prepare;
3811
- } else {
3812
- return null;
3813
- }
3814
- }
3815
-
3816
- /// Searches the pipeline for a prepare for a given client.
3817
- fn pipeline_prepare_for_client(self: *Self, client: u128) ?*Prepare {
3547
+ fn primary_pipeline_prepare(self: *Self, request: Request) void {
3818
3548
  assert(self.status == .normal);
3819
3549
  assert(self.primary());
3820
3550
  assert(self.commit_min == self.commit_max);
3551
+ assert(self.commit_max + self.pipeline.queue.prepare_queue.count == self.op);
3552
+ assert(!self.pipeline.queue.prepare_queue.full());
3553
+ self.pipeline.queue.verify();
3821
3554
 
3822
- var op = self.commit_max + 1;
3823
- var parent = self.journal.header_with_op(self.commit_max).?.checksum;
3824
- var iterator = self.pipeline.iterator_mutable();
3825
- while (iterator.next_ptr()) |prepare| {
3826
- assert(prepare.message.header.command == .prepare);
3827
- assert(prepare.message.header.op == op);
3828
- assert(prepare.message.header.parent == parent);
3829
-
3830
- // A client may have multiple requests in the pipeline if these were committed by
3831
- // the previous primary and were reloaded into the pipeline after a view change.
3832
- if (prepare.message.header.client == client) return prepare;
3555
+ const message = request.message;
3556
+ assert(!self.ignore_request_message(message));
3833
3557
 
3834
- parent = prepare.message.header.checksum;
3835
- op += 1;
3836
- }
3558
+ log.debug("{}: primary_pipeline_next: request checksum={} client={}", .{
3559
+ self.replica,
3560
+ message.header.checksum,
3561
+ message.header.client,
3562
+ });
3837
3563
 
3838
- assert(self.pipeline.count <= constants.pipeline_max);
3839
- assert(self.commit_max + self.pipeline.count == op - 1);
3840
- assert(self.commit_max + self.pipeline.count == self.op);
3564
+ // Guard against the wall clock going backwards by taking the max with timestamps issued:
3565
+ self.state_machine.prepare_timestamp = std.math.max(
3566
+ // The cluster `commit_timestamp` may be ahead of our `prepare_timestamp` because this
3567
+ // may be our first prepare as a recently elected primary:
3568
+ std.math.max(
3569
+ self.state_machine.prepare_timestamp,
3570
+ self.state_machine.commit_timestamp,
3571
+ ) + 1,
3572
+ @intCast(u64, request.realtime),
3573
+ );
3574
+ assert(self.state_machine.prepare_timestamp > self.state_machine.commit_timestamp);
3841
3575
 
3842
- return null;
3843
- }
3576
+ const prepare_timestamp = self.state_machine.prepare(
3577
+ message.header.operation.cast(StateMachine),
3578
+ message.body(),
3579
+ );
3844
3580
 
3845
- /// Searches the pipeline for a prepare for a given client and checksum.
3846
- /// Passing the prepare_ok message prevents these u128s from being accidentally swapped.
3847
- /// Asserts that the returned prepare, if any, exactly matches the prepare_ok.
3848
- fn pipeline_prepare_for_prepare_ok(self: *Self, ok: *const Message) ?*Prepare {
3849
- assert(ok.header.command == .prepare_ok);
3581
+ const latest_entry = self.journal.header_with_op(self.op).?;
3582
+ message.header.parent = latest_entry.checksum;
3583
+ message.header.context = message.header.checksum;
3584
+ message.header.view = self.view;
3585
+ message.header.op = self.op + 1;
3586
+ message.header.commit = self.commit_max;
3587
+ message.header.timestamp = prepare_timestamp;
3588
+ message.header.replica = self.replica;
3589
+ message.header.command = .prepare;
3850
3590
 
3851
- assert(self.status == .normal);
3852
- assert(self.primary());
3591
+ message.header.set_checksum_body(message.body());
3592
+ message.header.set_checksum();
3853
3593
 
3854
- const prepare = self.pipeline_prepare_for_client(ok.header.client) orelse {
3855
- log.debug("{}: pipeline_prepare_for_prepare_ok: not preparing", .{self.replica});
3856
- return null;
3857
- };
3594
+ log.debug("{}: primary_pipeline_next: prepare {}", .{ self.replica, message.header.checksum });
3858
3595
 
3859
- if (ok.header.context != prepare.message.header.checksum) {
3860
- // This can be normal, for example, if an old prepare_ok is replayed.
3861
- log.debug("{}: pipeline_prepare_for_prepare_ok: preparing a different client op", .{
3862
- self.replica,
3863
- });
3864
- return null;
3596
+ if (self.pipeline.queue.prepare_queue.tail_ptr()) |previous| {
3597
+ // Do not restart the prepare timeout as it is already ticking for another prepare.
3598
+ assert(self.prepare_timeout.ticking);
3599
+ assert(previous.message.header.checksum == message.header.parent);
3600
+ } else {
3601
+ // We are about to add the first prepare to the pipeline, so start the timeout.
3602
+ assert(!self.prepare_timeout.ticking);
3603
+ self.prepare_timeout.start();
3865
3604
  }
3605
+ self.pipeline.queue.push_prepare(message);
3606
+ self.on_prepare(message);
3866
3607
 
3867
- assert(prepare.message.header.parent == ok.header.parent);
3868
- assert(prepare.message.header.client == ok.header.client);
3869
- assert(prepare.message.header.request == ok.header.request);
3870
- assert(prepare.message.header.cluster == ok.header.cluster);
3871
- assert(prepare.message.header.epoch == ok.header.epoch);
3872
- // A prepare may be committed in the same view or in a newer view:
3873
- assert(prepare.message.header.view <= ok.header.view);
3874
- assert(prepare.message.header.op == ok.header.op);
3875
- assert(prepare.message.header.commit == ok.header.commit);
3876
- assert(prepare.message.header.timestamp == ok.header.timestamp);
3877
- assert(prepare.message.header.operation == ok.header.operation);
3878
-
3879
- return prepare;
3608
+ // We expect `on_prepare()` to increment `self.op` to match the primary's latest prepare:
3609
+ // This is critical to ensure that pipelined prepares do not receive the same op number.
3610
+ assert(self.op == message.header.op);
3880
3611
  }
3881
3612
 
3882
- fn recover(self: *Self) void {
3883
- assert(self.status == .recovering);
3884
- assert(self.replica_count > 1);
3885
-
3886
- log.debug("{}: recover: sending recovery messages nonce={}", .{
3887
- self.replica,
3888
- self.recovery_nonce,
3889
- });
3613
+ fn pipeline_prepare_by_op_and_checksum(self: *Self, op: u64, checksum: ?u128) ?*Message {
3614
+ assert(self.status == .normal or self.status == .view_change);
3615
+ assert(self.replica == self.primary_index(self.view) or checksum != null);
3890
3616
 
3891
- self.send_header_to_other_replicas(.{
3892
- .command = .recovery,
3893
- .cluster = self.cluster,
3894
- .context = self.recovery_nonce,
3895
- .replica = self.replica,
3896
- });
3617
+ if (checksum == null) {
3618
+ // The PipelineCache may hold messages that have been discarded, so we must be
3619
+ // careful not to access it unless we can verify the entry's checksum.
3620
+ //
3621
+ // Only on_request_prepare() queries the pipeline with checksum=null.
3622
+ // And primaries ignore request_prepare messages during their view change
3623
+ // (during which time the pipeline is not yet repaired, and so is untrusted).
3624
+ assert(self.primary());
3625
+ assert(self.pipeline == .queue);
3626
+ }
3627
+
3628
+ return switch (self.pipeline) {
3629
+ .cache => |*cache| cache.prepare_by_op_and_checksum(op, checksum.?),
3630
+ .queue => |*queue| if (queue.prepare_by_op_and_checksum(op, checksum)) |prepare|
3631
+ prepare.message
3632
+ else
3633
+ null,
3634
+ };
3897
3635
  }
3898
3636
 
3899
3637
  /// Starting from the latest journal entry, backfill any missing or disconnected headers.
@@ -3911,8 +3649,8 @@ pub fn ReplicaType(
3911
3649
  assert(self.status == .normal or self.status == .view_change);
3912
3650
  assert(self.repairs_allowed());
3913
3651
 
3914
- assert(self.op_checkpoint <= self.op);
3915
- assert(self.op_checkpoint <= self.commit_min);
3652
+ assert(self.op_checkpoint() <= self.op);
3653
+ assert(self.op_checkpoint() <= self.commit_min);
3916
3654
  assert(self.commit_min <= self.op);
3917
3655
  assert(self.commit_min <= self.commit_max);
3918
3656
  assert(self.journal.header_with_op(self.op) != null);
@@ -3954,36 +3692,43 @@ pub fn ReplicaType(
3954
3692
  }
3955
3693
 
3956
3694
  // Request any missing or disconnected headers:
3957
- // TODO Snapshots: Ensure that self.commit_min op always exists in the journal.
3958
- var broken = self.journal.find_latest_headers_break_between(self.commit_min, self.op);
3959
- if (broken) |range| {
3960
- log.debug("{}: repair: break: view={} op_min={} op_max={} (commit={}..{} op={})", .{
3961
- self.replica,
3962
- self.view,
3963
- range.op_min,
3964
- range.op_max,
3965
- self.commit_min,
3966
- self.commit_max,
3695
+ if (self.commit_min != self.op) {
3696
+ var broken = self.journal.find_latest_headers_break_between(
3697
+ self.commit_min + 1,
3967
3698
  self.op,
3968
- });
3969
- assert(range.op_min > self.commit_min);
3970
- assert(range.op_max < self.op);
3971
- // A range of `op_min=0` or `op_max=0` should be impossible as a header break:
3972
- // This is the root op that is prepared when the cluster is initialized.
3973
- assert(range.op_min > 0);
3974
- assert(range.op_max > 0);
3975
-
3976
- if (self.choose_any_other_replica()) |replica| {
3977
- self.send_header_to_replica(replica, .{
3978
- .command = .request_headers,
3979
- .cluster = self.cluster,
3980
- .replica = self.replica,
3981
- .view = self.view,
3982
- .commit = range.op_min,
3983
- .op = range.op_max,
3984
- });
3699
+ );
3700
+ if (broken) |range| {
3701
+ log.debug(
3702
+ "{}: repair: break: view={} op_min={} op_max={} (commit={}..{} op={})",
3703
+ .{
3704
+ self.replica,
3705
+ self.view,
3706
+ range.op_min,
3707
+ range.op_max,
3708
+ self.commit_min,
3709
+ self.commit_max,
3710
+ self.op,
3711
+ },
3712
+ );
3713
+ assert(range.op_min > self.commit_min);
3714
+ assert(range.op_max < self.op);
3715
+ // A range of `op_min=0` or `op_max=0` should be impossible as a header break:
3716
+ // This is the root op that is prepared when the cluster is initialized.
3717
+ assert(range.op_min > 0);
3718
+ assert(range.op_max > 0);
3719
+
3720
+ if (self.choose_any_other_replica()) |replica| {
3721
+ self.send_header_to_replica(replica, .{
3722
+ .command = .request_headers,
3723
+ .cluster = self.cluster,
3724
+ .replica = self.replica,
3725
+ .view = self.view,
3726
+ .commit = range.op_min,
3727
+ .op = range.op_max,
3728
+ });
3729
+ }
3730
+ return;
3985
3731
  }
3986
- return;
3987
3732
  }
3988
3733
 
3989
3734
  // Assert that all headers are now present and connected with a perfect hash chain:
@@ -4003,9 +3748,12 @@ pub fn ReplicaType(
4003
3748
  }
4004
3749
 
4005
3750
  if (self.status == .view_change and self.primary_index(self.view) == self.replica) {
4006
- if (self.primary_repair_pipeline_op() != null) return self.primary_repair_pipeline();
4007
- // Start the view as the new primary:
4008
- self.start_view_as_the_new_primary();
3751
+ // Repair the pipeline, which may discover faulty prepares and drive more repairs.
3752
+ switch (self.primary_repair_pipeline()) {
3753
+ // primary_repair_pipeline() is already working.
3754
+ .busy => {},
3755
+ .done => self.primary_start_view_as_the_new_primary(),
3756
+ }
4009
3757
  }
4010
3758
  }
4011
3759
 
@@ -4073,8 +3821,8 @@ pub fn ReplicaType(
4073
3821
  return false;
4074
3822
  }
4075
3823
 
4076
- if (header.op <= self.op_checkpoint) {
4077
- if (header.op == 0 and self.op_checkpoint == 0) {
3824
+ if (header.op <= self.op_checkpoint()) {
3825
+ if (header.op == 0 and self.op_checkpoint() == 0) {
4078
3826
  // Repairing the root op is allowed until the first checkpoint.
4079
3827
  } else {
4080
3828
  // It is critical that we do not repair checkpointed ops; their slots now belong
@@ -4082,7 +3830,7 @@ pub fn ReplicaType(
4082
3830
  // correctness violation.
4083
3831
  log.debug("{}: repair_header: false (precedes self.op_checkpoint={})", .{
4084
3832
  self.replica,
4085
- self.op_checkpoint,
3833
+ self.op_checkpoint(),
4086
3834
  });
4087
3835
  return false;
4088
3836
  }
@@ -4200,175 +3948,173 @@ pub fn ReplicaType(
4200
3948
  }
4201
3949
 
4202
3950
  /// Reads prepares into the pipeline (before we start the view as the new primary).
4203
- fn primary_repair_pipeline(self: *Self) void {
3951
+ fn primary_repair_pipeline(self: *Self) enum { done, busy } {
4204
3952
  assert(self.status == .view_change);
4205
3953
  assert(self.primary_index(self.view) == self.replica);
4206
- assert(self.commit_max < self.op);
3954
+ assert(self.commit_max == self.commit_min);
3955
+ assert(self.commit_max <= self.op);
4207
3956
  assert(self.journal.dirty.count == 0);
3957
+ assert(self.pipeline == .cache);
4208
3958
 
4209
- if (self.repairing_pipeline) {
3959
+ if (self.pipeline_repairing) {
4210
3960
  log.debug("{}: primary_repair_pipeline: already repairing...", .{self.replica});
4211
- return;
3961
+ return .busy;
4212
3962
  }
4213
3963
 
4214
- log.debug("{}: primary_repair_pipeline: repairing", .{self.replica});
4215
-
4216
- assert(!self.repairing_pipeline);
4217
- self.repairing_pipeline = true;
3964
+ if (self.primary_repair_pipeline_op()) |_| {
3965
+ log.debug("{}: primary_repair_pipeline: repairing", .{self.replica});
3966
+ assert(!self.pipeline_repairing);
3967
+ self.pipeline_repairing = true;
3968
+ self.primary_repair_pipeline_read();
3969
+ return .busy;
3970
+ }
4218
3971
 
4219
- self.primary_repair_pipeline_read();
3972
+ // All prepares needed to reconstruct the pipeline queue are now available in the cache.
3973
+ return .done;
4220
3974
  }
4221
3975
 
4222
- /// Discard messages from the prepare pipeline.
4223
- /// Retain uncommitted messages that belong in the current view to maximize durability.
4224
- fn primary_repair_pipeline_diff(self: *Self) void {
3976
+ fn primary_repair_pipeline_done(self: *Self) PipelineQueue {
4225
3977
  assert(self.status == .view_change);
4226
3978
  assert(self.primary_index(self.view) == self.replica);
3979
+ assert(self.commit_max == self.commit_min);
3980
+ assert(self.commit_max <= self.op);
3981
+ assert(self.journal.dirty.count == 0);
3982
+ assert(self.valid_hash_chain_between(self.commit_min, self.op));
3983
+ assert(self.pipeline == .cache);
3984
+ assert(!self.pipeline_repairing);
3985
+ assert(self.primary_repair_pipeline() == .done);
3986
+ assert(self.commit_max + constants.pipeline_prepare_queue_max >= self.op);
4227
3987
 
4228
- // Discard messages from the front of the pipeline that committed since we were primary.
4229
- while (self.pipeline.head_ptr()) |prepare| {
4230
- if (prepare.message.header.op > self.commit_max) break;
4231
-
4232
- self.message_bus.unref(self.pipeline.pop().?.message);
4233
- }
4234
-
4235
- // Discard the whole pipeline if it is now disconnected from the WAL's hash chain.
4236
- if (self.pipeline.head_ptr()) |pipeline_head| {
4237
- const parent = self.journal.header_with_op_and_checksum(
4238
- pipeline_head.message.header.op - 1,
4239
- pipeline_head.message.header.parent,
4240
- );
4241
- if (parent == null) {
4242
- while (self.pipeline.pop()) |prepare| self.message_bus.unref(prepare.message);
4243
- assert(self.pipeline.count == 0);
4244
- }
4245
- }
3988
+ var pipeline_queue = PipelineQueue{};
3989
+ var op = self.commit_max + 1;
3990
+ var parent = self.journal.header_with_op(self.commit_max).?.checksum;
3991
+ while (op <= self.op) : (op += 1) {
3992
+ const journal_header = self.journal.header_with_op(op).?;
3993
+ assert(journal_header.op == op);
3994
+ assert(journal_header.parent == parent);
4246
3995
 
4247
- // Discard messages from the back of the pipeline that are not part of this view.
4248
- while (self.pipeline.tail_ptr()) |prepare| {
4249
- if (self.journal.has(prepare.message.header)) break;
3996
+ const prepare =
3997
+ self.pipeline.cache.prepare_by_op_and_checksum(op, journal_header.checksum).?;
3998
+ assert(prepare.header.op == op);
3999
+ assert(prepare.header.op <= self.op);
4000
+ assert(prepare.header.checksum == journal_header.checksum);
4001
+ assert(prepare.header.parent == parent);
4002
+ assert(self.journal.has(prepare.header));
4250
4003
 
4251
- self.message_bus.unref(self.pipeline.pop_tail().?.message);
4004
+ pipeline_queue.push_prepare(prepare.ref());
4005
+ parent = prepare.header.checksum;
4252
4006
  }
4007
+ assert(self.commit_max + pipeline_queue.prepare_queue.count == self.op);
4253
4008
 
4254
- log.debug("{}: primary_repair_pipeline_diff: {} prepare(s)", .{
4255
- self.replica,
4256
- self.pipeline.count,
4257
- });
4258
-
4259
- self.verify_pipeline();
4260
-
4261
- // Do not reset `repairing_pipeline` here as this must be reset by the read callback.
4262
- // Otherwise, we would be making `primary_repair_pipeline()` reentrant.
4009
+ pipeline_queue.verify();
4010
+ return pipeline_queue;
4263
4011
  }
4264
4012
 
4265
4013
  /// Returns the next `op` number that needs to be read into the pipeline.
4266
- fn primary_repair_pipeline_op(self: *Self) ?u64 {
4014
+ /// Returns null when all necessary prepares are in the pipeline cache.
4015
+ fn primary_repair_pipeline_op(self: *const Self) ?u64 {
4267
4016
  assert(self.status == .view_change);
4268
4017
  assert(self.primary_index(self.view) == self.replica);
4018
+ assert(self.commit_max == self.commit_min);
4019
+ assert(self.commit_max <= self.op);
4020
+ assert(self.pipeline == .cache);
4269
4021
 
4270
- // We cannot rely on `pipeline.count` below unless the pipeline has first been diffed.
4271
- self.primary_repair_pipeline_diff();
4272
-
4273
- const op = self.commit_max + self.pipeline.count + 1;
4274
- if (op <= self.op) return op;
4275
-
4276
- assert(self.commit_max + self.pipeline.count == self.op);
4022
+ var op = self.commit_max + 1;
4023
+ while (op <= self.op) : (op += 1) {
4024
+ const op_header = self.journal.header_with_op(op).?;
4025
+ if (!self.pipeline.cache.contains_header(op_header)) {
4026
+ return op;
4027
+ }
4028
+ }
4277
4029
  return null;
4278
4030
  }
4279
4031
 
4280
4032
  fn primary_repair_pipeline_read(self: *Self) void {
4281
- assert(self.repairing_pipeline);
4282
4033
  assert(self.status == .view_change);
4283
4034
  assert(self.primary_index(self.view) == self.replica);
4035
+ assert(self.commit_max == self.commit_min);
4036
+ assert(self.commit_max <= self.op);
4037
+ assert(self.pipeline == .cache);
4038
+ assert(self.pipeline_repairing);
4284
4039
 
4285
- if (self.primary_repair_pipeline_op()) |op| {
4286
- assert(op > self.commit_max);
4287
- assert(op <= self.op);
4288
- assert(self.commit_max + self.pipeline.count + 1 == op);
4289
-
4290
- const checksum = self.journal.header_with_op(op).?.checksum;
4291
-
4292
- log.debug("{}: primary_repair_pipeline_read: op={} checksum={}", .{
4293
- self.replica,
4294
- op,
4295
- checksum,
4296
- });
4297
-
4298
- self.journal.read_prepare(repair_pipeline_push, op, checksum, null);
4299
- } else {
4300
- log.debug("{}: primary_repair_pipeline_read: repaired", .{self.replica});
4301
- self.repairing_pipeline = false;
4302
- self.repair();
4303
- }
4040
+ const op = self.primary_repair_pipeline_op().?;
4041
+ const op_checksum = self.journal.header_with_op(op).?.checksum;
4042
+ log.debug("{}: primary_repair_pipeline_read: op={} checksum={}", .{
4043
+ self.replica,
4044
+ op,
4045
+ op_checksum,
4046
+ });
4047
+ self.journal.read_prepare(repair_pipeline_read_callback, op, op_checksum, null);
4304
4048
  }
4305
4049
 
4306
- fn repair_pipeline_push(
4050
+ fn repair_pipeline_read_callback(
4307
4051
  self: *Self,
4308
4052
  prepare: ?*Message,
4309
4053
  destination_replica: ?u8,
4310
4054
  ) void {
4311
4055
  assert(destination_replica == null);
4312
4056
 
4313
- assert(self.repairing_pipeline);
4314
- self.repairing_pipeline = false;
4057
+ assert(self.pipeline_repairing);
4058
+ self.pipeline_repairing = false;
4315
4059
 
4316
4060
  if (prepare == null) {
4317
- log.debug("{}: repair_pipeline_push: prepare == null", .{self.replica});
4061
+ log.debug("{}: repair_pipeline_read_callback: prepare == null", .{self.replica});
4318
4062
  return;
4319
4063
  }
4320
4064
 
4321
4065
  // Our state may have advanced significantly while we were reading from disk.
4322
4066
  if (self.status != .view_change) {
4323
- log.debug("{}: repair_pipeline_push: no longer in view change status", .{
4067
+ assert(self.primary_index(self.view) != self.replica);
4068
+
4069
+ log.debug("{}: repair_pipeline_read_callback: no longer in view change status", .{
4324
4070
  self.replica,
4325
4071
  });
4326
4072
  return;
4327
4073
  }
4328
4074
 
4329
4075
  if (self.primary_index(self.view) != self.replica) {
4330
- log.debug("{}: repair_pipeline_push: no longer primary", .{self.replica});
4076
+ log.debug("{}: repair_pipeline_read_callback: no longer primary", .{self.replica});
4331
4077
  return;
4332
4078
  }
4333
4079
 
4334
4080
  // We may even be several views ahead and may now have a completely different pipeline.
4335
4081
  const op = self.primary_repair_pipeline_op() orelse {
4336
- log.debug("{}: repair_pipeline_push: pipeline changed", .{self.replica});
4082
+ log.debug("{}: repair_pipeline_read_callback: pipeline changed", .{self.replica});
4337
4083
  return;
4338
4084
  };
4339
4085
 
4340
4086
  assert(op > self.commit_max);
4341
4087
  assert(op <= self.op);
4342
- assert(self.commit_max + self.pipeline.count + 1 == op);
4343
4088
 
4344
4089
  if (prepare.?.header.op != op) {
4345
- log.debug("{}: repair_pipeline_push: op changed", .{self.replica});
4090
+ log.debug("{}: repair_pipeline_read_callback: op changed", .{self.replica});
4346
4091
  return;
4347
4092
  }
4348
4093
 
4349
4094
  if (prepare.?.header.checksum != self.journal.header_with_op(op).?.checksum) {
4350
- log.debug("{}: repair_pipeline_push: checksum changed", .{self.replica});
4095
+ log.debug("{}: repair_pipeline_read_callback: checksum changed", .{self.replica});
4351
4096
  return;
4352
4097
  }
4353
4098
 
4354
4099
  assert(self.status == .view_change);
4355
4100
  assert(self.primary_index(self.view) == self.replica);
4356
4101
 
4357
- log.debug("{}: repair_pipeline_push: op={} checksum={}", .{
4102
+ log.debug("{}: repair_pipeline_read_callback: op={} checksum={}", .{
4358
4103
  self.replica,
4359
4104
  prepare.?.header.op,
4360
4105
  prepare.?.header.checksum,
4361
4106
  });
4362
4107
 
4363
- if (self.pipeline.tail_ptr()) |parent| {
4364
- assert(prepare.?.header.parent == parent.message.header.checksum);
4365
- }
4366
-
4367
- self.pipeline.push_assume_capacity(.{ .message = prepare.?.ref() });
4368
- assert(self.pipeline.count >= 1);
4108
+ const prepare_evicted = self.pipeline.cache.insert(prepare.?.ref());
4109
+ if (prepare_evicted) |message_evicted| self.message_bus.unref(message_evicted);
4369
4110
 
4370
- self.repairing_pipeline = true;
4371
- self.primary_repair_pipeline_read();
4111
+ if (self.primary_repair_pipeline_op()) |_| {
4112
+ assert(!self.pipeline_repairing);
4113
+ self.pipeline_repairing = true;
4114
+ self.primary_repair_pipeline_read();
4115
+ } else {
4116
+ self.repair();
4117
+ }
4372
4118
  }
4373
4119
 
4374
4120
  fn repair_prepares(self: *Self) void {
@@ -4376,7 +4122,7 @@ pub fn ReplicaType(
4376
4122
  assert(self.repairs_allowed());
4377
4123
  assert(self.journal.dirty.count > 0);
4378
4124
  assert(self.op >= self.commit_min);
4379
- assert(self.op - self.commit_min + 1 <= constants.journal_slot_count);
4125
+ assert(self.op - self.commit_min <= constants.journal_slot_count);
4380
4126
 
4381
4127
  // Request enough prepares to utilize our max IO depth:
4382
4128
  var budget = self.journal.writes.available();
@@ -4434,7 +4180,7 @@ pub fn ReplicaType(
4434
4180
  // belong) to a newer op, from the new WAL wrap. Additionally, we may not
4435
4181
  // still have access to its surrounding commits to verify the hash chain.
4436
4182
  assert(op <= self.commit_min);
4437
- assert(op <= self.op_checkpoint);
4183
+ assert(op <= self.op_checkpoint());
4438
4184
  assert(self.journal.faulty.bit(slot));
4439
4185
 
4440
4186
  log.debug("{}: repair_prepares: remove slot={} " ++
@@ -4516,9 +4262,9 @@ pub fn ReplicaType(
4516
4262
  //
4517
4263
  // Using the pipeline to repair is faster than a `request_prepare`.
4518
4264
  // Also, messages in the pipeline are never corrupt.
4519
- if (self.pipeline_prepare_for_op_and_checksum(op, checksum)) |prepare| {
4520
- assert(prepare.message.header.op == op);
4521
- assert(prepare.message.header.checksum == checksum);
4265
+ if (self.pipeline_prepare_by_op_and_checksum(op, checksum)) |prepare| {
4266
+ assert(prepare.header.op == op);
4267
+ assert(prepare.header.checksum == checksum);
4522
4268
 
4523
4269
  if (self.replica_count == 1) {
4524
4270
  // This op won't start writing until all ops in the pipeline preceding it have
@@ -4528,7 +4274,8 @@ pub fn ReplicaType(
4528
4274
  op,
4529
4275
  checksum,
4530
4276
  });
4531
- assert(op > self.pipeline.head_ptr().?.message.header.op);
4277
+ const pipeline_head = self.pipeline.queue.prepare_queue.head_ptr().?;
4278
+ assert(pipeline_head.message.header.op < op);
4532
4279
  return false;
4533
4280
  }
4534
4281
 
@@ -4537,7 +4284,7 @@ pub fn ReplicaType(
4537
4284
  op,
4538
4285
  checksum,
4539
4286
  });
4540
- self.write_prepare(prepare.message, .pipeline);
4287
+ self.write_prepare(prepare, .pipeline);
4541
4288
  return true;
4542
4289
  }
4543
4290
 
@@ -4638,29 +4385,10 @@ pub fn ReplicaType(
4638
4385
  }
4639
4386
  }
4640
4387
 
4641
- /// The caller must ensure that the headers are trustworthy.
4642
- ///
4643
- /// Asserts that sequential ops are hash-chained. (Gaps are permitted).
4644
- fn replace_headers(self: *Self, headers: []const Header) void {
4645
- for (headers) |*header, i| {
4646
- if (i > 0) {
4647
- const next = &headers[i - 1];
4648
- assert(next.view >= header.view);
4649
- if (next.op == header.op + 1) {
4650
- assert(next.parent == header.checksum);
4651
- } else {
4652
- assert(next.op > header.op);
4653
- }
4654
- }
4655
-
4656
- self.replace_header(header);
4657
- }
4658
- }
4659
-
4660
4388
  /// Replaces the header if the header is different and not already committed.
4661
4389
  /// The caller must ensure that the header is trustworthy.
4662
4390
  fn replace_header(self: *Self, header: *const Header) void {
4663
- assert(self.op_checkpoint <= self.commit_min);
4391
+ assert(self.op_checkpoint() <= self.commit_min);
4664
4392
  assert(header.command == .prepare);
4665
4393
  assert(header.op <= self.op); // Never advance the op.
4666
4394
  assert(header.op <= self.op_checkpoint_trigger());
@@ -4670,7 +4398,7 @@ pub fn ReplicaType(
4670
4398
  assert(existing_header.checksum == header.checksum);
4671
4399
  return;
4672
4400
  } else {
4673
- if (header.op <= self.op_checkpoint) {
4401
+ if (header.op <= self.op_checkpoint()) {
4674
4402
  // Never replace a checkpointed op — those slots are needed by the following
4675
4403
  // WAL wrap.
4676
4404
  return;
@@ -4769,35 +4497,11 @@ pub fn ReplicaType(
4769
4497
  self.nack_prepare_op = null;
4770
4498
  }
4771
4499
 
4772
- fn reset_quorum_prepare_ok(self: *Self) void {
4773
- // "prepare_ok"s from previous views are not valid, even if the pipeline entry is reused
4774
- // after a cycle of view changes. In other words, when a view change cycles around, so
4775
- // that the original primary becomes a primary of a new view, pipeline entries may be
4776
- // reused. However, the pipeline's prepare_ok quorums must not be reused, since the
4777
- // replicas that sent them may have swapped them out during a previous view change.
4778
- var iterator = self.pipeline.iterator_mutable();
4779
- while (iterator.next_ptr()) |prepare| {
4780
- prepare.ok_quorum_received = false;
4781
- prepare.ok_from_all_replicas = quorum_counter_null;
4782
- assert(prepare.ok_from_all_replicas.count() == 0);
4783
- }
4784
- }
4785
-
4786
4500
  fn reset_quorum_start_view_change(self: *Self) void {
4787
4501
  self.reset_quorum_counter(&self.start_view_change_from_other_replicas);
4788
4502
  self.start_view_change_quorum = false;
4789
4503
  }
4790
4504
 
4791
- fn reset_quorum_recovery_response(self: *Self) void {
4792
- for (self.recovery_response_from_other_replicas) |*received, replica| {
4793
- if (received.*) |message| {
4794
- assert(replica != self.replica);
4795
- self.message_bus.unref(message);
4796
- received.* = null;
4797
- }
4798
- }
4799
- }
4800
-
4801
4505
  fn send_prepare_ok(self: *Self, header: *const Header) void {
4802
4506
  assert(header.command == .prepare);
4803
4507
  assert(header.cluster == self.cluster);
@@ -4920,6 +4624,7 @@ pub fn ReplicaType(
4920
4624
  // operations after the highest `commit_min` may yet have been committed before the old
4921
4625
  // primary crashed. The new primary will use the NACK protocol to be sure of a discard.
4922
4626
  assert(message.header.commit == self.commit_min);
4627
+ DVCQuorum.verify_message(message);
4923
4628
 
4924
4629
  self.send_message_to_replica(self.primary_index(self.view), message);
4925
4630
  }
@@ -5051,18 +4756,6 @@ pub fn ReplicaType(
5051
4756
  },
5052
4757
  else => unreachable,
5053
4758
  },
5054
- .recovery => {
5055
- assert(self.status == .recovering);
5056
- assert(message.header.replica == self.replica);
5057
- assert(message.header.replica != replica);
5058
- assert(message.header.context == self.recovery_nonce);
5059
- },
5060
- .recovery_response => {
5061
- assert(self.status == .normal);
5062
- assert(message.header.view == self.view);
5063
- assert(message.header.replica == self.replica);
5064
- assert(message.header.replica != replica);
5065
- },
5066
4759
  .headers => {
5067
4760
  assert(self.status == .normal or self.status == .view_change);
5068
4761
  assert(message.header.view == self.view);
@@ -5111,6 +4804,42 @@ pub fn ReplicaType(
5111
4804
  },
5112
4805
  }
5113
4806
 
4807
+ if (replica != self.replica) {
4808
+ // Critical: Do not advertise a view/log_view before it is durable.
4809
+ // See view_durable()/log_view_durable().
4810
+ if (message.header.view > self.view_durable() and
4811
+ message.header.command != .request_start_view)
4812
+ {
4813
+ log.debug("{}: send_message_to_replica: dropped {s} " ++
4814
+ "(view_durable={} message.view={})", .{
4815
+ self.replica,
4816
+ @tagName(message.header.command),
4817
+ self.view_durable(),
4818
+ message.header.view,
4819
+ });
4820
+ return;
4821
+ }
4822
+
4823
+ if (message.header.command == .do_view_change) {
4824
+ const message_log_view = message.header.timestamp;
4825
+ if (self.log_view_durable() < message_log_view) {
4826
+ log.debug("{}: send_message_to_replica: dropped {s} " ++
4827
+ "(log_view_durable={} message.log_view={})", .{
4828
+ self.replica,
4829
+ @tagName(message.header.command),
4830
+ self.log_view_durable(),
4831
+ message_log_view,
4832
+ });
4833
+ return;
4834
+ }
4835
+ assert(std.mem.eql(
4836
+ u8,
4837
+ message.body(),
4838
+ std.mem.sliceAsBytes(self.superblock.working.vsr_headers().slice),
4839
+ ));
4840
+ }
4841
+ }
4842
+
5114
4843
  if (replica == self.replica) {
5115
4844
  assert(self.loopback_queue == null);
5116
4845
  self.loopback_queue = message.ref();
@@ -5119,6 +4848,142 @@ pub fn ReplicaType(
5119
4848
  }
5120
4849
  }
5121
4850
 
4851
+ /// The highest durable view.
4852
+ /// A replica must not advertise a view higher than its durable view.
4853
+ ///
4854
+ /// The advertised `view` must never backtrack after a crash.
4855
+ /// This ensures the old primary is isolated — if a backup's view backtracks, it could
4856
+ /// ack a prepare to the old primary, forking the log. See VRR §8.2 for more detail.
4857
+ ///
4858
+ /// Equivalent to `superblock.working.vsr_state.view`.
4859
+ fn view_durable(self: *const Self) u32 {
4860
+ return self.superblock.working.vsr_state.view;
4861
+ }
4862
+
4863
+ /// The highest durable log_view.
4864
+ /// A replica must not advertise a log_view (in a DVC) higher than its durable log_view.
4865
+ ///
4866
+ /// A replica's advertised `log_view` must never backtrack after a crash.
4867
+ /// (`log_view` is only advertised within DVC messages).
4868
+ ///
4869
+ /// To understand why, consider the following replica logs, where:
4870
+ ///
4871
+ /// - numbers in replica rows denote the version of the op, and
4872
+ /// - a<b<c denotes the view in which the op was prepared.
4873
+ ///
4874
+ /// Replica 0 prepares some ops, but they never arrive at replica 1/2:
4875
+ ///
4876
+ /// view=a
4877
+ /// op │ 0 1 2
4878
+ /// replica 0 │ 1a 2a 3a (log_view=a, leader)
4879
+ /// replica 1 │ - - - (log_view=a, follower — but never receives any prepares)
4880
+ /// (replica 2) │ - - - (log_view=_, partitioned)
4881
+ ///
4882
+ /// After a view change, replica 1 prepares some ops, but they never arrive at replica 0/2:
4883
+ ///
4884
+ /// view=b
4885
+ /// op │ 0 1 2
4886
+ /// (replica 0) │ 1a 2a 3a (log_view=a, partitioned)
4887
+ /// replica 1 │ 4b 5b 6b (log_view=b, leader)
4888
+ /// replica 2 │ - - - (log_view=b, follower — but never receives any prepares)
4889
+ ///
4890
+ /// After another view change, replica 2 loads replica 1's ops:
4891
+ ///
4892
+ /// view=c
4893
+ /// op │ 0 1 2
4894
+ /// replica 0 │ 1a 2a 3a (log_view=c, follower)
4895
+ /// (replica 1) │ 4b 5b 6b (log_view=b, partitioned)
4896
+ /// replica 2 │ 1c 2c 3c (log_view=c, leader)
4897
+ ///
4898
+ /// Suppose replica 0 crashes and its log_view regresses to a.
4899
+ /// If replica 2 is partitioned, replicas 0 and 1 start view d with the DVCs:
4900
+ ///
4901
+ /// replica 0 │ 1a 2a 3a (log_view=a, log_view backtracked!)
4902
+ /// replica 1 │ 4b 5b 6b (log_view=b)
4903
+ ///
4904
+ /// Replica 1's higher log_view is canonical, so 4b/5b/6b replace 1a/2a/3a even though
4905
+ /// the latter may have been committed during view c. The log has forked.
4906
+ ///
4907
+ /// Therefore, a replica's log_view must never regress.
4908
+ ///
4909
+ /// Equivalent to `superblock.working.vsr_state.log_view`.
4910
+ fn log_view_durable(self: *const Self) u32 {
4911
+ return self.superblock.working.vsr_state.log_view;
4912
+ }
4913
+
4914
+ fn view_durable_updating(self: *const Self) bool {
4915
+ return self.superblock.view_change_in_progress();
4916
+ }
4917
+
4918
+ /// Persist the current view and log_view to the superblock.
4919
+ /// `view_durable` and `log_view_durable` will update asynchronously, when their respective
4920
+ /// updates are durable.
4921
+ fn view_durable_update(self: *Self) void {
4922
+ assert(self.status == .normal or self.status == .view_change);
4923
+ assert(self.view >= self.log_view);
4924
+ assert(self.view >= self.view_durable());
4925
+ assert(self.log_view >= self.log_view_durable());
4926
+ assert(self.log_view > self.log_view_durable() or self.view > self.view_durable());
4927
+ // The primary must only persist the SV headers after repairs are done.
4928
+ // Otherwise headers could be nacked, truncated, then restored after a crash.
4929
+ assert(self.log_view < self.view or self.replica != self.primary_index(self.view) or
4930
+ self.status == .normal);
4931
+
4932
+ if (self.view_durable_updating()) return;
4933
+
4934
+ log.debug("{}: view_durable_update: view_durable={}..{} log_view_durable={}..{}", .{
4935
+ self.replica,
4936
+ self.view_durable(),
4937
+ self.view,
4938
+ self.log_view_durable(),
4939
+ self.log_view,
4940
+ });
4941
+
4942
+ self.superblock.view_change(
4943
+ view_durable_update_callback,
4944
+ &self.superblock_context_view_change,
4945
+ .{
4946
+ .commit_max = self.commit_max,
4947
+ .view = self.view,
4948
+ .log_view = self.log_view,
4949
+ .headers = self.create_view_change_headers(),
4950
+ },
4951
+ );
4952
+ assert(self.view_durable_updating());
4953
+ }
4954
+
4955
+ fn view_durable_update_callback(context: *SuperBlock.Context) void {
4956
+ const self = @fieldParentPtr(Self, "superblock_context_view_change", context);
4957
+ assert(self.status == .normal or self.status == .view_change);
4958
+ assert(!self.view_durable_updating());
4959
+ assert(self.superblock.working.vsr_state.view <= self.view);
4960
+ assert(self.superblock.working.vsr_state.log_view <= self.log_view);
4961
+ assert(self.superblock.working.vsr_state.commit_min <= self.commit_min);
4962
+ assert(self.superblock.working.vsr_state.commit_max <= self.commit_max);
4963
+
4964
+ log.debug("{}: view_durable_update_callback: " ++
4965
+ "(view_durable={} log_view_durable={})", .{
4966
+ self.replica,
4967
+ self.view_durable(),
4968
+ self.log_view_durable(),
4969
+ });
4970
+
4971
+ assert(self.view_durable() <= self.view);
4972
+ assert(self.log_view_durable() <= self.view_durable());
4973
+ assert(self.log_view_durable() <= self.log_view);
4974
+
4975
+ // The view/log_view incremented while the previous view-change update was being saved.
4976
+ const update = self.log_view_durable() < self.log_view or
4977
+ self.view_durable() < self.view;
4978
+
4979
+ const update_dvc = update and self.log_view < self.view;
4980
+ const update_sv = update and self.log_view == self.view and
4981
+ (self.replica != self.primary_index(self.view) or self.status == .normal);
4982
+ assert(!(update_dvc and update_sv));
4983
+
4984
+ if (update_dvc or update_sv) self.view_durable_update();
4985
+ }
4986
+
5122
4987
  fn set_op_and_commit_max(self: *Self, op: u64, commit_max: u64, method: []const u8) void {
5123
4988
  assert(self.status == .view_change or self.status == .recovering);
5124
4989
 
@@ -5130,6 +4995,7 @@ pub fn ReplicaType(
5130
4995
  // It will be set shortly, when we transition to normal status.
5131
4996
  assert(self.view == 0);
5132
4997
  },
4998
+ .recovering_head => unreachable,
5133
4999
  }
5134
5000
 
5135
5001
  // Uncommitted ops may not survive a view change so we must assert `op` against
@@ -5156,9 +5022,7 @@ pub fn ReplicaType(
5156
5022
  });
5157
5023
  }
5158
5024
 
5159
- assert(commit_max >=
5160
- self.commit_max - std.math.min(constants.pipeline_max, self.commit_max));
5161
-
5025
+ assert(commit_max >= self.commit_max -| constants.pipeline_prepare_queue_max);
5162
5026
  assert(self.commit_min <= self.commit_max);
5163
5027
  assert(self.op >= self.commit_max or self.op < self.commit_max);
5164
5028
 
@@ -5201,48 +5065,84 @@ pub fn ReplicaType(
5201
5065
  /// where the new primary's headers depends on which of replica 1 and 2's DVC is used
5202
5066
  /// for repair before the other (i.e. whether they repair op 6 or 7 first).
5203
5067
  ///
5204
- /// For the above case to occur, replicas 0, 1, and 2 must all share the highest `view_normal`.
5205
- /// And since they share the latest `view_normal`, ops 5,6,7 were just installed by
5068
+ /// For the above case to occur, replicas 0, 1, and 2 must all share the highest `log_view`.
5069
+ /// And since they share the latest `log_view`, ops 5,6,7 were just installed by
5206
5070
  /// `replace_header`, which is order-independent (it doesn't use the hash chain).
5207
5071
  ///
5208
- /// (If replica 0's view_normal was greater than 1/2's, then replica 0 must have all
5072
+ /// (If replica 0's log_view was greater than 1/2's, then replica 0 must have all
5209
5073
  /// headers from previous views. Which means 6,7 are from the current view. But since
5210
- /// replica 0 doesn't have 6/7, then replica 1/2 must share the latest view_normal. ∎)
5074
+ /// replica 0 doesn't have 6/7, then replica 1/2 must share the latest log_view. ∎)
5211
5075
  fn primary_set_log_from_do_view_change_messages(self: *Self) void {
5212
5076
  assert(self.status == .view_change);
5213
5077
  assert(self.primary_index(self.view) == self.replica);
5214
5078
  assert(self.replica_count > 1);
5215
5079
  assert(self.start_view_change_quorum);
5216
5080
  assert(self.do_view_change_quorum);
5081
+ assert(self.do_view_change_from_all_replicas[self.replica] != null);
5082
+ DVCQuorum.verify(self.do_view_change_from_all_replicas);
5217
5083
 
5218
- const do_view_change_head = self.do_view_change_quorum_head();
5219
- assert(do_view_change_head.view_normal >= self.view_normal);
5220
- assert(do_view_change_head.op >= self.commit_min);
5221
- assert(do_view_change_head.op >= do_view_change_head.commit_min_max);
5222
- assert(do_view_change_head.commit_min_max >= self.commit_min);
5084
+ const dvcs_all = DVCQuorum.dvcs_all(self.do_view_change_from_all_replicas);
5085
+ assert(dvcs_all.len == self.quorum_view_change);
5223
5086
 
5224
- // The `prepare_timestamp` prevents a primary's own clock from running backwards.
5225
- // Therefore, `prepare_timestamp`:
5087
+ const dvcs_canonical = DVCQuorum.dvcs_canonical(self.do_view_change_from_all_replicas);
5088
+ assert(dvcs_canonical.len > 0);
5089
+
5090
+ for (dvcs_all.constSlice()) |message| {
5091
+ log.debug(
5092
+ "{}: on_do_view_change: dvc: " ++
5093
+ "replica={} log_view={} op={} commit_min={}",
5094
+ .{
5095
+ self.replica,
5096
+ message.header.replica,
5097
+ @intCast(u32, message.header.timestamp),
5098
+ message.header.op,
5099
+ message.header.commit, // The `commit_min` of the replica.
5100
+ },
5101
+ );
5102
+ }
5103
+
5104
+ for (dvcs_canonical.constSlice()) |message| {
5105
+ for (message_body_as_headers_chain_disjoint(message)) |*header| {
5106
+ log.debug(
5107
+ "{}: on_do_view_change: canonical: replica={} op={} checksum={}",
5108
+ .{
5109
+ self.replica,
5110
+ message.header.replica,
5111
+ header.op,
5112
+ header.checksum,
5113
+ },
5114
+ );
5115
+ }
5116
+ }
5117
+
5118
+ const do_view_change_commit_min_max = DVCQuorum.commit_min_max(
5119
+ self.do_view_change_from_all_replicas,
5120
+ .{
5121
+ .replica = self.replica,
5122
+ .commit_min = self.commit_min,
5123
+ },
5124
+ );
5125
+ assert(do_view_change_commit_min_max >= self.commit_min);
5126
+
5127
+ // The `prepare_timestamp` prevents a primary's own clock from running backwards.
5128
+ // Therefore, `prepare_timestamp`:
5226
5129
  // 1. is advanced if behind the cluster, but never reset if ahead of the cluster, i.e.
5227
5130
  // 2. may not always reflect the timestamp of the latest prepared op, and
5228
5131
  // 3. should be advanced before discarding the timestamps of any uncommitted headers.
5229
- if (self.state_machine.prepare_timestamp < do_view_change_head.timestamp) {
5230
- self.state_machine.prepare_timestamp = do_view_change_head.timestamp;
5132
+ const timestamp_max = DVCQuorum.timestamp_max(self.do_view_change_from_all_replicas);
5133
+ if (self.state_machine.prepare_timestamp < timestamp_max) {
5134
+ self.state_machine.prepare_timestamp = timestamp_max;
5231
5135
  }
5232
5136
 
5233
- const view_normal_canonical = do_view_change_head.view_normal;
5234
- // `op_canonical` must be computed before calling `set_op_and_commit_max()`, since
5235
- // that may change `replica.op`.
5236
- //
5237
- // Don't remove the uncanonical headers yet — even though the removed headers are
5238
- // a subset of the DVC headers, removing and then adding them back would cause clean
5239
- // headers to become dirty.
5240
- const op_canonical = self.primary_op_canonical_max(view_normal_canonical);
5241
- assert(op_canonical <= self.op);
5242
- assert(op_canonical >= self.op -| constants.pipeline_max);
5243
- assert(op_canonical >= self.commit_min);
5244
-
5245
- if (do_view_change_head.op > self.op_checkpoint_trigger()) {
5137
+ var headers_canonical = DVCQuorum.headers_canonical(self.do_view_change_from_all_replicas);
5138
+ const header_head = headers_canonical.next().?;
5139
+ assert(header_head.op == header_head.op);
5140
+ assert(header_head.op >= do_view_change_commit_min_max);
5141
+ assert(header_head.op >= self.op_checkpoint());
5142
+ assert(header_head.op >= self.commit_min);
5143
+ assert(header_head.op >= self.commit_max);
5144
+
5145
+ if (header_head.op > self.op_checkpoint_trigger()) {
5246
5146
  // This replica is too far behind, i.e. the new `self.op` is too far ahead of the
5247
5147
  // last checkpoint. If we wrap now, we overwrite un-checkpointed transfers in the WAL,
5248
5148
  // precluding recovery.
@@ -5253,32 +5153,40 @@ pub fn ReplicaType(
5253
5153
  }
5254
5154
 
5255
5155
  self.set_op_and_commit_max(
5256
- do_view_change_head.op,
5257
- // `set_op_and_commit_max()` expects the highest commit_max that we know of.
5258
- // But DVCs include replica's `commit_min`, not `commit_max`.
5156
+ header_head.op,
5259
5157
  std.math.max(
5260
5158
  self.commit_max,
5261
- do_view_change_head.commit_min_max,
5159
+ std.math.max(
5160
+ // `set_op_and_commit_max()` expects the highest commit_max that we know of.
5161
+ // But DVCs include replica's `commit_min`, not `commit_max`.
5162
+ do_view_change_commit_min_max,
5163
+ // An op cannot be uncommitted if it is definitely outside the pipeline.
5164
+ // Use `do_view_change_op_head` instead of `replica.op` since the former is
5165
+ // about to become the new `replica.op`.
5166
+ header_head.op -| constants.pipeline_prepare_queue_max,
5167
+ ),
5262
5168
  ),
5263
5169
  "on_do_view_change",
5264
5170
  );
5265
- // "`replica.op` exists" invariant may be broken until after the canonical DVC headers
5266
- // are installed.
5267
-
5268
- // First, set all the canonical headers from the replica(s) with highest `view_normal`:
5269
- for (self.do_view_change_from_all_replicas) |received| {
5270
- if (received) |message| {
5271
- const view_normal = @intCast(u32, message.header.timestamp);
5272
- // The view in which this replica's status was normal must be before this view.
5273
- assert(view_normal < message.header.view);
5171
+ // "`replica.op` exists" invariant may be broken briefly between set_op_and_commit_max()
5172
+ // and replace_header().
5173
+ self.replace_header(&header_head);
5174
+ assert(self.journal.header_with_op(self.op) != null);
5274
5175
 
5275
- if (view_normal < view_normal_canonical) continue;
5276
- assert(view_normal == view_normal_canonical);
5176
+ while (headers_canonical.next()) |header| {
5177
+ assert(header.op < header_head.op);
5178
+ self.replace_header(&header);
5179
+ }
5277
5180
 
5278
- const message_headers = message_body_as_headers(message);
5279
- for (message_headers) |*header| {
5181
+ const dvcs_uncanonical =
5182
+ DVCQuorum.dvcs_uncanonical(self.do_view_change_from_all_replicas);
5183
+ for (dvcs_uncanonical.constSlice()) |message| {
5184
+ for (message_body_as_headers_chain_disjoint(message)) |*header| {
5185
+ // We must trust headers that other replicas have committed, because
5186
+ // repair_header() will not repair a header if the hash chain has a gap.
5187
+ if (header.op <= message.header.commit) {
5280
5188
  log.debug(
5281
- "{}: on_do_view_change: canonical: replica={} op={} checksum={}",
5189
+ "{}: on_do_view_change: committed: replica={} op={} checksum={}",
5282
5190
  .{
5283
5191
  self.replica,
5284
5192
  message.header.replica,
@@ -5286,295 +5194,98 @@ pub fn ReplicaType(
5286
5194
  header.checksum,
5287
5195
  },
5288
5196
  );
5289
- }
5290
- self.replace_headers(message_headers);
5291
- }
5292
- }
5293
-
5294
- // Since we used do_view_change_head to set the replica.op, it must have been loaded
5295
- // into the headers (if it wasn't present already).
5296
- assert(self.journal.header_with_op(self.op) != null);
5297
-
5298
- // Now that the canonical headers are all in place, repair any other headers:
5299
- for (self.do_view_change_from_all_replicas) |received| {
5300
- if (received) |message| {
5301
- const view_normal = @intCast(u32, message.header.timestamp);
5302
- assert(view_normal < message.header.view);
5303
-
5304
- if (view_normal == view_normal_canonical) continue;
5305
- assert(view_normal < view_normal_canonical);
5306
-
5307
- for (message_body_as_headers(message)) |*header| {
5308
- // We must trust headers that other replicas have committed, because
5309
- // repair_header() will not repair a header if the hash chain has a gap.
5310
- if (header.op <= message.header.commit) {
5311
- log.debug(
5312
- "{}: on_do_view_change: committed: replica={} op={} checksum={}",
5313
- .{
5314
- self.replica,
5315
- message.header.replica,
5316
- header.op,
5317
- header.checksum,
5318
- },
5319
- );
5320
- self.replace_header(header);
5321
- } else {
5322
- _ = self.repair_header(header);
5323
- }
5197
+ self.replace_header(header);
5198
+ } else {
5199
+ _ = self.repair_header(header);
5324
5200
  }
5325
5201
  }
5326
5202
  }
5327
-
5328
- const op_max = self.do_view_change_op_max(op_canonical);
5329
- assert(op_max <= self.op);
5330
- assert(op_max >= self.commit_min);
5331
- if (op_max != self.op) {
5332
- log.debug("{}: primary_set_log_from_do_view_change_messages: discard op={}..{}", .{
5333
- self.replica,
5334
- op_max + 1,
5335
- self.op,
5336
- });
5337
- self.journal.remove_entries_from(op_max + 1);
5338
- self.op = op_max;
5339
- }
5340
- assert(self.journal.header_with_op(self.op) != null);
5341
5203
  }
5342
5204
 
5343
- fn do_view_change_quorum_head(self: *const Self) struct {
5344
- /// The highest `view_normal` of any DVC.
5345
- ///
5346
- /// The headers bundled with DVCs with the highest `view_normal` are canonical, since
5347
- /// the replica has knowledge of previous view changes in which headers were replaced.
5348
- view_normal: u32,
5349
- /// The highest `commit_min` from any DVC (this is not a `commit_max`).
5350
- commit_min_max: u64,
5351
- /// The highest `op` from a DVC with the highest `view_normal`.
5352
- op: u64,
5353
- /// The higest timestamp from any DVC.
5354
- timestamp: u64,
5355
- } {
5205
+ fn primary_start_view_as_the_new_primary(self: *Self) void {
5356
5206
  assert(self.status == .view_change);
5357
5207
  assert(self.primary_index(self.view) == self.replica);
5358
- assert(self.replica_count > 1);
5359
- assert(self.start_view_change_quorum);
5208
+ assert(self.view == self.log_view);
5360
5209
  assert(self.do_view_change_quorum);
5361
- assert(self.do_view_change_from_all_replicas[self.replica] != null);
5362
-
5363
- var v: ?u32 = null; // The highest `view_normal` from any replica.
5364
- var n: ?u64 = null; // The highest `op` for the highest `view_normal` from any replica.
5365
- var k: ?u64 = null; // The highest `commit_min` from any replica.
5366
- var t: ?u64 = null; // The highest `timestamp` from any replica.
5367
-
5368
- for (self.do_view_change_from_all_replicas) |received, replica| {
5369
- if (received) |message| {
5370
- assert(message.header.command == .do_view_change);
5371
- assert(message.header.cluster == self.cluster);
5372
- assert(message.header.replica == replica);
5373
- assert(message.header.view == self.view);
5374
- assert(message.header.op >= message.header.commit);
5375
- assert(message.header.op - message.header.commit <= constants.journal_slot_count);
5376
-
5377
- // The view when this replica was last in normal status, which:
5378
- // * may be higher than the view in any of the prepare headers.
5379
- // * must be lower than the view of this view change.
5380
- const view_normal = @intCast(u32, message.header.timestamp);
5381
- assert(view_normal < message.header.view);
5382
-
5383
- if (replica == self.replica) {
5384
- assert(view_normal == self.view_normal);
5385
- assert(message.header.op == self.op);
5386
- // We may have a newer commit than our DVC due to async commits (see below).
5387
- assert(message.header.commit <= self.commit_min);
5388
- }
5210
+ assert(!self.pipeline_repairing);
5211
+ assert(self.primary_repair_pipeline() == .done);
5389
5212
 
5390
- log.debug(
5391
- "{}: on_do_view_change: " ++
5392
- "replica={} view_normal={} op={} commit_min={}",
5393
- .{
5394
- self.replica,
5395
- message.header.replica,
5396
- view_normal,
5397
- message.header.op,
5398
- message.header.commit, // The `commit_min` of the replica.
5399
- },
5400
- );
5213
+ assert(self.commit_min == self.commit_max);
5214
+ assert(self.journal.dirty.count == 0);
5215
+ assert(self.journal.faulty.count == 0);
5216
+ assert(self.nack_prepare_op == null);
5217
+ assert(self.valid_hash_chain_between(self.commit_min, self.op));
5401
5218
 
5402
- if (v == null or view_normal > v.?) {
5403
- v = view_normal;
5404
- n = message.header.op;
5405
- } else if (view_normal == v.? and message.header.op > n.?) {
5406
- n = message.header.op;
5407
- }
5219
+ {
5220
+ const pipeline_queue = self.primary_repair_pipeline_done();
5221
+ assert(pipeline_queue.request_queue.empty());
5222
+ assert(pipeline_queue.prepare_queue.count + self.commit_max == self.op);
5223
+ if (!pipeline_queue.prepare_queue.empty()) {
5224
+ const prepares = &pipeline_queue.prepare_queue;
5225
+ assert(prepares.head_ptr_const().?.message.header.op == self.commit_max + 1);
5226
+ assert(prepares.tail_ptr_const().?.message.header.op == self.op);
5227
+ }
5408
5228
 
5409
- if (k == null or message.header.commit > k.?) k = message.header.commit;
5229
+ var pipeline_prepares = pipeline_queue.prepare_queue.iterator();
5230
+ while (pipeline_prepares.next()) |prepare| {
5231
+ assert(self.journal.has(prepare.message.header));
5232
+ assert(!prepare.ok_quorum_received);
5233
+ assert(prepare.ok_from_all_replicas.count() == 0);
5410
5234
 
5411
- const message_headers = message_body_as_headers(message);
5412
- if (t == null or t.? < message_headers[0].timestamp) {
5413
- t = message_headers[0].timestamp;
5414
- }
5235
+ log.debug("{}: start_view_as_the_new_primary: pipeline " ++
5236
+ "(op={} checksum={x} parent={x})", .{
5237
+ self.replica,
5238
+ prepare.message.header.op,
5239
+ prepare.message.header.checksum,
5240
+ prepare.message.header.parent,
5241
+ });
5415
5242
  }
5416
- }
5417
5243
 
5418
- // Consider the case:
5419
- // 1. Start committing op=N…M.
5420
- // 2. Send `do_view_change` to self.
5421
- // 3. Finish committing op=N…M.
5422
- // 4. Remaining `do_view_change` messages arrive, completing the quorum.
5423
- // In this scenario, our own DVC's commit is `N-1`, but `commit_min=M`.
5424
- // Don't let the commit backtrack.
5425
- if (k.? < self.commit_min) {
5426
- assert(self.commit_min >
5427
- self.do_view_change_from_all_replicas[self.replica].?.header.commit);
5428
- log.debug("{}: on_do_view_change: bump commit_min view={} commit={}..{}", .{
5429
- self.replica,
5430
- self.view,
5431
- k.?,
5432
- self.commit_min,
5433
- });
5434
- k = self.commit_min;
5244
+ self.pipeline.cache.deinit(self.message_bus.pool);
5245
+ self.pipeline = .{ .queue = pipeline_queue };
5246
+ self.pipeline.queue.verify();
5435
5247
  }
5436
5248
 
5437
- assert(v.? >= self.view_normal);
5438
- assert(k.? >= self.commit_min);
5439
-
5440
- return .{
5441
- .view_normal = v.?,
5442
- .commit_min_max = k.?,
5443
- .op = n.?,
5444
- .timestamp = t.?,
5445
- };
5446
- }
5447
-
5448
- /// Identify headers to discard during a view change before the primary starts the view.
5449
- /// This is required to maximize availability in the presence of storage faults.
5450
- /// Refer to the CTRL protocol from Protocol-Aware Recovery for Consensus-Based Storage.
5451
- ///
5452
- /// Returns the highest op that:
5453
- /// - precedes any hash chain breaks in the uncanonical headers, and
5454
- /// - precedes any gaps in the uncommitted headers.
5455
- ///
5456
- /// Breaks
5457
- ///
5458
- /// If there is a hash chain break, none of the headers from the canonical DVCs replaced
5459
- /// the broken (leftover uncanonical) op.
5460
- /// Removing these is necessary for correctness and liveness, to ensure that
5461
- /// disconnected headers do not remain in place in lieu of gaps.
5462
- ///
5463
- /// Gaps
5464
- ///
5465
- /// It is possible for the new primary to have done an op jump in a previous view, and
5466
- /// introduced a header gap for an op, which may have then been discarded by another primary
5467
- /// during a view change, before surviving into this view as a gap because our latest op was
5468
- /// set as the latest op for the quorum.
5469
- ///
5470
- /// In this case, it may be impossible for the new primary to repair the missing header as
5471
- /// the rest of the cluster may have already discarded it. We therefore iterate over our
5472
- /// uncommitted header gaps to discard any that may be impossible to repair.
5473
- ///
5474
- /// For example, if the old primary replicates ops=7,8,9 (all uncommitted) but only op=9 is
5475
- /// prepared on another replica before the old primary crashes, then this function finds a
5476
- /// gap for ops=7,8 and will attempt to discard ops 7,8,9.
5477
- fn do_view_change_op_max(self: *const Self, op_canonical: u64) u64 {
5478
- assert(self.replica_count > 1);
5479
- assert(self.status == .view_change);
5480
- assert(self.primary_index(self.view) == self.replica);
5481
- assert(self.do_view_change_quorum);
5482
- assert(!self.repair_timeout.ticking);
5483
- assert(self.op >= self.commit_max);
5484
- // At least one replica in the new quorum committed in the new replica.op's WAL wrap —
5485
- // wrapping implies a checkpoint (which implies a commit).
5486
- assert(self.op - self.commit_max <= constants.journal_slot_count);
5487
- assert(self.op - self.commit_min <= constants.journal_slot_count);
5488
-
5489
- assert(op_canonical <= self.op);
5490
- assert(op_canonical >= self.commit_min);
5491
-
5492
- // Any uncanonical ops remaining either:
5493
- // * Connect to the hash chain on the right.
5494
- // * Do not connect on the right (hash chain break).
5495
- //
5496
- // If there is a hash chain break, none of the headers from the canonical DVCs replaced
5497
- // the broken op. It is truncated like a gap.
5498
- //
5499
- // Removing these is necessary for correctness and liveness, to ensure that
5500
- // disconnected headers do not remain in place in lieu of gaps.
5501
- const op_before_break = blk: {
5502
- var op: u64 = op_canonical;
5503
- while (op < self.op) : (op += 1) {
5504
- if (self.journal.header_with_op(op)) |header| {
5505
- if (self.journal.header_with_op(op + 1)) |next| {
5506
- // Broken hash chain.
5507
- if (header.checksum != next.parent) break :blk op;
5508
- }
5509
- }
5510
- } else break :blk self.op;
5511
- };
5512
-
5513
- // Find the beginning of the lowest gap.
5514
- //
5515
- // While iterating > commit_max does not in itself guarantee that an op is uncommitted
5516
- // (the old primary may have committed the op shortly before crashing), nevertheless,
5517
- // if it was committed it would have survived into the new view as a header not a gap.
5518
- const op_before_gap = blk: {
5519
- // An op cannot be uncommitted if it is definitely outside the pipeline.
5520
- const op_committed = std.math.max(self.commit_max, self.op -| constants.pipeline_max);
5521
- assert(op_committed <= self.op);
5522
-
5523
- var op = op_committed;
5524
- while (op < self.op) : (op += 1) {
5525
- if (self.journal.header_with_op(op + 1) == null) break :blk op;
5526
- } else break :blk self.op;
5527
- };
5528
-
5529
- return std.math.min(op_before_break, op_before_gap);
5530
- }
5531
-
5532
- fn start_view_as_the_new_primary(self: *Self) void {
5533
- assert(self.status == .view_change);
5534
- assert(self.primary_index(self.view) == self.replica);
5535
- assert(self.do_view_change_quorum);
5536
- assert(!self.repairing_pipeline);
5537
-
5538
- assert(self.commit_min == self.commit_max);
5539
- assert(self.primary_repair_pipeline_op() == null);
5540
- self.verify_pipeline();
5541
- assert(self.commit_max + self.pipeline.count == self.op);
5542
- assert(self.valid_hash_chain_between(self.commit_min, self.op));
5543
-
5544
- assert(self.journal.dirty.count == 0);
5545
- assert(self.journal.faulty.count == 0);
5546
- assert(self.nack_prepare_op == null);
5547
-
5548
- const start_view = self.create_view_change_message(.start_view);
5549
- defer self.message_bus.unref(start_view);
5550
-
5551
5249
  self.transition_to_normal_from_view_change_status(self.view);
5552
- // Detect if the transition to normal status above accidentally resets the pipeline:
5553
- assert(self.commit_max + self.pipeline.count == self.op);
5250
+ self.view_durable_update();
5554
5251
 
5555
5252
  assert(self.status == .normal);
5556
5253
  assert(self.primary());
5557
5254
 
5558
- assert(start_view.references == 1);
5559
- assert(start_view.header.command == .start_view);
5560
- assert(start_view.header.view == self.view);
5561
- assert(start_view.header.op == self.op);
5562
- assert(start_view.header.commit == self.commit_max);
5563
-
5564
5255
  // Send prepare_ok messages to ourself to contribute to the pipeline.
5565
5256
  self.send_prepare_oks_after_view_change();
5566
5257
 
5567
- self.send_message_to_other_replicas(start_view);
5258
+ // SVs will be sent out (via timeout) after the view_durable update completes.
5259
+ assert(self.view_durable_updating());
5260
+ assert(self.log_view > self.log_view_durable());
5568
5261
  }
5569
5262
 
5570
- fn transition_to_normal_from_recovering_status(self: *Self, new_view: u32) void {
5263
+ fn transition_to_recovering_head(self: *Self) void {
5571
5264
  assert(self.status == .recovering);
5572
- assert(self.view == 0);
5265
+ assert(self.view == self.log_view);
5266
+ assert(self.op >= self.commit_min);
5267
+ assert(!self.committing);
5268
+ assert(self.replica_count > 1);
5269
+ assert(self.journal.header_with_op(self.op) != null);
5270
+ assert(self.pipeline == .cache);
5271
+
5272
+ self.status = .recovering_head;
5273
+
5274
+ log.warn("{}: transition_to_recovering_head: op_checkpoint={} op_head={}", .{
5275
+ self.replica,
5276
+ self.op_checkpoint(),
5277
+ self.op,
5278
+ });
5279
+ }
5280
+
5281
+ fn transition_to_normal_from_recovering_status(self: *Self) void {
5282
+ assert(self.status == .recovering or self.status == .recovering_head);
5283
+ assert(self.view == self.log_view);
5573
5284
  assert(!self.committing);
5574
- assert(self.replica_count > 1 or new_view == 0);
5285
+ assert(self.replica_count > 1 or self.commit_min == self.op);
5575
5286
  assert(self.journal.header_with_op(self.op) != null);
5576
- self.view = new_view;
5577
- self.view_normal = new_view;
5287
+ assert(self.pipeline == .cache);
5288
+
5578
5289
  self.status = .normal;
5579
5290
 
5580
5291
  if (self.primary()) {
@@ -5586,7 +5297,7 @@ pub fn ReplicaType(
5586
5297
  },
5587
5298
  );
5588
5299
 
5589
- assert(self.journal.is_empty() or self.replica_count == 1);
5300
+ assert(self.replica_count == 1);
5590
5301
  assert(!self.prepare_timeout.ticking);
5591
5302
  assert(!self.normal_status_timeout.ticking);
5592
5303
  assert(!self.view_change_status_timeout.ticking);
@@ -5595,7 +5306,9 @@ pub fn ReplicaType(
5595
5306
  self.ping_timeout.start();
5596
5307
  self.commit_timeout.start();
5597
5308
  self.repair_timeout.start();
5598
- self.recovery_timeout.stop();
5309
+
5310
+ self.pipeline.cache.deinit(self.message_bus.pool);
5311
+ self.pipeline = .{ .queue = .{} };
5599
5312
  } else {
5600
5313
  log.debug(
5601
5314
  "{}: transition_to_normal_from_recovering_status: view={} backup",
@@ -5613,31 +5326,30 @@ pub fn ReplicaType(
5613
5326
  self.ping_timeout.start();
5614
5327
  self.normal_status_timeout.start();
5615
5328
  self.repair_timeout.start();
5616
- self.recovery_timeout.stop();
5617
5329
  }
5618
5330
  }
5619
5331
 
5620
- fn transition_to_normal_from_view_change_status(self: *Self, new_view: u32) void {
5332
+ fn transition_to_normal_from_view_change_status(self: *Self, view_new: u32) void {
5621
5333
  // In the VRR paper it's possible to transition from normal to normal for the same view.
5622
5334
  // For example, this could happen after a state transfer triggered by an op jump.
5623
5335
  assert(self.status == .view_change);
5624
- assert(new_view >= self.view);
5336
+ assert(view_new >= self.view);
5625
5337
  assert(self.journal.header_with_op(self.op) != null);
5626
- self.view = new_view;
5627
- self.view_normal = new_view;
5338
+
5628
5339
  self.status = .normal;
5629
5340
 
5630
5341
  if (self.primary()) {
5631
5342
  log.debug(
5632
- "{}: transition_to_normal_from_view_change_status: view={} primary",
5633
- .{
5634
- self.replica,
5635
- self.view,
5636
- },
5343
+ "{}: transition_to_normal_from_view_change_status: view={}..{} primary",
5344
+ .{ self.replica, self.view, view_new },
5637
5345
  );
5638
5346
 
5639
5347
  assert(!self.prepare_timeout.ticking);
5640
- assert(!self.recovery_timeout.ticking);
5348
+ assert(!self.pipeline_repairing);
5349
+ assert(self.pipeline == .queue);
5350
+ assert(self.view == view_new);
5351
+ assert(self.log_view == view_new);
5352
+ assert(self.commit_min == self.commit_max);
5641
5353
 
5642
5354
  self.ping_timeout.start();
5643
5355
  self.commit_timeout.start();
@@ -5647,15 +5359,25 @@ pub fn ReplicaType(
5647
5359
  self.repair_timeout.start();
5648
5360
 
5649
5361
  // Do not reset the pipeline as there may be uncommitted ops to drive to completion.
5650
- if (self.pipeline.count > 0) self.prepare_timeout.start();
5362
+ if (self.pipeline.queue.prepare_queue.count > 0) self.prepare_timeout.start();
5651
5363
  } else {
5652
- log.debug("{}: transition_to_normal_from_view_change_status: view={} backup", .{
5364
+ log.debug("{}: transition_to_normal_from_view_change_status: view={}..{} backup", .{
5653
5365
  self.replica,
5654
5366
  self.view,
5367
+ view_new,
5655
5368
  });
5656
5369
 
5657
5370
  assert(!self.prepare_timeout.ticking);
5658
- assert(!self.recovery_timeout.ticking);
5371
+ assert(self.pipeline == .cache);
5372
+
5373
+ if (self.log_view == view_new and self.view == view_new) {
5374
+ // We recovered into the same view we crashed in, with a detour through
5375
+ // status=recovering_head.
5376
+ } else {
5377
+ self.view = view_new;
5378
+ self.log_view = view_new;
5379
+ self.view_durable_update();
5380
+ }
5659
5381
 
5660
5382
  self.ping_timeout.start();
5661
5383
  self.commit_timeout.stop();
@@ -5668,7 +5390,6 @@ pub fn ReplicaType(
5668
5390
  self.reset_quorum_start_view_change();
5669
5391
  self.reset_quorum_do_view_change();
5670
5392
  self.reset_quorum_nack_prepare();
5671
- self.reset_quorum_prepare_ok();
5672
5393
 
5673
5394
  assert(self.start_view_change_quorum == false);
5674
5395
  assert(self.do_view_change_quorum == false);
@@ -5680,17 +5401,34 @@ pub fn ReplicaType(
5680
5401
  /// where v identifies the new view. A replica notices the need for a view change either
5681
5402
  /// based on its own timer, or because it receives a start_view_change or do_view_change
5682
5403
  /// message for a view with a larger number than its own view.
5683
- fn transition_to_view_change_status(self: *Self, new_view: u32) void {
5404
+ fn transition_to_view_change_status(self: *Self, view_new: u32) void {
5684
5405
  log.debug("{}: transition_to_view_change_status: view={}..{}", .{
5685
5406
  self.replica,
5686
5407
  self.view,
5687
- new_view,
5408
+ view_new,
5688
5409
  });
5689
- assert(self.status == .normal or self.status == .view_change);
5690
- assert(new_view > self.view);
5691
- self.view = new_view;
5410
+ assert(self.status == .normal or
5411
+ self.status == .view_change or
5412
+ self.status == .recovering or
5413
+ self.status == .recovering_head);
5414
+
5415
+ const status_before = self.status;
5692
5416
  self.status = .view_change;
5693
5417
 
5418
+ if (self.view == view_new) {
5419
+ assert(status_before == .recovering or status_before == .recovering_head);
5420
+ } else {
5421
+ assert(view_new > self.view);
5422
+ self.view = view_new;
5423
+ self.view_durable_update();
5424
+ }
5425
+
5426
+ if (self.pipeline == .queue) {
5427
+ var queue = self.pipeline.queue;
5428
+ self.pipeline = .{ .cache = PipelineCache.init_from_queue(&queue) };
5429
+ queue.deinit(self.message_bus.pool);
5430
+ }
5431
+
5694
5432
  self.ping_timeout.stop();
5695
5433
  self.commit_timeout.stop();
5696
5434
  self.normal_status_timeout.stop();
@@ -5698,7 +5436,6 @@ pub fn ReplicaType(
5698
5436
  self.view_change_message_timeout.start();
5699
5437
  self.repair_timeout.stop();
5700
5438
  self.prepare_timeout.stop();
5701
- assert(!self.recovery_timeout.ticking);
5702
5439
 
5703
5440
  // Do not reset quorum counters only on entering a view, assuming that the view will be
5704
5441
  // followed only by a single subsequent view change to the next view, because multiple
@@ -5708,7 +5445,6 @@ pub fn ReplicaType(
5708
5445
  self.reset_quorum_start_view_change();
5709
5446
  self.reset_quorum_do_view_change();
5710
5447
  self.reset_quorum_nack_prepare();
5711
- self.reset_quorum_prepare_ok();
5712
5448
 
5713
5449
  assert(self.start_view_change_quorum == false);
5714
5450
  assert(self.do_view_change_quorum == false);
@@ -5785,7 +5521,7 @@ pub fn ReplicaType(
5785
5521
  fn valid_hash_chain_between(self: *const Self, op_min: u64, op_max: u64) bool {
5786
5522
  assert(op_min <= op_max);
5787
5523
  // Headers with ops preceding the checkpoint may be unavailable due to a WAL wrap.
5788
- assert(op_min >= self.op_checkpoint);
5524
+ assert(op_min >= self.op_checkpoint());
5789
5525
 
5790
5526
  // If we use anything less than self.op then we may commit ops for a forked hash chain
5791
5527
  // that have since been reordered by a new primary.
@@ -5796,7 +5532,7 @@ pub fn ReplicaType(
5796
5532
  while (op > op_min) {
5797
5533
  op -= 1;
5798
5534
 
5799
- if (self.op_checkpoint == op) {
5535
+ if (self.op_checkpoint() == op) {
5800
5536
  // op_checkpoint's slot may have been overwritten in the WAL — but we can
5801
5537
  // always use the VSRState to anchor the hash chain.
5802
5538
  assert(op == op_min);
@@ -5807,7 +5543,7 @@ pub fn ReplicaType(
5807
5543
  log.debug("{}: valid_hash_chain_between: break A: {} (checkpoint={})", .{
5808
5544
  self.replica,
5809
5545
  self.superblock.working.vsr_state.commit_min_checksum,
5810
- self.op_checkpoint,
5546
+ self.op_checkpoint(),
5811
5547
  });
5812
5548
  log.debug("{}: valid_hash_chain_between: break B: {}", .{
5813
5549
  self.replica,
@@ -5836,37 +5572,6 @@ pub fn ReplicaType(
5836
5572
  return true;
5837
5573
  }
5838
5574
 
5839
- fn verify_pipeline(self: *Self) void {
5840
- assert(self.status == .view_change);
5841
-
5842
- var op = self.commit_max + 1;
5843
- var parent = self.journal.header_with_op(self.commit_max).?.checksum;
5844
-
5845
- var iterator = self.pipeline.iterator();
5846
- while (iterator.next_ptr()) |prepare| {
5847
- assert(prepare.message.header.command == .prepare);
5848
- assert(!prepare.ok_quorum_received);
5849
- assert(prepare.ok_from_all_replicas.count() == 0);
5850
-
5851
- log.debug("{}: verify_pipeline: op={} checksum={x} parent={x}", .{
5852
- self.replica,
5853
- prepare.message.header.op,
5854
- prepare.message.header.checksum,
5855
- prepare.message.header.parent,
5856
- });
5857
-
5858
- assert(self.journal.has(prepare.message.header));
5859
- assert(prepare.message.header.op == op);
5860
- assert(prepare.message.header.op <= self.op);
5861
- assert(prepare.message.header.parent == parent);
5862
-
5863
- parent = prepare.message.header.checksum;
5864
- op += 1;
5865
- }
5866
- assert(self.pipeline.count <= constants.pipeline_max);
5867
- assert(self.commit_max + self.pipeline.count == op - 1);
5868
- }
5869
-
5870
5575
  fn view_jump(self: *Self, header: *const Header) void {
5871
5576
  const to: Status = switch (header.command) {
5872
5577
  .prepare, .commit => .normal,
@@ -5874,7 +5579,10 @@ pub fn ReplicaType(
5874
5579
  else => unreachable,
5875
5580
  };
5876
5581
 
5877
- if (self.status != .normal and self.status != .view_change) return;
5582
+ switch (self.status) {
5583
+ .normal, .view_change, .recovering_head => {},
5584
+ .recovering => return,
5585
+ }
5878
5586
 
5879
5587
  if (header.view < self.view) return;
5880
5588
 
@@ -5898,18 +5606,20 @@ pub fn ReplicaType(
5898
5606
  .view_change => if (header.view == self.view) return,
5899
5607
  else => unreachable,
5900
5608
  },
5609
+ .recovering_head => {},
5901
5610
  else => unreachable,
5902
5611
  }
5903
5612
 
5904
5613
  switch (to) {
5905
5614
  .normal => {
5906
5615
  if (header.view == self.view) {
5907
- assert(self.status == .view_change);
5616
+ assert(self.status == .view_change or self.status == .recovering_head);
5908
5617
 
5909
5618
  log.debug("{}: view_jump: waiting to exit view change", .{self.replica});
5910
5619
  } else {
5911
5620
  assert(header.view > self.view);
5912
- assert(self.status == .view_change or self.status == .normal);
5621
+ assert(self.status == .view_change or self.status == .recovering_head or
5622
+ self.status == .normal);
5913
5623
 
5914
5624
  log.debug("{}: view_jump: waiting to jump to newer view", .{self.replica});
5915
5625
  }
@@ -5924,8 +5634,10 @@ pub fn ReplicaType(
5924
5634
  });
5925
5635
  },
5926
5636
  .view_change => {
5927
- assert(header.view > self.view);
5928
- assert(self.status == .view_change or self.status == .normal);
5637
+ assert(self.status == .recovering_head or header.view > self.view);
5638
+ assert(self.status != .recovering_head or header.command == .start_view);
5639
+ assert(self.status == .recovering_head or self.status == .view_change or
5640
+ self.status == .normal);
5929
5641
 
5930
5642
  if (header.view == self.view + 1) {
5931
5643
  log.debug("{}: view_jump: jumping to view change", .{self.replica});
@@ -5944,10 +5656,10 @@ pub fn ReplicaType(
5944
5656
  assert(message.header.view <= self.view);
5945
5657
  assert(message.header.op <= self.op);
5946
5658
 
5947
- if (message.header.op == self.op_checkpoint) {
5659
+ if (message.header.op == self.op_checkpoint()) {
5948
5660
  assert(message.header.op == 0);
5949
5661
  } else {
5950
- assert(message.header.op > self.op_checkpoint);
5662
+ assert(message.header.op > self.op_checkpoint());
5951
5663
  }
5952
5664
 
5953
5665
  if (!self.journal.has(message.header)) {
@@ -5968,6 +5680,18 @@ pub fn ReplicaType(
5968
5680
  return;
5969
5681
  }
5970
5682
 
5683
+ // Criteria for caching:
5684
+ // - The primary does not update the cache since it is (or will be) reconstructing its
5685
+ // pipeline.
5686
+ // - Cache uncommitted ops, since it will avoid a WAL read in the common case.
5687
+ if (self.pipeline == .cache and
5688
+ self.replica != self.primary_index(self.view) and
5689
+ self.commit_min < message.header.op)
5690
+ {
5691
+ const prepare_evicted = self.pipeline.cache.insert(message.ref());
5692
+ if (prepare_evicted) |m| self.message_bus.unref(m);
5693
+ }
5694
+
5971
5695
  self.journal.write_prepare(write_prepare_callback, message, trigger);
5972
5696
  }
5973
5697
 
@@ -5993,3 +5717,832 @@ pub fn ReplicaType(
5993
5717
  }
5994
5718
  };
5995
5719
  }
5720
+
5721
+ /// A do-view-change:
5722
+ /// - selects the view's head
5723
+ /// - discards uncommitted ops (to maximize availability in the presence of storage faults)
5724
+ /// - retains all committed ops
5725
+ /// - retains all possibly-committed ops (because they might be committed — we can't tell)
5726
+ /// (Some of these may be discarded during repair, via the nack protocol).
5727
+ /// Refer to the CTRL protocol from Protocol-Aware Recovery for Consensus-Based Storage.
5728
+ ///
5729
+ /// Terminology:
5730
+ ///
5731
+ /// - The *head* message (of a view) is the message (committed or uncommitted) within that view with
5732
+ /// the highest op.
5733
+ ///
5734
+ /// - *gap*: There is a header for op X and X+n (n>1), but no header at op X+1.
5735
+ /// - *break*/*chain break*: The header for op X is not the parent of the header for op X+1.
5736
+ /// - *fork*: A correctness bug in which a committed (or possibly committed) message is discarded.
5737
+ ///
5738
+ /// - An *uncanonical* message may have been removed/changed during a prior view.
5739
+ /// - A *canonical* message was part of the most recent log_view.
5740
+ /// - Canonical messages do not necessarily survive into the new view, but they take
5741
+ /// precedence over uncanonical messages.
5742
+ /// - Canonical messages may be committed or uncommitted.
5743
+ ///
5744
+ /// - *DVC* refers to a command=do_view_change message.
5745
+ /// - *SV* refers to a command=start_view message.
5746
+ /// - The *pipeline suffix* is the last pipeline_prepare_queue_max messages of the log (counting
5747
+ /// backwards from the head op). For example, when pipeline_prepare_queue_max=3,
5748
+ ///
5749
+ /// - the pipeline suffix of log "1,2,3,4,5" is "3,4,5".
5750
+ /// - the pipeline suffix of log "1,2,3,5" is "3,5".
5751
+ ///
5752
+ ///
5753
+ /// Invariants:
5754
+ ///
5755
+ /// For each DVC message:
5756
+ ///
5757
+ /// - The headers all belong to the same hash chain.
5758
+ /// - Reason: If multiple replicas with the same canonical log_view disagree about an op, the new
5759
+ /// primary could not determine which is correct.
5760
+ /// - Gaps are permitted, but the DVC-sender is responsible for ensuring they do not conceal
5761
+ /// chain breaks.
5762
+ /// - For example,
5763
+ /// - a DVC of 6a,8a is valid (6a/8a belong to the same chain).
5764
+ /// - a DVC of 6b,8a is invalid (the gap at 7 conceal a chain break).
5765
+ /// - a DVC of 6b,7b,8a is invalid (7b/8a is a chain break)..
5766
+ ///
5767
+ /// - The headers must connect to the cluster's committed ops (the "DVC anchor").
5768
+ /// This means that either:
5769
+ /// - the DVC includes the op=C header, or
5770
+ /// - the DVC includes the op=C+1 header (where C+1's parent is C).
5771
+ /// (Where `C = "DVC anchor" = max(replica.commit_min, replica.op -| pipeline_prepare_queue_max)`).
5772
+ /// - Reason: The new primary may truncate the entire pipeline (6-9) due to a gap (6),
5773
+ /// but afterwards it still requires a head op to repair/chain backward from.
5774
+ /// (According to the intersection property, a gap in the pipeline indicates an
5775
+ /// uncommitted op).
5776
+ /// - For example, given pipeline_prepare_queue_max=3:
5777
+ /// - a DVC of 7,8 is invalid if replica.commit_min=5.
5778
+ /// - a DVC of 7,8 is valid if replica.commit_min=6.
5779
+ /// - a DVC of 5,7,8 is valid. (5,_,7,8)
5780
+ /// - a DVC of 5,8 is valid. (5,_,_,8)
5781
+ /// - a DVC of 0,1 is valid.
5782
+ ///
5783
+ /// Across all DVCs in the quorum:
5784
+ ///
5785
+ /// - The headers of every DVC with the same log_view must not conflict.
5786
+ /// - In other words:
5787
+ /// dvc₁.headers[i].op == dvc₂.headers[j].op implies
5788
+ /// dvc₁.headers[i].checksum == dvc₂.headers[j].checksum.
5789
+ /// - Reason: the headers bundled with the DVC(s) with the highest log_view will be
5790
+ /// loaded into the new primary with `replace_header()`, not `repair_header()`.
5791
+ ///
5792
+ /// Perhaps unintuitively, it is safe to advertise a header before its message is prepared
5793
+ /// (e.g. the write is still queued). The header is either:
5794
+ ///
5795
+ /// - committed — so another replica in the quorum must have a copy, according to the quorum
5796
+ /// intersection property. Or,
5797
+ /// - uncommitted — if the header is chosen, but cannot be recovered from any replica, then
5798
+ /// it will be discarded by the nack protocol.
5799
+ ///
5800
+ ///
5801
+ /// Examples
5802
+ ///
5803
+ /// In these examples:
5804
+ /// - pipeline_prepare_queue_max=3
5805
+ /// - Brackets denote the suffix of the replica's log that is actually included in the DVC headers.
5806
+ /// - Parenthesis denote a replica that did not participate in the DVC (for example, because it is
5807
+ /// partitioned).
5808
+ ///
5809
+ /// Example 1: No gap in canonical headers
5810
+ ///
5811
+ /// Consider a view change with DVCs:
5812
+ ///
5813
+ /// replica headers log_view
5814
+ /// 0 1 [2 3 4b] 4 (new primary)
5815
+ /// 1 1 2 3 4a 5 6 [7 8 9] 5
5816
+ /// 2 (1 2 3 4a 5 6 7 8 9) 5 (partitioned)
5817
+ ///
5818
+ /// Replica 1's headers are canonical, so replica 0 constructs the log:
5819
+ ///
5820
+ /// 1 2 3 4b 7 8 9
5821
+ ///
5822
+ /// The 5/6 gap conceals a hash break — 4b should be 4a.
5823
+ /// The view must initially keeps all of these headers, and after the DVC quorum is handled, repairs
5824
+ /// backwards from 7. (If it instead discarded at the gap (5…9), the log would fork (4a→4b).)
5825
+ ///
5826
+ ///
5827
+ /// Example 2: Gap in pipeline suffix
5828
+ ///
5829
+ /// Consider a set of replicas performing a DVC:
5830
+ ///
5831
+ /// replica headers log_view
5832
+ /// 0 1 [2 3 4b] 4 (new primary)
5833
+ /// 1 1 2 3 4a 5 6 8 9 5
5834
+ /// 2 (1 2 3 ? ? ? ? ? ?) 5 (partitioned)
5835
+ ///
5836
+ /// Which headers should replica 1 include in its DVC?
5837
+ /// The cases are be distinguished by `log_view % replica_count`.
5838
+ ///
5839
+ /// (These examples are still applicable if the gap is not in the first op of the pipeline suffix).
5840
+ ///
5841
+ ///
5842
+ /// Example 2a: Gap in the pipeline suffix of a retired primary
5843
+ ///
5844
+ /// The replica was a primary during its retired log_view.
5845
+ /// It may have gaps or breaks in its pipeline suffix iff:
5846
+ /// - it didn't finish repairs before the next view change, and
5847
+ /// - some uncommitted ops were truncated during the DVC (since this "moves" the suffix backwards).
5848
+ ///
5849
+ /// We cannot send op 6 in the DVC because if repairs did not complete, it may be the wrong message.
5850
+ ///
5851
+ /// However, even though we may not have a full unbroken suffix of pipeline_prepare_queue_max
5852
+ /// messages, we know that our unbroken suffix (however long it may be) includes all
5853
+ /// possibly-committed messages, since otherwise the retired log_view would not have started.
5854
+ ///
5855
+ /// Therefore, the retired primary sends a DVC with only the unbroken log suffix:
5856
+ ///
5857
+ /// replica headers
5858
+ /// 1 1 2 3 4a 5 6 [8 9] (retired primary)
5859
+ ///
5860
+ ///
5861
+ /// Example 2b: Gap in the pipeline suffix of a retired follower
5862
+ ///
5863
+ /// The replica was a follower during its retired log_view.
5864
+ /// Followers always load a full suffix of headers from the view's SV message.
5865
+ /// If there is now a gap in it the follower's suffix, this must be due to missed prepares.
5866
+ ///
5867
+ /// Therefore, ops to the left of the gap (where the gap is within the suffix) are part of the
5868
+ /// suffix's hash chain, even though we cannot test this by chaining checksum/parent.
5869
+ ///
5870
+ /// Therefore, the retired follower sends the DVC:
5871
+ ///
5872
+ /// replica headers
5873
+ /// 1 1 2 3 4a 5 [6 8 9] (retired follower)
5874
+ ///
5875
+ ///
5876
+ /// Example 3: Break in pipeline suffix
5877
+ ///
5878
+ /// Consider a set of replicas performing a DVC:
5879
+ ///
5880
+ /// replica headers log_view
5881
+ /// 0 1 [2 3 4b] 4 (new primary)
5882
+ /// 1 1 2 3 4b 5a 6a 7a [8b 9b] 5
5883
+ /// 2 (1 2 3 4b 5b 7b 7b 8b 9b) 5 (partitioned)
5884
+ ///
5885
+ /// (Note the chain break at replica 1's 7a/8b.)
5886
+ /// This scenario is exactly analogous to Example 2, except that it can only occur on a retired
5887
+ /// primary, never a retired follower.
5888
+ ///
5889
+ /// The retired primary sends a DVC with only the unbroken log suffix:
5890
+ ///
5891
+ /// replica headers
5892
+ /// 1 1 2 3 4a 5 6 7a [8 9] (retired primary)
5893
+ ///
5894
+ ///
5895
+ /// Example 4: Gap in retiring primary suffix after recovery
5896
+ ///
5897
+ /// Suppose that replica 1 starts a view as the primary of view 4, with the suffix:
5898
+ ///
5899
+ /// log_view 4
5900
+ /// view 4
5901
+ /// journal 1 2 3
5902
+ /// head 3
5903
+ ///
5904
+ /// During this view, it prepares several ops:
5905
+ ///
5906
+ /// log_view 4
5907
+ /// view 4
5908
+ /// journal 1 2 3 4 5 6 7
5909
+ /// head 7
5910
+ ///
5911
+ /// However, the WAL writes are reordered — ops 4,5,7 writes finish before op=6's write has begun:
5912
+ ///
5913
+ /// log_view 4
5914
+ /// view 4
5915
+ /// journal 1 2 3 4 5 6 7
5916
+ /// wal 1 2 3 4 5 _ 7
5917
+ /// head 7
5918
+ ///
5919
+ /// Replica 1 crashes and recovers, and immediately begins sending a DVC for view=5.
5920
+ /// Under normal circumstances, the retired primary cannot distinguish between a gap and a break
5921
+ /// due to the possibility that its did not complete repair (see Example 2a).
5922
+ /// In this instance though, the gap is safe to skip over because it is to the right of the durable
5923
+ /// SV's head (op=3).
5924
+ ///
5925
+ /// log_view 4
5926
+ /// view 5
5927
+ /// journal 1 2 3 [4 5 _ 7]
5928
+ /// head 7
5929
+ ///
5930
+ const DVCQuorum = struct {
5931
+ const DVCArray = std.BoundedArray(*const Message, constants.replicas_max);
5932
+
5933
+ fn verify(dvc_quorum: QuorumMessages) void {
5934
+ const dvcs = DVCQuorum.dvcs_all(dvc_quorum);
5935
+ assert(dvcs.len >= 2);
5936
+ for (dvcs.constSlice()) |message| verify_message(message);
5937
+
5938
+ var log_views_all = std.BoundedArray(u32, constants.replicas_max){ .buffer = undefined };
5939
+ for (dvcs.constSlice()) |message| {
5940
+ const log_view = @intCast(u32, message.header.timestamp);
5941
+ if (std.mem.count(u32, log_views_all.constSlice(), &.{log_view}) == 0) {
5942
+ log_views_all.appendAssumeCapacity(log_view);
5943
+ }
5944
+ }
5945
+
5946
+ // Verify that DVCs with the same log_view do not conflict.
5947
+ for (log_views_all.constSlice()) |log_view| {
5948
+ const view_dvcs = dvcs_with_log_view(dvc_quorum, log_view);
5949
+ var view_headers = HeaderIterator.init(view_dvcs, null);
5950
+ while (view_headers.next()) |_| {}
5951
+ }
5952
+ }
5953
+
5954
+ fn verify_message(message: *const Message) void {
5955
+ assert(message.header.command == .do_view_change);
5956
+ assert(message.header.op >= message.header.commit);
5957
+ assert(message.header.op - message.header.commit <= constants.journal_slot_count);
5958
+
5959
+ // The log_view:
5960
+ // * may be higher than the view in any of the prepare headers.
5961
+ // * must be lower than the view of this view change.
5962
+ const log_view = @intCast(u32, message.header.timestamp);
5963
+ assert(log_view < message.header.view);
5964
+
5965
+ // Ignore the headers, but perform the validation.
5966
+ _ = message_body_as_headers_chain_disjoint(message);
5967
+ }
5968
+
5969
+ fn dvcs_all(dvc_quorum: QuorumMessages) DVCArray {
5970
+ var array = DVCArray{ .buffer = undefined };
5971
+ for (dvc_quorum) |received, replica| {
5972
+ if (received) |message| {
5973
+ assert(message.header.command == .do_view_change);
5974
+ assert(message.header.replica == replica);
5975
+
5976
+ array.appendAssumeCapacity(message);
5977
+ }
5978
+ }
5979
+ return array;
5980
+ }
5981
+
5982
+ fn dvcs_canonical(dvc_quorum: QuorumMessages) DVCArray {
5983
+ return dvcs_with_log_view(dvc_quorum, DVCQuorum.log_view_max(dvc_quorum));
5984
+ }
5985
+
5986
+ fn dvcs_with_log_view(dvc_quorum: QuorumMessages, log_view: u32) DVCArray {
5987
+ var array = DVCArray{ .buffer = undefined };
5988
+ const dvcs = DVCQuorum.dvcs_all(dvc_quorum);
5989
+ for (dvcs.constSlice()) |message| {
5990
+ const message_log_view = @intCast(u32, message.header.timestamp);
5991
+ if (message_log_view == log_view) {
5992
+ array.appendAssumeCapacity(message);
5993
+ }
5994
+ }
5995
+ return array;
5996
+ }
5997
+
5998
+ fn dvcs_uncanonical(dvc_quorum: QuorumMessages) DVCArray {
5999
+ const log_view_max_ = DVCQuorum.log_view_max(dvc_quorum);
6000
+ var array = DVCArray{ .buffer = undefined };
6001
+ const dvcs = DVCQuorum.dvcs_all(dvc_quorum);
6002
+ for (dvcs.constSlice()) |message| {
6003
+ const log_view = @intCast(u32, message.header.timestamp);
6004
+ assert(log_view <= log_view_max_);
6005
+
6006
+ if (log_view < log_view_max_) {
6007
+ array.appendAssumeCapacity(message);
6008
+ }
6009
+ }
6010
+ return array;
6011
+ }
6012
+
6013
+ /// Returns the highest `log_view` of any DVC.
6014
+ ///
6015
+ /// The headers bundled with DVCs with the highest `log_view` are canonical, since
6016
+ /// the replica has knowledge of previous view changes in which headers were replaced.
6017
+ fn log_view_max(dvc_quorum: QuorumMessages) u32 {
6018
+ var log_view_max_: ?u32 = null;
6019
+ const dvcs = DVCQuorum.dvcs_all(dvc_quorum);
6020
+ for (dvcs.constSlice()) |message| {
6021
+ // The view when this replica was last in normal status, which:
6022
+ // * may be higher than the view in any of the prepare headers.
6023
+ // * must be lower than the view of this view change.
6024
+ const log_view = @intCast(u32, message.header.timestamp);
6025
+ assert(log_view < message.header.view);
6026
+
6027
+ if (log_view_max_ == null or log_view_max_.? < log_view) {
6028
+ log_view_max_ = log_view;
6029
+ }
6030
+ }
6031
+ return log_view_max_.?;
6032
+ }
6033
+
6034
+ /// Returns the highest `commit_min` from any DVC (this is not a `commit_max`).
6035
+ fn commit_min_max(dvc_quorum: QuorumMessages, local: struct {
6036
+ replica: u64,
6037
+ commit_min: u64,
6038
+ }) u64 {
6039
+ assert(dvc_quorum[local.replica].?.header.commit <= local.commit_min);
6040
+
6041
+ var commit_min_max_: ?u64 = null;
6042
+ const dvcs = DVCQuorum.dvcs_all(dvc_quorum);
6043
+ for (dvcs.constSlice()) |message| {
6044
+ if (commit_min_max_ == null or commit_min_max_.? < message.header.commit) {
6045
+ commit_min_max_ = message.header.commit;
6046
+ }
6047
+ }
6048
+
6049
+ // Consider the case:
6050
+ // 1. Start committing op=N…M.
6051
+ // 2. Send `do_view_change` to self.
6052
+ // 3. Finish committing op=N…M.
6053
+ // 4. Remaining `do_view_change` messages arrive, completing the quorum.
6054
+ // In this scenario, our own DVC's commit is `N-1`, but `commit_min=M`.
6055
+ // Don't let the commit backtrack.
6056
+ if (commit_min_max_.? < local.commit_min) {
6057
+ const dvc_old = dvc_quorum[local.replica].?;
6058
+ assert(dvc_old.header.commit < local.commit_min);
6059
+ assert(dvc_old.header.commit <= commit_min_max_.?);
6060
+
6061
+ log.debug("{}: on_do_view_change: bump commit_min commit={}..{}", .{
6062
+ local.replica,
6063
+ commit_min_max_.?,
6064
+ local.commit_min,
6065
+ });
6066
+ commit_min_max_ = local.commit_min;
6067
+ }
6068
+
6069
+ assert(commit_min_max_.? >= local.commit_min);
6070
+ return commit_min_max_.?;
6071
+ }
6072
+
6073
+ /// Returns the highest `timestamp` from any replica.
6074
+ fn timestamp_max(dvc_quorum: QuorumMessages) u64 {
6075
+ var timestamp_max_: ?u64 = null;
6076
+ const dvcs = DVCQuorum.dvcs_all(dvc_quorum);
6077
+ for (dvcs.constSlice()) |message| {
6078
+ const message_headers = message_body_as_headers_chain_disjoint(message);
6079
+ if (timestamp_max_ == null or timestamp_max_.? < message_headers[0].timestamp) {
6080
+ timestamp_max_ = message_headers[0].timestamp;
6081
+ }
6082
+ }
6083
+ return timestamp_max_.?;
6084
+ }
6085
+
6086
+ fn op_max_canonical(dvc_quorum: QuorumMessages) u64 {
6087
+ var op_max: ?u64 = null;
6088
+ const dvcs = DVCQuorum.dvcs_canonical(dvc_quorum);
6089
+ for (dvcs.constSlice()) |message| {
6090
+ if (op_max == null or op_max.? < message.header.op) {
6091
+ op_max = message.header.op;
6092
+ }
6093
+ }
6094
+ return op_max.?;
6095
+ }
6096
+
6097
+ /// Return an iterator over the canonical DVC's headers, from high-to-low op.
6098
+ /// The first header returned is the new head message.
6099
+ fn headers_canonical(dvc_quorum: QuorumMessages) HeaderIterator {
6100
+ const dvcs = DVCQuorum.dvcs_canonical(dvc_quorum);
6101
+
6102
+ const op_head_max = op_max_canonical(dvc_quorum);
6103
+ // The number of uncommitted ops cannot be more than the length of the pipeline.
6104
+ const op_suffix_min = op_head_max -| constants.pipeline_prepare_queue_max;
6105
+ assert(op_suffix_min <= op_head_max);
6106
+
6107
+ var op_head_min = op_suffix_min;
6108
+ var ops_in_suffix = std.StaticBitSet(constants.pipeline_prepare_queue_max).initEmpty();
6109
+ for (dvcs.constSlice()) |message| {
6110
+ const message_headers = message_body_as_headers_chain_disjoint(message);
6111
+ for (message_headers) |header| {
6112
+ if (header.op > op_suffix_min) {
6113
+ ops_in_suffix.set((header.op - op_suffix_min) - 1);
6114
+ }
6115
+ }
6116
+ op_head_min = std.math.max(op_head_min, message_headers[message_headers.len - 1].op);
6117
+ }
6118
+ assert(op_head_max == 0 or ops_in_suffix.isSet((op_head_max - op_suffix_min) - 1));
6119
+ assert(op_head_min >= op_suffix_min);
6120
+ assert(op_head_min <= op_head_max);
6121
+
6122
+ const op_head = blk: {
6123
+ var op = op_head_min + 1;
6124
+ while (op < op_head_max) : (op += 1) {
6125
+ if (!ops_in_suffix.isSet((op - op_suffix_min) - 1)) {
6126
+ break :blk op - 1;
6127
+ }
6128
+ } else {
6129
+ break :blk op_head_max;
6130
+ }
6131
+ };
6132
+ assert(op_head >= op_head_min);
6133
+ assert(op_head <= op_head_max);
6134
+
6135
+ return HeaderIterator.init(dvcs, op_head);
6136
+ }
6137
+
6138
+ /// Iterate the headers of a set of (same-log_view) DVCs, from high-to-low op.
6139
+ const HeaderIterator = struct {
6140
+ dvcs: DVCArray,
6141
+ dvcs_offsets: std.BoundedArray(usize, constants.replicas_max),
6142
+
6143
+ child: ?struct {
6144
+ op: u64,
6145
+ parent: u128,
6146
+ } = null,
6147
+
6148
+ fn init(dvcs: DVCArray, op_head: ?u64) HeaderIterator {
6149
+ assert(dvcs.len > 0);
6150
+
6151
+ var dvcs_log_view: ?u32 = null;
6152
+ for (dvcs.constSlice()) |message| {
6153
+ const log_view = @intCast(u32, message.header.timestamp);
6154
+ if (dvcs_log_view) |view| {
6155
+ assert(view == log_view);
6156
+ } else {
6157
+ dvcs_log_view = log_view;
6158
+ }
6159
+ }
6160
+
6161
+ var dvcs_offsets = std.BoundedArray(usize, constants.replicas_max){
6162
+ .buffer = undefined,
6163
+ };
6164
+
6165
+ if (op_head) |op_head_| {
6166
+ // Skip over discarded headers.
6167
+ for (dvcs.constSlice()) |message| {
6168
+ const offset = for (message_body_as_headers_chain_disjoint(message)) |header, i| {
6169
+ if (header.op <= op_head_) break i;
6170
+ } else 0;
6171
+ dvcs_offsets.appendAssumeCapacity(offset);
6172
+ }
6173
+ } else {
6174
+ for (dvcs.constSlice()) |_| dvcs_offsets.appendAssumeCapacity(0);
6175
+ }
6176
+ assert(dvcs.len == dvcs_offsets.len);
6177
+
6178
+ return .{
6179
+ .dvcs = dvcs,
6180
+ .dvcs_offsets = dvcs_offsets,
6181
+ };
6182
+ }
6183
+
6184
+ fn next(iterator: *HeaderIterator) ?Header {
6185
+ const ReplicaSet = std.StaticBitSet(constants.replicas_max);
6186
+ var next_header: ?*const Header = null;
6187
+ var next_advance = ReplicaSet.initEmpty();
6188
+
6189
+ for (iterator.dvcs.constSlice()) |message, i| {
6190
+ const message_headers = message_body_as_headers_chain_disjoint(message);
6191
+ const message_headers_offset = iterator.dvcs_offsets.get(i);
6192
+ if (message_headers_offset == message_headers.len) continue;
6193
+
6194
+ const header = &message_headers[message_headers_offset];
6195
+ if (next_header == null or
6196
+ next_header.?.op < header.op)
6197
+ {
6198
+ next_header = header;
6199
+ next_advance = ReplicaSet.initEmpty();
6200
+ }
6201
+ assert((next_header.?.op == header.op) ==
6202
+ (next_header.?.checksum == header.checksum));
6203
+
6204
+ if (next_header.?.op == header.op) {
6205
+ next_advance.set(i);
6206
+ }
6207
+ }
6208
+ assert((next_advance.count() == 0) == (next_header == null));
6209
+
6210
+ var next_advance_iterator = next_advance.iterator(.{});
6211
+ while (next_advance_iterator.next()) |i| {
6212
+ iterator.dvcs_offsets.slice()[i] += 1;
6213
+ }
6214
+
6215
+ if (next_header) |header| {
6216
+ if (iterator.child) |child| {
6217
+ assert(child.op > header.op);
6218
+ assert((child.op == header.op + 1) == (child.parent == header.checksum));
6219
+ }
6220
+ iterator.child = .{ .op = header.op, .parent = header.parent };
6221
+ return header.*;
6222
+ } else {
6223
+ return null;
6224
+ }
6225
+ }
6226
+ };
6227
+ };
6228
+
6229
+ /// Asserts that the headers are in descending op order.
6230
+ /// The headers may contain gaps and/or breaks.
6231
+ fn message_body_as_headers(message: *const Message) []const Header {
6232
+ assert(message.header.size > @sizeOf(Header)); // Body must contain at least one header.
6233
+ assert(message.header.command == .do_view_change or
6234
+ message.header.command == .start_view or
6235
+ message.header.command == .headers);
6236
+
6237
+ const headers = std.mem.bytesAsSlice(
6238
+ Header,
6239
+ message.buffer[@sizeOf(Header)..message.header.size],
6240
+ );
6241
+
6242
+ var child: ?*const Header = null;
6243
+ for (headers) |*header| {
6244
+ assert(!constants.verify or header.valid_checksum());
6245
+ assert(header.cluster == message.header.cluster);
6246
+ assert(header.command == .prepare);
6247
+ assert(header.view <= message.header.view);
6248
+
6249
+ if (child) |child_header| {
6250
+ // Headers must be provided in reverse order for the sake of `repair_header()`.
6251
+ // Otherwise, headers may never be repaired where the hash chain never connects.
6252
+ assert(header.op < child_header.op);
6253
+ }
6254
+ child = header;
6255
+ }
6256
+
6257
+ return headers;
6258
+ }
6259
+
6260
+ /// Asserts that the headers are in descending op order, and there are no breaks.
6261
+ /// The headers may contain gaps.
6262
+ fn message_body_as_headers_chain_disjoint(message: *const Message) []const Header {
6263
+ assert(message.header.command == .do_view_change or message.header.command == .start_view);
6264
+
6265
+ const message_headers = message_body_as_headers(message);
6266
+ assert(message_headers.len > 0);
6267
+ assert(message_headers[0].op == message.header.op);
6268
+
6269
+ var child: ?*const Header = null;
6270
+ for (message_headers) |*header| {
6271
+ assert(header.op <= message.header.op);
6272
+
6273
+ if (child) |child_header| {
6274
+ assert(header.view <= child_header.view);
6275
+ assert((header.op + 1 == child_header.op) == (header.checksum == child_header.parent));
6276
+ assert(header.timestamp < child_header.timestamp);
6277
+ }
6278
+ child = header;
6279
+ }
6280
+ return message_headers;
6281
+ }
6282
+
6283
+ /// Asserts that the headers are in descending op order, and there are no gaps or breaks.
6284
+ fn message_body_as_headers_chain_consecutive(message: *const Message) []const Header {
6285
+ assert(message.header.command == .start_view);
6286
+
6287
+ const message_headers = message_body_as_headers_chain_disjoint(message);
6288
+ var child: ?*const Header = null;
6289
+ for (message_headers) |*header| {
6290
+ if (child) |child_header| {
6291
+ assert(header.op + 1 == child_header.op);
6292
+ assert(header.checksum == child_header.parent);
6293
+ }
6294
+ child = header;
6295
+ }
6296
+ return message_headers;
6297
+ }
6298
+
6299
+ /// The PipelineQueue belongs to a normal-status primary. It consists of two queues:
6300
+ /// - A prepare queue, containing all messages currently being prepared.
6301
+ /// - A request queue, containing all messages which are waiting to begin preparing.
6302
+ ///
6303
+ /// Invariants:
6304
+ /// - prepare_queue contains only messages with command=prepare.
6305
+ /// - prepare_queue's messages have sequential, increasing ops.
6306
+ /// - prepare_queue's messages are hash-chained.
6307
+ /// - request_queue contains only messages with command=request.
6308
+ /// - If request_queue is not empty, then prepare_queue is full OR 1-less than full.
6309
+ /// (The caller is responsible for maintaining this invariant. If the caller removes an entry
6310
+ /// from `prepare_queue`, an entry from request_queue should be moved over promptly.)
6311
+ ///
6312
+ /// Note: The prepare queue may contain multiple prepares from a single client, but the request
6313
+ /// queue may not (see message_by_client()).
6314
+ const PipelineQueue = struct {
6315
+ const PrepareQueue = RingBuffer(Prepare, constants.pipeline_prepare_queue_max, .array);
6316
+ const RequestQueue = RingBuffer(Request, constants.pipeline_request_queue_max, .array);
6317
+
6318
+ /// Messages that are preparing (uncommitted, being written to the WAL (may already be written
6319
+ /// to the WAL) and replicated (may just be waiting for acks)).
6320
+ prepare_queue: PrepareQueue = .{},
6321
+ /// Messages that are accepted from the client, but not yet preparing.
6322
+ /// When `pipeline_prepare_queue_max + pipeline_request_queue_max = clients_max`, the request
6323
+ /// queue guards against clients starving one another.
6324
+ request_queue: RequestQueue = .{},
6325
+
6326
+ fn deinit(pipeline: *PipelineQueue, message_pool: *MessagePool) void {
6327
+ while (pipeline.request_queue.pop()) |r| message_pool.unref(r.message);
6328
+ while (pipeline.prepare_queue.pop()) |p| message_pool.unref(p.message);
6329
+ }
6330
+
6331
+ fn verify(pipeline: PipelineQueue) void {
6332
+ assert(pipeline.request_queue.count <= constants.pipeline_request_queue_max);
6333
+ assert(pipeline.prepare_queue.count <= constants.pipeline_prepare_queue_max);
6334
+ assert(pipeline.request_queue.empty() or
6335
+ constants.pipeline_prepare_queue_max == pipeline.prepare_queue.count or
6336
+ constants.pipeline_prepare_queue_max == pipeline.prepare_queue.count + 1);
6337
+
6338
+ if (pipeline.prepare_queue.head_ptr_const()) |head| {
6339
+ var op = head.message.header.op;
6340
+ var parent = head.message.header.parent;
6341
+ var prepare_iterator = pipeline.prepare_queue.iterator();
6342
+ while (prepare_iterator.next_ptr()) |prepare| {
6343
+ assert(prepare.message.header.command == .prepare);
6344
+ assert(prepare.message.header.op == op);
6345
+ assert(prepare.message.header.parent == parent);
6346
+
6347
+ parent = prepare.message.header.checksum;
6348
+ op += 1;
6349
+ }
6350
+ }
6351
+
6352
+ var request_iterator = pipeline.request_queue.iterator();
6353
+ while (request_iterator.next()) |request| {
6354
+ assert(request.message.header.command == .request);
6355
+ }
6356
+ }
6357
+
6358
+ fn full(pipeline: PipelineQueue) bool {
6359
+ if (pipeline.prepare_queue.full()) {
6360
+ return pipeline.request_queue.full();
6361
+ } else {
6362
+ assert(pipeline.request_queue.empty() or
6363
+ pipeline.prepare_queue.count + 1 == constants.pipeline_prepare_queue_max);
6364
+ return false;
6365
+ }
6366
+ }
6367
+
6368
+ /// Searches the pipeline for a prepare for a given op and checksum.
6369
+ /// When `checksum` is `null`, match any checksum.
6370
+ fn prepare_by_op_and_checksum(pipeline: *PipelineQueue, op: u64, checksum: ?u128) ?*Prepare {
6371
+ if (pipeline.prepare_queue.empty()) return null;
6372
+
6373
+ // To optimize the search, we can leverage the fact that the pipeline's entries are
6374
+ // ordered and consecutive.
6375
+ const head_op = pipeline.prepare_queue.head_ptr().?.message.header.op;
6376
+ const tail_op = pipeline.prepare_queue.tail_ptr().?.message.header.op;
6377
+ if (op < head_op) return null;
6378
+ if (op > tail_op) return null;
6379
+
6380
+ const prepare = pipeline.prepare_queue.get_ptr(op - head_op).?;
6381
+ assert(prepare.message.header.op == op);
6382
+
6383
+ if (checksum == null) return prepare;
6384
+ if (checksum.? == prepare.message.header.checksum) return prepare;
6385
+ return null;
6386
+ }
6387
+
6388
+ /// Searches the pipeline for a prepare matching the given ack.
6389
+ /// Asserts that the returned prepare corresponds to the prepare_ok.
6390
+ fn prepare_by_prepare_ok(pipeline: *PipelineQueue, ok: *const Message) ?*Prepare {
6391
+ assert(ok.header.command == .prepare_ok);
6392
+
6393
+ const prepare = pipeline.prepare_by_op_and_checksum(
6394
+ ok.header.op,
6395
+ ok.header.context,
6396
+ ) orelse return null;
6397
+ assert(prepare.message.header.command == .prepare);
6398
+ assert(prepare.message.header.parent == ok.header.parent);
6399
+ assert(prepare.message.header.client == ok.header.client);
6400
+ assert(prepare.message.header.request == ok.header.request);
6401
+ assert(prepare.message.header.cluster == ok.header.cluster);
6402
+ assert(prepare.message.header.epoch == ok.header.epoch);
6403
+ // A prepare may be committed in the same view or in a newer view:
6404
+ assert(prepare.message.header.view <= ok.header.view);
6405
+ assert(prepare.message.header.op == ok.header.op);
6406
+ assert(prepare.message.header.commit == ok.header.commit);
6407
+ assert(prepare.message.header.timestamp == ok.header.timestamp);
6408
+ assert(prepare.message.header.operation == ok.header.operation);
6409
+
6410
+ return prepare;
6411
+ }
6412
+
6413
+ /// Search the pipeline (both request & prepare queues) for a message from the given client.
6414
+ /// - A client may have multiple prepares in the pipeline if these were committed by the
6415
+ /// previous primary and were reloaded into the pipeline after a view change.
6416
+ /// - A client may have at most one request in the pipeline.
6417
+ /// If there are multiple messages in the pipeline from the client, the *latest* message is
6418
+ /// returned (to help the caller identify bad client behavior).
6419
+ fn message_by_client(pipeline: PipelineQueue, client_id: u128) ?*const Message {
6420
+ var message: ?*const Message = null;
6421
+ var prepare_iterator = pipeline.prepare_queue.iterator();
6422
+ while (prepare_iterator.next_ptr()) |prepare| {
6423
+ if (prepare.message.header.client == client_id) message = prepare.message;
6424
+ }
6425
+
6426
+ var request_iterator = pipeline.request_queue.iterator();
6427
+ while (request_iterator.next()) |request| {
6428
+ if (request.message.header.client == client_id) message = request.message;
6429
+ }
6430
+ return message;
6431
+ }
6432
+
6433
+ /// Warning: This temporarily violates the prepare/request queue count invariant.
6434
+ /// After invocation, call pop_request→push_prepare to begin preparing the next request.
6435
+ fn pop_prepare(pipeline: *PipelineQueue) ?Prepare {
6436
+ if (pipeline.prepare_queue.pop()) |prepare| {
6437
+ assert(pipeline.request_queue.empty() or
6438
+ pipeline.prepare_queue.count + 1 == constants.pipeline_prepare_queue_max);
6439
+ return prepare;
6440
+ } else {
6441
+ assert(pipeline.request_queue.empty());
6442
+ return null;
6443
+ }
6444
+ }
6445
+
6446
+ fn pop_request(pipeline: *PipelineQueue) ?Request {
6447
+ return pipeline.request_queue.pop();
6448
+ }
6449
+
6450
+ fn push_request(pipeline: *PipelineQueue, request: Request) void {
6451
+ assert(request.message.header.command == .request);
6452
+ var queue_iterator = pipeline.request_queue.iterator();
6453
+ while (queue_iterator.next()) |queue_request| {
6454
+ assert(queue_request.message.header.client != request.message.header.client);
6455
+ }
6456
+
6457
+ pipeline.request_queue.push_assume_capacity(request);
6458
+ if (constants.verify) pipeline.verify();
6459
+ }
6460
+
6461
+ fn push_prepare(pipeline: *PipelineQueue, message: *Message) void {
6462
+ assert(message.header.command == .prepare);
6463
+ if (pipeline.prepare_queue.tail()) |tail| {
6464
+ assert(message.header.op == tail.message.header.op + 1);
6465
+ assert(message.header.parent == tail.message.header.checksum);
6466
+ assert(message.header.view >= tail.message.header.view);
6467
+ } else {
6468
+ assert(pipeline.request_queue.empty());
6469
+ }
6470
+
6471
+ pipeline.prepare_queue.push_assume_capacity(.{ .message = message });
6472
+ if (constants.verify) pipeline.verify();
6473
+ }
6474
+ };
6475
+
6476
+ /// Prepares in the cache may be committed or uncommitted, and may not belong to the current view.
6477
+ ///
6478
+ /// Invariants:
6479
+ /// - The cache contains only messages with command=prepare.
6480
+ /// - If a message with op X is in the cache, it is in `prepares[X % prepares.len]`.
6481
+ const PipelineCache = struct {
6482
+ const prepares_max =
6483
+ constants.pipeline_prepare_queue_max +
6484
+ constants.pipeline_request_queue_max;
6485
+
6486
+ prepares: [prepares_max]?*Message = [_]?*Message{null} ** prepares_max,
6487
+
6488
+ /// Converting a PipelineQueue to a PipelineCache discards all accumulated acks.
6489
+ /// "prepare_ok"s from previous views are not valid, even if the pipeline entry is reused
6490
+ /// after a cycle of view changes. In other words, when a view change cycles around, so
6491
+ /// that the original primary becomes a primary of a new view, pipeline entries may be
6492
+ /// reused. However, the pipeline's prepare_ok quorums must not be reused, since the
6493
+ /// replicas that sent them may have swapped them out during a previous view change.
6494
+ fn init_from_queue(queue: *PipelineQueue) PipelineCache {
6495
+ var cache = PipelineCache{};
6496
+ var prepares = queue.prepare_queue.iterator();
6497
+ while (prepares.next()) |prepare| {
6498
+ const prepare_evicted = cache.insert(prepare.message.ref());
6499
+ assert(prepare_evicted == null);
6500
+ assert(prepare.message.header.command == .prepare);
6501
+ }
6502
+ return cache;
6503
+ }
6504
+
6505
+ fn deinit(pipeline: *PipelineCache, message_pool: *MessagePool) void {
6506
+ for (pipeline.prepares) |*entry| {
6507
+ if (entry.*) |m| {
6508
+ message_pool.unref(m);
6509
+ entry.* = null;
6510
+ }
6511
+ }
6512
+ }
6513
+
6514
+ fn empty(pipeline: *const PipelineCache) bool {
6515
+ for (pipeline.prepares) |*entry| {
6516
+ if (entry) |_| return true;
6517
+ }
6518
+ return false;
6519
+ }
6520
+
6521
+ fn contains_header(pipeline: *const PipelineCache, header: *const Header) bool {
6522
+ assert(header.command == .prepare);
6523
+
6524
+ const slot = header.op % prepares_max;
6525
+ const prepare = pipeline.prepares[slot] orelse return false;
6526
+ return prepare.header.op == header.op and prepare.header.checksum == header.checksum;
6527
+ }
6528
+
6529
+ /// Unlike the PipelineQueue, cached messages may not belong to the current view.
6530
+ /// Thus, a matching checksum is required.
6531
+ fn prepare_by_op_and_checksum(pipeline: *PipelineCache, op: u64, checksum: u128) ?*Message {
6532
+ const slot = op % prepares_max;
6533
+ const prepare = pipeline.prepares[slot] orelse return null;
6534
+ if (prepare.header.op != op) return null;
6535
+ if (prepare.header.checksum != checksum) return null;
6536
+ return prepare;
6537
+ }
6538
+
6539
+ /// Returns the message evicted from the cache, if any.
6540
+ fn insert(pipeline: *PipelineCache, prepare: *Message) ?*Message {
6541
+ assert(prepare.header.command == .prepare);
6542
+
6543
+ const slot = prepare.header.op % prepares_max;
6544
+ const prepare_evicted = pipeline.prepares[slot];
6545
+ pipeline.prepares[slot] = prepare;
6546
+ return prepare_evicted;
6547
+ }
6548
+ };