tigerbeetle-node 0.8.0 → 0.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -18,6 +18,24 @@ const log = std.log.scoped(.replica);
18
18
  pub const Status = enum {
19
19
  normal,
20
20
  view_change,
21
+ // Recovery (for replica_count > 1):
22
+ //
23
+ // 1. At replica start: `status=recovering` and `journal.recovered=false`
24
+ // 2. Load the WAL. Mark questionable entries as faulty.
25
+ // 3. If the WAL has no entries (besides the initial commit), skip to step 5 with view 0.
26
+ // 4. Run VSR recovery protocol:
27
+ // a. Send a `recovery` message to every replica (except self).
28
+ // b. Wait for f+1 `recovery_response` messages from replicas in `normal` status.
29
+ // Each `recovery_response` includes the current view number.
30
+ // Each `recovery_response` must include a nonce matching the `recovery` message.
31
+ // c. Wait for a `recovery_response` from the leader of the highest known view.
32
+ // 5. Transition to `status=normal` with the discovered view number:
33
+ // * Set `op` to the highest op in the leader's recovery response.
34
+ // * Repair faulty messages.
35
+ // * Commit through to the discovered `commit_max`.
36
+ // * Set `state_machine.prepare_timeout` to the current op's timestamp.
37
+ //
38
+ // TODO document snapshot recovery in this progression
21
39
  recovering,
22
40
  };
23
41
 
@@ -47,22 +65,24 @@ const ClientTableEntry = struct {
47
65
  reply: *Message,
48
66
  };
49
67
 
68
+ const Nonce = u128;
69
+
50
70
  const Prepare = struct {
51
71
  /// The current prepare message (used to cross-check prepare_ok messages, and for resending).
52
72
  message: *Message,
53
73
 
54
74
  /// Unique prepare_ok messages for the same view, op number and checksum from ALL replicas.
55
- ok_from_all_replicas: QuorumCounter = QuorumCounterNull,
75
+ ok_from_all_replicas: QuorumCounter = quorum_counter_null,
56
76
 
57
77
  /// Whether a quorum of prepare_ok messages has been received for this prepare.
58
78
  ok_quorum_received: bool = false,
59
79
  };
60
80
 
61
81
  const QuorumMessages = [config.replicas_max]?*Message;
62
- const QuorumMessagesNull = [_]?*Message{null} ** config.replicas_max;
82
+ const quorum_messages_null = [_]?*Message{null} ** config.replicas_max;
63
83
 
64
84
  const QuorumCounter = std.StaticBitSet(config.replicas_max);
65
- const QuorumCounterNull = QuorumCounter.initEmpty();
85
+ const quorum_counter_null = QuorumCounter.initEmpty();
66
86
 
67
87
  pub fn Replica(
68
88
  comptime StateMachine: type,
@@ -114,12 +134,17 @@ pub fn Replica(
114
134
  view_normal: u32,
115
135
 
116
136
  /// The current status, either normal, view_change, or recovering:
117
- /// TODO Don't default to normal, set the starting status according to the journal's health.
118
- status: Status = .normal,
137
+ status: Status = .recovering,
119
138
 
120
139
  /// The op number assigned to the most recently prepared operation:
121
140
  op: u64,
122
141
 
142
+ /// The op of the highest checkpointed message.
143
+ // TODO Update this to use LSM storage.
144
+ // TODO Refuse to store/ack any op>op_checkpoint+journal_slot_count.
145
+ // TODO Enforce invariant op≥op_checkpoint.
146
+ op_checkpoint: u64 = 0,
147
+
123
148
  /// The op number of the latest committed and executed operation (according to the replica):
124
149
  /// The replica may have to wait for repairs to complete before commit_min reaches commit_max.
125
150
  commit_min: u64,
@@ -136,6 +161,9 @@ pub fn Replica(
136
161
 
137
162
  /// The leader's pipeline of inflight prepares waiting to commit in FIFO order.
138
163
  /// This allows us to pipeline without the complexity of out-of-order commits.
164
+ ///
165
+ /// After a view change, the old leader's pipeline is left untouched so that it is able to
166
+ /// help the new leader repair, even in the face of local storage faults.
139
167
  pipeline: RingBuffer(Prepare, config.pipeline_max) = .{},
140
168
 
141
169
  /// In some cases, a replica may send a message to itself. We do not submit these messages
@@ -144,13 +172,16 @@ pub fn Replica(
144
172
  loopback_queue: ?*Message = null,
145
173
 
146
174
  /// Unique start_view_change messages for the same view from OTHER replicas (excluding ourself).
147
- start_view_change_from_other_replicas: QuorumCounter = QuorumCounterNull,
175
+ start_view_change_from_other_replicas: QuorumCounter = quorum_counter_null,
148
176
 
149
177
  /// Unique do_view_change messages for the same view from ALL replicas (including ourself).
150
- do_view_change_from_all_replicas: QuorumMessages = QuorumMessagesNull,
178
+ do_view_change_from_all_replicas: QuorumMessages = quorum_messages_null,
151
179
 
152
180
  /// Unique nack_prepare messages for the same view from OTHER replicas (excluding ourself).
153
- nack_prepare_from_other_replicas: QuorumCounter = QuorumCounterNull,
181
+ nack_prepare_from_other_replicas: QuorumCounter = quorum_counter_null,
182
+
183
+ /// Unique recovery_response messages from OTHER replicas (excluding ourself).
184
+ recovery_response_from_other_replicas: QuorumMessages = quorum_messages_null,
154
185
 
155
186
  /// Whether a replica has received a quorum of start_view_change messages for the view change:
156
187
  start_view_change_quorum: bool = false,
@@ -189,6 +220,12 @@ pub fn Replica(
189
220
  /// The number of ticks before repairing missing/disconnected headers and/or dirty entries:
190
221
  repair_timeout: Timeout,
191
222
 
223
+ /// The number of ticks before attempting to send another set of `recovery` messages.
224
+ recovery_timeout: Timeout,
225
+
226
+ /// The nonce of the `recovery` messages.
227
+ recovery_nonce: Nonce,
228
+
192
229
  /// Used to provide deterministic entropy to `choose_any_other_replica()`.
193
230
  /// Incremented whenever `choose_any_other_replica()` is called.
194
231
  choose_any_other_replica_ticks: u64 = 0,
@@ -245,25 +282,27 @@ pub fn Replica(
245
282
  try client_table.ensureTotalCapacity(allocator, @intCast(u32, config.clients_max));
246
283
  assert(client_table.capacity() >= config.clients_max);
247
284
 
248
- var init_prepare = Header{
249
- .parent = 0,
250
- .client = 0,
251
- .context = 0,
252
- .request = 0,
253
- .cluster = cluster,
254
- .epoch = 0,
255
- .view = 0,
256
- .op = 0,
257
- .commit = 0,
258
- .offset = 0,
259
- .size = @sizeOf(Header),
260
- .replica = 0,
261
- .command = .prepare,
262
- .operation = .init,
263
- .version = Version,
285
+ const root_prepare = Header.root_prepare(cluster);
286
+
287
+ var clock = try Clock.init(
288
+ allocator,
289
+ replica_count,
290
+ replica,
291
+ time,
292
+ );
293
+ errdefer clock.deinit(allocator);
294
+
295
+ const journal = try Journal.init(allocator, storage, replica);
296
+ errdefer journal.deinit(allocator);
297
+
298
+ const recovery_nonce = blk: {
299
+ var nonce: [@sizeOf(Nonce)]u8 = undefined;
300
+ var hash = std.crypto.hash.Blake3.init(.{});
301
+ hash.update(std.mem.asBytes(&clock.monotonic()));
302
+ hash.update(&[_]u8{replica});
303
+ hash.final(&nonce);
304
+ break :blk @bitCast(Nonce, nonce);
264
305
  };
265
- init_prepare.set_checksum_body(&[0]u8{});
266
- init_prepare.set_checksum();
267
306
 
268
307
  var self = Self{
269
308
  .cluster = cluster,
@@ -271,28 +310,16 @@ pub fn Replica(
271
310
  .replica = replica,
272
311
  .quorum_replication = quorum_replication,
273
312
  .quorum_view_change = quorum_view_change,
274
- .clock = try Clock.init(
275
- allocator,
276
- replica_count,
277
- replica,
278
- time,
279
- ),
280
- .journal = try Journal.init(
281
- allocator,
282
- storage,
283
- replica,
284
- config.journal_size_max,
285
- config.journal_headers_max,
286
- &init_prepare,
287
- ),
313
+ .clock = clock,
314
+ .journal = journal,
288
315
  .message_bus = message_bus,
289
316
  .state_machine = state_machine,
290
317
  .client_table = client_table,
291
- .view = init_prepare.view,
292
- .view_normal = init_prepare.view,
293
- .op = init_prepare.op,
294
- .commit_min = init_prepare.commit,
295
- .commit_max = init_prepare.commit,
318
+ .view = root_prepare.view,
319
+ .view_normal = root_prepare.view,
320
+ .op = root_prepare.op,
321
+ .commit_min = root_prepare.commit,
322
+ .commit_max = root_prepare.commit,
296
323
  .ping_timeout = Timeout{
297
324
  .name = "ping_timeout",
298
325
  .id = replica,
@@ -328,6 +355,12 @@ pub fn Replica(
328
355
  .id = replica,
329
356
  .after = 50,
330
357
  },
358
+ .recovery_timeout = Timeout{
359
+ .name = "recovery_timeout",
360
+ .id = replica,
361
+ .after = 200,
362
+ },
363
+ .recovery_nonce = recovery_nonce,
331
364
  .prng = std.rand.DefaultPrng.init(replica),
332
365
  };
333
366
 
@@ -346,20 +379,7 @@ pub fn Replica(
346
379
  config.clients_max,
347
380
  });
348
381
 
349
- // We must initialize timeouts here, not in tick() on the first tick, because on_message()
350
- // can race with tick()... before timeouts have been initialized:
351
- assert(self.status == .normal);
352
- if (self.leader()) {
353
- log.debug("{}: init: leader", .{self.replica});
354
- self.ping_timeout.start();
355
- self.commit_timeout.start();
356
- self.repair_timeout.start();
357
- } else {
358
- log.debug("{}: init: follower", .{self.replica});
359
- self.ping_timeout.start();
360
- self.normal_status_timeout.start();
361
- self.repair_timeout.start();
362
- }
382
+ assert(self.status == .recovering);
363
383
 
364
384
  return self;
365
385
  }
@@ -378,12 +398,7 @@ pub fn Replica(
378
398
  self.client_table.deinit(allocator);
379
399
  }
380
400
 
381
- {
382
- var it = self.pipeline.iterator();
383
- while (it.next()) |prepare| {
384
- self.message_bus.unref(prepare.message);
385
- }
386
- }
401
+ while (self.pipeline.pop()) |prepare| self.message_bus.unref(prepare.message);
387
402
 
388
403
  if (self.loopback_queue) |loopback_message| {
389
404
  assert(loopback_message.next == null);
@@ -394,6 +409,10 @@ pub fn Replica(
394
409
  for (self.do_view_change_from_all_replicas) |message| {
395
410
  if (message) |m| self.message_bus.unref(m);
396
411
  }
412
+
413
+ for (self.recovery_response_from_other_replicas) |message| {
414
+ if (message) |m| self.message_bus.unref(m);
415
+ }
397
416
  }
398
417
 
399
418
  /// Time is measured in logical ticks that are incremented on every call to tick().
@@ -408,12 +427,40 @@ pub fn Replica(
408
427
  self.clock.tick();
409
428
 
410
429
  if (!self.journal.recovered) {
411
- self.journal.recover();
430
+ if (!self.journal.recovering) self.journal.recover();
412
431
  return;
413
432
  } else {
414
433
  assert(!self.journal.recovering);
415
434
  }
416
435
 
436
+ if (self.status == .recovering) {
437
+ if (self.recovery_timeout.ticking) {
438
+ // Continue running the VSR recovery protocol.
439
+ self.recovery_timeout.tick();
440
+ if (self.recovery_timeout.fired()) self.on_recovery_timeout();
441
+ } else if (self.journal.is_empty()) {
442
+ // The data file is brand new — no messages have ever been written.
443
+ // Transition to normal status; no need to run the VSR recovery protocol.
444
+ assert(self.journal.faulty.count == 0);
445
+ self.transition_to_normal_from_recovering_status(0);
446
+ assert(self.status == .normal);
447
+ } else if (self.replica_count == 1) {
448
+ // A cluster-of-one does not run the VSR recovery protocol.
449
+ if (self.journal.faulty.count != 0) @panic("journal is corrupt");
450
+ if (self.committing) return;
451
+ assert(self.op == 0);
452
+ self.op = self.journal.op_maximum();
453
+ self.commit_ops(self.op);
454
+ // The recovering→normal transition is deferred until all ops are committed.
455
+ } else {
456
+ // The journal just finished recovery.
457
+ // Now try to learn the current view via the VSR recovery protocol.
458
+ self.recovery_timeout.start();
459
+ self.recover();
460
+ }
461
+ return;
462
+ }
463
+
417
464
  self.ping_timeout.tick();
418
465
  self.prepare_timeout.tick();
419
466
  self.commit_timeout.tick();
@@ -437,11 +484,12 @@ pub fn Replica(
437
484
  /// Called by the MessageBus to deliver a message to the replica.
438
485
  pub fn on_message(self: *Self, message: *Message) void {
439
486
  assert(self.loopback_queue == null);
487
+ assert(message.references > 0);
440
488
 
441
- log.debug("{}: on_message: view={} status={s} {}", .{
489
+ log.debug("{}: on_message: view={} status={} {}", .{
442
490
  self.replica,
443
491
  self.view,
444
- @tagName(self.status),
492
+ self.status,
445
493
  message.header,
446
494
  });
447
495
 
@@ -463,7 +511,6 @@ pub fn Replica(
463
511
  }
464
512
 
465
513
  if (!self.journal.recovered) {
466
- self.journal.recover();
467
514
  log.debug("{}: on_message: waiting for journal to recover", .{self.replica});
468
515
  return;
469
516
  } else {
@@ -482,7 +529,7 @@ pub fn Replica(
482
529
  .do_view_change => self.on_do_view_change(message),
483
530
  .start_view => self.on_start_view(message),
484
531
  .recovery => self.on_recovery(message),
485
- .recovery_response => return, // TODO
532
+ .recovery_response => self.on_recovery_response(message),
486
533
  .request_start_view => self.on_request_start_view(message),
487
534
  .request_prepare => self.on_request_prepare(message),
488
535
  .request_headers => self.on_request_headers(message),
@@ -542,7 +589,7 @@ pub fn Replica(
542
589
  } else {
543
590
  // Copy the ping's monotonic timestamp to our pong and add our wall clock sample:
544
591
  pong.op = message.header.op;
545
- pong.offset = @bitCast(u64, self.clock.realtime());
592
+ pong.timestamp = @bitCast(u64, self.clock.realtime());
546
593
  self.send_header_to_replica(message.header.replica, pong);
547
594
  }
548
595
  }
@@ -552,7 +599,7 @@ pub fn Replica(
552
599
  if (message.header.replica == self.replica) return;
553
600
 
554
601
  const m0 = message.header.op;
555
- const t1 = @bitCast(i64, message.header.offset);
602
+ const t1 = @bitCast(i64, message.header.timestamp);
556
603
  const m2 = self.clock.monotonic();
557
604
 
558
605
  self.clock.learn(message.header.replica, m0, t1, m2);
@@ -560,9 +607,9 @@ pub fn Replica(
560
607
 
561
608
  /// The primary advances op-number, adds the request to the end of the log, and updates the
562
609
  /// information for this client in the client-table to contain the new request number, s.
563
- /// Then it sends a ⟨PREPARE v, m, n, k⟩ message to the other replicas, where v is the current
564
- /// view-number, m is the message it received from the client, n is the op-number it assigned to
565
- /// the request, and k is the commit-number.
610
+ /// Then it sends a ⟨PREPARE v, m, n, k⟩ message to the other replicas, where v is the
611
+ /// current view-number, m is the message it received from the client, n is the op-number
612
+ /// it assigned to the request, and k is the commit-number.
566
613
  fn on_request(self: *Self, message: *Message) void {
567
614
  if (self.ignore_request_message(message)) return;
568
615
 
@@ -581,19 +628,30 @@ pub fn Replica(
581
628
 
582
629
  log.debug("{}: on_request: request {}", .{ self.replica, message.header.checksum });
583
630
 
584
- self.state_machine.prepare(
585
- realtime,
631
+ // Guard against the wall clock going backwards by taking the max with timestamps issued:
632
+ self.state_machine.prepare_timestamp = std.math.max(
633
+ // The cluster `commit_timestamp` may be ahead of our `prepare_timestamp` because this
634
+ // may be our first prepare as a recently elected leader:
635
+ std.math.max(
636
+ self.state_machine.prepare_timestamp,
637
+ self.state_machine.commit_timestamp,
638
+ ) + 1,
639
+ @intCast(u64, realtime),
640
+ );
641
+ assert(self.state_machine.prepare_timestamp > self.state_machine.commit_timestamp);
642
+
643
+ const prepare_timestamp = self.state_machine.prepare(
586
644
  message.header.operation.cast(StateMachine),
587
645
  message.body(),
588
646
  );
589
647
 
590
- var latest_entry = self.journal.entry_for_op_exact(self.op).?;
648
+ const latest_entry = self.journal.header_with_op(self.op).?;
591
649
  message.header.parent = latest_entry.checksum;
592
650
  message.header.context = message.header.checksum;
593
651
  message.header.view = self.view;
594
652
  message.header.op = self.op + 1;
595
653
  message.header.commit = self.commit_max;
596
- message.header.offset = Journal.next_offset(latest_entry);
654
+ message.header.timestamp = prepare_timestamp;
597
655
  message.header.replica = self.replica;
598
656
  message.header.command = .prepare;
599
657
 
@@ -612,6 +670,8 @@ pub fn Replica(
612
670
  } else {
613
671
  // Do not restart the prepare timeout as it is already ticking for another prepare.
614
672
  assert(self.prepare_timeout.ticking);
673
+ const previous = self.pipeline.get_ptr(self.pipeline.count - 2).?;
674
+ assert(previous.message.header.checksum == message.header.parent);
615
675
  }
616
676
 
617
677
  self.on_prepare(message);
@@ -625,22 +685,23 @@ pub fn Replica(
625
685
  ///
626
686
  /// The leader starts by sending a prepare message to itself.
627
687
  ///
628
- /// Each replica (including the leader) then forwards this prepare message to the next replica
629
- /// in the configuration, in parallel to writing to its own journal, closing the circle until
630
- /// the next replica is back to the leader, in which case the replica does not forward.
688
+ /// Each replica (including the leader) then forwards this prepare message to the next
689
+ /// replica in the configuration, in parallel to writing to its own journal, closing the
690
+ /// circle until the next replica is back to the leader, in which case the replica does not
691
+ /// forward.
631
692
  ///
632
693
  /// This keeps the leader's outgoing bandwidth limited (one-for-one) to incoming bandwidth,
633
- /// since the leader need only replicate to the next replica. Otherwise, the leader would need
634
- /// to replicate to multiple followers, dividing available bandwidth.
694
+ /// since the leader need only replicate to the next replica. Otherwise, the leader would
695
+ /// need to replicate to multiple followers, dividing available bandwidth.
635
696
  ///
636
- /// This does not impact latency, since with Flexible Paxos we need only one remote prepare_ok.
637
- /// It is ideal if this synchronous replication to one remote replica is to the next replica,
638
- /// since that is the replica next in line to be leader, which will need to be up-to-date before
639
- /// it can start the next view.
697
+ /// This does not impact latency, since with Flexible Paxos we need only one remote
698
+ /// prepare_ok. It is ideal if this synchronous replication to one remote replica is to the
699
+ /// next replica, since that is the replica next in line to be leader, which will need to
700
+ /// be up-to-date before it can start the next view.
640
701
  ///
641
- /// At the same time, asynchronous replication keeps going, so that if our local disk is slow,
642
- /// then any latency spike will be masked by more remote prepare_ok messages as they come in.
643
- /// This gives automatic tail latency tolerance for storage latency spikes.
702
+ /// At the same time, asynchronous replication keeps going, so that if our local disk is
703
+ /// slow, then any latency spike will be masked by more remote prepare_ok messages as they
704
+ /// come in. This gives automatic tail latency tolerance for storage latency spikes.
644
705
  ///
645
706
  /// The remaining problem then is tail latency tolerance for network latency spikes.
646
707
  /// If the next replica is down or partitioned, then the leader's prepare timeout will fire,
@@ -669,12 +730,26 @@ pub fn Replica(
669
730
  return;
670
731
  }
671
732
 
733
+ // Verify that the new request will fit in the WAL.
734
+ if (message.header.op >= self.op_checkpoint + config.journal_slot_count) {
735
+ log.debug("{}: on_prepare: ignoring op={} (too far ahead, checkpoint={})", .{
736
+ self.replica,
737
+ message.header.op,
738
+ self.op_checkpoint,
739
+ });
740
+ // When we are the leader, `on_request` enforces this invariant.
741
+ assert(self.follower());
742
+ return;
743
+ }
744
+
672
745
  assert(self.status == .normal);
673
746
  assert(message.header.view == self.view);
674
747
  assert(self.leader() or self.follower());
675
748
  assert(message.header.replica == self.leader_index(message.header.view));
749
+ assert(message.header.op > self.op_checkpoint);
676
750
  assert(message.header.op > self.op);
677
751
  assert(message.header.op > self.commit_min);
752
+ assert(message.header.op < self.op_checkpoint + config.journal_slot_count);
678
753
 
679
754
  if (self.follower()) self.normal_status_timeout.reset();
680
755
 
@@ -685,7 +760,7 @@ pub fn Replica(
685
760
 
686
761
  if (self.journal.previous_entry(message.header)) |previous| {
687
762
  // Any previous entry may be a whole journal's worth of ops behind due to wrapping.
688
- // We therefore do not do any further op, offset or checksum assertions beyond this:
763
+ // We therefore do not do any further op or checksum assertions beyond this:
689
764
  self.panic_if_hash_chain_would_break_in_the_same_view(previous, message.header);
690
765
  }
691
766
 
@@ -700,7 +775,7 @@ pub fn Replica(
700
775
  });
701
776
  assert(message.header.op == self.op + 1);
702
777
  self.op = message.header.op;
703
- self.journal.set_entry_as_dirty(message.header);
778
+ self.journal.set_header_as_dirty(message.header);
704
779
 
705
780
  self.replicate(message);
706
781
  self.append(message);
@@ -780,7 +855,7 @@ pub fn Replica(
780
855
  assert(message.header.replica == self.leader_index(message.header.view));
781
856
 
782
857
  // We may not always have the latest commit entry but if we do our checksum must match:
783
- if (self.journal.entry_for_op_exact(message.header.commit)) |commit_entry| {
858
+ if (self.journal.header_with_op(message.header.commit)) |commit_entry| {
784
859
  if (commit_entry.checksum == message.header.context) {
785
860
  log.debug("{}: on_commit: checksum verified", .{self.replica});
786
861
  } else if (self.valid_hash_chain("on_commit")) {
@@ -792,7 +867,6 @@ pub fn Replica(
792
867
  }
793
868
 
794
869
  self.normal_status_timeout.reset();
795
-
796
870
  self.commit_ops(message.header.commit);
797
871
  }
798
872
 
@@ -951,7 +1025,7 @@ pub fn Replica(
951
1025
 
952
1026
  var v: ?u32 = null;
953
1027
  var k: ?u64 = null;
954
- var latest = Header.reserved();
1028
+ var latest = Header.reserved(self.cluster, 0);
955
1029
 
956
1030
  for (self.do_view_change_from_all_replicas) |received, replica| {
957
1031
  if (received) |m| {
@@ -962,10 +1036,10 @@ pub fn Replica(
962
1036
 
963
1037
  // The latest normal view experienced by this replica:
964
1038
  // This may be higher than the view in any of the prepare headers.
965
- var replica_view_normal = @intCast(u32, m.header.offset);
1039
+ var replica_view_normal = @intCast(u32, m.header.timestamp);
966
1040
  assert(replica_view_normal < m.header.view);
967
1041
 
968
- var replica_latest = Header.reserved();
1042
+ var replica_latest = Header.reserved(self.cluster, 0);
969
1043
  set_latest_op(self.message_body_as_headers(m), &replica_latest);
970
1044
  assert(replica_latest.op == m.header.op);
971
1045
 
@@ -1005,7 +1079,7 @@ pub fn Replica(
1005
1079
  }
1006
1080
 
1007
1081
  // Verify that the repairs above have not replaced or advanced the latest op:
1008
- assert(self.journal.entry_for_op_exact(self.op).?.checksum == latest.checksum);
1082
+ assert(self.journal.header_with_op(self.op).?.checksum == latest.checksum);
1009
1083
 
1010
1084
  assert(self.start_view_change_quorum);
1011
1085
  assert(!self.do_view_change_quorum);
@@ -1013,7 +1087,11 @@ pub fn Replica(
1013
1087
 
1014
1088
  self.discard_uncommitted_headers();
1015
1089
  assert(self.op >= self.commit_max);
1016
- assert(self.journal.entry_for_op_exact(self.op) != null);
1090
+
1091
+ const prepare_timestamp = self.journal.header_with_op(self.op).?.timestamp;
1092
+ if (self.state_machine.prepare_timestamp < prepare_timestamp) {
1093
+ self.state_machine.prepare_timestamp = prepare_timestamp;
1094
+ }
1017
1095
 
1018
1096
  // Start repairs according to the CTRL protocol:
1019
1097
  assert(!self.repair_timeout.ticking);
@@ -1041,7 +1119,7 @@ pub fn Replica(
1041
1119
  assert(self.status == .view_change);
1042
1120
  assert(message.header.view == self.view);
1043
1121
 
1044
- var latest = Header.reserved();
1122
+ var latest = Header.reserved(self.cluster, 0);
1045
1123
  set_latest_op(self.message_body_as_headers(message), &latest);
1046
1124
  assert(latest.op == message.header.op);
1047
1125
 
@@ -1053,10 +1131,10 @@ pub fn Replica(
1053
1131
  }
1054
1132
 
1055
1133
  // Verify that the repairs above have not replaced or advanced the latest op:
1056
- assert(self.journal.entry_for_op_exact(self.op).?.checksum == latest.checksum);
1134
+ assert(self.journal.header_with_op(self.op).?.checksum == latest.checksum);
1057
1135
 
1058
1136
  if (self.status == .view_change) {
1059
- self.transition_to_normal_status(message.header.view);
1137
+ self.transition_to_normal_from_view_change_status(message.header.view);
1060
1138
  self.send_prepare_oks_after_view_change();
1061
1139
  }
1062
1140
 
@@ -1089,8 +1167,9 @@ pub fn Replica(
1089
1167
  self.send_message_to_replica(message.header.replica, start_view);
1090
1168
  }
1091
1169
 
1092
- /// TODO This is a work in progress (out of scope for the bounty)
1093
1170
  fn on_recovery(self: *Self, message: *const Message) void {
1171
+ assert(self.replica_count > 1);
1172
+
1094
1173
  if (self.status != .normal) {
1095
1174
  log.debug("{}: on_recovery: ignoring ({})", .{ self.replica, self.status });
1096
1175
  return;
@@ -1104,34 +1183,28 @@ pub fn Replica(
1104
1183
  const response = self.message_bus.get_message();
1105
1184
  defer self.message_bus.unref(response);
1106
1185
 
1186
+ log.debug("{}: on_recovery: view={} op={} commit={} nonce={}", .{
1187
+ self.replica,
1188
+ self.view,
1189
+ self.op,
1190
+ self.commit_max,
1191
+ message.header.context,
1192
+ });
1193
+
1107
1194
  response.header.* = .{
1108
1195
  .command = .recovery_response,
1109
1196
  .cluster = self.cluster,
1110
- .context = message.header.context,
1197
+ .context = message.header.context, // Echo the request's nonce.
1111
1198
  .replica = self.replica,
1112
1199
  .view = self.view,
1113
1200
  .op = self.op,
1114
1201
  .commit = self.commit_max,
1115
1202
  };
1116
1203
 
1117
- const count_max = 8; // The number of prepare headers to include in the body.
1118
-
1119
- const size_max = @sizeOf(Header) * std.math.min(
1120
- std.math.max(@divFloor(response.buffer.len, @sizeOf(Header)), 2),
1121
- 1 + count_max,
1122
- );
1123
- assert(size_max > @sizeOf(Header));
1124
-
1125
- const count = self.journal.copy_latest_headers_between(
1126
- 0,
1127
- self.op,
1128
- std.mem.bytesAsSlice(Header, response.buffer[@sizeOf(Header)..size_max]),
1129
- );
1130
-
1131
- // We expect that self.op always exists.
1132
- assert(count > 0);
1133
-
1134
- response.header.size = @intCast(u32, @sizeOf(Header) + @sizeOf(Header) * count);
1204
+ const count_max = 8; // The maximum number of prepare headers to include in the body.
1205
+ const count = self.copy_latest_headers_and_set_size(0, self.op, count_max, response);
1206
+ assert(count > 0); // We expect that self.op always exists.
1207
+ assert(@divExact(response.header.size, @sizeOf(Header)) == 1 + count);
1135
1208
 
1136
1209
  response.header.set_checksum_body(response.body());
1137
1210
  response.header.set_checksum();
@@ -1143,68 +1216,339 @@ pub fn Replica(
1143
1216
  self.send_message_to_replica(message.header.replica, response);
1144
1217
  }
1145
1218
 
1146
- /// TODO This is a work in progress (out of scope for the bounty)
1147
1219
  fn on_recovery_response(self: *Self, message: *Message) void {
1148
- _ = self;
1149
- _ = message;
1220
+ assert(self.replica_count > 1);
1221
+
1222
+ if (self.status != .recovering) {
1223
+ log.debug("{}: on_recovery_response: ignoring ({})", .{
1224
+ self.replica,
1225
+ self.status,
1226
+ });
1227
+ return;
1228
+ }
1229
+
1230
+ if (message.header.replica == self.replica) {
1231
+ log.warn("{}: on_recovery_response: ignoring (self)", .{self.replica});
1232
+ return;
1233
+ }
1234
+
1235
+ if (message.header.context != self.recovery_nonce) {
1236
+ log.warn("{}: on_recovery_response: ignoring (different nonce)", .{self.replica});
1237
+ return;
1238
+ }
1239
+
1240
+ // Recovery messages with our nonce are not sent until after the journal is recovered.
1241
+ assert(self.journal.recovered);
1242
+
1243
+ var responses: *QuorumMessages = &self.recovery_response_from_other_replicas;
1244
+ if (responses[message.header.replica]) |existing| {
1245
+ assert(message.header.replica == existing.header.replica);
1246
+
1247
+ if (message.header.checksum == existing.header.checksum) {
1248
+ // The response was replayed by the network; ignore it.
1249
+ log.debug("{}: on_recovery_response: ignoring (duplicate message)", .{
1250
+ self.replica,
1251
+ });
1252
+ return;
1253
+ }
1254
+
1255
+ // We received a second (distinct) response from a replica. Possible causes:
1256
+ // * We retried the `recovery` message, because we had not yet received a quorum.
1257
+ // * The `recovery` message was duplicated/misdirected by the network, and the
1258
+ // receiver's state changed in the mean time.
1259
+
1260
+ log.debug(
1261
+ "{}: on_recovery_response: replica={} view={}..{} op={}..{} commit={}..{}",
1262
+ .{
1263
+ self.replica,
1264
+ existing.header.replica,
1265
+ existing.header.view,
1266
+ message.header.view,
1267
+ existing.header.op,
1268
+ message.header.op,
1269
+ existing.header.commit,
1270
+ message.header.commit,
1271
+ },
1272
+ );
1273
+
1274
+ if (message.header.view < existing.header.view or
1275
+ (message.header.view == existing.header.view and
1276
+ message.header.op < existing.header.op) or
1277
+ (message.header.view == existing.header.view and
1278
+ message.header.op == existing.header.op and
1279
+ message.header.commit < existing.header.commit))
1280
+ {
1281
+ // The second message is older than the first one (reordered packets).
1282
+ log.debug("{}: on_recovery_response: ignoring (older)", .{self.replica});
1283
+ return;
1284
+ }
1285
+
1286
+ // The second message is newer than the first one.
1287
+ assert(message.header.view >= existing.header.view);
1288
+ // The op number may regress if an uncommitted op was discarded in a higher view.
1289
+ assert(message.header.op >= existing.header.op or
1290
+ message.header.view > existing.header.view);
1291
+ assert(message.header.commit >= existing.header.commit);
1292
+
1293
+ self.message_bus.unref(existing);
1294
+ responses[message.header.replica] = null;
1295
+ } else {
1296
+ log.debug(
1297
+ "{}: on_recovery_response: replica={} view={} op={} commit={}",
1298
+ .{
1299
+ self.replica,
1300
+ message.header.replica,
1301
+ message.header.view,
1302
+ message.header.op,
1303
+ message.header.commit,
1304
+ },
1305
+ );
1306
+ }
1307
+
1308
+ assert(responses[message.header.replica] == null);
1309
+ responses[message.header.replica] = message.ref();
1310
+
1311
+ // Wait until we have:
1312
+ // * at least `f + 1` messages for quorum (not including ourself), and
1313
+ // * a response from the leader of the highest discovered view.
1314
+ const count = self.count_quorum(responses, .recovery_response, self.recovery_nonce);
1315
+ assert(count <= self.replica_count - 1);
1316
+
1317
+ const threshold = self.quorum_view_change;
1318
+ if (count < threshold) {
1319
+ log.debug("{}: on_recovery_response: waiting for quorum ({}/{})", .{
1320
+ self.replica,
1321
+ count,
1322
+ threshold,
1323
+ });
1324
+ return;
1325
+ }
1326
+
1327
+ const view = blk: { // The latest known view.
1328
+ var view: u32 = 0;
1329
+ for (self.recovery_response_from_other_replicas) |received, replica| {
1330
+ if (received) |response| {
1331
+ assert(replica != self.replica);
1332
+ assert(response.header.replica == replica);
1333
+ assert(response.header.context == self.recovery_nonce);
1334
+
1335
+ view = std.math.max(view, response.header.view);
1336
+ }
1337
+ }
1338
+ break :blk view;
1339
+ };
1340
+
1341
+ const leader_response = responses[self.leader_index(view)];
1342
+ if (leader_response == null) {
1343
+ log.debug(
1344
+ "{}: on_recovery_response: ignoring (awaiting response from leader of view={})",
1345
+ .{
1346
+ self.replica,
1347
+ view,
1348
+ },
1349
+ );
1350
+ return;
1351
+ }
1352
+
1353
+ if (leader_response.?.header.view != view) {
1354
+ // The leader (according to the view quorum) isn't the leader (according to itself).
1355
+ // The `recovery_timeout` will retry shortly with another round.
1356
+ log.debug(
1357
+ "{}: on_recovery_response: ignoring (leader view={} != quorum view={})",
1358
+ .{
1359
+ self.replica,
1360
+ leader_response.?.header.view,
1361
+ view,
1362
+ },
1363
+ );
1364
+ return;
1365
+ }
1366
+
1367
+ // This recovering→normal status transition occurs exactly once.
1368
+ // All further `recovery_response` messages are ignored.
1369
+
1370
+ // TODO When the view is recovered from the superblock (instead of via the VSR recovery
1371
+ // protocol), if the view number indicates that this replica is a leader, it must
1372
+ // transition to status=view_change instead of status=normal.
1373
+
1374
+ const leader_headers = self.message_body_as_headers(leader_response.?);
1375
+ assert(leader_headers.len > 0);
1376
+
1377
+ const commit = leader_response.?.header.commit;
1378
+ {
1379
+ var latest = Header.reserved(self.cluster, 0);
1380
+ set_latest_op(leader_headers, &latest);
1381
+ assert(latest.op == leader_response.?.header.op);
1382
+
1383
+ self.set_latest_op_and_k(&latest, commit, "on_recovery_response");
1384
+ assert(self.op == latest.op);
1385
+ assert(self.journal.header_with_op(self.op) != null);
1386
+ }
1387
+
1388
+ assert(self.status == .recovering);
1389
+ self.transition_to_normal_from_recovering_status(view);
1390
+ assert(self.status == .normal);
1391
+ assert(self.follower());
1392
+
1393
+ // TODO If the view's primary is >1 WAL ahead of us, these headers could cause
1394
+ // problems. We don't want to jump this far ahead to repair, but we still need to use
1395
+ // the hash chain to figure out which headers to request. Maybe include our
1396
+ // `op_checkpoint` in the recovery (request) message so that the response can give more
1397
+ // useful (i.e. older) headers.
1398
+ for (leader_headers) |*header| {
1399
+ _ = self.repair_header(header);
1400
+ }
1401
+
1402
+ if (self.op < config.journal_slot_count) {
1403
+ if (self.journal.header_with_op(0)) |header| {
1404
+ assert(header.command == .prepare);
1405
+ assert(header.operation == .root);
1406
+ } else {
1407
+ // This is the first wrap of the log, and the root prepare is corrupt.
1408
+ // Repair the root repair. This is necessary to maintain the invariant that the
1409
+ // op=commit_min exists in-memory.
1410
+ const header = Header.root_prepare(self.cluster);
1411
+ self.journal.set_header_as_dirty(&header);
1412
+ log.debug("{}: on_recovery_response: repair root op", .{self.replica});
1413
+ }
1414
+ }
1415
+
1416
+ log.debug("{}: on_recovery_response: responses={} view={} headers={}..{}" ++
1417
+ " commit={} dirty={} faulty={}", .{
1418
+ self.replica,
1419
+ count,
1420
+ view,
1421
+ leader_headers[leader_headers.len - 1].op,
1422
+ leader_headers[0].op,
1423
+ commit,
1424
+ self.journal.dirty.count,
1425
+ self.journal.faulty.count,
1426
+ });
1427
+
1428
+ self.state_machine.prepare_timestamp = self.journal.header_with_op(self.op).?.timestamp;
1429
+ // `state_machine.commit_timestamp` is updated as messages are committed.
1430
+
1431
+ self.reset_quorum_recovery_response();
1432
+ self.commit_ops(commit);
1433
+ self.repair();
1150
1434
  }
1151
1435
 
1436
+ /// If the requested prepare has been guaranteed by this replica:
1437
+ /// * Read the prepare from storage, and forward it to the replica that requested it.
1438
+ /// * Otherwise send no reply — it isn't safe to nack.
1439
+ /// If the requested prepare has *not* been guaranteed by this replica, then send a nack.
1440
+ ///
1441
+ /// A prepare is considered "guaranteed" by a replica if that replica has acknowledged it
1442
+ /// to the cluster. The cluster sees the replica as an underwriter of a guaranteed
1443
+ /// prepare. If a guaranteed prepare is found to by faulty, the replica must repair it
1444
+ /// to restore durability.
1152
1445
  fn on_request_prepare(self: *Self, message: *const Message) void {
1153
1446
  if (self.ignore_repair_message(message)) return;
1154
1447
 
1448
+ assert(self.replica_count > 1);
1155
1449
  assert(self.status == .normal or self.status == .view_change);
1156
1450
  assert(message.header.view == self.view);
1157
1451
  assert(message.header.replica != self.replica);
1158
1452
 
1159
1453
  const op = message.header.op;
1160
- var checksum: ?u128 = message.header.context;
1161
- if (self.leader_index(self.view) == self.replica and checksum.? == 0) checksum = null;
1454
+ const slot = self.journal.slot_for_op(op);
1455
+ const checksum: ?u128 = switch (message.header.timestamp) {
1456
+ 0 => null,
1457
+ 1 => message.header.context,
1458
+ else => unreachable,
1459
+ };
1162
1460
 
1163
- if (self.journal.entry_for_op_exact_with_checksum(op, checksum)) |entry| {
1164
- assert(entry.op == op);
1165
- assert(checksum == null or entry.checksum == checksum.?);
1461
+ // Only the leader may respond to `request_prepare` messages without a checksum.
1462
+ assert(checksum != null or self.leader_index(self.view) == self.replica);
1166
1463
 
1167
- if (!self.journal.dirty.bit(op)) {
1168
- assert(!self.journal.faulty.bit(op));
1464
+ // Try to serve the message directly from the pipeline.
1465
+ // This saves us from going to disk. And we don't need to worry that the WAL's copy
1466
+ // of an uncommitted prepare is lost/corrupted.
1467
+ if (self.pipeline_prepare_for_op_and_checksum(op, checksum)) |prepare| {
1468
+ log.debug("{}: on_request_prepare: op={} checksum={} reply from pipeline", .{
1469
+ self.replica,
1470
+ op,
1471
+ checksum,
1472
+ });
1473
+ self.send_message_to_replica(message.header.replica, prepare.message);
1474
+ return;
1475
+ }
1169
1476
 
1477
+ if (self.journal.prepare_inhabited[slot.index]) {
1478
+ const prepare_checksum = self.journal.prepare_checksums[slot.index];
1479
+ // Consult `journal.prepare_checksums` (rather than `journal.headers`):
1480
+ // the former may have the prepare we want — even if journal recovery marked the
1481
+ // slot as faulty and left the in-memory header as reserved.
1482
+ if (checksum == null or checksum.? == prepare_checksum) {
1170
1483
  log.debug("{}: on_request_prepare: op={} checksum={} reading", .{
1171
1484
  self.replica,
1172
1485
  op,
1173
1486
  checksum,
1174
1487
  });
1175
1488
 
1176
- // TODO Do not reissue the read if we are already reading in order to send to
1177
- // this particular destination replica.
1178
-
1179
- self.journal.read_prepare(
1180
- on_request_prepare_read,
1181
- op,
1182
- entry.checksum,
1183
- message.header.replica,
1184
- );
1185
-
1186
- // We have guaranteed the prepare and our copy is clean (not safe to nack).
1187
- return;
1188
- } else if (self.journal.faulty.bit(op)) {
1189
- log.debug("{}: on_request_prepare: op={} checksum={} faulty", .{
1190
- self.replica,
1191
- op,
1192
- checksum,
1193
- });
1489
+ if (self.journal.header_with_op_and_checksum(op, checksum)) |_| {
1490
+ // The header for the target prepare is already in-memory.
1491
+ // This is preferable to the `else` case since we have the prepare's
1492
+ // `header.size` in-memory, so the read can be (potentially) shorter.
1493
+ // TODO Do not reissue the read if we are already reading in order to send
1494
+ // to this particular destination replica.
1495
+ self.journal.read_prepare(
1496
+ on_request_prepare_read,
1497
+ op,
1498
+ prepare_checksum,
1499
+ message.header.replica,
1500
+ );
1501
+ } else {
1502
+ // TODO Do not reissue the read if we are already reading in order to send to
1503
+ // this particular destination replica.
1504
+ self.journal.read_prepare_with_op_and_checksum(
1505
+ on_request_prepare_read,
1506
+ op,
1507
+ prepare_checksum,
1508
+ message.header.replica,
1509
+ );
1510
+ }
1194
1511
 
1195
- // We have gauranteed the prepare but our copy is faulty (not safe to nack).
1512
+ // We have guaranteed the prepare (not safe to nack).
1513
+ // Our copy may or may not be valid, but we will try to read & forward it.
1196
1514
  return;
1197
1515
  }
1516
+ }
1198
1517
 
1199
- // We know of the prepare but we have yet to write or guarantee it (safe to nack).
1200
- // Continue through below...
1518
+ {
1519
+ // We may have guaranteed the prepare but our copy is faulty (not safe to nack).
1520
+ if (self.journal.faulty.bit(slot)) return;
1521
+ if (self.journal.header_with_op_and_checksum(message.header.op, checksum)) |_| {
1522
+ if (self.journal.dirty.bit(slot)) {
1523
+ // We know of the prepare but have yet to write it (safe to nack).
1524
+ // Continue through below...
1525
+ } else {
1526
+ // We have guaranteed the prepare and our copy is clean (not safe to nack).
1527
+ return;
1528
+ }
1529
+ }
1201
1530
  }
1202
1531
 
1532
+ // Protocol-Aware Recovery's CTRL protocol only runs during the view change, when the
1533
+ // new primary needs to repair its own WAL before starting the new view.
1534
+ //
1535
+ // This branch is only where the backup doesn't have the prepare and could possibly
1536
+ // send a nack as part of the CTRL protocol. Nacks only get sent during a view change
1537
+ // to help the new primary trim uncommitted ops that couldn't otherwise be repaired.
1538
+ // Without doing this, the cluster would become permanently unavailable. So backups
1539
+ // shouldn't respond to the `request_prepare` if the new view has already started,
1540
+ // they should also be in view change status, waiting for the new primary to start
1541
+ // the view.
1203
1542
  if (self.status == .view_change) {
1204
1543
  assert(message.header.replica == self.leader_index(self.view));
1205
1544
  assert(checksum != null);
1206
- if (self.journal.entry_for_op_exact_with_checksum(op, checksum) != null) {
1207
- assert(self.journal.dirty.bit(op) and !self.journal.faulty.bit(op));
1545
+
1546
+ if (self.journal.header_with_op_and_checksum(op, checksum)) |_| {
1547
+ assert(self.journal.dirty.bit(slot) and !self.journal.faulty.bit(slot));
1548
+ }
1549
+
1550
+ if (self.journal.prepare_inhabited[slot.index]) {
1551
+ assert(self.journal.prepare_checksums[slot.index] != checksum.?);
1208
1552
  }
1209
1553
 
1210
1554
  log.debug("{}: on_request_prepare: op={} checksum={} nacking", .{
@@ -1264,21 +1608,9 @@ pub fn Replica(
1264
1608
  const op_max = message.header.op;
1265
1609
  assert(op_max >= op_min);
1266
1610
 
1267
- // We must add 1 because op_max and op_min are both inclusive:
1268
- const count_max = @intCast(u32, std.math.min(64, op_max - op_min + 1));
1269
- assert(count_max > 0);
1270
-
1271
- const size_max = @sizeOf(Header) * std.math.min(
1272
- std.math.max(@divFloor(message.buffer.len, @sizeOf(Header)), 2),
1273
- 1 + count_max,
1274
- );
1275
- assert(size_max > @sizeOf(Header));
1276
-
1277
- const count = self.journal.copy_latest_headers_between(
1278
- op_min,
1279
- op_max,
1280
- std.mem.bytesAsSlice(Header, response.buffer[@sizeOf(Header)..size_max]),
1281
- );
1611
+ const count = self.copy_latest_headers_and_set_size(op_min, op_max, null, response);
1612
+ assert(count >= 0);
1613
+ assert(@divExact(response.header.size, @sizeOf(Header)) == 1 + count);
1282
1614
 
1283
1615
  if (count == 0) {
1284
1616
  log.debug("{}: on_request_headers: ignoring (op={}..{}, no headers)", .{
@@ -1289,8 +1621,6 @@ pub fn Replica(
1289
1621
  return;
1290
1622
  }
1291
1623
 
1292
- response.header.size = @intCast(u32, @sizeOf(Header) + @sizeOf(Header) * count);
1293
-
1294
1624
  response.header.set_checksum_body(response.body());
1295
1625
  response.header.set_checksum();
1296
1626
 
@@ -1313,7 +1643,8 @@ pub fn Replica(
1313
1643
  }
1314
1644
 
1315
1645
  const op = self.nack_prepare_op.?;
1316
- const checksum = self.journal.entry_for_op_exact(op).?.checksum;
1646
+ const checksum = self.journal.header_with_op(op).?.checksum;
1647
+ const slot = self.journal.slot_with_op(op).?;
1317
1648
 
1318
1649
  if (message.header.op != op) {
1319
1650
  log.debug("{}: on_nack_prepare: ignoring (repairing another op)", .{self.replica});
@@ -1348,14 +1679,14 @@ pub fn Replica(
1348
1679
  // Otherwise, if we know we do not have the op, then we can exclude ourselves.
1349
1680
  assert(self.replica_count > 1);
1350
1681
 
1351
- const threshold = if (self.journal.faulty.bit(op))
1682
+ const threshold = if (self.journal.faulty.bit(slot))
1352
1683
  self.replica_count - self.quorum_replication + 1
1353
1684
  else
1354
1685
  self.replica_count - self.quorum_replication;
1355
1686
 
1356
1687
  if (threshold == 0) {
1357
1688
  assert(self.replica_count == 2);
1358
- assert(!self.journal.faulty.bit(op));
1689
+ assert(!self.journal.faulty.bit(slot));
1359
1690
 
1360
1691
  // This is a special case for a cluster-of-two, handled in `repair_prepare()`.
1361
1692
  log.debug("{}: on_nack_prepare: ignoring (cluster-of-two, not faulty)", .{
@@ -1364,10 +1695,11 @@ pub fn Replica(
1364
1695
  return;
1365
1696
  }
1366
1697
 
1367
- log.debug("{}: on_nack_prepare: quorum_replication={} threshold={}", .{
1698
+ log.debug("{}: on_nack_prepare: quorum_replication={} threshold={} op={}", .{
1368
1699
  self.replica,
1369
1700
  self.quorum_replication,
1370
1701
  threshold,
1702
+ op,
1371
1703
  });
1372
1704
 
1373
1705
  // We should never expect to receive a nack from ourselves:
@@ -1383,7 +1715,7 @@ pub fn Replica(
1383
1715
 
1384
1716
  assert(count == threshold);
1385
1717
  assert(!self.nack_prepare_from_other_replicas.isSet(self.replica));
1386
- log.debug("{}: on_nack_prepare: quorum received", .{self.replica});
1718
+ log.debug("{}: on_nack_prepare: quorum received op={}", .{ self.replica, op });
1387
1719
 
1388
1720
  self.discard_uncommitted_ops_from(op, checksum);
1389
1721
  self.reset_quorum_nack_prepare();
@@ -1512,7 +1844,10 @@ pub fn Replica(
1512
1844
  const replica = waiting[self.prepare_timeout.attempts % waiting_len];
1513
1845
  assert(replica != self.replica);
1514
1846
 
1515
- log.debug("{}: on_prepare_timeout: replicating to replica {}", .{ self.replica, replica });
1847
+ log.debug("{}: on_prepare_timeout: replicating to replica {}", .{
1848
+ self.replica,
1849
+ replica,
1850
+ });
1516
1851
  self.send_message_to_replica(replica, prepare.message);
1517
1852
  }
1518
1853
 
@@ -1524,7 +1859,7 @@ pub fn Replica(
1524
1859
  assert(self.commit_min == self.commit_max);
1525
1860
 
1526
1861
  // TODO Snapshots: Use snapshot checksum if commit is no longer in journal.
1527
- const latest_committed_entry = self.journal.entry_for_op_exact(self.commit_max).?;
1862
+ const latest_committed_entry = self.journal.header_with_op(self.commit_max).?;
1528
1863
 
1529
1864
  self.send_header_to_other_replicas(.{
1530
1865
  .command = .commit,
@@ -1569,6 +1904,13 @@ pub fn Replica(
1569
1904
  self.repair();
1570
1905
  }
1571
1906
 
1907
+ fn on_recovery_timeout(self: *Self) void {
1908
+ assert(self.status == .recovering);
1909
+ assert(self.replica_count > 1);
1910
+ self.recovery_timeout.reset();
1911
+ self.recover();
1912
+ }
1913
+
1572
1914
  fn reference_message_and_receive_quorum_exactly_once(
1573
1915
  self: *Self,
1574
1916
  messages: *QuorumMessages,
@@ -1625,7 +1967,10 @@ pub fn Replica(
1625
1967
 
1626
1968
  // This is not the first time we have had quorum, the state transition has already happened:
1627
1969
  if (count > threshold) {
1628
- log.debug("{}: on_{s}: ignoring (quorum received already)", .{ self.replica, command });
1970
+ log.debug("{}: on_{s}: ignoring (quorum received already)", .{
1971
+ self.replica,
1972
+ command,
1973
+ });
1629
1974
  return null;
1630
1975
  }
1631
1976
 
@@ -1674,7 +2019,11 @@ pub fn Replica(
1674
2019
 
1675
2020
  // Do not allow duplicate messages to trigger multiple passes through a state transition:
1676
2021
  if (counter.isSet(message.header.replica)) {
1677
- log.debug("{}: on_{s}: ignoring (duplicate message)", .{ self.replica, command });
2022
+ log.debug("{}: on_{s}: ignoring (duplicate message replica={})", .{
2023
+ self.replica,
2024
+ command,
2025
+ message.header.replica,
2026
+ });
1678
2027
  return null;
1679
2028
  }
1680
2029
 
@@ -1695,7 +2044,10 @@ pub fn Replica(
1695
2044
 
1696
2045
  // This is not the first time we have had quorum, the state transition has already happened:
1697
2046
  if (count > threshold) {
1698
- log.debug("{}: on_{s}: ignoring (quorum received already)", .{ self.replica, command });
2047
+ log.debug("{}: on_{s}: ignoring (quorum received already)", .{
2048
+ self.replica,
2049
+ command,
2050
+ });
1699
2051
  return null;
1700
2052
  }
1701
2053
 
@@ -1709,8 +2061,15 @@ pub fn Replica(
1709
2061
  assert(message.header.view == self.view);
1710
2062
  assert(message.header.op == self.op);
1711
2063
 
1712
- log.debug("{}: append: appending to journal", .{self.replica});
1713
- self.write_prepare(message, .append);
2064
+ if (self.replica_count == 1 and self.pipeline.count > 1) {
2065
+ // In a cluster-of-one, the prepares must always be written to the WAL sequentially
2066
+ // (never concurrently). This ensures that there will be no gaps in the WAL during
2067
+ // crash recovery.
2068
+ log.debug("{}: append: serializing append op={}", .{ self.replica, message.header.op });
2069
+ } else {
2070
+ log.debug("{}: append: appending to journal", .{self.replica});
2071
+ self.write_prepare(message, .append);
2072
+ }
1714
2073
  }
1715
2074
 
1716
2075
  /// Returns whether `b` succeeds `a` by having a newer view or same view and newer op.
@@ -1761,7 +2120,8 @@ pub fn Replica(
1761
2120
  fn commit_ops(self: *Self, commit: u64) void {
1762
2121
  // TODO Restrict `view_change` status only to the leader purely as defense-in-depth.
1763
2122
  // Be careful of concurrency when doing this, as successive view changes can happen quickly.
1764
- assert(self.status == .normal or self.status == .view_change);
2123
+ assert(self.status == .normal or self.status == .view_change or
2124
+ (self.status == .recovering and self.replica_count == 1));
1765
2125
  assert(self.commit_min <= self.commit_max);
1766
2126
  assert(self.commit_min <= self.op);
1767
2127
  assert(self.commit_max <= self.op or self.commit_max > self.op);
@@ -1805,12 +2165,14 @@ pub fn Replica(
1805
2165
 
1806
2166
  fn commit_ops_read(self: *Self) void {
1807
2167
  assert(self.committing);
1808
- assert(self.status == .normal or self.status == .view_change);
2168
+ assert(self.status == .normal or self.status == .view_change or
2169
+ (self.status == .recovering and self.replica_count == 1));
1809
2170
  assert(self.commit_min <= self.commit_max);
1810
2171
  assert(self.commit_min <= self.op);
1811
2172
 
1812
2173
  if (!self.valid_hash_chain("commit_ops_read")) {
1813
2174
  self.committing = false;
2175
+ assert(self.replica_count > 1);
1814
2176
  return;
1815
2177
  }
1816
2178
  assert(self.op >= self.commit_max);
@@ -1819,12 +2181,22 @@ pub fn Replica(
1819
2181
  // Even a naive state transfer may fail to correct for this.
1820
2182
  if (self.commit_min < self.commit_max and self.commit_min < self.op) {
1821
2183
  const op = self.commit_min + 1;
1822
- const checksum = self.journal.entry_for_op_exact(op).?.checksum;
2184
+ const checksum = self.journal.header_with_op(op).?.checksum;
1823
2185
  self.journal.read_prepare(commit_ops_commit, op, checksum, null);
1824
2186
  } else {
1825
2187
  self.committing = false;
1826
2188
  // This is an optimization to expedite the view change before the `repair_timeout`:
1827
2189
  if (self.status == .view_change and self.repairs_allowed()) self.repair();
2190
+
2191
+ if (self.status == .recovering) {
2192
+ assert(self.replica_count == 1);
2193
+ assert(self.commit_min == self.commit_max);
2194
+ assert(self.commit_min == self.op);
2195
+ self.transition_to_normal_from_recovering_status(0);
2196
+ } else {
2197
+ // We expect that a cluster-of-one only calls commit_ops() in recovering status.
2198
+ assert(self.replica_count > 1);
2199
+ }
1828
2200
  }
1829
2201
  }
1830
2202
 
@@ -1836,31 +2208,39 @@ pub fn Replica(
1836
2208
 
1837
2209
  if (prepare == null) {
1838
2210
  log.debug("{}: commit_ops_commit: prepare == null", .{self.replica});
2211
+ if (self.replica_count == 1) @panic("cannot recover corrupt prepare");
1839
2212
  return;
1840
2213
  }
1841
2214
 
1842
- if (self.status == .view_change) {
1843
- if (self.leader_index(self.view) != self.replica) {
1844
- log.debug("{}: commit_ops_commit: no longer leader", .{self.replica});
1845
- return;
1846
- }
2215
+ switch (self.status) {
2216
+ .normal => {},
2217
+ .view_change => {
2218
+ if (self.leader_index(self.view) != self.replica) {
2219
+ log.debug("{}: commit_ops_commit: no longer leader", .{self.replica});
2220
+ assert(self.replica_count > 1);
2221
+ return;
2222
+ }
1847
2223
 
1848
- // Only the leader may commit during a view change before starting the new view.
1849
- // Fall through if this is indeed the case.
1850
- } else if (self.status != .normal) {
1851
- log.debug("{}: commit_ops_commit: no longer in normal status", .{self.replica});
1852
- return;
2224
+ // Only the leader may commit during a view change before starting the new view.
2225
+ // Fall through if this is indeed the case.
2226
+ },
2227
+ .recovering => {
2228
+ assert(self.replica_count == 1);
2229
+ assert(self.leader_index(self.view) == self.replica);
2230
+ },
1853
2231
  }
1854
2232
 
1855
2233
  const op = self.commit_min + 1;
1856
2234
 
1857
2235
  if (prepare.?.header.op != op) {
1858
2236
  log.debug("{}: commit_ops_commit: op changed", .{self.replica});
2237
+ assert(self.replica_count > 1);
1859
2238
  return;
1860
2239
  }
1861
2240
 
1862
- if (prepare.?.header.checksum != self.journal.entry_for_op_exact(op).?.checksum) {
2241
+ if (prepare.?.header.checksum != self.journal.header_with_op(op).?.checksum) {
1863
2242
  log.debug("{}: commit_ops_commit: checksum changed", .{self.replica});
2243
+ assert(self.replica_count > 1);
1864
2244
  return;
1865
2245
  }
1866
2246
 
@@ -1876,9 +2256,10 @@ pub fn Replica(
1876
2256
 
1877
2257
  fn commit_op(self: *Self, prepare: *const Message) void {
1878
2258
  // TODO Can we add more checks around allowing commit_op() during a view change?
1879
- assert(self.status == .normal or self.status == .view_change);
2259
+ assert(self.status == .normal or self.status == .view_change or
2260
+ (self.status == .recovering and self.replica_count == 1));
1880
2261
  assert(prepare.header.command == .prepare);
1881
- assert(prepare.header.operation != .init);
2262
+ assert(prepare.header.operation != .root);
1882
2263
  assert(prepare.header.op == self.commit_min + 1);
1883
2264
  assert(prepare.header.op <= self.op);
1884
2265
 
@@ -1886,7 +2267,7 @@ pub fn Replica(
1886
2267
  // happened since we last checked in `commit_ops_read()`. However, this would relate to
1887
2268
  // subsequent ops, since by now we have already verified the hash chain for this commit.
1888
2269
 
1889
- assert(self.journal.entry_for_op_exact(self.commit_min).?.checksum ==
2270
+ assert(self.journal.header_with_op(self.commit_min).?.checksum ==
1890
2271
  prepare.header.parent);
1891
2272
 
1892
2273
  log.debug("{}: commit_op: executing view={} {} op={} checksum={} ({s})", .{
@@ -1901,6 +2282,8 @@ pub fn Replica(
1901
2282
  const reply = self.message_bus.get_message();
1902
2283
  defer self.message_bus.unref(reply);
1903
2284
 
2285
+ assert(self.state_machine.commit_timestamp < prepare.header.timestamp);
2286
+
1904
2287
  const reply_body_size = @intCast(u32, self.state_machine.commit(
1905
2288
  prepare.header.client,
1906
2289
  prepare.header.operation.cast(StateMachine),
@@ -1908,6 +2291,9 @@ pub fn Replica(
1908
2291
  reply.buffer[@sizeOf(Header)..],
1909
2292
  ));
1910
2293
 
2294
+ assert(self.state_machine.commit_timestamp <= prepare.header.timestamp);
2295
+ self.state_machine.commit_timestamp = prepare.header.timestamp;
2296
+
1911
2297
  self.commit_min += 1;
1912
2298
  assert(self.commit_min == prepare.header.op);
1913
2299
  if (self.commit_min > self.commit_max) self.commit_max = self.commit_min;
@@ -1927,10 +2313,10 @@ pub fn Replica(
1927
2313
  .commit = prepare.header.op,
1928
2314
  .size = @sizeOf(Header) + reply_body_size,
1929
2315
  };
1930
- assert(reply.header.offset == 0);
2316
+ assert(reply.header.timestamp == 0);
1931
2317
  assert(reply.header.epoch == 0);
1932
2318
 
1933
- reply.header.set_checksum_body(reply.buffer[@sizeOf(Header)..reply.header.size]);
2319
+ reply.header.set_checksum_body(reply.body());
1934
2320
  reply.header.set_checksum();
1935
2321
 
1936
2322
  if (reply.header.operation == .register) {
@@ -1974,8 +2360,16 @@ pub fn Replica(
1974
2360
  assert(self.commit_min == self.commit_max);
1975
2361
  assert(self.commit_max == prepare.message.header.op);
1976
2362
 
1977
- self.message_bus.unref(prepare.message);
1978
- assert(self.pipeline.pop() != null);
2363
+ self.message_bus.unref(self.pipeline.pop().?.message);
2364
+
2365
+ if (self.replica_count == 1) {
2366
+ if (self.pipeline.head_ptr()) |head| {
2367
+ // Write the next message in the queue.
2368
+ // A cluster-of-one writes prepares sequentially to avoid gaps in the WAL.
2369
+ self.write_prepare(head.message, .append);
2370
+ // The loop will wrap around and exit when `!ok_quorum_received`.
2371
+ }
2372
+ }
1979
2373
  }
1980
2374
 
1981
2375
  assert(self.prepare_timeout.ticking);
@@ -1983,6 +2377,39 @@ pub fn Replica(
1983
2377
  if (self.pipeline.count == 0) self.prepare_timeout.stop();
1984
2378
  }
1985
2379
 
2380
+ fn copy_latest_headers_and_set_size(
2381
+ self: *const Self,
2382
+ op_min: u64,
2383
+ op_max: u64,
2384
+ count_max: ?usize,
2385
+ message: *Message,
2386
+ ) usize {
2387
+ assert(op_max >= op_min);
2388
+ assert(count_max == null or count_max.? > 0);
2389
+ assert(message.header.command == .do_view_change or
2390
+ message.header.command == .start_view or
2391
+ message.header.command == .headers or
2392
+ message.header.command == .recovery_response);
2393
+
2394
+ const body_size_max = @sizeOf(Header) * std.math.min(
2395
+ @divExact(message.buffer.len - @sizeOf(Header), @sizeOf(Header)),
2396
+ // We must add 1 because op_max and op_min are both inclusive:
2397
+ count_max orelse std.math.min(64, op_max - op_min + 1),
2398
+ );
2399
+ assert(body_size_max >= @sizeOf(Header));
2400
+ assert(count_max == null or body_size_max == count_max.? * @sizeOf(Header));
2401
+
2402
+ const count = self.journal.copy_latest_headers_between(
2403
+ op_min,
2404
+ op_max,
2405
+ std.mem.bytesAsSlice(Header, message.buffer[@sizeOf(Header)..][0..body_size_max]),
2406
+ );
2407
+
2408
+ message.header.size = @intCast(u32, @sizeOf(Header) * (1 + count));
2409
+
2410
+ return count;
2411
+ }
2412
+
1986
2413
  fn count_quorum(
1987
2414
  self: *Self,
1988
2415
  messages: *QuorumMessages,
@@ -2004,6 +2431,7 @@ pub fn Replica(
2004
2431
  assert(m.header.view == self.view);
2005
2432
  },
2006
2433
  .do_view_change => assert(m.header.view == self.view),
2434
+ .recovery_response => assert(m.header.replica != self.replica),
2007
2435
  .nack_prepare => {
2008
2436
  // TODO See if we can restrict this branch further.
2009
2437
  assert(m.header.replica != self.replica);
@@ -2032,7 +2460,8 @@ pub fn Replica(
2032
2460
  const session = reply.header.commit; // The commit number becomes the session number.
2033
2461
  const request = reply.header.request;
2034
2462
 
2035
- assert(session > 0); // We reserved the `0` commit number for the cluster `.init` operation.
2463
+ // We reserved the `0` commit number for the cluster `.root` operation.
2464
+ assert(session > 0);
2036
2465
  assert(request == 0);
2037
2466
 
2038
2467
  // For correctness, it's critical that all replicas evict deterministically:
@@ -2113,8 +2542,8 @@ pub fn Replica(
2113
2542
  // The latest normal view (as specified in the 2012 paper) is different to the view
2114
2543
  // number contained in the prepare headers we include in the body. The former shows
2115
2544
  // how recent a view change the replica participated in, which may be much higher.
2116
- // We use the `offset` field to send this in addition to the current view number:
2117
- .offset = if (command == .do_view_change) self.view_normal else 0,
2545
+ // We use the `timestamp` field to send this in addition to the current view number:
2546
+ .timestamp = if (command == .do_view_change) self.view_normal else 0,
2118
2547
  .op = self.op,
2119
2548
  .commit = self.commit_max,
2120
2549
  };
@@ -2128,22 +2557,9 @@ pub fn Replica(
2128
2557
  const count_max = config.pipeline_max;
2129
2558
  assert(count_max > 0);
2130
2559
 
2131
- const size_max = @sizeOf(Header) * std.math.min(
2132
- std.math.max(@divFloor(message.buffer.len, @sizeOf(Header)), 2),
2133
- 1 + count_max,
2134
- );
2135
- assert(size_max > @sizeOf(Header));
2136
-
2137
- const count = self.journal.copy_latest_headers_between(
2138
- 0,
2139
- self.op,
2140
- std.mem.bytesAsSlice(Header, message.buffer[@sizeOf(Header)..size_max]),
2141
- );
2142
-
2143
- // We expect that self.op always exists.
2144
- assert(count > 0);
2145
-
2146
- message.header.size = @intCast(u32, @sizeOf(Header) + @sizeOf(Header) * count);
2560
+ const count = self.copy_latest_headers_and_set_size(0, self.op, count_max, message);
2561
+ assert(count > 0); // We expect that self.op always exists.
2562
+ assert(@divExact(message.header.size, @sizeOf(Header)) == 1 + count);
2147
2563
 
2148
2564
  message.header.set_checksum_body(message.body());
2149
2565
  message.header.set_checksum();
@@ -2154,7 +2570,9 @@ pub fn Replica(
2154
2570
  /// The caller owns the returned message, if any, which has exactly 1 reference.
2155
2571
  fn create_message_from_header(self: *Self, header: Header) *Message {
2156
2572
  assert(header.replica == self.replica);
2157
- assert(header.view == self.view or header.command == .request_start_view);
2573
+ assert(header.view == self.view or
2574
+ header.command == .request_start_view or
2575
+ header.command == .recovery);
2158
2576
  assert(header.size == @sizeOf(Header));
2159
2577
 
2160
2578
  const message = self.message_bus.pool.get_message();
@@ -2181,6 +2599,12 @@ pub fn Replica(
2181
2599
  /// uncommitted header gaps and compare them with the quorum of do_view_change messages
2182
2600
  /// received from other replicas, before starting the new view, to discard any that may be
2183
2601
  /// impossible to repair.
2602
+ ///
2603
+ /// For example, if the old primary replicates ops=7,8,9 (all uncommitted) but only op=9 is
2604
+ /// prepared on another replica before the old primary crashes, then this function finds a
2605
+ /// gap for ops=7,8 and will attempt to discard ops 7,8,9.
2606
+ // TODO To improve availability, potentially call this before the local headers are
2607
+ // repaired during the view change, so that we can participate in nacking headers.
2184
2608
  fn discard_uncommitted_headers(self: *Self) void {
2185
2609
  assert(self.status == .view_change);
2186
2610
  assert(self.leader_index(self.view) == self.replica);
@@ -2188,6 +2612,7 @@ pub fn Replica(
2188
2612
  assert(!self.repair_timeout.ticking);
2189
2613
  assert(self.op >= self.commit_max);
2190
2614
  assert(self.replica_count > 1);
2615
+ assert(self.op - self.commit_max <= config.journal_slot_count);
2191
2616
 
2192
2617
  const threshold = self.replica_count - self.quorum_replication;
2193
2618
  if (threshold == 0) {
@@ -2195,9 +2620,13 @@ pub fn Replica(
2195
2620
  return;
2196
2621
  }
2197
2622
 
2623
+ // Iterating > commit_max does not in itself guarantee that the header is uncommitted.
2624
+ // We must also count nacks from the quorum, since the old primary may have committed
2625
+ // another op just before crashing, if there was sufficient quorum. Counting nacks
2626
+ // ensures that the old primary could not possibly have committed the header.
2198
2627
  var op = self.op;
2199
2628
  while (op > self.commit_max) : (op -= 1) {
2200
- if (self.journal.entry_for_op_exact(op) != null) continue;
2629
+ if (self.journal.header_with_op(op) != null) continue;
2201
2630
 
2202
2631
  log.debug("{}: discard_uncommitted_headers: op={} gap", .{ self.replica, op });
2203
2632
 
@@ -2208,14 +2637,30 @@ pub fn Replica(
2208
2637
  assert(m.header.cluster == self.cluster);
2209
2638
  assert(m.header.replica == replica);
2210
2639
  assert(m.header.view == self.view);
2640
+ assert(m.header.commit <= self.commit_max);
2211
2641
 
2212
2642
  if (replica != self.replica) {
2213
- if (m.header.op < op) nacks += 1;
2214
-
2215
- log.debug("{}: discard_uncommitted_headers: replica={} op={}", .{
2643
+ // Check for a gap in the uncommitted headers from this replica.
2644
+ const received_headers = self.message_body_as_headers(m);
2645
+ assert(received_headers.len >= 1);
2646
+
2647
+ const received_op_min = received_headers[received_headers.len - 1].op;
2648
+ const received_op_max = received_headers[0].op;
2649
+ assert(received_op_max >= received_op_min);
2650
+
2651
+ const nack = for (received_headers) |*h| {
2652
+ if (h.op == op) break false;
2653
+ } else nack: {
2654
+ // Don't nack ops that didn't fit in the message's attached headers.
2655
+ break :nack op >= received_op_min;
2656
+ };
2657
+
2658
+ if (nack) nacks += 1;
2659
+ log.debug("{}: discard_uncommitted_headers: replica={} op={} nack={}", .{
2216
2660
  self.replica,
2217
2661
  m.header.replica,
2218
- m.header.op,
2662
+ op,
2663
+ nack,
2219
2664
  });
2220
2665
  }
2221
2666
  }
@@ -2229,12 +2674,15 @@ pub fn Replica(
2229
2674
  });
2230
2675
 
2231
2676
  if (nacks >= threshold) {
2677
+ assert(op > self.commit_max);
2678
+
2232
2679
  self.journal.remove_entries_from(op);
2233
2680
  self.op = op - 1;
2234
2681
 
2235
- assert(self.journal.entry_for_op(op) == null);
2236
- assert(!self.journal.dirty.bit(op));
2237
- assert(!self.journal.faulty.bit(op));
2682
+ const slot = self.journal.slot_for_op(op);
2683
+ assert(self.journal.header_for_op(op) == null);
2684
+ assert(!self.journal.dirty.bit(slot));
2685
+ assert(!self.journal.faulty.bit(slot));
2238
2686
  }
2239
2687
  }
2240
2688
  }
@@ -2249,10 +2697,11 @@ pub fn Replica(
2249
2697
 
2250
2698
  assert(self.valid_hash_chain("discard_uncommitted_ops_from"));
2251
2699
 
2700
+ const slot = self.journal.slot_with_op(op).?;
2252
2701
  assert(op > self.commit_max);
2253
2702
  assert(op <= self.op);
2254
- assert(self.journal.entry_for_op_exact_with_checksum(op, checksum) != null);
2255
- assert(self.journal.dirty.bit(op));
2703
+ assert(self.journal.header_with_op_and_checksum(op, checksum) != null);
2704
+ assert(self.journal.dirty.bit(slot));
2256
2705
 
2257
2706
  log.debug("{}: discard_uncommitted_ops_from: ops={}..{} view={}", .{
2258
2707
  self.replica,
@@ -2264,13 +2713,13 @@ pub fn Replica(
2264
2713
  self.journal.remove_entries_from(op);
2265
2714
  self.op = op - 1;
2266
2715
 
2267
- assert(self.journal.entry_for_op(op) == null);
2268
- assert(!self.journal.dirty.bit(op));
2269
- assert(!self.journal.faulty.bit(op));
2716
+ assert(self.journal.header_for_op(op) == null);
2717
+ assert(!self.journal.dirty.bit(slot));
2718
+ assert(!self.journal.faulty.bit(slot));
2270
2719
 
2271
2720
  // We require that `self.op` always exists. Rewinding `self.op` could change that.
2272
2721
  // However, we do this only as the leader within a view change, with all headers intact.
2273
- assert(self.journal.entry_for_op_exact(self.op) != null);
2722
+ assert(self.journal.header_with_op(self.op) != null);
2274
2723
  }
2275
2724
 
2276
2725
  /// Returns whether the replica is a follower for the current view.
@@ -2370,7 +2819,7 @@ pub fn Replica(
2370
2819
  return true;
2371
2820
  },
2372
2821
  // Only the leader may answer a request for a prepare without a context:
2373
- .request_prepare => if (message.header.context == 0) {
2822
+ .request_prepare => if (message.header.timestamp == 0) {
2374
2823
  log.warn("{}: on_{s}: ignoring (no context)", .{ self.replica, command });
2375
2824
  return true;
2376
2825
  },
@@ -2439,6 +2888,18 @@ pub fn Replica(
2439
2888
  if (self.ignore_request_message_follower(message)) return true;
2440
2889
  if (self.ignore_request_message_duplicate(message)) return true;
2441
2890
  if (self.ignore_request_message_preparing(message)) return true;
2891
+
2892
+ // Verify that the new request will fit in the WAL.
2893
+ // The message's op hasn't been assigned yet, but it will be `self.op + 1`.
2894
+ if (self.op + 1 >= self.op_checkpoint + config.journal_slot_count) {
2895
+ log.debug("{}: on_request: ignoring op={} (too far ahead, checkpoint={})", .{
2896
+ self.replica,
2897
+ message.header.op,
2898
+ self.op_checkpoint,
2899
+ });
2900
+ return true;
2901
+ }
2902
+
2442
2903
  return false;
2443
2904
  }
2444
2905
 
@@ -2491,7 +2952,9 @@ pub fn Replica(
2491
2952
  return false;
2492
2953
  } else {
2493
2954
  // The client may have only one request inflight at a time.
2494
- log.err("{}: on_request: ignoring new request (client bug)", .{self.replica});
2955
+ log.err("{}: on_request: ignoring new request (client bug)", .{
2956
+ self.replica,
2957
+ });
2495
2958
  return true;
2496
2959
  }
2497
2960
  } else {
@@ -2642,7 +3105,71 @@ pub fn Replica(
2642
3105
  return false;
2643
3106
  }
2644
3107
 
2645
- fn is_repair(self: *Self, message: *const Message) bool {
3108
+ /// Returns whether the highest known op is certain.
3109
+ ///
3110
+ /// After recovering the WAL, there are 2 possible outcomes:
3111
+ /// * All entries valid. The highest op is certain, and safe to set as `replica.op`.
3112
+ /// * One or more entries are faulty. The highest op isn't certain — it may be one of the
3113
+ /// broken entries.
3114
+ ///
3115
+ /// The replica must refrain from repairing any faulty slots until the highest op is known.
3116
+ /// Otherwise, if we were to repair a slot while uncertain of `replica.op`:
3117
+ ///
3118
+ /// * we may nack an op that we shouldn't, or
3119
+ /// * we may replace a prepared op that we were guaranteeing for the primary, potentially
3120
+ /// forking the log.
3121
+ ///
3122
+ ///
3123
+ /// Test for a fault the right of the current op. The fault might be our true op, and
3124
+ /// sharing our current `replica.op` might cause the cluster's op to likewise regress.
3125
+ ///
3126
+ /// Note that for our purposes here, we only care about entries that were faulty during
3127
+ /// WAL recovery, not ones that were found to be faulty after the fact (e.g. due to
3128
+ /// `request_prepare`).
3129
+ ///
3130
+ /// Cases (`✓`: `replica.op_checkpoint`, `✗`: faulty, `o`: `replica.op`):
3131
+ /// * ` ✓ o ✗ `: View change is unsafe.
3132
+ /// * ` ✗ ✓ o `: View change is unsafe.
3133
+ /// * ` ✓ ✗ o `: View change is safe.
3134
+ /// * ` ✓ = o `: View change is unsafe if any slots are faulty.
3135
+ /// (`replica.op_checkpoint` == `replica.op`).
3136
+ // TODO Use this function once we switch from recovery protocol to the superblock.
3137
+ // If there is an "unsafe" fault, we will need to request a start_view from the leader to
3138
+ // learn the op.
3139
+ fn op_certain(self: *const Self) bool {
3140
+ assert(self.status == .recovering);
3141
+ assert(self.journal.recovered);
3142
+ assert(self.op_checkpoint <= self.op);
3143
+
3144
+ const slot_op_checkpoint = self.journal.slot_for_op(self.op_checkpoint).index;
3145
+ const slot_op = self.journal.slot_with_op(self.op).?.index;
3146
+ const slot_known_range = vsr.SlotRange{
3147
+ .head = slot_op_checkpoint,
3148
+ .tail = slot_op,
3149
+ };
3150
+
3151
+ var iterator = self.journal.faulty.bits.iterator(.{ .kind = .set });
3152
+ while (iterator.next()) |slot| {
3153
+ // The command is `reserved` when the entry was found faulty during WAL recovery.
3154
+ // Faults found after WAL recovery are not relevant, because we know their op.
3155
+ if (self.journal.headers[slot.index].command == .reserved) {
3156
+ if (slot_op_checkpoint == slot_op or
3157
+ !slot_known_range.contains(slot))
3158
+ {
3159
+ log.warn("{}: op_certain: op not known (faulty_slot={}, op={}, op_checkpoint={})", .{
3160
+ self.replica,
3161
+ slot.index,
3162
+ self.op,
3163
+ self.op_checkpoint,
3164
+ });
3165
+ return false;
3166
+ }
3167
+ }
3168
+ }
3169
+ return true;
3170
+ }
3171
+
3172
+ fn is_repair(self: *const Self, message: *const Message) bool {
2646
3173
  assert(message.header.command == .prepare);
2647
3174
 
2648
3175
  if (self.status == .normal) {
@@ -2674,15 +3201,17 @@ pub fn Replica(
2674
3201
  assert(self.follower());
2675
3202
  assert(header.view == self.view);
2676
3203
  assert(header.op > self.op + 1);
2677
- // We may have learned of a higher `commit_max` through a commit message before jumping to a
2678
- // newer op that is less than `commit_max` but greater than `commit_min`:
3204
+ // We may have learned of a higher `commit_max` through a commit message before jumping
3205
+ // to a newer op that is less than `commit_max` but greater than `commit_min`:
2679
3206
  assert(header.op > self.commit_min);
3207
+ // Never overwrite an op that still needs to be checkpointed.
3208
+ assert(header.op - self.op_checkpoint < config.journal_slot_count);
2680
3209
 
2681
3210
  log.debug("{}: jump_to_newer_op: advancing: op={}..{} checksum={}..{}", .{
2682
3211
  self.replica,
2683
3212
  self.op,
2684
3213
  header.op - 1,
2685
- self.journal.entry_for_op_exact(self.op).?.checksum,
3214
+ self.journal.header_with_op(self.op).?.checksum,
2686
3215
  header.parent,
2687
3216
  });
2688
3217
 
@@ -2694,7 +3223,10 @@ pub fn Replica(
2694
3223
  fn message_body_as_headers(_: *Self, message: *const Message) []Header {
2695
3224
  // TODO Assert message commands that we expect this to be called for.
2696
3225
  assert(message.header.size > @sizeOf(Header)); // Body must contain at least one header.
2697
- return std.mem.bytesAsSlice(Header, message.buffer[@sizeOf(Header)..message.header.size]);
3226
+ return std.mem.bytesAsSlice(
3227
+ Header,
3228
+ message.buffer[@sizeOf(Header)..message.header.size],
3229
+ );
2698
3230
  }
2699
3231
 
2700
3232
  /// Panics if immediate neighbors in the same view would have a broken hash chain.
@@ -2716,6 +3248,29 @@ pub fn Replica(
2716
3248
  }
2717
3249
  }
2718
3250
 
3251
+ /// Searches the pipeline for a prepare for a given op and checksum.
3252
+ /// When `checksum` is `null`, match any checksum.
3253
+ fn pipeline_prepare_for_op_and_checksum(self: *Self, op: u64, checksum: ?u128) ?*Prepare {
3254
+ assert(self.status == .normal or self.status == .view_change);
3255
+
3256
+ // To optimize the search, we can leverage the fact that the pipeline is ordered and
3257
+ // continuous.
3258
+ if (self.pipeline.count == 0) return null;
3259
+ const head_op = self.pipeline.head_ptr().?.message.header.op;
3260
+ const tail_op = self.pipeline.tail_ptr().?.message.header.op;
3261
+ if (op < head_op) return null;
3262
+ if (op > tail_op) return null;
3263
+
3264
+ const pipeline_prepare = self.pipeline.get_ptr(op - head_op).?;
3265
+ assert(pipeline_prepare.message.header.op == op);
3266
+
3267
+ if (checksum == null or pipeline_prepare.message.header.checksum == checksum.?) {
3268
+ return pipeline_prepare;
3269
+ } else {
3270
+ return null;
3271
+ }
3272
+ }
3273
+
2719
3274
  /// Searches the pipeline for a prepare for a given client.
2720
3275
  fn pipeline_prepare_for_client(self: *Self, client: u128) ?*Prepare {
2721
3276
  assert(self.status == .normal);
@@ -2723,7 +3278,7 @@ pub fn Replica(
2723
3278
  assert(self.commit_min == self.commit_max);
2724
3279
 
2725
3280
  var op = self.commit_max + 1;
2726
- var parent = self.journal.entry_for_op_exact(self.commit_max).?.checksum;
3281
+ var parent = self.journal.header_with_op(self.commit_max).?.checksum;
2727
3282
  var iterator = self.pipeline.iterator();
2728
3283
  while (iterator.next_ptr()) |prepare| {
2729
3284
  assert(prepare.message.header.command == .prepare);
@@ -2776,15 +3331,33 @@ pub fn Replica(
2776
3331
  assert(prepare.message.header.view <= ok.header.view);
2777
3332
  assert(prepare.message.header.op == ok.header.op);
2778
3333
  assert(prepare.message.header.commit == ok.header.commit);
2779
- assert(prepare.message.header.offset == ok.header.offset);
3334
+ assert(prepare.message.header.timestamp == ok.header.timestamp);
2780
3335
  assert(prepare.message.header.operation == ok.header.operation);
2781
3336
 
2782
3337
  return prepare;
2783
3338
  }
2784
3339
 
3340
+ fn recover(self: *Self) void {
3341
+ assert(self.status == .recovering);
3342
+ assert(self.replica_count > 1);
3343
+ assert(self.journal.recovered);
3344
+
3345
+ log.debug("{}: recover: sending recovery messages nonce={}", .{
3346
+ self.replica,
3347
+ self.recovery_nonce,
3348
+ });
3349
+
3350
+ self.send_header_to_other_replicas(.{
3351
+ .command = .recovery,
3352
+ .cluster = self.cluster,
3353
+ .context = self.recovery_nonce,
3354
+ .replica = self.replica,
3355
+ });
3356
+ }
3357
+
2785
3358
  /// Starting from the latest journal entry, backfill any missing or disconnected headers.
2786
- /// A header is disconnected if it breaks the hash chain with its newer neighbor to the right.
2787
- /// Since we work backwards from the latest entry, we should always be able to fix the chain.
3359
+ /// A header is disconnected if it breaks the chain with its newer neighbor to the right.
3360
+ /// Since we work back from the latest entry, we should always be able to fix the chain.
2788
3361
  /// Once headers are connected, backfill any dirty or faulty prepares.
2789
3362
  fn repair(self: *Self) void {
2790
3363
  if (!self.repair_timeout.ticking) {
@@ -2796,38 +3369,50 @@ pub fn Replica(
2796
3369
 
2797
3370
  assert(self.status == .normal or self.status == .view_change);
2798
3371
  assert(self.repairs_allowed());
3372
+
3373
+ assert(self.op_checkpoint <= self.op);
3374
+ assert(self.op_checkpoint <= self.commit_min);
2799
3375
  assert(self.commit_min <= self.op);
2800
3376
  assert(self.commit_min <= self.commit_max);
2801
3377
 
2802
- // We expect these always to exist:
2803
- assert(self.journal.entry_for_op_exact(self.commit_min) != null);
2804
- assert(self.journal.entry_for_op_exact(self.op) != null);
3378
+ assert(self.journal.header_with_op(self.commit_min) != null);
3379
+ assert(self.journal.header_with_op(self.op) != null);
3380
+
3381
+ // The replica repairs backwards from `commit_max`. But if `commit_max` is too high
3382
+ // (>1 WAL ahead), then bound it such that uncommitted WAL entries are not overwritten.
3383
+ const commit_max_limit = std.math.min(
3384
+ self.commit_max,
3385
+ self.op_checkpoint + config.journal_slot_count,
3386
+ );
2805
3387
 
2806
3388
  // Request outstanding committed prepares to advance our op number:
2807
3389
  // This handles the case of an idle cluster, where a follower will not otherwise advance.
2808
3390
  // This is not required for correctness, but for durability.
2809
- if (self.op < self.commit_max) {
3391
+ if (self.op < commit_max_limit) {
2810
3392
  // If the leader repairs during a view change, it will have already advanced
2811
3393
  // `self.op` to the latest op according to the quorum of `do_view_change` messages
2812
3394
  // received, so we must therefore be a follower in normal status:
2813
3395
  assert(self.status == .normal);
2814
3396
  assert(self.follower());
2815
- log.debug("{}: repair: op={} < commit_max={}", .{
3397
+ log.debug("{}: repair: op={} < commit_max_limit={}, commit_max={}", .{
2816
3398
  self.replica,
2817
3399
  self.op,
3400
+ commit_max_limit,
2818
3401
  self.commit_max,
2819
3402
  });
2820
3403
  // We need to advance our op number and therefore have to `request_prepare`,
2821
3404
  // since only `on_prepare()` can do this, not `repair_header()` in `on_headers()`.
2822
3405
  self.send_header_to_replica(self.leader_index(self.view), .{
2823
3406
  .command = .request_prepare,
2824
- // We cannot yet know the checksum of the prepare so we set the context to 0:
2825
- // Context is optional when requesting from the leader but required otherwise.
3407
+ // We cannot yet know the checksum of the prepare so we set the context and
3408
+ // timestamp to 0: Context is optional when requesting from the leader but
3409
+ // required otherwise.
2826
3410
  .context = 0,
3411
+ .timestamp = 0,
2827
3412
  .cluster = self.cluster,
2828
3413
  .replica = self.replica,
2829
3414
  .view = self.view,
2830
- .op = self.commit_max,
3415
+ .op = commit_max_limit,
2831
3416
  });
2832
3417
  return;
2833
3418
  }
@@ -2848,9 +3433,10 @@ pub fn Replica(
2848
3433
  assert(range.op_min > self.commit_min);
2849
3434
  assert(range.op_max < self.op);
2850
3435
  // A range of `op_min=0` or `op_max=0` should be impossible as a header break:
2851
- // This is the init op that is prepared when the cluster is initialized.
3436
+ // This is the root op that is prepared when the cluster is initialized.
2852
3437
  assert(range.op_min > 0);
2853
3438
  assert(range.op_max > 0);
3439
+
2854
3440
  if (self.choose_any_other_replica()) |replica| {
2855
3441
  self.send_header_to_replica(replica, .{
2856
3442
  .command = .request_headers,
@@ -2869,10 +3455,14 @@ pub fn Replica(
2869
3455
  assert(self.valid_hash_chain_between(self.commit_min, self.op));
2870
3456
 
2871
3457
  // Request and repair any dirty or faulty prepares:
2872
- if (self.journal.dirty.len > 0) return self.repair_prepares();
3458
+ if (self.journal.dirty.count > 0) return self.repair_prepares();
2873
3459
 
2874
3460
  // Commit ops, which may in turn discover faulty prepares and drive more repairs:
2875
- if (self.commit_min < self.commit_max) return self.commit_ops(self.commit_max);
3461
+ if (self.commit_min < self.commit_max) {
3462
+ assert(self.replica_count > 1);
3463
+ self.commit_ops(self.commit_max);
3464
+ return;
3465
+ }
2876
3466
 
2877
3467
  if (self.status == .view_change and self.leader_index(self.view) == self.replica) {
2878
3468
  if (self.repair_pipeline_op() != null) return self.repair_pipeline();
@@ -2927,10 +3517,13 @@ pub fn Replica(
2927
3517
  }
2928
3518
 
2929
3519
  if (header.op > self.op) {
2930
- log.debug("{}: repair_header: false (advances self.op)", .{self.replica});
3520
+ log.debug("{}: repair_header: false (advances self.op={})", .{
3521
+ self.replica,
3522
+ self.op,
3523
+ });
2931
3524
  return false;
2932
3525
  } else if (header.op == self.op) {
2933
- if (self.journal.entry_for_op_exact_with_checksum(self.op, header.checksum)) |_| {
3526
+ if (self.journal.header_with_op_and_checksum(self.op, header.checksum)) |_| {
2934
3527
  // Fall through below to check if self.op is uncommitted AND reordered,
2935
3528
  // which we would see by the presence of an earlier op with higher view number,
2936
3529
  // that breaks the chain with self.op. In this case, we must skip the repair to
@@ -2944,27 +3537,42 @@ pub fn Replica(
2944
3537
  }
2945
3538
  }
2946
3539
 
2947
- if (self.journal.entry(header)) |existing| {
3540
+ if (self.journal.header_for_entry(header)) |existing| {
3541
+ assert(existing.op == header.op);
3542
+
2948
3543
  // Do not replace any existing op lightly as doing so may impair durability and even
2949
3544
  // violate correctness by undoing a prepare already acknowledged to the leader:
2950
3545
  if (existing.checksum == header.checksum) {
2951
- if (!self.journal.dirty.bit(header.op)) {
2952
- log.debug("{}: repair_header: false (checksum clean)", .{self.replica});
3546
+ const slot = self.journal.slot_with_header(header).?;
3547
+ if (!self.journal.dirty.bit(slot)) {
3548
+ log.debug("{}: repair_header: op={} false (checksum clean)", .{
3549
+ self.replica,
3550
+ header.op,
3551
+ });
2953
3552
  return false;
2954
3553
  }
2955
3554
 
2956
- log.debug("{}: repair_header: exists, checksum dirty", .{self.replica});
3555
+ log.debug("{}: repair_header: op={} exists, checksum dirty", .{
3556
+ self.replica,
3557
+ header.op,
3558
+ });
2957
3559
  } else if (existing.view == header.view) {
2958
3560
  // The journal must have wrapped:
2959
3561
  // We expect that the same view and op will have the same checksum.
2960
3562
  assert(existing.op != header.op);
2961
3563
 
2962
3564
  if (existing.op > header.op) {
2963
- log.debug("{}: repair_header: false (view has newer op)", .{self.replica});
3565
+ log.debug("{}: repair_header: op={} false (view has newer op)", .{
3566
+ self.replica,
3567
+ header.op,
3568
+ });
2964
3569
  return false;
2965
3570
  }
2966
3571
 
2967
- log.debug("{}: repair_header: exists, view has older op", .{self.replica});
3572
+ log.debug("{}: repair_header: op={} exists, view has older op", .{
3573
+ self.replica,
3574
+ header.op,
3575
+ });
2968
3576
  } else {
2969
3577
  assert(existing.view != header.view);
2970
3578
  assert(existing.op == header.op or existing.op != header.op);
@@ -2972,38 +3580,37 @@ pub fn Replica(
2972
3580
  if (!self.repair_header_would_connect_hash_chain(header)) {
2973
3581
  // We cannot replace this op until we are sure that doing so would not
2974
3582
  // violate any prior commitments made to the leader.
2975
- log.debug("{}: repair_header: false (exists)", .{self.replica});
3583
+ log.debug("{}: repair_header: op={} false (exists)", .{
3584
+ self.replica,
3585
+ header.op,
3586
+ });
2976
3587
  return false;
2977
3588
  }
2978
3589
 
2979
- log.debug("{}: repair_header: exists, connects hash chain", .{self.replica});
3590
+ log.debug("{}: repair_header: op={} exists, connects hash chain", .{
3591
+ self.replica,
3592
+ header.op,
3593
+ });
2980
3594
  }
2981
3595
  } else {
2982
- log.debug("{}: repair_header: gap", .{self.replica});
3596
+ log.debug("{}: repair_header: op={} gap", .{ self.replica, header.op });
2983
3597
  }
2984
3598
 
2985
3599
  // Caveat: Do not repair an existing op or gap if doing so would break the hash chain:
2986
3600
  if (self.repair_header_would_break_hash_chain_with_next_entry(header)) {
2987
- log.debug("{}: repair_header: false (breaks hash chain)", .{self.replica});
3601
+ log.debug("{}: repair_header: op={} false (breaks hash chain)", .{
3602
+ self.replica,
3603
+ header.op,
3604
+ });
2988
3605
  return false;
2989
3606
  }
2990
3607
 
2991
- // Caveat: Do not repair an existing op or gap if doing so would overlap another:
2992
- if (self.repair_header_would_overlap_another(header)) {
2993
- if (!self.repair_header_would_connect_hash_chain(header)) {
2994
- log.debug("{}: repair_header: false (overlap)", .{self.replica});
2995
- return false;
2996
- }
2997
- // We may have to overlap previous entries in order to connect the hash chain:
2998
- log.debug("{}: repair_header: overlap, connects hash chain", .{self.replica});
2999
- }
3000
-
3001
3608
  // TODO Snapshots: Skip if this header is already snapshotted.
3002
3609
 
3003
3610
  assert(header.op < self.op or
3004
- self.journal.entry_for_op_exact(self.op).?.checksum == header.checksum);
3611
+ self.journal.header_with_op(self.op).?.checksum == header.checksum);
3005
3612
 
3006
- self.journal.set_entry_as_dirty(header);
3613
+ self.journal.set_header_as_dirty(header);
3007
3614
  return true;
3008
3615
  }
3009
3616
 
@@ -3024,10 +3631,12 @@ pub fn Replica(
3024
3631
  if (header.checksum == next.parent) {
3025
3632
  assert(header.view <= next.view);
3026
3633
  assert(header.op + 1 == next.op);
3027
- // We don't break with `next` but this is no guarantee that `next` does not break.
3634
+ // We don't break with `next` but this is no guarantee that `next` does not
3635
+ // break.
3028
3636
  return false;
3029
3637
  } else {
3030
- // If the journal has wrapped, then err in favor of a break regardless of op order:
3638
+ // If the journal has wrapped, then err in favor of a break regardless of op
3639
+ // order:
3031
3640
  return true;
3032
3641
  }
3033
3642
  }
@@ -3036,14 +3645,17 @@ pub fn Replica(
3036
3645
  return false;
3037
3646
  }
3038
3647
 
3039
- /// If we repair this header, then would this connect the hash chain through to the latest op?
3040
- /// This offers a strong guarantee that may be used to replace or overlap an existing op.
3648
+ /// If we repair this header, then would this connect the hash chain through to the latest
3649
+ /// op? This offers a strong guarantee that may be used to replace or overlap an existing
3650
+ /// op.
3041
3651
  ///
3042
3652
  /// Here is an example of what could go wrong if we did not check for complete connection:
3043
3653
  ///
3044
3654
  /// 1. We do a prepare that's going to be committed.
3045
- /// 2. We do a stale prepare to the right of this, ignoring the hash chain break to the left.
3046
- /// 3. We do another stale prepare that replaces the first op because it connects to the second.
3655
+ /// 2. We do a stale prepare to the right of this, ignoring the hash chain break to the
3656
+ /// left.
3657
+ /// 3. We do another stale prepare that replaces the first op because it connects to the
3658
+ /// second.
3047
3659
  ///
3048
3660
  /// This would violate our quorum replication commitment to the leader.
3049
3661
  /// The mistake in this example was not that we ignored the break to the left, which we must
@@ -3066,43 +3678,16 @@ pub fn Replica(
3066
3678
  }
3067
3679
 
3068
3680
  assert(entry.op == self.op);
3069
- assert(entry.checksum == self.journal.entry_for_op_exact(self.op).?.checksum);
3681
+ assert(entry.checksum == self.journal.header_with_op(self.op).?.checksum);
3070
3682
  return true;
3071
3683
  }
3072
3684
 
3073
- /// If we repair this header, then would this overlap and overwrite part of another batch?
3074
- /// Journal entries have variable-sized batches that may overlap if entries are disconnected.
3075
- fn repair_header_would_overlap_another(self: *Self, header: *const Header) bool {
3076
- // TODO Snapshots: Handle journal wrap around.
3077
- {
3078
- // Look behind this entry for any preceeding entry that this would overlap:
3079
- var op: u64 = header.op;
3080
- while (op > 0) {
3081
- op -= 1;
3082
- if (self.journal.entry_for_op(op)) |neighbor| {
3083
- if (Journal.next_offset(neighbor) > header.offset) return true;
3084
- break;
3085
- }
3086
- }
3087
- }
3088
- {
3089
- // Look beyond this entry for any succeeding entry that this would overlap:
3090
- var op: u64 = header.op + 1;
3091
- while (op <= self.op) : (op += 1) {
3092
- if (self.journal.entry_for_op(op)) |neighbor| {
3093
- if (Journal.next_offset(header) > neighbor.offset) return true;
3094
- break;
3095
- }
3096
- }
3097
- }
3098
- return false;
3099
- }
3100
-
3101
3685
  /// Reads prepares into the pipeline (before we start the view as the new leader).
3102
3686
  fn repair_pipeline(self: *Self) void {
3103
3687
  assert(self.status == .view_change);
3104
3688
  assert(self.leader_index(self.view) == self.replica);
3105
3689
  assert(self.commit_max < self.op);
3690
+ assert(self.journal.dirty.count == 0);
3106
3691
 
3107
3692
  if (self.repairing_pipeline) {
3108
3693
  log.debug("{}: repair_pipeline: already repairing...", .{self.replica});
@@ -3117,11 +3702,57 @@ pub fn Replica(
3117
3702
  self.repair_pipeline_read();
3118
3703
  }
3119
3704
 
3705
+ /// Discard messages from the prepare pipeline.
3706
+ /// Retain uncommitted messages that belong in the current view to maximize durability.
3707
+ fn repair_pipeline_diff(self: *Self) void {
3708
+ assert(self.status == .view_change);
3709
+ assert(self.leader_index(self.view) == self.replica);
3710
+
3711
+ // Discard messages from the front of the pipeline that committed since we were leader.
3712
+ while (self.pipeline.head_ptr()) |prepare| {
3713
+ if (prepare.message.header.op > self.commit_max) break;
3714
+
3715
+ self.message_bus.unref(self.pipeline.pop().?.message);
3716
+ }
3717
+
3718
+ // Discard the whole pipeline if it is now disconnected from the WAL's hash chain.
3719
+ if (self.pipeline.head_ptr()) |pipeline_head| {
3720
+ const parent = self.journal.header_with_op_and_checksum(
3721
+ pipeline_head.message.header.op - 1,
3722
+ pipeline_head.message.header.parent,
3723
+ );
3724
+ if (parent == null) {
3725
+ while (self.pipeline.pop()) |prepare| self.message_bus.unref(prepare.message);
3726
+ assert(self.pipeline.count == 0);
3727
+ }
3728
+ }
3729
+
3730
+ // Discard messages from the back of the pipeline that are not part of this view.
3731
+ while (self.pipeline.tail_ptr()) |prepare| {
3732
+ if (self.journal.has(prepare.message.header)) break;
3733
+
3734
+ self.message_bus.unref(self.pipeline.pop_tail().?.message);
3735
+ }
3736
+
3737
+ log.debug("{}: repair_pipeline_diff: {} prepare(s)", .{
3738
+ self.replica,
3739
+ self.pipeline.count,
3740
+ });
3741
+
3742
+ self.verify_pipeline();
3743
+
3744
+ // Do not reset `repairing_pipeline` here as this must be reset by the read callback.
3745
+ // Otherwise, we would be making `repair_pipeline()` reentrant.
3746
+ }
3747
+
3120
3748
  /// Returns the next `op` number that needs to be read into the pipeline.
3121
3749
  fn repair_pipeline_op(self: *Self) ?u64 {
3122
3750
  assert(self.status == .view_change);
3123
3751
  assert(self.leader_index(self.view) == self.replica);
3124
3752
 
3753
+ // We cannot rely on `pipeline.count` below unless the pipeline has first been diffed.
3754
+ self.repair_pipeline_diff();
3755
+
3125
3756
  const op = self.commit_max + self.pipeline.count + 1;
3126
3757
  if (op <= self.op) return op;
3127
3758
 
@@ -3139,7 +3770,7 @@ pub fn Replica(
3139
3770
  assert(op <= self.op);
3140
3771
  assert(self.commit_max + self.pipeline.count + 1 == op);
3141
3772
 
3142
- const checksum = self.journal.entry_for_op_exact(op).?.checksum;
3773
+ const checksum = self.journal.header_with_op(op).?.checksum;
3143
3774
 
3144
3775
  log.debug("{}: repair_pipeline_read: op={} checksum={}", .{
3145
3776
  self.replica,
@@ -3198,7 +3829,7 @@ pub fn Replica(
3198
3829
  return;
3199
3830
  }
3200
3831
 
3201
- if (prepare.?.header.checksum != self.journal.entry_for_op_exact(op).?.checksum) {
3832
+ if (prepare.?.header.checksum != self.journal.header_with_op(op).?.checksum) {
3202
3833
  log.debug("{}: repair_pipeline_push: checksum changed", .{self.replica});
3203
3834
  return;
3204
3835
  }
@@ -3212,6 +3843,10 @@ pub fn Replica(
3212
3843
  prepare.?.header.checksum,
3213
3844
  });
3214
3845
 
3846
+ if (self.pipeline.tail_ptr()) |parent| {
3847
+ assert(prepare.?.header.parent == parent.message.header.checksum);
3848
+ }
3849
+
3215
3850
  self.pipeline.push_assume_capacity(.{ .message = prepare.?.ref() });
3216
3851
  assert(self.pipeline.count >= 1);
3217
3852
 
@@ -3222,7 +3857,7 @@ pub fn Replica(
3222
3857
  fn repair_prepares(self: *Self) void {
3223
3858
  assert(self.status == .normal or self.status == .view_change);
3224
3859
  assert(self.repairs_allowed());
3225
- assert(self.journal.dirty.len > 0);
3860
+ assert(self.journal.dirty.count > 0);
3226
3861
 
3227
3862
  // Request enough prepares to utilize our max IO depth:
3228
3863
  var budget = self.journal.writes.available();
@@ -3231,11 +3866,34 @@ pub fn Replica(
3231
3866
  return;
3232
3867
  }
3233
3868
 
3869
+ if (self.op < config.journal_slot_count) {
3870
+ // The op is known, and this is the first WAL cycle.
3871
+ // Therefore, any faulty ops to the right of `replica.op` are corrupt reserved
3872
+ // entries from the initial format.
3873
+ var op: usize = self.op + 1;
3874
+ while (op < config.journal_slot_count) : (op += 1) {
3875
+ const slot = self.journal.slot_for_op(op);
3876
+ assert(slot.index == op);
3877
+
3878
+ if (self.journal.faulty.bit(slot)) {
3879
+ assert(self.journal.headers[op].command == .reserved);
3880
+ self.journal.dirty.clear(slot);
3881
+ self.journal.faulty.clear(slot);
3882
+ log.debug("{}: repair_prepares: op={} (op known, first cycle)", .{
3883
+ self.replica,
3884
+ op,
3885
+ });
3886
+ }
3887
+ }
3888
+ }
3889
+
3234
3890
  var op = self.op + 1;
3235
- while (op > 0) {
3891
+ const op_min = op -| config.journal_slot_count;
3892
+ while (op > op_min) {
3236
3893
  op -= 1;
3237
3894
 
3238
- if (self.journal.dirty.bit(op)) {
3895
+ const slot = self.journal.slot_for_op(op);
3896
+ if (self.journal.dirty.bit(slot)) {
3239
3897
  // If this is an uncommitted op, and we are the leader in `view_change` status,
3240
3898
  // then we will `request_prepare` from the cluster, set `nack_prepare_op`,
3241
3899
  // and stop repairing any further prepares:
@@ -3257,7 +3915,7 @@ pub fn Replica(
3257
3915
  }
3258
3916
  }
3259
3917
  } else {
3260
- assert(!self.journal.faulty.bit(op));
3918
+ assert(!self.journal.faulty.bit(slot));
3261
3919
  }
3262
3920
  }
3263
3921
  }
@@ -3279,16 +3937,17 @@ pub fn Replica(
3279
3937
  /// This is effectively "many-to-one" repair, where a single replica recovers using the
3280
3938
  /// resources of many replicas, for faster recovery.
3281
3939
  fn repair_prepare(self: *Self, op: u64) bool {
3940
+ const slot = self.journal.slot_with_op(op).?;
3941
+ const checksum = self.journal.header_with_op(op).?.checksum;
3942
+
3282
3943
  assert(self.status == .normal or self.status == .view_change);
3283
3944
  assert(self.repairs_allowed());
3284
- assert(self.journal.dirty.bit(op));
3285
-
3286
- const checksum = self.journal.entry_for_op_exact(op).?.checksum;
3945
+ assert(self.journal.dirty.bit(slot));
3287
3946
 
3288
3947
  // We may be appending to or repairing the journal concurrently.
3289
3948
  // We do not want to re-request any of these prepares unnecessarily.
3290
3949
  if (self.journal.writing(op, checksum)) {
3291
- log.debug("{}: repair_prepare: already writing op={} checksum={}", .{
3950
+ log.debug("{}: repair_prepare: op={} checksum={} (already writing)", .{
3292
3951
  self.replica,
3293
3952
  op,
3294
3953
  checksum,
@@ -3296,11 +3955,46 @@ pub fn Replica(
3296
3955
  return false;
3297
3956
  }
3298
3957
 
3958
+ // The message may be available in the local pipeline.
3959
+ // For example (replica_count=3):
3960
+ // 1. View=1: Replica 1 is leader, and prepares op 5. The local write fails.
3961
+ // 2. Time passes. The view changes (e.g. due to a timeout)…
3962
+ // 3. View=4: Replica 1 is leader again, and is repairing op 5
3963
+ // (which is still in the pipeline).
3964
+ //
3965
+ // Using the pipeline to repair is faster than a `request_prepare`.
3966
+ // Also, messages in the pipeline are never corrupt.
3967
+ if (self.pipeline_prepare_for_op_and_checksum(op, checksum)) |prepare| {
3968
+ assert(prepare.message.header.op == op);
3969
+ assert(prepare.message.header.checksum == checksum);
3970
+
3971
+ if (self.replica_count == 1) {
3972
+ // This op won't start writing until all ops in the pipeline preceding it have
3973
+ // been written.
3974
+ log.debug("{}: repair_prepare: op={} checksum={} (serializing append)", .{
3975
+ self.replica,
3976
+ op,
3977
+ checksum,
3978
+ });
3979
+ assert(op > self.pipeline.head_ptr().?.message.header.op);
3980
+ return false;
3981
+ }
3982
+
3983
+ log.debug("{}: repair_prepare: op={} checksum={} (from pipeline)", .{
3984
+ self.replica,
3985
+ op,
3986
+ checksum,
3987
+ });
3988
+ self.write_prepare(prepare.message, .pipeline);
3989
+ return true;
3990
+ }
3991
+
3299
3992
  const request_prepare = Header{
3300
3993
  .command = .request_prepare,
3301
- // If we request a prepare from a follower, as below, it is critical to pass a checksum:
3302
- // Otherwise we could receive different prepares for the same op number.
3994
+ // If we request a prepare from a follower, as below, it is critical to pass a
3995
+ // checksum: Otherwise we could receive different prepares for the same op number.
3303
3996
  .context = checksum,
3997
+ .timestamp = 1, // The checksum is included in context.
3304
3998
  .cluster = self.cluster,
3305
3999
  .replica = self.replica,
3306
4000
  .view = self.view,
@@ -3311,7 +4005,7 @@ pub fn Replica(
3311
4005
  // Only the leader is allowed to do repairs in a view change:
3312
4006
  assert(self.leader_index(self.view) == self.replica);
3313
4007
 
3314
- const reason = if (self.journal.faulty.bit(op)) "faulty" else "dirty";
4008
+ const reason = if (self.journal.faulty.bit(slot)) "faulty" else "dirty";
3315
4009
  log.debug(
3316
4010
  "{}: repair_prepare: op={} checksum={} (uncommitted, {s}, view_change)",
3317
4011
  .{
@@ -3322,7 +4016,7 @@ pub fn Replica(
3322
4016
  },
3323
4017
  );
3324
4018
 
3325
- if (self.replica_count == 2 and !self.journal.faulty.bit(op)) {
4019
+ if (self.replica_count == 2 and !self.journal.faulty.bit(slot)) {
3326
4020
  // This is required to avoid a liveness issue for a cluster-of-two where a new
3327
4021
  // leader learns of an op during a view change but where the op is faulty on
3328
4022
  // the old leader. We must immediately roll back the op since it could not have
@@ -3354,7 +4048,7 @@ pub fn Replica(
3354
4048
  self.send_header_to_other_replicas(request_prepare);
3355
4049
  } else {
3356
4050
  const nature = if (op > self.commit_max) "uncommitted" else "committed";
3357
- const reason = if (self.journal.faulty.bit(op)) "faulty" else "dirty";
4051
+ const reason = if (self.journal.faulty.bit(slot)) "faulty" else "dirty";
3358
4052
  log.debug("{}: repair_prepare: op={} checksum={} ({s}, {s})", .{
3359
4053
  self.replica,
3360
4054
  op,
@@ -3417,22 +4111,6 @@ pub fn Replica(
3417
4111
  self.send_message_to_replica(next, message);
3418
4112
  }
3419
4113
 
3420
- /// Empties the prepare pipeline, unreffing all prepare and prepare_ok messages.
3421
- /// Stops the prepare timeout and resets the timeouts counter.
3422
- fn reset_pipeline(self: *Self) void {
3423
- while (self.pipeline.pop()) |prepare| {
3424
- self.message_bus.unref(prepare.message);
3425
- }
3426
-
3427
- self.prepare_timeout.stop();
3428
-
3429
- assert(self.pipeline.count == 0);
3430
- assert(self.prepare_timeout.ticking == false);
3431
-
3432
- // Do not reset `repairing_pipeline` here as this must be reset by the read callback.
3433
- // Otherwise, we would be making `repair_pipeline()` reentrant.
3434
- }
3435
-
3436
4114
  fn reset_quorum_messages(self: *Self, messages: *QuorumMessages, command: Command) void {
3437
4115
  assert(messages.len == config.replicas_max);
3438
4116
  var view: ?u32 = null;
@@ -3457,7 +4135,12 @@ pub fn Replica(
3457
4135
  received.* = null;
3458
4136
  }
3459
4137
  assert(count <= self.replica_count);
3460
- log.debug("{}: reset {} {s} message(s)", .{ self.replica, count, @tagName(command) });
4138
+ log.debug("{}: reset {} {s} message(s) from view={}", .{
4139
+ self.replica,
4140
+ count,
4141
+ @tagName(command),
4142
+ view,
4143
+ });
3461
4144
  }
3462
4145
 
3463
4146
  fn reset_quorum_counter(self: *Self, counter: *QuorumCounter) void {
@@ -3466,7 +4149,7 @@ pub fn Replica(
3466
4149
  assert(replica < self.replica_count);
3467
4150
  }
3468
4151
 
3469
- counter.setIntersection(QuorumCounterNull);
4152
+ counter.setIntersection(quorum_counter_null);
3470
4153
  assert(counter.count() == 0);
3471
4154
 
3472
4155
  var replica: usize = 0;
@@ -3490,6 +4173,16 @@ pub fn Replica(
3490
4173
  self.start_view_change_quorum = false;
3491
4174
  }
3492
4175
 
4176
+ fn reset_quorum_recovery_response(self: *Self) void {
4177
+ for (self.recovery_response_from_other_replicas) |*received, replica| {
4178
+ if (received.*) |message| {
4179
+ assert(replica != self.replica);
4180
+ self.message_bus.unref(message);
4181
+ received.* = null;
4182
+ }
4183
+ }
4184
+ }
4185
+
3493
4186
  fn send_prepare_ok(self: *Self, header: *const Header) void {
3494
4187
  assert(header.command == .prepare);
3495
4188
  assert(header.cluster == self.cluster);
@@ -3549,7 +4242,7 @@ pub fn Replica(
3549
4242
  .view = self.view,
3550
4243
  .op = header.op,
3551
4244
  .commit = header.commit,
3552
- .offset = header.offset,
4245
+ .timestamp = header.timestamp,
3553
4246
  .operation = header.operation,
3554
4247
  });
3555
4248
  } else {
@@ -3567,7 +4260,7 @@ pub fn Replica(
3567
4260
  // * being able to send what we have will allow the pipeline to commit earlier, and
3568
4261
  // * the leader will drop any prepare_ok for a prepare not in the pipeline.
3569
4262
  // This is safe only because the leader can verify against the prepare checksum.
3570
- if (self.journal.entry_for_op_exact(op)) |header| {
4263
+ if (self.journal.header_with_op(op)) |header| {
3571
4264
  self.send_prepare_ok(header);
3572
4265
  defer self.flush_loopback_queue();
3573
4266
  }
@@ -3603,8 +4296,8 @@ pub fn Replica(
3603
4296
  assert(message.header.command == .do_view_change);
3604
4297
  assert(message.header.view == self.view);
3605
4298
  assert(message.header.op == self.op);
4299
+ assert(message.header.op == self.message_body_as_headers(message)[0].op);
3606
4300
  assert(message.header.commit == self.commit_max);
3607
- // TODO Assert that latest header in message body matches self.op.
3608
4301
 
3609
4302
  self.send_message_to_replica(self.leader_index(self.view), message);
3610
4303
  }
@@ -3679,6 +4372,7 @@ pub fn Replica(
3679
4372
 
3680
4373
  // TODO According to message.header.command, assert on the destination replica.
3681
4374
  switch (message.header.command) {
4375
+ .reserved => unreachable,
3682
4376
  .request => {
3683
4377
  // Do not assert message.header.replica because we forward .request messages.
3684
4378
  assert(self.status == .normal);
@@ -3731,6 +4425,16 @@ pub fn Replica(
3731
4425
  },
3732
4426
  else => unreachable,
3733
4427
  },
4428
+ .recovery => {
4429
+ assert(self.status == .recovering);
4430
+ assert(message.header.replica == self.replica);
4431
+ assert(message.header.context == self.recovery_nonce);
4432
+ },
4433
+ .recovery_response => {
4434
+ assert(self.status == .normal);
4435
+ assert(message.header.view == self.view);
4436
+ assert(message.header.replica == self.replica);
4437
+ },
3734
4438
  .headers => {
3735
4439
  assert(self.status == .normal or self.status == .view_change);
3736
4440
  assert(message.header.view == self.view);
@@ -3757,7 +4461,7 @@ pub fn Replica(
3757
4461
  .nack_prepare => {
3758
4462
  assert(message.header.view == self.view);
3759
4463
  assert(message.header.replica == self.replica);
3760
- assert(replica == self.leader_index(self.view));
4464
+ assert(self.leader_index(self.view) == replica);
3761
4465
  },
3762
4466
  else => {
3763
4467
  log.info("{}: send_message_to_replica: TODO {s}", .{
@@ -3776,8 +4480,8 @@ pub fn Replica(
3776
4480
  }
3777
4481
 
3778
4482
  /// Finds the header with the highest op number in a slice of headers from a replica.
3779
- /// Searches only by op number to find the highest `self.op for the replica.
3780
- fn set_latest_op(headers: []Header, latest: *Header) void {
4483
+ /// Searches only by op number to find the highest `self.op` for the replica.
4484
+ fn set_latest_op(headers: []const Header, latest: *Header) void {
3781
4485
  switch (latest.command) {
3782
4486
  .reserved, .prepare => assert(latest.valid_checksum()),
3783
4487
  else => unreachable,
@@ -3802,17 +4506,27 @@ pub fn Replica(
3802
4506
  k: u64,
3803
4507
  method: []const u8,
3804
4508
  ) void {
3805
- assert(self.status == .view_change);
3806
-
4509
+ assert(self.status == .view_change or self.status == .recovering);
4510
+ assert(self.journal.recovered);
3807
4511
  assert(latest.valid_checksum());
3808
4512
  assert(latest.invalid() == null);
3809
4513
  assert(latest.command == .prepare);
3810
4514
  assert(latest.cluster == self.cluster);
3811
4515
 
3812
- // The view may have started already, so we can have a prepare in the same view:
3813
- assert(latest.view <= self.view);
4516
+ switch (self.status) {
4517
+ .normal => unreachable,
4518
+ .view_change => {
4519
+ // The view may have started already, so we can have a prepare in the same view:
4520
+ assert(latest.view <= self.view);
4521
+ },
4522
+ .recovering => {
4523
+ // The replica's view hasn't been set yet.
4524
+ // It will be set shortly, when we transition to normal status.
4525
+ assert(self.view == 0);
4526
+ },
4527
+ }
3814
4528
 
3815
- log.debug("{}: {s}: view={} op={}..{} commit={}..{} checksum={} offset={}", .{
4529
+ log.debug("{}: {s}: view={} op={}..{} commit={}..{} checksum={}", .{
3816
4530
  self.replica,
3817
4531
  method,
3818
4532
  self.view,
@@ -3821,7 +4535,6 @@ pub fn Replica(
3821
4535
  self.commit_max,
3822
4536
  k,
3823
4537
  latest.checksum,
3824
- latest.offset,
3825
4538
  });
3826
4539
 
3827
4540
  // Uncommitted ops may not survive a view change so we must assert `latest.op` against
@@ -3863,15 +4576,15 @@ pub fn Replica(
3863
4576
  // Do not set the latest op as dirty if we already have it exactly:
3864
4577
  // Otherwise, this would trigger a repair and delay the view change, or worse, it would
3865
4578
  // prevent us from assisting another replica to recover when we do in fact have the op.
3866
- if (self.journal.entry_for_op_exact_with_checksum(latest.op, latest.checksum)) |_| {
4579
+ if (self.journal.has(latest)) {
3867
4580
  log.debug("{}: {s}: latest op exists exactly", .{ self.replica, method });
3868
4581
  } else {
3869
- self.journal.set_entry_as_dirty(latest);
4582
+ self.journal.set_header_as_dirty(latest);
3870
4583
  }
3871
4584
 
3872
4585
  assert(self.op == latest.op);
3873
4586
  self.journal.remove_entries_from(self.op + 1);
3874
- assert(self.journal.entry_for_op_exact(self.op).?.checksum == latest.checksum);
4587
+ assert(self.journal.header_with_op(self.op).?.checksum == latest.checksum);
3875
4588
  }
3876
4589
 
3877
4590
  fn start_view_as_the_new_leader(self: *Self) void {
@@ -3884,31 +4597,18 @@ pub fn Replica(
3884
4597
 
3885
4598
  assert(self.commit_min == self.commit_max);
3886
4599
  assert(self.repair_pipeline_op() == null);
4600
+ self.verify_pipeline();
3887
4601
  assert(self.commit_max + self.pipeline.count == self.op);
3888
4602
  assert(self.valid_hash_chain_between(self.commit_min, self.op));
3889
4603
 
3890
- var pipeline_op = self.commit_max + 1;
3891
- var pipeline_parent = self.journal.entry_for_op_exact(self.commit_max).?.checksum;
3892
- var iterator = self.pipeline.iterator();
3893
- while (iterator.next_ptr()) |prepare| {
3894
- assert(prepare.message.header.command == .prepare);
3895
- assert(prepare.message.header.op == pipeline_op);
3896
- assert(prepare.message.header.parent == pipeline_parent);
3897
-
3898
- pipeline_parent = prepare.message.header.checksum;
3899
- pipeline_op += 1;
3900
- }
3901
- assert(self.pipeline.count <= config.pipeline_max);
3902
- assert(self.commit_max + self.pipeline.count == pipeline_op - 1);
3903
-
3904
- assert(self.journal.dirty.len == 0);
3905
- assert(self.journal.faulty.len == 0);
4604
+ assert(self.journal.dirty.count == 0);
4605
+ assert(self.journal.faulty.count == 0);
3906
4606
  assert(self.nack_prepare_op == null);
3907
4607
 
3908
4608
  const start_view = self.create_view_change_message(.start_view);
3909
4609
  defer self.message_bus.unref(start_view);
3910
4610
 
3911
- self.transition_to_normal_status(self.view);
4611
+ self.transition_to_normal_from_view_change_status(self.view);
3912
4612
  // Detect if the transition to normal status above accidentally resets the pipeline:
3913
4613
  assert(self.commit_max + self.pipeline.count == self.op);
3914
4614
 
@@ -3927,17 +4627,73 @@ pub fn Replica(
3927
4627
  self.send_message_to_other_replicas(start_view);
3928
4628
  }
3929
4629
 
3930
- fn transition_to_normal_status(self: *Self, new_view: u32) void {
3931
- log.debug("{}: transition_to_normal_status: view={}", .{ self.replica, new_view });
4630
+ fn transition_to_normal_from_recovering_status(self: *Self, new_view: u32) void {
4631
+ assert(self.status == .recovering);
4632
+ assert(self.view == 0);
4633
+ self.view = new_view;
4634
+ self.view_normal = new_view;
4635
+ self.status = .normal;
4636
+
4637
+ if (self.leader()) {
4638
+ log.debug(
4639
+ "{}: transition_to_normal_from_recovering_status: view={} leader",
4640
+ .{
4641
+ self.replica,
4642
+ self.view,
4643
+ },
4644
+ );
4645
+
4646
+ assert(self.journal.is_empty() or self.replica_count == 1);
4647
+ assert(!self.prepare_timeout.ticking);
4648
+ assert(!self.normal_status_timeout.ticking);
4649
+ assert(!self.view_change_status_timeout.ticking);
4650
+ assert(!self.view_change_message_timeout.ticking);
4651
+
4652
+ self.ping_timeout.start();
4653
+ self.commit_timeout.start();
4654
+ self.repair_timeout.start();
4655
+ self.recovery_timeout.stop();
4656
+ } else {
4657
+ log.debug(
4658
+ "{}: transition_to_normal_from_recovering_status: view={} follower",
4659
+ .{
4660
+ self.replica,
4661
+ self.view,
4662
+ },
4663
+ );
4664
+
4665
+ assert(!self.prepare_timeout.ticking);
4666
+ assert(!self.commit_timeout.ticking);
4667
+ assert(!self.view_change_status_timeout.ticking);
4668
+ assert(!self.view_change_message_timeout.ticking);
4669
+
4670
+ self.ping_timeout.start();
4671
+ self.normal_status_timeout.start();
4672
+ self.repair_timeout.start();
4673
+ self.recovery_timeout.stop();
4674
+ }
4675
+ }
4676
+
4677
+ fn transition_to_normal_from_view_change_status(self: *Self, new_view: u32) void {
3932
4678
  // In the VRR paper it's possible to transition from normal to normal for the same view.
3933
4679
  // For example, this could happen after a state transfer triggered by an op jump.
4680
+ assert(self.status == .view_change);
3934
4681
  assert(new_view >= self.view);
3935
4682
  self.view = new_view;
3936
4683
  self.view_normal = new_view;
3937
4684
  self.status = .normal;
3938
4685
 
3939
4686
  if (self.leader()) {
3940
- log.debug("{}: transition_to_normal_status: leader", .{self.replica});
4687
+ log.debug(
4688
+ "{}: transition_to_normal_from_view_change_status: view={} leader",
4689
+ .{
4690
+ self.replica,
4691
+ self.view,
4692
+ },
4693
+ );
4694
+
4695
+ assert(!self.prepare_timeout.ticking);
4696
+ assert(!self.recovery_timeout.ticking);
3941
4697
 
3942
4698
  self.ping_timeout.start();
3943
4699
  self.commit_timeout.start();
@@ -3947,12 +4703,15 @@ pub fn Replica(
3947
4703
  self.repair_timeout.start();
3948
4704
 
3949
4705
  // Do not reset the pipeline as there may be uncommitted ops to drive to completion.
3950
- if (self.pipeline.count > 0) {
3951
- assert(!self.prepare_timeout.ticking);
3952
- self.prepare_timeout.start();
3953
- }
4706
+ if (self.pipeline.count > 0) self.prepare_timeout.start();
3954
4707
  } else {
3955
- log.debug("{}: transition_to_normal_status: follower", .{self.replica});
4708
+ log.debug("{}: transition_to_normal_from_view_change_status: view={} follower", .{
4709
+ self.replica,
4710
+ self.view,
4711
+ });
4712
+
4713
+ assert(!self.prepare_timeout.ticking);
4714
+ assert(!self.recovery_timeout.ticking);
3956
4715
 
3957
4716
  self.ping_timeout.start();
3958
4717
  self.commit_timeout.stop();
@@ -3960,8 +4719,6 @@ pub fn Replica(
3960
4719
  self.view_change_status_timeout.stop();
3961
4720
  self.view_change_message_timeout.stop();
3962
4721
  self.repair_timeout.start();
3963
-
3964
- self.reset_pipeline();
3965
4722
  }
3966
4723
 
3967
4724
  self.reset_quorum_start_view_change();
@@ -3973,17 +4730,18 @@ pub fn Replica(
3973
4730
  assert(self.nack_prepare_op == null);
3974
4731
  }
3975
4732
 
3976
- /// A replica i that notices the need for a view change advances its view, sets its status to
3977
- /// view_change, and sends a ⟨start_view_change v, i⟩ message to all the other replicas,
3978
- /// where v identifies the new view. A replica notices the need for a view change either based
3979
- /// on its own timer, or because it receives a start_view_change or do_view_change message for
3980
- /// a view with a larger number than its own view.
4733
+ /// A replica i that notices the need for a view change advances its view, sets its status
4734
+ /// to view_change, and sends a ⟨start_view_change v, i⟩ message to all the other replicas,
4735
+ /// where v identifies the new view. A replica notices the need for a view change either
4736
+ /// based on its own timer, or because it receives a start_view_change or do_view_change
4737
+ /// message for a view with a larger number than its own view.
3981
4738
  fn transition_to_view_change_status(self: *Self, new_view: u32) void {
3982
4739
  log.debug("{}: transition_to_view_change_status: view={}..{}", .{
3983
4740
  self.replica,
3984
4741
  self.view,
3985
4742
  new_view,
3986
4743
  });
4744
+ assert(self.status == .normal or self.status == .view_change);
3987
4745
  assert(new_view > self.view);
3988
4746
  self.view = new_view;
3989
4747
  self.status = .view_change;
@@ -3994,13 +4752,14 @@ pub fn Replica(
3994
4752
  self.view_change_status_timeout.start();
3995
4753
  self.view_change_message_timeout.start();
3996
4754
  self.repair_timeout.stop();
4755
+ self.prepare_timeout.stop();
4756
+ assert(!self.recovery_timeout.ticking);
3997
4757
 
3998
4758
  // Do not reset quorum counters only on entering a view, assuming that the view will be
3999
4759
  // followed only by a single subsequent view change to the next view, because multiple
4000
4760
  // successive view changes can fail, e.g. after a view change timeout.
4001
- // We must therefore reset our counters here to avoid counting messages from an older view,
4002
- // which would violate the quorum intersection property essential for correctness.
4003
- self.reset_pipeline();
4761
+ // We must therefore reset our counters here to avoid counting messages from an older
4762
+ // view, which would violate the quorum intersection property essential for correctness.
4004
4763
  self.reset_quorum_start_view_change();
4005
4764
  self.reset_quorum_do_view_change();
4006
4765
  self.reset_quorum_nack_prepare();
@@ -4075,21 +4834,21 @@ pub fn Replica(
4075
4834
  return true;
4076
4835
  }
4077
4836
 
4078
- /// Returns true if all operations are present, correctly ordered and connected by hash chain,
4079
- /// between `op_min` and `op_max` (both inclusive).
4837
+ /// Returns true if all operations are present, correctly ordered and connected by hash
4838
+ /// chain, between `op_min` and `op_max` (both inclusive).
4080
4839
  fn valid_hash_chain_between(self: *Self, op_min: u64, op_max: u64) bool {
4081
4840
  assert(op_min <= op_max);
4082
4841
 
4083
- // If we use anything less than self.op then we may commit ops for a forked hash chain that
4084
- // have since been reordered by a new leader.
4842
+ // If we use anything less than self.op then we may commit ops for a forked hash chain
4843
+ // that have since been reordered by a new leader.
4085
4844
  assert(op_max == self.op);
4086
- var b = self.journal.entry_for_op_exact(op_max).?;
4845
+ var b = self.journal.header_with_op(op_max).?;
4087
4846
 
4088
4847
  var op = op_max;
4089
4848
  while (op > op_min) {
4090
4849
  op -= 1;
4091
4850
 
4092
- if (self.journal.entry_for_op_exact(op)) |a| {
4851
+ if (self.journal.header_with_op(op)) |a| {
4093
4852
  assert(a.op + 1 == b.op);
4094
4853
  if (a.checksum == b.parent) {
4095
4854
  assert(ascending_viewstamps(a, b));
@@ -4108,6 +4867,33 @@ pub fn Replica(
4108
4867
  return true;
4109
4868
  }
4110
4869
 
4870
+ fn verify_pipeline(self: *Self) void {
4871
+ var op = self.commit_max + 1;
4872
+ var parent = self.journal.header_with_op(self.commit_max).?.checksum;
4873
+
4874
+ var iterator = self.pipeline.iterator();
4875
+ while (iterator.next_ptr()) |prepare| {
4876
+ assert(prepare.message.header.command == .prepare);
4877
+
4878
+ log.debug("{}: verify_pipeline: op={} checksum={x} parent={x}", .{
4879
+ self.replica,
4880
+ prepare.message.header.op,
4881
+ prepare.message.header.checksum,
4882
+ prepare.message.header.parent,
4883
+ });
4884
+
4885
+ assert(self.journal.has(prepare.message.header));
4886
+ assert(prepare.message.header.op == op);
4887
+ assert(prepare.message.header.op <= self.op);
4888
+ assert(prepare.message.header.parent == parent);
4889
+
4890
+ parent = prepare.message.header.checksum;
4891
+ op += 1;
4892
+ }
4893
+ assert(self.pipeline.count <= config.pipeline_max);
4894
+ assert(self.commit_max + self.pipeline.count == op - 1);
4895
+ }
4896
+
4111
4897
  fn view_jump(self: *Self, header: *const Header) void {
4112
4898
  const to: Status = switch (header.command) {
4113
4899
  .prepare, .commit => .normal,
@@ -4203,10 +4989,10 @@ pub fn Replica(
4203
4989
  return;
4204
4990
  }
4205
4991
 
4206
- self.journal.write_prepare(write_prepare_on_write, message, trigger);
4992
+ self.journal.write_prepare(write_prepare_callback, message, trigger);
4207
4993
  }
4208
4994
 
4209
- fn write_prepare_on_write(
4995
+ fn write_prepare_callback(
4210
4996
  self: *Self,
4211
4997
  wrote: ?*Message,
4212
4998
  trigger: Journal.Write.Trigger,
@@ -4222,6 +5008,7 @@ pub fn Replica(
4222
5008
  // If this was a repair, continue immediately to repair the next prepare:
4223
5009
  // This is an optimization to eliminate waiting until the next repair timeout.
4224
5010
  .repair => self.repair(),
5011
+ .pipeline => self.repair(),
4225
5012
  }
4226
5013
  }
4227
5014
  };