tigerbeetle-node 0.11.12 → 0.11.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/README.md +212 -196
  2. package/dist/.client.node.sha256 +1 -1
  3. package/package.json +3 -2
  4. package/src/node.zig +1 -0
  5. package/src/tigerbeetle/scripts/benchmark.bat +9 -2
  6. package/src/tigerbeetle/scripts/benchmark.sh +1 -1
  7. package/src/tigerbeetle/scripts/fail_on_diff.sh +9 -0
  8. package/src/tigerbeetle/scripts/fuzz_loop_hash_log.sh +12 -0
  9. package/src/tigerbeetle/scripts/scripts/benchmark.bat +9 -2
  10. package/src/tigerbeetle/scripts/scripts/benchmark.sh +1 -1
  11. package/src/tigerbeetle/scripts/scripts/fail_on_diff.sh +9 -0
  12. package/src/tigerbeetle/scripts/scripts/fuzz_loop_hash_log.sh +12 -0
  13. package/src/tigerbeetle/src/benchmark.zig +253 -231
  14. package/src/tigerbeetle/src/config.zig +2 -3
  15. package/src/tigerbeetle/src/constants.zig +2 -10
  16. package/src/tigerbeetle/src/io/linux.zig +15 -6
  17. package/src/tigerbeetle/src/lsm/forest.zig +1 -0
  18. package/src/tigerbeetle/src/lsm/forest_fuzz.zig +63 -14
  19. package/src/tigerbeetle/src/lsm/groove.zig +134 -70
  20. package/src/tigerbeetle/src/lsm/level_iterator.zig +2 -2
  21. package/src/tigerbeetle/src/lsm/manifest_level.zig +1 -0
  22. package/src/tigerbeetle/src/lsm/posted_groove.zig +7 -4
  23. package/src/tigerbeetle/src/lsm/segmented_array.zig +1 -0
  24. package/src/tigerbeetle/src/lsm/table.zig +29 -51
  25. package/src/tigerbeetle/src/lsm/table_immutable.zig +6 -17
  26. package/src/tigerbeetle/src/lsm/table_iterator.zig +2 -2
  27. package/src/tigerbeetle/src/lsm/table_mutable.zig +9 -26
  28. package/src/tigerbeetle/src/lsm/test.zig +1 -0
  29. package/src/tigerbeetle/src/lsm/tree.zig +2 -26
  30. package/src/tigerbeetle/src/lsm/tree_fuzz.zig +7 -2
  31. package/src/tigerbeetle/src/message_bus.zig +1 -0
  32. package/src/tigerbeetle/src/simulator.zig +14 -3
  33. package/src/tigerbeetle/src/state_machine/auditor.zig +1 -0
  34. package/src/tigerbeetle/src/state_machine.zig +402 -184
  35. package/src/tigerbeetle/src/stdx.zig +9 -0
  36. package/src/tigerbeetle/src/testing/cluster.zig +1 -0
  37. package/src/tigerbeetle/src/testing/packet_simulator.zig +19 -9
  38. package/src/tigerbeetle/src/testing/state_machine.zig +1 -0
  39. package/src/tigerbeetle/src/unit_tests.zig +20 -22
  40. package/src/tigerbeetle/src/vsr/README.md +1 -1
  41. package/src/tigerbeetle/src/vsr/client.zig +4 -4
  42. package/src/tigerbeetle/src/vsr/clock.zig +2 -0
  43. package/src/tigerbeetle/src/vsr/journal.zig +2 -0
  44. package/src/tigerbeetle/src/vsr/replica.zig +481 -246
  45. package/src/tigerbeetle/src/vsr.zig +104 -31
@@ -208,8 +208,21 @@ pub fn ReplicaType(
208
208
  /// we require and assert in our protocol implementation.
209
209
  loopback_queue: ?*Message = null,
210
210
 
211
- /// Unique start_view_change messages for the same view from OTHER replicas (excluding ourself).
212
- start_view_change_from_other_replicas: QuorumCounter = quorum_counter_null,
211
+ /// The last timestamp received on a commit heartbeat.
212
+ /// The timestamp originates from the primary's monotonic clock. It is used to discard
213
+ /// delayed or duplicate heartbeat messages.
214
+ /// (status=normal backup)
215
+ heartbeat_timestamp: u64 = 0,
216
+
217
+ /// While set, don't send commit heartbeats.
218
+ /// Used when the primary believes that it is partitioned and needs to step down.
219
+ /// In particular, guards against a deadlock in the case where small messages (e.g.
220
+ /// heartbeats, pings/pongs) succeed, but large messages (e.g. prepares) fail.
221
+ /// (status=normal primary, pipeline has prepare with !ok_quorum_received)
222
+ primary_abdicating: bool = false,
223
+
224
+ /// Unique start_view_change messages for the same view from ALL replicas (including ourself).
225
+ start_view_change_from_all_replicas: QuorumCounter = quorum_counter_null,
213
226
 
214
227
  /// Unique do_view_change messages for the same view from ALL replicas (including ourself).
215
228
  do_view_change_from_all_replicas: QuorumMessages = quorum_messages_null,
@@ -217,9 +230,6 @@ pub fn ReplicaType(
217
230
  /// Unique nack_prepare messages for the same view from OTHER replicas (excluding ourself).
218
231
  nack_prepare_from_other_replicas: QuorumCounter = quorum_counter_null,
219
232
 
220
- /// Whether a replica has received a quorum of start_view_change messages for the view change:
221
- start_view_change_quorum: bool = false,
222
-
223
233
  /// Whether the primary has received a quorum of do_view_change messages for the view change:
224
234
  /// Determines whether the primary may effect repairs according to the CTRL protocol.
225
235
  do_view_change_quorum: bool = false,
@@ -230,28 +240,54 @@ pub fn ReplicaType(
230
240
  /// The number of ticks before a primary or backup broadcasts a ping to other replicas.
231
241
  /// TODO Explain why we need this (MessageBus handshaking, leapfrogging faulty replicas,
232
242
  /// deciding whether starting a view change would be detrimental under some network partitions).
243
+ /// (status=normal replicas)
233
244
  ping_timeout: Timeout,
234
245
 
235
246
  /// The number of ticks without enough prepare_ok's before the primary resends a prepare.
247
+ /// (status=normal primary, pipeline has prepare with !ok_quorum_received)
236
248
  prepare_timeout: Timeout,
237
249
 
250
+ /// The number of ticks waiting for a prepare_ok.
251
+ /// When triggered, set primary_abdicating=true, which pauses outgoing commit heartbeats.
252
+ /// (status=normal primary, pipeline has prepare with !ok_quorum_received)
253
+ primary_abdicate_timeout: Timeout,
254
+
238
255
  /// The number of ticks before the primary sends a commit heartbeat:
239
256
  /// The primary always sends a commit heartbeat irrespective of when it last sent a prepare.
240
257
  /// This improves liveness when prepare messages cannot be replicated fully due to partitions.
241
- commit_timeout: Timeout,
242
-
243
- /// The number of ticks without hearing from the primary before starting a view change.
244
- /// This transitions from .normal status to .view_change status.
245
- normal_status_timeout: Timeout,
246
-
247
- /// The number of ticks before a view change is timed out:
248
- /// This transitions from `view_change` status to `view_change` status but for a newer view.
258
+ /// (status=normal primary)
259
+ commit_message_timeout: Timeout,
260
+
261
+ /// The number of ticks without a heartbeat.
262
+ /// Reset any time the backup receives a heartbeat from the primary.
263
+ /// Triggers SVC messages. If an SVC quorum is achieved, we will kick off a view-change.
264
+ /// (status=normal backup)
265
+ normal_heartbeat_timeout: Timeout,
266
+
267
+ /// The number of ticks before resetting the SVC quorum.
268
+ /// (status=normal|view-change, SVC quorum contains message from ANY OTHER replica)
269
+ start_view_change_window_timeout: Timeout,
270
+
271
+ /// The number of ticks before resending a `start_view_change` message.
272
+ /// (status=normal|view-change)
273
+ start_view_change_message_timeout: Timeout,
274
+
275
+ /// The number of ticks before a view change is timed out.
276
+ /// When triggered, begin sending SVC messages (to attempt to increment the view and try a
277
+ /// different primary) — but keep trying DVCs as well.
278
+ /// (status=view-change)
249
279
  view_change_status_timeout: Timeout,
250
280
 
251
- /// The number of ticks before resending a `start_view_change` or `do_view_change` message:
252
- view_change_message_timeout: Timeout,
281
+ /// The number of ticks before resending a `do_view_change` message:
282
+ /// (status=view-change)
283
+ do_view_change_message_timeout: Timeout,
284
+
285
+ /// The number of ticks before resending a `request_start_view` message.
286
+ /// (status=view-change backup)
287
+ request_start_view_message_timeout: Timeout,
253
288
 
254
289
  /// The number of ticks before repairing missing/disconnected headers and/or dirty entries:
290
+ /// (status=normal or (status=view-change and primary))
255
291
  repair_timeout: Timeout,
256
292
 
257
293
  /// Used to provide deterministic entropy to `choose_any_other_replica()`.
@@ -479,8 +515,6 @@ pub fn ReplicaType(
479
515
  time: Time,
480
516
  storage: *Storage,
481
517
  message_pool: *MessagePool,
482
- // TODO With https://github.com/coilhq/tigerbeetle/issues/71,
483
- // the separate message_bus_options won't be necessary.
484
518
  message_bus_options: MessageBus.Options,
485
519
  state_machine_options: StateMachine.Options,
486
520
  };
@@ -578,26 +612,46 @@ pub fn ReplicaType(
578
612
  .id = replica_index,
579
613
  .after = 50,
580
614
  },
581
- .commit_timeout = Timeout{
582
- .name = "commit_timeout",
615
+ .primary_abdicate_timeout = Timeout{
616
+ .name = "primary_abdicate_timeout",
583
617
  .id = replica_index,
584
- .after = 100,
618
+ .after = 1000,
619
+ },
620
+ .commit_message_timeout = Timeout{
621
+ .name = "commit_message_timeout",
622
+ .id = replica_index,
623
+ .after = 50,
624
+ },
625
+ .normal_heartbeat_timeout = Timeout{
626
+ .name = "normal_heartbeat_timeout",
627
+ .id = replica_index,
628
+ .after = 500,
585
629
  },
586
- .normal_status_timeout = Timeout{
587
- .name = "normal_status_timeout",
630
+ .start_view_change_window_timeout = Timeout{
631
+ .name = "start_view_change_window_timeout",
588
632
  .id = replica_index,
589
633
  .after = 500,
590
634
  },
635
+ .start_view_change_message_timeout = Timeout{
636
+ .name = "start_view_change_message_timeout",
637
+ .id = replica_index,
638
+ .after = 50,
639
+ },
591
640
  .view_change_status_timeout = Timeout{
592
641
  .name = "view_change_status_timeout",
593
642
  .id = replica_index,
594
643
  .after = 500,
595
644
  },
596
- .view_change_message_timeout = Timeout{
597
- .name = "view_change_message_timeout",
645
+ .do_view_change_message_timeout = Timeout{
646
+ .name = "do_view_change_message_timeout",
598
647
  .id = replica_index,
599
648
  .after = 50,
600
649
  },
650
+ .request_start_view_message_timeout = Timeout{
651
+ .name = "request_start_view_message_timeout",
652
+ .id = replica_index,
653
+ .after = 100,
654
+ },
601
655
  .repair_timeout = Timeout{
602
656
  .name = "repair_timeout",
603
657
  .id = replica_index,
@@ -686,18 +740,26 @@ pub fn ReplicaType(
686
740
 
687
741
  self.ping_timeout.tick();
688
742
  self.prepare_timeout.tick();
689
- self.commit_timeout.tick();
690
- self.normal_status_timeout.tick();
743
+ self.primary_abdicate_timeout.tick();
744
+ self.commit_message_timeout.tick();
745
+ self.normal_heartbeat_timeout.tick();
746
+ self.start_view_change_window_timeout.tick();
747
+ self.start_view_change_message_timeout.tick();
691
748
  self.view_change_status_timeout.tick();
692
- self.view_change_message_timeout.tick();
749
+ self.do_view_change_message_timeout.tick();
750
+ self.request_start_view_message_timeout.tick();
693
751
  self.repair_timeout.tick();
694
752
 
695
753
  if (self.ping_timeout.fired()) self.on_ping_timeout();
696
754
  if (self.prepare_timeout.fired()) self.on_prepare_timeout();
697
- if (self.commit_timeout.fired()) self.on_commit_timeout();
698
- if (self.normal_status_timeout.fired()) self.on_normal_status_timeout();
755
+ if (self.primary_abdicate_timeout.fired()) self.on_primary_abdicate_timeout();
756
+ if (self.commit_message_timeout.fired()) self.on_commit_message_timeout();
757
+ if (self.normal_heartbeat_timeout.fired()) self.on_normal_heartbeat_timeout();
758
+ if (self.start_view_change_window_timeout.fired()) self.on_start_view_change_window_timeout();
759
+ if (self.start_view_change_message_timeout.fired()) self.on_start_view_change_message_timeout();
699
760
  if (self.view_change_status_timeout.fired()) self.on_view_change_status_timeout();
700
- if (self.view_change_message_timeout.fired()) self.on_view_change_message_timeout();
761
+ if (self.do_view_change_message_timeout.fired()) self.on_do_view_change_message_timeout();
762
+ if (self.request_start_view_message_timeout.fired()) self.on_request_start_view_message_timeout();
701
763
  if (self.repair_timeout.fired()) self.on_repair_timeout();
702
764
 
703
765
  // None of the on_timeout() functions above should send a message to this replica.
@@ -743,6 +805,7 @@ pub fn ReplicaType(
743
805
  switch (message.header.command) {
744
806
  .ping => self.on_ping(message),
745
807
  .pong => self.on_pong(message),
808
+ .ping_client => self.on_ping_client(message),
746
809
  .request => self.on_request(message),
747
810
  .prepare => self.on_prepare(message),
748
811
  .prepare_ok => self.on_prepare_ok(message),
@@ -757,7 +820,7 @@ pub fn ReplicaType(
757
820
  .headers => self.on_headers(message),
758
821
  .nack_prepare => self.on_nack_prepare(message),
759
822
  // A replica should never handle misdirected messages intended for a client:
760
- .eviction, .reply => {
823
+ .pong_client, .eviction, .reply => {
761
824
  log.warn("{}: on_message: ignoring misdirected {s} message", .{
762
825
  self.replica,
763
826
  @tagName(message.header.command),
@@ -783,58 +846,34 @@ pub fn ReplicaType(
783
846
  tracer.flush();
784
847
  }
785
848
 
786
- // Pings are used:
787
- // - By clients, to learn about the current view.
788
- // - By replicas, to synchronise cluster time and to probe for network connectivity.
789
- //
790
- // In the second case we avoid setting the view to make sure pings can still be sent
791
- // during view changes.
849
+ /// Pings are used by replicas to synchronise cluster time and to probe for network connectivity.
792
850
  fn on_ping(self: *Self, message: *const Message) void {
851
+ assert(message.header.command == .ping);
793
852
  if (self.status != .normal and self.status != .view_change) return;
794
853
 
795
854
  assert(self.status == .normal or self.status == .view_change);
796
855
 
856
+ if (message.header.replica == self.replica) {
857
+ log.warn("{}: on_ping: ignoring (self)", .{self.replica});
858
+ return;
859
+ }
860
+
797
861
  // TODO Drop pings that were not addressed to us.
798
862
 
799
- var pong = Header{
863
+ self.send_header_to_replica(message.header.replica, .{
800
864
  .command = .pong,
801
865
  .cluster = self.cluster,
802
866
  .replica = self.replica,
803
- };
804
-
805
- if (message.header.client > 0) {
806
- assert(message.header.replica == 0);
807
-
808
- // We must only ever send our view number to a client via a pong message if we are
809
- // in normal status. Otherwise, we may be partitioned from the cluster with a newer
810
- // view number, leak this to the client, which would then pass this to the cluster
811
- // in subsequent client requests, which would then ignore these client requests with
812
- // a newer view number, locking out the client. The principle here is that we must
813
- // never send view numbers for views that have not yet started.
814
- if (self.status == .normal) {
815
- pong.view = self.view;
816
- self.send_header_to_client(message.header.client, pong);
817
- }
818
- } else {
819
- assert(message.header.view == 0);
820
-
821
- if (message.header.replica == self.replica) {
822
- log.warn("{}: on_ping: ignoring (self)", .{self.replica});
823
- } else {
824
- // Copy the ping's monotonic timestamp to our pong and add our wall clock sample:
825
- pong.op = message.header.op;
826
- pong.timestamp = @bitCast(u64, self.clock.realtime());
827
- self.send_header_to_replica(message.header.replica, pong);
828
- }
829
- }
867
+ // Copy the ping's monotonic timestamp to our pong and add our wall clock sample:
868
+ .op = message.header.op,
869
+ .timestamp = @bitCast(u64, self.clock.realtime()),
870
+ });
830
871
  }
831
872
 
832
873
  fn on_pong(self: *Self, message: *const Message) void {
833
- if (message.header.client > 0) return;
874
+ assert(message.header.command == .pong);
834
875
  if (message.header.replica == self.replica) return;
835
876
 
836
- assert(message.header.view == 0);
837
-
838
877
  const m0 = message.header.op;
839
878
  const t1 = @bitCast(i64, message.header.timestamp);
840
879
  const m2 = self.clock.monotonic();
@@ -842,6 +881,27 @@ pub fn ReplicaType(
842
881
  self.clock.learn(message.header.replica, m0, t1, m2);
843
882
  }
844
883
 
884
+ /// Pings are used by clients to learn about the current view.
885
+ fn on_ping_client(self: *Self, message: *const Message) void {
886
+ assert(message.header.command == .ping_client);
887
+ assert(message.header.client != 0);
888
+
889
+ // We must only ever send our view number to a client via a pong message if we are
890
+ // in normal status. Otherwise, we may be partitioned from the cluster with a newer
891
+ // view number, leak this to the client, which would then pass this to the cluster
892
+ // in subsequent client requests, which would then ignore these client requests with
893
+ // a newer view number, locking out the client. The principle here is that we must
894
+ // never send view numbers for views that have not yet started.
895
+ if (self.status != .normal) return;
896
+
897
+ self.send_header_to_client(message.header.client, .{
898
+ .command = .pong_client,
899
+ .cluster = self.cluster,
900
+ .replica = self.replica,
901
+ .view = self.view,
902
+ });
903
+ }
904
+
845
905
  /// When there is free space in the pipeline's prepare queue:
846
906
  /// The primary advances op-number, adds the request to the end of the log, and updates the
847
907
  /// information for this client in the client-table to contain the new request number, s.
@@ -905,6 +965,7 @@ pub fn ReplicaType(
905
965
  /// If the next replica is down or partitioned, then the primary's prepare timeout will fire,
906
966
  /// and the primary will resend but to another replica, until it receives enough prepare_ok's.
907
967
  fn on_prepare(self: *Self, message: *Message) void {
968
+ assert(message.header.command == .prepare);
908
969
  self.view_jump(message.header);
909
970
 
910
971
  if (self.is_repair(message)) {
@@ -949,8 +1010,6 @@ pub fn ReplicaType(
949
1010
  assert(message.header.op > self.commit_min);
950
1011
  assert(message.header.op <= self.op_checkpoint_trigger());
951
1012
 
952
- if (self.backup()) self.normal_status_timeout.reset();
953
-
954
1013
  if (message.header.op > self.op + 1) {
955
1014
  log.debug("{}: on_prepare: newer op", .{self.replica});
956
1015
  self.jump_to_newer_op_in_normal_status(message.header);
@@ -988,6 +1047,7 @@ pub fn ReplicaType(
988
1047
  }
989
1048
 
990
1049
  fn on_prepare_ok(self: *Self, message: *Message) void {
1050
+ assert(message.header.command == .prepare_ok);
991
1051
  if (self.ignore_prepare_ok(message)) return;
992
1052
 
993
1053
  assert(self.status == .normal);
@@ -1020,12 +1080,20 @@ pub fn ReplicaType(
1020
1080
  else
1021
1081
  self.quorum_replication;
1022
1082
 
1083
+ if (!prepare.ok_from_all_replicas.isSet(message.header.replica)) {
1084
+ self.primary_abdicating = false;
1085
+ if (!prepare.ok_quorum_received) {
1086
+ self.primary_abdicate_timeout.reset();
1087
+ }
1088
+ }
1089
+
1023
1090
  const count = self.count_message_and_receive_quorum_exactly_once(
1024
1091
  &prepare.ok_from_all_replicas,
1025
1092
  message,
1026
1093
  threshold,
1027
1094
  ) orelse return;
1028
1095
 
1096
+ const prepare_pending = self.primary_pipeline_pending().?;
1029
1097
  assert(count == threshold);
1030
1098
  assert(!prepare.ok_quorum_received);
1031
1099
  prepare.ok_quorum_received = true;
@@ -1035,6 +1103,16 @@ pub fn ReplicaType(
1035
1103
  prepare.message.header.checksum,
1036
1104
  });
1037
1105
 
1106
+ assert(self.prepare_timeout.ticking);
1107
+ assert(self.primary_abdicate_timeout.ticking);
1108
+ assert(!self.primary_abdicating);
1109
+ if (self.primary_pipeline_pending()) |_| {
1110
+ if (prepare_pending == prepare) self.prepare_timeout.reset();
1111
+ } else {
1112
+ self.prepare_timeout.stop();
1113
+ self.primary_abdicate_timeout.stop();
1114
+ }
1115
+
1038
1116
  self.commit_pipeline();
1039
1117
  }
1040
1118
 
@@ -1043,6 +1121,8 @@ pub fn ReplicaType(
1043
1121
  /// It's possible for the network to be one-way partitioned so that backups don't see the
1044
1122
  /// primary as down, but neither can the primary hear from the backups.
1045
1123
  fn on_commit(self: *Self, message: *const Message) void {
1124
+ assert(message.header.command == .commit);
1125
+
1046
1126
  self.view_jump(message.header);
1047
1127
 
1048
1128
  if (self.status != .normal) {
@@ -1070,6 +1150,13 @@ pub fn ReplicaType(
1070
1150
  assert(message.header.view == self.view);
1071
1151
  assert(message.header.replica == self.primary_index(message.header.view));
1072
1152
 
1153
+ // Old/duplicate heartbeats don't count.
1154
+ if (self.heartbeat_timestamp < message.header.timestamp) {
1155
+ self.heartbeat_timestamp = message.header.timestamp;
1156
+ self.normal_heartbeat_timeout.reset();
1157
+ self.start_view_change_from_all_replicas.unset(self.replica);
1158
+ }
1159
+
1073
1160
  // We may not always have the latest commit entry but if we do our checksum must match:
1074
1161
  if (self.journal.header_with_op(message.header.commit)) |commit_entry| {
1075
1162
  if (commit_entry.checksum == message.header.context) {
@@ -1082,7 +1169,6 @@ pub fn ReplicaType(
1082
1169
  }
1083
1170
  }
1084
1171
 
1085
- self.normal_status_timeout.reset();
1086
1172
  self.commit_journal(message.header.commit);
1087
1173
  }
1088
1174
 
@@ -1154,47 +1240,53 @@ pub fn ReplicaType(
1154
1240
  }
1155
1241
 
1156
1242
  fn on_start_view_change(self: *Self, message: *Message) void {
1157
- if (self.ignore_view_change_message(message)) return;
1243
+ assert(message.header.command == .start_view_change);
1244
+ if (self.ignore_start_view_change_message(message)) return;
1158
1245
 
1246
+ assert(self.replica_count > 1);
1159
1247
  assert(self.status == .normal or self.status == .view_change);
1160
1248
  assert(message.header.view >= self.view);
1161
- assert(message.header.replica != self.replica);
1162
1249
 
1163
1250
  self.view_jump(message.header);
1164
1251
 
1165
- assert(self.status == .view_change);
1166
1252
  assert(message.header.view == self.view);
1167
1253
 
1168
- // Wait until we have `f` messages (excluding ourself) for quorum:
1169
- assert(self.replica_count > 1);
1170
- const threshold = self.quorum_view_change - 1;
1254
+ // Wait until we have `f + 1` messages (possibly including ourself) for quorum.
1255
+ // This ensures that we do not start a view-change while normal request processing
1256
+ // is possible.
1257
+ const threshold = self.quorum_view_change;
1171
1258
 
1172
- const count = self.count_message_and_receive_quorum_exactly_once(
1173
- &self.start_view_change_from_other_replicas,
1174
- message,
1175
- threshold,
1176
- ) orelse return;
1259
+ self.start_view_change_from_all_replicas.set(message.header.replica);
1177
1260
 
1178
- assert(count == threshold);
1179
- assert(!self.start_view_change_from_other_replicas.isSet(self.replica));
1261
+ if (self.replica != message.header.replica and
1262
+ !self.start_view_change_window_timeout.ticking)
1263
+ {
1264
+ self.start_view_change_window_timeout.start();
1265
+ }
1266
+
1267
+ const count = self.start_view_change_from_all_replicas.count();
1268
+ assert(count <= threshold);
1269
+
1270
+ if (count < threshold) {
1271
+ log.debug("{}: on_start_view_change: view={} waiting for quorum ({}/{})", .{
1272
+ self.replica,
1273
+ self.view,
1274
+ count,
1275
+ threshold,
1276
+ });
1277
+ return;
1278
+ }
1180
1279
  log.debug("{}: on_start_view_change: view={} quorum received", .{
1181
1280
  self.replica,
1182
1281
  self.view,
1183
1282
  });
1184
1283
 
1185
- assert(!self.start_view_change_quorum);
1186
- assert(!self.do_view_change_quorum);
1187
- self.start_view_change_quorum = true;
1188
-
1189
- // When replica i receives start_view_change messages for its view from f other replicas,
1190
- // it sends a ⟨do_view_change v, l, v’, n, k, i⟩ message to the node that will be the
1191
- // primary in the new view. Here v is its view, l is its log, v′ is the view number of the
1192
- // latest view in which its status was normal, n is the op number, and k is the commit
1193
- // number.
1194
- self.send_do_view_change();
1195
- defer self.flush_loopback_queue();
1284
+ self.transition_to_view_change_status(self.view + 1);
1285
+ assert(self.start_view_change_from_all_replicas.count() == 0);
1196
1286
  }
1197
1287
 
1288
+ /// DVC serves two purposes:
1289
+ ///
1198
1290
  /// When the new primary receives f + 1 do_view_change messages from different replicas
1199
1291
  /// (including itself), it sets its view number to that in the messages and selects as the
1200
1292
  /// new log the one contained in the message with the largest v′; if several messages have
@@ -1204,22 +1296,25 @@ pub fn ReplicaType(
1204
1296
  /// informs the other replicas of the completion of the view change by sending
1205
1297
  /// ⟨start_view v, l, n, k⟩ messages to the other replicas, where l is the new log, n is the
1206
1298
  /// op number, and k is the commit number.
1299
+ ///
1300
+ /// When a new backup receives a do_view_change message for a new view, it transitions to
1301
+ /// that new view in view-change status and begins to broadcast its own DVC.
1207
1302
  fn on_do_view_change(self: *Self, message: *Message) void {
1303
+ assert(message.header.command == .do_view_change);
1208
1304
  if (self.ignore_view_change_message(message)) return;
1209
1305
 
1210
1306
  assert(self.status == .normal or self.status == .view_change);
1211
1307
  assert(message.header.view >= self.view);
1212
- assert(self.primary_index(message.header.view) == self.replica);
1213
1308
 
1214
1309
  self.view_jump(message.header);
1215
1310
 
1216
1311
  assert(self.status == .view_change);
1217
1312
  assert(message.header.view == self.view);
1218
1313
 
1219
- // We may receive a `do_view_change` quorum from other replicas, which already have a
1220
- // `start_view_change_quorum`, before we receive a `start_view_change_quorum`:
1221
- if (!self.start_view_change_quorum) {
1222
- log.debug("{}: on_do_view_change: waiting for start_view_change quorum (view={})", .{
1314
+ if (self.primary_index(message.header.view) != self.replica) {
1315
+ for (self.do_view_change_from_all_replicas) |dvc| assert(dvc == null);
1316
+
1317
+ log.debug("{}: on_do_view_change: view={} backup awaiting start_view", .{
1223
1318
  self.replica,
1224
1319
  self.view,
1225
1320
  });
@@ -1244,7 +1339,6 @@ pub fn ReplicaType(
1244
1339
  self.view,
1245
1340
  });
1246
1341
 
1247
- assert(self.start_view_change_quorum);
1248
1342
  assert(!self.do_view_change_quorum);
1249
1343
  self.do_view_change_quorum = true;
1250
1344
 
@@ -1274,6 +1368,7 @@ pub fn ReplicaType(
1274
1368
  /// they execute all operations known to be committed that they haven’t executed previously,
1275
1369
  /// advance their commit number, and update the information in their client table.
1276
1370
  fn on_start_view(self: *Self, message: *const Message) void {
1371
+ assert(message.header.command == .start_view);
1277
1372
  if (self.ignore_view_change_message(message)) return;
1278
1373
 
1279
1374
  if (message.header.op > self.op_checkpoint_trigger()) {
@@ -1320,6 +1415,7 @@ pub fn ReplicaType(
1320
1415
  }
1321
1416
 
1322
1417
  fn on_request_start_view(self: *Self, message: *const Message) void {
1418
+ assert(message.header.command == .request_start_view);
1323
1419
  if (self.ignore_repair_message(message)) return;
1324
1420
 
1325
1421
  assert(self.status == .normal);
@@ -1350,6 +1446,7 @@ pub fn ReplicaType(
1350
1446
  /// prepare. If a guaranteed prepare is found to by faulty, the replica must repair it
1351
1447
  /// to restore durability.
1352
1448
  fn on_request_prepare(self: *Self, message: *const Message) void {
1449
+ assert(message.header.command == .request_prepare);
1353
1450
  if (self.ignore_repair_message(message)) return;
1354
1451
 
1355
1452
  assert(self.replica_count > 1);
@@ -1483,6 +1580,7 @@ pub fn ReplicaType(
1483
1580
  }
1484
1581
 
1485
1582
  fn on_request_headers(self: *Self, message: *const Message) void {
1583
+ assert(message.header.command == .request_headers);
1486
1584
  if (self.ignore_repair_message(message)) return;
1487
1585
 
1488
1586
  assert(self.status == .normal or self.status == .view_change);
@@ -1525,6 +1623,7 @@ pub fn ReplicaType(
1525
1623
  }
1526
1624
 
1527
1625
  fn on_nack_prepare(self: *Self, message: *Message) void {
1626
+ assert(message.header.command == .nack_prepare);
1528
1627
  if (self.ignore_repair_message(message)) return;
1529
1628
 
1530
1629
  assert(self.status == .view_change);
@@ -1622,6 +1721,7 @@ pub fn ReplicaType(
1622
1721
  }
1623
1722
 
1624
1723
  fn on_headers(self: *Self, message: *const Message) void {
1724
+ assert(message.header.command == .headers);
1625
1725
  if (self.ignore_repair_message(message)) return;
1626
1726
 
1627
1727
  assert(self.status == .normal or self.status == .view_change);
@@ -1665,12 +1765,12 @@ pub fn ReplicaType(
1665
1765
  assert(self.status == .normal);
1666
1766
  assert(self.primary());
1667
1767
 
1668
- const prepare = self.pipeline.queue.prepare_queue.head_ptr().?;
1669
- assert(prepare.message.header.command == .prepare);
1670
- assert(prepare.message.header.op == self.commit_min + 1);
1768
+ const prepare = self.primary_pipeline_pending().?;
1671
1769
 
1672
- if (prepare.ok_quorum_received) {
1673
- assert(self.committing);
1770
+ if (self.replica_count == 1) {
1771
+ // Replica=1 doesn't write prepares concurrently to avoid gaps in its WAL.
1772
+ assert(self.journal.writes.executing() <= 1);
1773
+ assert(self.journal.writes.executing() == 1 or self.committing);
1674
1774
 
1675
1775
  self.prepare_timeout.reset();
1676
1776
  return;
@@ -1701,10 +1801,15 @@ pub fn ReplicaType(
1701
1801
  }
1702
1802
 
1703
1803
  if (waiting_len == 0) {
1704
- self.prepare_timeout.reset();
1804
+ // TODO: This assert will be valid when the state-transfer is implemented and the
1805
+ // threshold=replica_count hack is removed from on_prepare_ok.
1806
+ // assert(self.quorum_replication == self.replica_count);
1807
+ assert(!prepare.ok_from_all_replicas.isSet(self.replica));
1808
+ assert(prepare.ok_from_all_replicas.count() == self.replica_count - 1);
1809
+ assert(prepare.message.header.op <= self.op);
1705
1810
 
1811
+ self.prepare_timeout.reset();
1706
1812
  log.debug("{}: on_prepare_timeout: waiting for journal", .{self.replica});
1707
- assert(!prepare.ok_from_all_replicas.isSet(self.replica));
1708
1813
 
1709
1814
  // We may be slow and waiting for the write to complete.
1710
1815
  //
@@ -1716,9 +1821,7 @@ pub fn ReplicaType(
1716
1821
  //
1717
1822
  // Retry the write through `on_repair()` which will work out which is which.
1718
1823
  // We do expect that the op would have been run through `on_prepare()` already.
1719
- assert(prepare.message.header.op <= self.op);
1720
1824
  self.on_repair(prepare.message);
1721
-
1722
1825
  return;
1723
1826
  }
1724
1827
 
@@ -1746,13 +1849,39 @@ pub fn ReplicaType(
1746
1849
  self.send_message_to_replica(replica, prepare.message);
1747
1850
  }
1748
1851
 
1749
- fn on_commit_timeout(self: *Self) void {
1750
- self.commit_timeout.reset();
1852
+ fn on_primary_abdicate_timeout(self: *Self) void {
1853
+ assert(self.status == .normal);
1854
+ assert(self.primary());
1855
+ assert(self.primary_pipeline_pending() != null);
1856
+ self.primary_abdicate_timeout.reset();
1857
+ if (self.replica_count == 1) return;
1858
+
1859
+ log.debug("{}: on_primary_abdicate_timeout: abdicating (view={})", .{
1860
+ self.replica,
1861
+ self.view,
1862
+ });
1863
+ self.primary_abdicating = true;
1864
+ }
1865
+
1866
+ fn on_commit_message_timeout(self: *Self) void {
1867
+ self.commit_message_timeout.reset();
1751
1868
 
1752
1869
  assert(self.status == .normal);
1753
1870
  assert(self.primary());
1754
1871
  assert(self.commit_min == self.commit_max);
1755
1872
 
1873
+ if (self.primary_abdicating) {
1874
+ assert(self.primary_abdicate_timeout.ticking);
1875
+ assert(self.pipeline.queue.prepare_queue.count > 0);
1876
+ assert(self.primary_pipeline_pending() != null);
1877
+
1878
+ log.debug("{}: on_commit_message_timeout: primary abdicating (view={})", .{
1879
+ self.replica,
1880
+ self.view,
1881
+ });
1882
+ return;
1883
+ }
1884
+
1756
1885
  const latest_committed_entry = checksum: {
1757
1886
  if (self.commit_max == self.superblock.working.vsr_state.commit_min) {
1758
1887
  break :checksum self.superblock.working.vsr_state.commit_min_checksum;
@@ -1768,37 +1897,86 @@ pub fn ReplicaType(
1768
1897
  .replica = self.replica,
1769
1898
  .view = self.view,
1770
1899
  .commit = self.commit_max,
1900
+ .timestamp = self.clock.monotonic(),
1771
1901
  });
1772
1902
  }
1773
1903
 
1774
- fn on_normal_status_timeout(self: *Self) void {
1904
+ fn on_normal_heartbeat_timeout(self: *Self) void {
1775
1905
  assert(self.status == .normal);
1776
1906
  assert(self.backup());
1777
- self.transition_to_view_change_status(self.view + 1);
1907
+ self.normal_heartbeat_timeout.reset();
1908
+
1909
+ if (self.replica_count == 1) return;
1910
+
1911
+ log.debug("{}: on_normal_heartbeat_timeout: heartbeat lost (view={})", .{
1912
+ self.replica,
1913
+ self.view,
1914
+ });
1915
+ self.send_start_view_change();
1916
+ }
1917
+
1918
+ fn on_start_view_change_window_timeout(self: *Self) void {
1919
+ assert(self.status == .normal or self.status == .view_change);
1920
+ assert(self.start_view_change_from_all_replicas.count() > 0);
1921
+ assert(self.replica_count > 1);
1922
+ self.start_view_change_window_timeout.stop();
1923
+
1924
+ // Don't reset our own SVC; it will be reset if/when we receive a heartbeat.
1925
+ const svc = self.start_view_change_from_all_replicas.isSet(self.replica);
1926
+ self.reset_quorum_start_view_change();
1927
+ if (svc) self.start_view_change_from_all_replicas.set(self.replica);
1928
+ }
1929
+
1930
+ fn on_start_view_change_message_timeout(self: *Self) void {
1931
+ assert(self.status == .normal or self.status == .view_change);
1932
+ self.start_view_change_message_timeout.reset();
1933
+ if (self.replica_count == 1) return;
1934
+
1935
+ if (self.start_view_change_from_all_replicas.isSet(self.replica)) {
1936
+ self.send_start_view_change();
1937
+ }
1778
1938
  }
1779
1939
 
1780
1940
  fn on_view_change_status_timeout(self: *Self) void {
1781
1941
  assert(self.status == .view_change);
1782
- self.transition_to_view_change_status(self.view + 1);
1942
+ assert(self.replica_count > 1);
1943
+ self.view_change_status_timeout.reset();
1944
+
1945
+ self.send_start_view_change();
1783
1946
  }
1784
1947
 
1785
- fn on_view_change_message_timeout(self: *Self) void {
1786
- self.view_change_message_timeout.reset();
1948
+ fn on_do_view_change_message_timeout(self: *Self) void {
1787
1949
  assert(self.status == .view_change);
1950
+ assert(self.replica_count > 1);
1951
+ self.do_view_change_message_timeout.reset();
1788
1952
 
1789
- // Keep sending `start_view_change` messages:
1790
- // We may have a `start_view_change_quorum` but other replicas may not.
1791
- // However, the primary may stop sending once it has a `do_view_change_quorum`.
1792
- if (!self.do_view_change_quorum) self.send_start_view_change();
1793
-
1794
- // It is critical that a `do_view_change` message implies a `start_view_change_quorum`:
1795
- if (self.start_view_change_quorum) {
1796
- // The primary need not retry to send a `do_view_change` message to itself:
1797
- // We assume the MessageBus will not drop messages sent by a replica to itself.
1798
- if (self.primary_index(self.view) != self.replica) self.send_do_view_change();
1953
+ if (self.primary_index(self.view) == self.replica and self.do_view_change_quorum) {
1954
+ // A primary in status=view_change with a complete DVC quorum must be repairing —
1955
+ // it does not need to signal other replicas.
1956
+ assert(self.view == self.log_view);
1957
+ } else {
1958
+ assert(self.view > self.log_view);
1959
+ self.send_do_view_change();
1799
1960
  }
1800
1961
  }
1801
1962
 
1963
+ fn on_request_start_view_message_timeout(self: *Self) void {
1964
+ assert(self.status == .view_change);
1965
+ assert(self.primary_index(self.view) != self.replica);
1966
+ self.request_start_view_message_timeout.reset();
1967
+
1968
+ log.debug("{}: on_request_start_view_message_timeout: view={}", .{
1969
+ self.replica,
1970
+ self.view,
1971
+ });
1972
+ self.send_header_to_replica(self.primary_index(self.view), .{
1973
+ .command = .request_start_view,
1974
+ .cluster = self.cluster,
1975
+ .replica = self.replica,
1976
+ .view = self.view,
1977
+ });
1978
+ }
1979
+
1802
1980
  fn on_repair_timeout(self: *Self) void {
1803
1981
  assert(self.status == .normal or self.status == .view_change);
1804
1982
  self.repair();
@@ -1926,13 +2104,6 @@ pub fn ReplicaType(
1926
2104
  assert(self.status == .normal);
1927
2105
  assert(self.primary());
1928
2106
  },
1929
- .start_view_change => {
1930
- assert(self.replica_count > 1);
1931
- if (self.replica_count == 2) assert(threshold == 1);
1932
-
1933
- assert(self.status == .view_change);
1934
- assert(self.replica != message.header.replica);
1935
- },
1936
2107
  .nack_prepare => {
1937
2108
  assert(self.replica_count > 1);
1938
2109
  if (self.replica_count == 2) assert(threshold >= 1);
@@ -2270,7 +2441,6 @@ pub fn ReplicaType(
2270
2441
  assert(prepare.message.header.checksum == self.commit_prepare.?.header.checksum);
2271
2442
  assert(prepare.message.header.op == self.commit_min);
2272
2443
  assert(prepare.message.header.op == self.commit_max);
2273
- assert(self.prepare_timeout.ticking);
2274
2444
 
2275
2445
  self.message_bus.unref(prepare.message);
2276
2446
 
@@ -2288,10 +2458,6 @@ pub fn ReplicaType(
2288
2458
  });
2289
2459
  self.write_prepare(next.message, .append);
2290
2460
  }
2291
- } else {
2292
- // When the pipeline is empty, stop the prepare timeout.
2293
- // The timeout will be restarted when another entry arrives for the pipeline.
2294
- self.prepare_timeout.stop();
2295
2461
  }
2296
2462
  }
2297
2463
 
@@ -2815,7 +2981,7 @@ pub fn ReplicaType(
2815
2981
  assert(
2816
2982
  header.view == self.view or
2817
2983
  header.command == .request_start_view or
2818
- header.command == .pong or header.command == .ping,
2984
+ header.command == .ping or header.command == .pong,
2819
2985
  );
2820
2986
  assert(header.size == @sizeOf(Header));
2821
2987
 
@@ -2864,25 +3030,20 @@ pub fn ReplicaType(
2864
3030
  assert(self.journal.header_with_op(self.op) != null);
2865
3031
  }
2866
3032
 
2867
- /// Returns whether the replica is a backup for the current view.
2868
- /// This may be used only when the replica status is normal.
2869
- fn backup(self: *Self) bool {
2870
- return !self.primary();
2871
- }
2872
-
2873
3033
  fn flush_loopback_queue(self: *Self) void {
2874
- // There are four cases where a replica will send a message to itself:
2875
- // However, of these four cases, all but one call send_message_to_replica().
3034
+ // There are five cases where a replica will send a message to itself:
3035
+ // However, of these five cases, all but one call send_message_to_replica().
2876
3036
  //
2877
3037
  // 1. In on_request(), the primary sends a synchronous prepare to itself, but this is
2878
3038
  // done by calling on_prepare() directly, and subsequent prepare timeout retries will
2879
3039
  // never resend to self.
2880
3040
  // 2. In on_prepare(), after writing to storage, the primary sends a (typically)
2881
3041
  // asynchronous prepare_ok to itself.
2882
- // 3. In on_start_view_change(), after receiving a quorum of start_view_change
2883
- // messages, the new primary sends a synchronous do_view_change to itself.
3042
+ // 3. In transition_to_view_change_status(), the new primary sends a synchronous DVC to
3043
+ // itself.
2884
3044
  // 4. In primary_start_view_as_the_new_primary(), the new primary sends itself a
2885
3045
  // prepare_ok message for each uncommitted message.
3046
+ // 5. In send_start_view_change(), a replica sends itself a start_view_change message.
2886
3047
  if (self.loopback_queue) |message| {
2887
3048
  defer self.message_bus.unref(message);
2888
3049
 
@@ -2898,6 +3059,8 @@ pub fn ReplicaType(
2898
3059
  }
2899
3060
 
2900
3061
  fn ignore_prepare_ok(self: *Self, message: *const Message) bool {
3062
+ assert(message.header.command == .prepare_ok);
3063
+
2901
3064
  if (self.primary_index(message.header.view) == self.replica) {
2902
3065
  assert(message.header.view <= self.view);
2903
3066
  }
@@ -3218,9 +3381,31 @@ pub fn ReplicaType(
3218
3381
  return false;
3219
3382
  }
3220
3383
 
3221
- fn ignore_view_change_message(self: *Self, message: *const Message) bool {
3222
- assert(message.header.command == .start_view_change or
3223
- message.header.command == .do_view_change or
3384
+ fn ignore_start_view_change_message(self: *const Self, message: *const Message) bool {
3385
+ assert(message.header.command == .start_view_change);
3386
+
3387
+ switch (self.status) {
3388
+ .normal, .view_change => {},
3389
+ .recovering => unreachable, // Single node clusters don't have view changes.
3390
+ .recovering_head => {
3391
+ log.debug("{}: on_start_view_change: ignoring (status={})", .{
3392
+ self.replica,
3393
+ self.status,
3394
+ });
3395
+ return true;
3396
+ },
3397
+ }
3398
+
3399
+ if (message.header.view < self.view) {
3400
+ log.debug("{}: on_start_view_change: ignoring (older view)", .{self.replica});
3401
+ return true;
3402
+ }
3403
+
3404
+ return false;
3405
+ }
3406
+
3407
+ fn ignore_view_change_message(self: *const Self, message: *const Message) bool {
3408
+ assert(message.header.command == .do_view_change or
3224
3409
  message.header.command == .start_view);
3225
3410
  assert(message.header.view > 0); // The initial view is already zero.
3226
3411
  assert(self.status != .recovering); // Single node clusters don't have view changes.
@@ -3228,6 +3413,7 @@ pub fn ReplicaType(
3228
3413
  const command: []const u8 = @tagName(message.header.command);
3229
3414
 
3230
3415
  if (self.status == .recovering_head and message.header.command != .start_view) {
3416
+ log.debug("{}: on_{s}: ignoring (recovering_head)", .{ self.replica, command });
3231
3417
  return true;
3232
3418
  }
3233
3419
 
@@ -3243,18 +3429,13 @@ pub fn ReplicaType(
3243
3429
 
3244
3430
  // These may be caused by faults in the network topology.
3245
3431
  switch (message.header.command) {
3246
- .start_view_change, .start_view => {
3432
+ .start_view => {
3247
3433
  if (message.header.replica == self.replica) {
3248
3434
  log.warn("{}: on_{s}: ignoring (self)", .{ self.replica, command });
3249
3435
  return true;
3250
3436
  }
3251
3437
  },
3252
- .do_view_change => {
3253
- if (self.primary_index(message.header.view) != self.replica) {
3254
- log.warn("{}: on_{s}: ignoring (backup)", .{ self.replica, command });
3255
- return true;
3256
- }
3257
- },
3438
+ .do_view_change => {},
3258
3439
  else => unreachable,
3259
3440
  }
3260
3441
 
@@ -3275,6 +3456,11 @@ pub fn ReplicaType(
3275
3456
  return false;
3276
3457
  }
3277
3458
 
3459
+ /// Returns the index into the configuration of the primary for a given view.
3460
+ fn primary_index(self: *const Self, view: u32) u8 {
3461
+ return @intCast(u8, @mod(view, self.replica_count));
3462
+ }
3463
+
3278
3464
  /// Returns whether the replica is the primary for the current view.
3279
3465
  /// This may be used only when the replica status is normal.
3280
3466
  fn primary(self: *const Self) bool {
@@ -3282,9 +3468,10 @@ pub fn ReplicaType(
3282
3468
  return self.primary_index(self.view) == self.replica;
3283
3469
  }
3284
3470
 
3285
- /// Returns the index into the configuration of the primary for a given view.
3286
- fn primary_index(self: *const Self, view: u32) u8 {
3287
- return @intCast(u8, @mod(view, self.replica_count));
3471
+ /// Returns whether the replica is a backup for the current view.
3472
+ /// This may be used only when the replica status is normal.
3473
+ fn backup(self: *const Self) bool {
3474
+ return !self.primary();
3288
3475
  }
3289
3476
 
3290
3477
  /// Advances `op` to where we need to be before `header` can be processed as a prepare.
@@ -3512,14 +3699,17 @@ pub fn ReplicaType(
3512
3699
 
3513
3700
  log.debug("{}: primary_pipeline_next: prepare {}", .{ self.replica, message.header.checksum });
3514
3701
 
3515
- if (self.pipeline.queue.prepare_queue.tail_ptr()) |previous| {
3702
+ if (self.primary_pipeline_pending()) |_| {
3516
3703
  // Do not restart the prepare timeout as it is already ticking for another prepare.
3517
- assert(self.prepare_timeout.ticking);
3704
+ const previous = self.pipeline.queue.prepare_queue.tail_ptr().?;
3518
3705
  assert(previous.message.header.checksum == message.header.parent);
3706
+ assert(self.prepare_timeout.ticking);
3707
+ assert(self.primary_abdicate_timeout.ticking);
3519
3708
  } else {
3520
- // We are about to add the first prepare to the pipeline, so start the timeout.
3521
3709
  assert(!self.prepare_timeout.ticking);
3710
+ assert(!self.primary_abdicate_timeout.ticking);
3522
3711
  self.prepare_timeout.start();
3712
+ self.primary_abdicate_timeout.start();
3523
3713
  }
3524
3714
  self.pipeline.queue.push_prepare(message);
3525
3715
  self.on_prepare(message);
@@ -3529,6 +3719,24 @@ pub fn ReplicaType(
3529
3719
  assert(self.op == message.header.op);
3530
3720
  }
3531
3721
 
3722
+ /// Returns the next prepare in the pipeline waiting for a quorum.
3723
+ /// Returns null when the pipeline is empty.
3724
+ /// Returns null when the pipeline is nonempty but all prepares have a quorum.
3725
+ fn primary_pipeline_pending(self: *const Self) ?*const Prepare {
3726
+ assert(self.status == .normal);
3727
+ assert(self.primary());
3728
+
3729
+ var prepares = self.pipeline.queue.prepare_queue.iterator();
3730
+ while (prepares.next_ptr()) |prepare| {
3731
+ assert(prepare.message.header.command == .prepare);
3732
+ if (!prepare.ok_quorum_received) {
3733
+ return prepare;
3734
+ }
3735
+ } else {
3736
+ return null;
3737
+ }
3738
+ }
3739
+
3532
3740
  fn pipeline_prepare_by_op_and_checksum(self: *Self, op: u64, checksum: ?u128) ?*Message {
3533
3741
  assert(self.status == .normal or self.status == .view_change);
3534
3742
  assert(self.replica == self.primary_index(self.view) or checksum != null);
@@ -4349,7 +4557,7 @@ pub fn ReplicaType(
4349
4557
  return;
4350
4558
  }
4351
4559
 
4352
- const next = @mod(self.replica + 1, @intCast(u8, self.replica_count));
4560
+ const next = @mod(self.replica + 1, self.replica_count);
4353
4561
  if (next == self.primary_index(message.header.view)) {
4354
4562
  log.debug("{}: replicate: not replicating (completed)", .{self.replica});
4355
4563
  return;
@@ -4412,13 +4620,13 @@ pub fn ReplicaType(
4412
4620
  }
4413
4621
 
4414
4622
  fn reset_quorum_nack_prepare(self: *Self) void {
4623
+ assert(!self.nack_prepare_from_other_replicas.isSet(self.replica));
4415
4624
  self.reset_quorum_counter(&self.nack_prepare_from_other_replicas);
4416
4625
  self.nack_prepare_op = null;
4417
4626
  }
4418
4627
 
4419
4628
  fn reset_quorum_start_view_change(self: *Self) void {
4420
- self.reset_quorum_counter(&self.start_view_change_from_other_replicas);
4421
- self.start_view_change_quorum = false;
4629
+ self.reset_quorum_counter(&self.start_view_change_from_all_replicas);
4422
4630
  }
4423
4631
 
4424
4632
  fn send_prepare_ok(self: *Self, header: *const Header) void {
@@ -4506,28 +4714,28 @@ pub fn ReplicaType(
4506
4714
  }
4507
4715
 
4508
4716
  fn send_start_view_change(self: *Self) void {
4509
- assert(self.status == .view_change);
4510
- assert(self.log_view < self.view);
4511
- assert(!self.do_view_change_quorum);
4512
- // Send only to other replicas (and not to ourself) to avoid a quorum off-by-one error:
4513
- // This could happen if the replica mistakenly counts its own message in the quorum.
4514
- self.send_header_to_other_replicas(.{
4717
+ assert(self.status == .normal or self.status == .view_change);
4718
+
4719
+ const header: Header = .{
4515
4720
  .command = .start_view_change,
4516
4721
  .cluster = self.cluster,
4517
4722
  .replica = self.replica,
4518
4723
  .view = self.view,
4519
- });
4724
+ };
4725
+
4726
+ self.send_header_to_other_replicas(header);
4727
+
4728
+ if (!self.start_view_change_from_all_replicas.isSet(self.replica)) {
4729
+ self.send_header_to_replica(self.replica, header);
4730
+ defer self.flush_loopback_queue();
4731
+ }
4520
4732
  }
4521
4733
 
4522
4734
  fn send_do_view_change(self: *Self) void {
4523
4735
  assert(self.status == .view_change);
4524
- assert(self.start_view_change_quorum);
4736
+ assert(self.view > self.log_view);
4525
4737
  assert(!self.do_view_change_quorum);
4526
4738
 
4527
- const count_start_view_change = self.start_view_change_from_other_replicas.count();
4528
- assert(count_start_view_change >= self.quorum_view_change - 1);
4529
- assert(count_start_view_change <= self.replica_count - 1);
4530
-
4531
4739
  const message = self.create_view_change_message(.do_view_change);
4532
4740
  defer self.message_bus.unref(message);
4533
4741
 
@@ -4546,7 +4754,14 @@ pub fn ReplicaType(
4546
4754
  assert(message.header.commit == self.commit_min);
4547
4755
  DVCQuorum.verify_message(message);
4548
4756
 
4549
- self.send_message_to_replica(self.primary_index(self.view), message);
4757
+ self.send_message_to_other_replicas(message);
4758
+
4759
+ if (self.replica == self.primary_index(self.view) and
4760
+ self.do_view_change_from_all_replicas[self.replica] == null)
4761
+ {
4762
+ self.send_message_to_replica(self.replica, message);
4763
+ defer self.flush_loopback_queue();
4764
+ }
4550
4765
  }
4551
4766
 
4552
4767
  fn send_eviction_message_to_client(self: *Self, client: u128) void {
@@ -4568,6 +4783,8 @@ pub fn ReplicaType(
4568
4783
  }
4569
4784
 
4570
4785
  fn send_header_to_client(self: *Self, client: u128, header: Header) void {
4786
+ assert(header.cluster == self.cluster);
4787
+
4571
4788
  const message = self.create_message_from_header(header);
4572
4789
  defer self.message_bus.unref(message);
4573
4790
 
@@ -4643,14 +4860,15 @@ pub fn ReplicaType(
4643
4860
  assert(replica == self.primary_index(self.view));
4644
4861
  assert(message.header.replica == self.replica);
4645
4862
  },
4863
+ .reply => unreachable,
4646
4864
  .start_view_change => {
4647
- assert(self.status == .view_change);
4865
+ assert(self.status == .normal or self.status == .view_change);
4648
4866
  assert(message.header.view == self.view);
4649
4867
  assert(message.header.replica == self.replica);
4650
4868
  },
4651
4869
  .do_view_change => {
4652
4870
  assert(self.status == .view_change);
4653
- assert(self.start_view_change_quorum);
4871
+ assert(self.view > self.log_view);
4654
4872
  assert(!self.do_view_change_quorum);
4655
4873
  assert(message.header.view == self.view);
4656
4874
  assert(message.header.replica == self.replica);
@@ -4658,23 +4876,12 @@ pub fn ReplicaType(
4658
4876
  assert(message.header.commit == self.commit_min);
4659
4877
  assert(message.header.timestamp == self.log_view);
4660
4878
  },
4661
- .start_view => switch (self.status) {
4662
- .normal => {
4663
- // A backup may ask the primary to resend the start_view message.
4664
- assert(!self.start_view_change_quorum);
4665
- assert(!self.do_view_change_quorum);
4666
- assert(message.header.view == self.view);
4667
- assert(message.header.replica == self.replica);
4668
- assert(message.header.replica != replica);
4669
- },
4670
- .view_change => {
4671
- assert(self.start_view_change_quorum);
4672
- assert(self.do_view_change_quorum);
4673
- assert(message.header.view == self.view);
4674
- assert(message.header.replica == self.replica);
4675
- assert(message.header.replica != replica);
4676
- },
4677
- else => unreachable,
4879
+ .start_view => {
4880
+ assert(self.status == .normal);
4881
+ assert(!self.do_view_change_quorum);
4882
+ assert(message.header.view == self.view);
4883
+ assert(message.header.replica == self.replica);
4884
+ assert(message.header.replica != replica);
4678
4885
  },
4679
4886
  .headers => {
4680
4887
  assert(self.status == .normal or self.status == .view_change);
@@ -4682,11 +4889,18 @@ pub fn ReplicaType(
4682
4889
  assert(message.header.replica == self.replica);
4683
4890
  assert(message.header.replica != replica);
4684
4891
  },
4685
- .ping, .pong => {
4686
- assert(message.header.view == 0);
4892
+ .ping => {
4893
+ assert(self.status == .normal);
4894
+ assert(message.header.replica == self.replica);
4895
+ assert(message.header.replica != replica);
4896
+ },
4897
+ .pong => {
4898
+ assert(self.status == .normal or self.status == .view_change);
4687
4899
  assert(message.header.replica == self.replica);
4688
4900
  assert(message.header.replica != replica);
4689
4901
  },
4902
+ .ping_client => unreachable,
4903
+ .pong_client => unreachable,
4690
4904
  .commit => {
4691
4905
  assert(self.status == .normal);
4692
4906
  assert(self.primary());
@@ -4716,12 +4930,14 @@ pub fn ReplicaType(
4716
4930
  assert(message.header.replica != replica);
4717
4931
  assert(self.primary_index(self.view) == replica);
4718
4932
  },
4719
- else => {
4720
- log.info("{}: send_message_to_replica: TODO {s}", .{
4721
- self.replica,
4722
- @tagName(message.header.command),
4723
- });
4933
+ .eviction => {
4934
+ assert(self.status == .normal);
4935
+ assert(self.primary());
4936
+ assert(message.header.view == self.view);
4937
+ assert(message.header.replica == self.replica);
4724
4938
  },
4939
+ .request_block => unreachable,
4940
+ .block => unreachable,
4725
4941
  }
4726
4942
 
4727
4943
  if (replica != self.replica) {
@@ -4929,27 +5145,12 @@ pub fn ReplicaType(
4929
5145
  }
4930
5146
 
4931
5147
  if (self.status == .view_change and self.log_view < self.view) {
4932
- if (self.primary_index(self.view) != self.replica and
4933
- self.start_view_change_quorum and !self.do_view_change_quorum)
4934
- {
4935
- self.send_do_view_change();
4936
- }
5148
+ if (!self.do_view_change_quorum) self.send_do_view_change();
4937
5149
  }
4938
5150
  }
4939
5151
 
4940
5152
  fn set_op_and_commit_max(self: *Self, op: u64, commit_max: u64, method: []const u8) void {
4941
- assert(self.status == .view_change or self.status == .recovering);
4942
-
4943
- switch (self.status) {
4944
- .normal => unreachable,
4945
- .view_change => {},
4946
- .recovering => {
4947
- // The replica's view hasn't been set yet.
4948
- // It will be set shortly, when we transition to normal status.
4949
- assert(self.view == 0);
4950
- },
4951
- .recovering_head => unreachable,
4952
- }
5153
+ assert(self.status == .view_change);
4953
5154
 
4954
5155
  // Uncommitted ops may not survive a view change so we must assert `op` against
4955
5156
  // `commit_max` and not `self.op`. However, committed ops (`commit_max`) must survive:
@@ -5026,9 +5227,9 @@ pub fn ReplicaType(
5026
5227
  /// replica 0 doesn't have 6/7, then replica 1/2 must share the latest log_view. ∎)
5027
5228
  fn primary_set_log_from_do_view_change_messages(self: *Self) void {
5028
5229
  assert(self.status == .view_change);
5230
+ assert(self.view > self.log_view);
5029
5231
  assert(self.primary_index(self.view) == self.replica);
5030
5232
  assert(self.replica_count > 1);
5031
- assert(self.start_view_change_quorum);
5032
5233
  assert(self.do_view_change_quorum);
5033
5234
  assert(self.do_view_change_from_all_replicas[self.replica] != null);
5034
5235
  DVCQuorum.verify(self.do_view_change_from_all_replicas);
@@ -5249,12 +5450,16 @@ pub fn ReplicaType(
5249
5450
 
5250
5451
  assert(self.replica_count == 1);
5251
5452
  assert(!self.prepare_timeout.ticking);
5252
- assert(!self.normal_status_timeout.ticking);
5453
+ assert(!self.primary_abdicate_timeout.ticking);
5454
+ assert(!self.normal_heartbeat_timeout.ticking);
5455
+ assert(!self.start_view_change_window_timeout.ticking);
5253
5456
  assert(!self.view_change_status_timeout.ticking);
5254
- assert(!self.view_change_message_timeout.ticking);
5457
+ assert(!self.do_view_change_message_timeout.ticking);
5458
+ assert(!self.request_start_view_message_timeout.ticking);
5255
5459
 
5256
5460
  self.ping_timeout.start();
5257
- self.commit_timeout.start();
5461
+ self.start_view_change_message_timeout.start();
5462
+ self.commit_message_timeout.start();
5258
5463
  self.repair_timeout.start();
5259
5464
 
5260
5465
  self.pipeline.cache.deinit(self.message_bus.pool);
@@ -5269,12 +5474,17 @@ pub fn ReplicaType(
5269
5474
  );
5270
5475
 
5271
5476
  assert(!self.prepare_timeout.ticking);
5272
- assert(!self.commit_timeout.ticking);
5477
+ assert(!self.primary_abdicate_timeout.ticking);
5478
+ assert(!self.normal_heartbeat_timeout.ticking);
5479
+ assert(!self.start_view_change_window_timeout.ticking);
5480
+ assert(!self.commit_message_timeout.ticking);
5273
5481
  assert(!self.view_change_status_timeout.ticking);
5274
- assert(!self.view_change_message_timeout.ticking);
5482
+ assert(!self.do_view_change_message_timeout.ticking);
5483
+ assert(!self.request_start_view_message_timeout.ticking);
5275
5484
 
5276
5485
  self.ping_timeout.start();
5277
- self.normal_status_timeout.start();
5486
+ self.normal_heartbeat_timeout.start();
5487
+ self.start_view_change_message_timeout.start();
5278
5488
  self.repair_timeout.start();
5279
5489
  }
5280
5490
  }
@@ -5285,6 +5495,7 @@ pub fn ReplicaType(
5285
5495
  assert(self.status == .view_change);
5286
5496
  assert(view_new >= self.view);
5287
5497
  assert(self.journal.header_with_op(self.op) != null);
5498
+ assert(!self.primary_abdicating);
5288
5499
 
5289
5500
  self.status = .normal;
5290
5501
 
@@ -5295,6 +5506,8 @@ pub fn ReplicaType(
5295
5506
  );
5296
5507
 
5297
5508
  assert(!self.prepare_timeout.ticking);
5509
+ assert(!self.normal_heartbeat_timeout.ticking);
5510
+ assert(!self.primary_abdicate_timeout.ticking);
5298
5511
  assert(!self.pipeline_repairing);
5299
5512
  assert(self.pipeline == .queue);
5300
5513
  assert(self.view == view_new);
@@ -5308,14 +5521,19 @@ pub fn ReplicaType(
5308
5521
  self.view_durable_update();
5309
5522
 
5310
5523
  self.ping_timeout.start();
5311
- self.commit_timeout.start();
5312
- self.normal_status_timeout.stop();
5524
+ self.commit_message_timeout.start();
5525
+ self.start_view_change_window_timeout.stop();
5526
+ self.start_view_change_message_timeout.start();
5313
5527
  self.view_change_status_timeout.stop();
5314
- self.view_change_message_timeout.stop();
5528
+ self.do_view_change_message_timeout.stop();
5529
+ self.request_start_view_message_timeout.stop();
5315
5530
  self.repair_timeout.start();
5316
5531
 
5317
5532
  // Do not reset the pipeline as there may be uncommitted ops to drive to completion.
5318
- if (self.pipeline.queue.prepare_queue.count > 0) self.prepare_timeout.start();
5533
+ if (self.pipeline.queue.prepare_queue.count > 0) {
5534
+ self.prepare_timeout.start();
5535
+ self.primary_abdicate_timeout.start();
5536
+ }
5319
5537
  } else {
5320
5538
  log.debug("{}: transition_to_normal_from_view_change_status: view={}..{} backup", .{
5321
5539
  self.replica,
@@ -5324,6 +5542,9 @@ pub fn ReplicaType(
5324
5542
  });
5325
5543
 
5326
5544
  assert(!self.prepare_timeout.ticking);
5545
+ assert(!self.normal_heartbeat_timeout.ticking);
5546
+ assert(!self.primary_abdicate_timeout.ticking);
5547
+ assert(self.request_start_view_message_timeout.ticking);
5327
5548
  assert(self.pipeline == .cache);
5328
5549
 
5329
5550
  if (self.log_view == view_new and self.view == view_new) {
@@ -5336,24 +5557,27 @@ pub fn ReplicaType(
5336
5557
  }
5337
5558
 
5338
5559
  self.ping_timeout.start();
5339
- self.commit_timeout.stop();
5340
- self.normal_status_timeout.start();
5560
+ self.commit_message_timeout.stop();
5561
+ self.normal_heartbeat_timeout.start();
5562
+ self.start_view_change_window_timeout.stop();
5563
+ self.start_view_change_message_timeout.start();
5341
5564
  self.view_change_status_timeout.stop();
5342
- self.view_change_message_timeout.stop();
5565
+ self.do_view_change_message_timeout.stop();
5566
+ self.request_start_view_message_timeout.stop();
5343
5567
  self.repair_timeout.start();
5344
5568
  }
5345
5569
 
5570
+ self.heartbeat_timestamp = 0;
5346
5571
  self.reset_quorum_start_view_change();
5347
5572
  self.reset_quorum_do_view_change();
5348
5573
  self.reset_quorum_nack_prepare();
5349
5574
 
5350
- assert(self.start_view_change_quorum == false);
5351
5575
  assert(self.do_view_change_quorum == false);
5352
5576
  assert(self.nack_prepare_op == null);
5353
5577
  }
5354
5578
 
5355
5579
  /// A replica i that notices the need for a view change advances its view, sets its status
5356
- /// to view_change, and sends a ⟨start_view_change v, i⟩ message to all the other replicas,
5580
+ /// to view_change, and sends a ⟨do_view_change v, i⟩ message to all the other replicas,
5357
5581
  /// where v identifies the new view. A replica notices the need for a view change either
5358
5582
  /// based on its own timer, or because it receives a start_view_change or do_view_change
5359
5583
  /// message for a view with a larger number than its own view.
@@ -5390,30 +5614,40 @@ pub fn ReplicaType(
5390
5614
  }
5391
5615
 
5392
5616
  self.ping_timeout.stop();
5393
- self.commit_timeout.stop();
5394
- self.normal_status_timeout.stop();
5617
+ self.commit_message_timeout.stop();
5618
+ self.normal_heartbeat_timeout.stop();
5619
+ self.start_view_change_window_timeout.stop();
5620
+ self.start_view_change_message_timeout.start();
5395
5621
  self.view_change_status_timeout.start();
5396
- self.view_change_message_timeout.start();
5622
+ self.do_view_change_message_timeout.start();
5397
5623
  self.repair_timeout.stop();
5398
5624
  self.prepare_timeout.stop();
5625
+ self.primary_abdicate_timeout.stop();
5626
+
5627
+ if (self.primary_index(self.view) == self.replica) {
5628
+ self.request_start_view_message_timeout.stop();
5629
+ } else {
5630
+ self.request_start_view_message_timeout.start();
5631
+ }
5399
5632
 
5400
5633
  // Do not reset quorum counters only on entering a view, assuming that the view will be
5401
5634
  // followed only by a single subsequent view change to the next view, because multiple
5402
5635
  // successive view changes can fail, e.g. after a view change timeout.
5403
5636
  // We must therefore reset our counters here to avoid counting messages from an older
5404
5637
  // view, which would violate the quorum intersection property essential for correctness.
5638
+ self.heartbeat_timestamp = 0;
5639
+ self.primary_abdicating = false;
5405
5640
  self.reset_quorum_start_view_change();
5406
5641
  self.reset_quorum_do_view_change();
5407
5642
  self.reset_quorum_nack_prepare();
5408
5643
 
5409
- assert(self.start_view_change_quorum == false);
5410
5644
  assert(self.do_view_change_quorum == false);
5411
5645
  assert(self.nack_prepare_op == null);
5412
5646
 
5413
5647
  if (self.log_view == self.view) {
5414
5648
  assert(status_before == .recovering_head);
5415
5649
  } else {
5416
- self.send_start_view_change();
5650
+ self.send_do_view_change();
5417
5651
  }
5418
5652
  }
5419
5653
 
@@ -5590,6 +5824,7 @@ pub fn ReplicaType(
5590
5824
  }
5591
5825
 
5592
5826
  // TODO Debounce and decouple this from `on_message()` by moving into `tick()`:
5827
+ // (Using request_start_view_message_timeout).
5593
5828
  log.debug("{}: view_jump: requesting start_view message", .{self.replica});
5594
5829
  self.send_header_to_replica(self.primary_index(header.view), .{
5595
5830
  .command = .request_start_view,