npm - tigerbeetle-node - Versions diffs - 0.11.12 → 0.11.13 - Mend

tigerbeetle-node 0.11.12 → 0.11.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

package/README.md +212 -196
package/dist/.client.node.sha256 +1 -1
package/package.json +3 -2
package/src/node.zig +1 -0
package/src/tigerbeetle/scripts/benchmark.bat +9 -2
package/src/tigerbeetle/scripts/benchmark.sh +1 -1
package/src/tigerbeetle/scripts/fail_on_diff.sh +9 -0
package/src/tigerbeetle/scripts/fuzz_loop_hash_log.sh +12 -0
package/src/tigerbeetle/scripts/scripts/benchmark.bat +9 -2
package/src/tigerbeetle/scripts/scripts/benchmark.sh +1 -1
package/src/tigerbeetle/scripts/scripts/fail_on_diff.sh +9 -0
package/src/tigerbeetle/scripts/scripts/fuzz_loop_hash_log.sh +12 -0
package/src/tigerbeetle/src/benchmark.zig +253 -231
package/src/tigerbeetle/src/config.zig +2 -3
package/src/tigerbeetle/src/constants.zig +2 -10
package/src/tigerbeetle/src/io/linux.zig +15 -6
package/src/tigerbeetle/src/lsm/forest.zig +1 -0
package/src/tigerbeetle/src/lsm/forest_fuzz.zig +63 -14
package/src/tigerbeetle/src/lsm/groove.zig +134 -70
package/src/tigerbeetle/src/lsm/level_iterator.zig +2 -2
package/src/tigerbeetle/src/lsm/manifest_level.zig +1 -0
package/src/tigerbeetle/src/lsm/posted_groove.zig +7 -4
package/src/tigerbeetle/src/lsm/segmented_array.zig +1 -0
package/src/tigerbeetle/src/lsm/table.zig +29 -51
package/src/tigerbeetle/src/lsm/table_immutable.zig +6 -17
package/src/tigerbeetle/src/lsm/table_iterator.zig +2 -2
package/src/tigerbeetle/src/lsm/table_mutable.zig +9 -26
package/src/tigerbeetle/src/lsm/test.zig +1 -0
package/src/tigerbeetle/src/lsm/tree.zig +2 -26
package/src/tigerbeetle/src/lsm/tree_fuzz.zig +7 -2
package/src/tigerbeetle/src/message_bus.zig +1 -0
package/src/tigerbeetle/src/simulator.zig +14 -3
package/src/tigerbeetle/src/state_machine/auditor.zig +1 -0
package/src/tigerbeetle/src/state_machine.zig +402 -184
package/src/tigerbeetle/src/stdx.zig +9 -0
package/src/tigerbeetle/src/testing/cluster.zig +1 -0
package/src/tigerbeetle/src/testing/packet_simulator.zig +19 -9
package/src/tigerbeetle/src/testing/state_machine.zig +1 -0
package/src/tigerbeetle/src/unit_tests.zig +20 -22
package/src/tigerbeetle/src/vsr/README.md +1 -1
package/src/tigerbeetle/src/vsr/client.zig +4 -4
package/src/tigerbeetle/src/vsr/clock.zig +2 -0
package/src/tigerbeetle/src/vsr/journal.zig +2 -0
package/src/tigerbeetle/src/vsr/replica.zig +481 -246
package/src/tigerbeetle/src/vsr.zig +104 -31

package/src/tigerbeetle/src/vsr/replica.zig CHANGED Viewed

@@ -208,8 +208,21 @@ pub fn ReplicaType(
         /// we require and assert in our protocol implementation.
         loopback_queue: ?*Message = null,
-        /// Unique start_view_change messages for the same view from OTHER replicas (excluding ourself).
-        start_view_change_from_other_replicas: QuorumCounter = quorum_counter_null,
+        /// The last timestamp received on a commit heartbeat.
+        /// The timestamp originates from the primary's monotonic clock. It is used to discard
+        /// delayed or duplicate heartbeat messages.
+        /// (status=normal backup)
+        heartbeat_timestamp: u64 = 0,
+        /// While set, don't send commit heartbeats.
+        /// Used when the primary believes that it is partitioned and needs to step down.
+        /// In particular, guards against a deadlock in the case where small messages (e.g.
+        /// heartbeats, pings/pongs) succeed, but large messages (e.g. prepares) fail.
+        /// (status=normal primary, pipeline has prepare with !ok_quorum_received)
+        primary_abdicating: bool = false,
+        /// Unique start_view_change messages for the same view from ALL replicas (including ourself).
+        start_view_change_from_all_replicas: QuorumCounter = quorum_counter_null,
         /// Unique do_view_change messages for the same view from ALL replicas (including ourself).
         do_view_change_from_all_replicas: QuorumMessages = quorum_messages_null,
@@ -217,9 +230,6 @@ pub fn ReplicaType(
         /// Unique nack_prepare messages for the same view from OTHER replicas (excluding ourself).
         nack_prepare_from_other_replicas: QuorumCounter = quorum_counter_null,
-        /// Whether a replica has received a quorum of start_view_change messages for the view change:
-        start_view_change_quorum: bool = false,
         /// Whether the primary has received a quorum of do_view_change messages for the view change:
         /// Determines whether the primary may effect repairs according to the CTRL protocol.
         do_view_change_quorum: bool = false,
@@ -230,28 +240,54 @@ pub fn ReplicaType(
         /// The number of ticks before a primary or backup broadcasts a ping to other replicas.
         /// TODO Explain why we need this (MessageBus handshaking, leapfrogging faulty replicas,
         /// deciding whether starting a view change would be detrimental under some network partitions).
+        /// (status=normal replicas)
         ping_timeout: Timeout,
         /// The number of ticks without enough prepare_ok's before the primary resends a prepare.
+        /// (status=normal primary, pipeline has prepare with !ok_quorum_received)
         prepare_timeout: Timeout,
+        /// The number of ticks waiting for a prepare_ok.
+        /// When triggered, set primary_abdicating=true, which pauses outgoing commit heartbeats.
+        /// (status=normal primary, pipeline has prepare with !ok_quorum_received)
+        primary_abdicate_timeout: Timeout,
         /// The number of ticks before the primary sends a commit heartbeat:
         /// The primary always sends a commit heartbeat irrespective of when it last sent a prepare.
         /// This improves liveness when prepare messages cannot be replicated fully due to partitions.
-        commit_timeout: Timeout,
-        /// The number of ticks without hearing from the primary before starting a view change.
-        /// This transitions from .normal status to .view_change status.
-        normal_status_timeout: Timeout,
-        /// The number of ticks before a view change is timed out:
-        /// This transitions from `view_change` status to `view_change` status but for a newer view.
+        /// (status=normal primary)
+        commit_message_timeout: Timeout,
+        /// The number of ticks without a heartbeat.
+        /// Reset any time the backup receives a heartbeat from the primary.
+        /// Triggers SVC messages. If an SVC quorum is achieved, we will kick off a view-change.
+        /// (status=normal backup)
+        normal_heartbeat_timeout: Timeout,
+        /// The number of ticks before resetting the SVC quorum.
+        /// (status=normal|view-change, SVC quorum contains message from ANY OTHER replica)
+        start_view_change_window_timeout: Timeout,
+        /// The number of ticks before resending a `start_view_change` message.
+        /// (status=normal|view-change)
+        start_view_change_message_timeout: Timeout,
+        /// The number of ticks before a view change is timed out.
+        /// When triggered, begin sending SVC messages (to attempt to increment the view and try a
+        /// different primary) — but keep trying DVCs as well.
+        /// (status=view-change)
         view_change_status_timeout: Timeout,
-        /// The number of ticks before resending a `start_view_change` or `do_view_change` message:
-        view_change_message_timeout: Timeout,
+        /// The number of ticks before resending a `do_view_change` message:
+        /// (status=view-change)
+        do_view_change_message_timeout: Timeout,
+        /// The number of ticks before resending a `request_start_view` message.
+        /// (status=view-change backup)
+        request_start_view_message_timeout: Timeout,
         /// The number of ticks before repairing missing/disconnected headers and/or dirty entries:
+        /// (status=normal or (status=view-change and primary))
         repair_timeout: Timeout,
         /// Used to provide deterministic entropy to `choose_any_other_replica()`.
@@ -479,8 +515,6 @@ pub fn ReplicaType(
             time: Time,
             storage: *Storage,
             message_pool: *MessagePool,
-            // TODO With https://github.com/coilhq/tigerbeetle/issues/71,
-            // the separate message_bus_options won't be necessary.
             message_bus_options: MessageBus.Options,
             state_machine_options: StateMachine.Options,
         };
@@ -578,26 +612,46 @@ pub fn ReplicaType(
                     .id = replica_index,
                     .after = 50,
                 },
-                .commit_timeout = Timeout{
-                    .name = "commit_timeout",
+                .primary_abdicate_timeout = Timeout{
+                    .name = "primary_abdicate_timeout",
                     .id = replica_index,
-                    .after = 100,
+                    .after = 1000,
+                },
+                .commit_message_timeout = Timeout{
+                    .name = "commit_message_timeout",
+                    .id = replica_index,
+                    .after = 50,
+                },
+                .normal_heartbeat_timeout = Timeout{
+                    .name = "normal_heartbeat_timeout",
+                    .id = replica_index,
+                    .after = 500,
                 },
-                .normal_status_timeout = Timeout{
-                    .name = "normal_status_timeout",
+                .start_view_change_window_timeout = Timeout{
+                    .name = "start_view_change_window_timeout",
                     .id = replica_index,
                     .after = 500,
                 },
+                .start_view_change_message_timeout = Timeout{
+                    .name = "start_view_change_message_timeout",
+                    .id = replica_index,
+                    .after = 50,
+                },
                 .view_change_status_timeout = Timeout{
                     .name = "view_change_status_timeout",
                     .id = replica_index,
                     .after = 500,
                 },
-                .view_change_message_timeout = Timeout{
-                    .name = "view_change_message_timeout",
+                .do_view_change_message_timeout = Timeout{
+                    .name = "do_view_change_message_timeout",
                     .id = replica_index,
                     .after = 50,
                 },
+                .request_start_view_message_timeout = Timeout{
+                    .name = "request_start_view_message_timeout",
+                    .id = replica_index,
+                    .after = 100,
+                },
                 .repair_timeout = Timeout{
                     .name = "repair_timeout",
                     .id = replica_index,
@@ -686,18 +740,26 @@ pub fn ReplicaType(
             self.ping_timeout.tick();
             self.prepare_timeout.tick();
-            self.commit_timeout.tick();
-            self.normal_status_timeout.tick();
+            self.primary_abdicate_timeout.tick();
+            self.commit_message_timeout.tick();
+            self.normal_heartbeat_timeout.tick();
+            self.start_view_change_window_timeout.tick();
+            self.start_view_change_message_timeout.tick();
             self.view_change_status_timeout.tick();
-            self.view_change_message_timeout.tick();
+            self.do_view_change_message_timeout.tick();
+            self.request_start_view_message_timeout.tick();
             self.repair_timeout.tick();
             if (self.ping_timeout.fired()) self.on_ping_timeout();
             if (self.prepare_timeout.fired()) self.on_prepare_timeout();
-            if (self.commit_timeout.fired()) self.on_commit_timeout();
-            if (self.normal_status_timeout.fired()) self.on_normal_status_timeout();
+            if (self.primary_abdicate_timeout.fired()) self.on_primary_abdicate_timeout();
+            if (self.commit_message_timeout.fired()) self.on_commit_message_timeout();
+            if (self.normal_heartbeat_timeout.fired()) self.on_normal_heartbeat_timeout();
+            if (self.start_view_change_window_timeout.fired()) self.on_start_view_change_window_timeout();
+            if (self.start_view_change_message_timeout.fired()) self.on_start_view_change_message_timeout();
             if (self.view_change_status_timeout.fired()) self.on_view_change_status_timeout();
-            if (self.view_change_message_timeout.fired()) self.on_view_change_message_timeout();
+            if (self.do_view_change_message_timeout.fired()) self.on_do_view_change_message_timeout();
+            if (self.request_start_view_message_timeout.fired()) self.on_request_start_view_message_timeout();
             if (self.repair_timeout.fired()) self.on_repair_timeout();
             // None of the on_timeout() functions above should send a message to this replica.
@@ -743,6 +805,7 @@ pub fn ReplicaType(
             switch (message.header.command) {
                 .ping => self.on_ping(message),
                 .pong => self.on_pong(message),
+                .ping_client => self.on_ping_client(message),
                 .request => self.on_request(message),
                 .prepare => self.on_prepare(message),
                 .prepare_ok => self.on_prepare_ok(message),
@@ -757,7 +820,7 @@ pub fn ReplicaType(
                 .headers => self.on_headers(message),
                 .nack_prepare => self.on_nack_prepare(message),
                 // A replica should never handle misdirected messages intended for a client:
-                .eviction, .reply => {
+                .pong_client, .eviction, .reply => {
                     log.warn("{}: on_message: ignoring misdirected {s} message", .{
                         self.replica,
                         @tagName(message.header.command),
@@ -783,58 +846,34 @@ pub fn ReplicaType(
             tracer.flush();
         }
-        // Pings are used:
-        //  - By clients, to learn about the current view.
-        //  - By replicas, to synchronise cluster time and to probe for network connectivity.
-        //
-        // In the second case we avoid setting the view to make sure pings can still be sent
-        // during view changes.
+        /// Pings are used by replicas to synchronise cluster time and to probe for network connectivity.
         fn on_ping(self: *Self, message: *const Message) void {
+            assert(message.header.command == .ping);
             if (self.status != .normal and self.status != .view_change) return;
             assert(self.status == .normal or self.status == .view_change);
+            if (message.header.replica == self.replica) {
+                log.warn("{}: on_ping: ignoring (self)", .{self.replica});
+                return;
+            }
             // TODO Drop pings that were not addressed to us.
-            var pong = Header{
+            self.send_header_to_replica(message.header.replica, .{
                 .command = .pong,
                 .cluster = self.cluster,
                 .replica = self.replica,
-            };
-            if (message.header.client > 0) {
-                assert(message.header.replica == 0);
-                // We must only ever send our view number to a client via a pong message if we are
-                // in normal status. Otherwise, we may be partitioned from the cluster with a newer
-                // view number, leak this to the client, which would then pass this to the cluster
-                // in subsequent client requests, which would then ignore these client requests with
-                // a newer view number, locking out the client. The principle here is that we must
-                // never send view numbers for views that have not yet started.
-                if (self.status == .normal) {
-                    pong.view = self.view;
-                    self.send_header_to_client(message.header.client, pong);
-                }
-            } else {
-                assert(message.header.view == 0);
-                if (message.header.replica == self.replica) {
-                    log.warn("{}: on_ping: ignoring (self)", .{self.replica});
-                } else {
-                    // Copy the ping's monotonic timestamp to our pong and add our wall clock sample:
-                    pong.op = message.header.op;
-                    pong.timestamp = @bitCast(u64, self.clock.realtime());
-                    self.send_header_to_replica(message.header.replica, pong);
-                }
-            }
+                // Copy the ping's monotonic timestamp to our pong and add our wall clock sample:
+                .op = message.header.op,
+                .timestamp = @bitCast(u64, self.clock.realtime()),
+            });
         }
         fn on_pong(self: *Self, message: *const Message) void {
-            if (message.header.client > 0) return;
+            assert(message.header.command == .pong);
             if (message.header.replica == self.replica) return;
-            assert(message.header.view == 0);
             const m0 = message.header.op;
             const t1 = @bitCast(i64, message.header.timestamp);
             const m2 = self.clock.monotonic();
@@ -842,6 +881,27 @@ pub fn ReplicaType(
             self.clock.learn(message.header.replica, m0, t1, m2);
         }
+        /// Pings are used by clients to learn about the current view.
+        fn on_ping_client(self: *Self, message: *const Message) void {
+            assert(message.header.command == .ping_client);
+            assert(message.header.client != 0);
+            // We must only ever send our view number to a client via a pong message if we are
+            // in normal status. Otherwise, we may be partitioned from the cluster with a newer
+            // view number, leak this to the client, which would then pass this to the cluster
+            // in subsequent client requests, which would then ignore these client requests with
+            // a newer view number, locking out the client. The principle here is that we must
+            // never send view numbers for views that have not yet started.
+            if (self.status != .normal) return;
+            self.send_header_to_client(message.header.client, .{
+                .command = .pong_client,
+                .cluster = self.cluster,
+                .replica = self.replica,
+                .view = self.view,
+            });
+        }
         /// When there is free space in the pipeline's prepare queue:
         ///   The primary advances op-number, adds the request to the end of the log, and updates the
         ///   information for this client in the client-table to contain the new request number, s.
@@ -905,6 +965,7 @@ pub fn ReplicaType(
         /// If the next replica is down or partitioned, then the primary's prepare timeout will fire,
         /// and the primary will resend but to another replica, until it receives enough prepare_ok's.
         fn on_prepare(self: *Self, message: *Message) void {
+            assert(message.header.command == .prepare);
             self.view_jump(message.header);
             if (self.is_repair(message)) {
@@ -949,8 +1010,6 @@ pub fn ReplicaType(
             assert(message.header.op > self.commit_min);
             assert(message.header.op <= self.op_checkpoint_trigger());
-            if (self.backup()) self.normal_status_timeout.reset();
             if (message.header.op > self.op + 1) {
                 log.debug("{}: on_prepare: newer op", .{self.replica});
                 self.jump_to_newer_op_in_normal_status(message.header);
@@ -988,6 +1047,7 @@ pub fn ReplicaType(
         }
         fn on_prepare_ok(self: *Self, message: *Message) void {
+            assert(message.header.command == .prepare_ok);
             if (self.ignore_prepare_ok(message)) return;
             assert(self.status == .normal);
@@ -1020,12 +1080,20 @@ pub fn ReplicaType(
             else
                 self.quorum_replication;
+            if (!prepare.ok_from_all_replicas.isSet(message.header.replica)) {
+                self.primary_abdicating = false;
+                if (!prepare.ok_quorum_received) {
+                    self.primary_abdicate_timeout.reset();
+                }
+            }
             const count = self.count_message_and_receive_quorum_exactly_once(
                 &prepare.ok_from_all_replicas,
                 message,
                 threshold,
             ) orelse return;
+            const prepare_pending = self.primary_pipeline_pending().?;
             assert(count == threshold);
             assert(!prepare.ok_quorum_received);
             prepare.ok_quorum_received = true;
@@ -1035,6 +1103,16 @@ pub fn ReplicaType(
                 prepare.message.header.checksum,
             });
+            assert(self.prepare_timeout.ticking);
+            assert(self.primary_abdicate_timeout.ticking);
+            assert(!self.primary_abdicating);
+            if (self.primary_pipeline_pending()) |_| {
+                if (prepare_pending == prepare) self.prepare_timeout.reset();
+            } else {
+                self.prepare_timeout.stop();
+                self.primary_abdicate_timeout.stop();
+            }
             self.commit_pipeline();
         }
@@ -1043,6 +1121,8 @@ pub fn ReplicaType(
         /// It's possible for the network to be one-way partitioned so that backups don't see the
         /// primary as down, but neither can the primary hear from the backups.
         fn on_commit(self: *Self, message: *const Message) void {
+            assert(message.header.command == .commit);
             self.view_jump(message.header);
             if (self.status != .normal) {
@@ -1070,6 +1150,13 @@ pub fn ReplicaType(
             assert(message.header.view == self.view);
             assert(message.header.replica == self.primary_index(message.header.view));
+            // Old/duplicate heartbeats don't count.
+            if (self.heartbeat_timestamp < message.header.timestamp) {
+                self.heartbeat_timestamp = message.header.timestamp;
+                self.normal_heartbeat_timeout.reset();
+                self.start_view_change_from_all_replicas.unset(self.replica);
+            }
             // We may not always have the latest commit entry but if we do our checksum must match:
             if (self.journal.header_with_op(message.header.commit)) |commit_entry| {
                 if (commit_entry.checksum == message.header.context) {
@@ -1082,7 +1169,6 @@ pub fn ReplicaType(
                 }
             }
-            self.normal_status_timeout.reset();
             self.commit_journal(message.header.commit);
         }
@@ -1154,47 +1240,53 @@ pub fn ReplicaType(
         }
         fn on_start_view_change(self: *Self, message: *Message) void {
-            if (self.ignore_view_change_message(message)) return;
+            assert(message.header.command == .start_view_change);
+            if (self.ignore_start_view_change_message(message)) return;
+            assert(self.replica_count > 1);
             assert(self.status == .normal or self.status == .view_change);
             assert(message.header.view >= self.view);
-            assert(message.header.replica != self.replica);
             self.view_jump(message.header);
-            assert(self.status == .view_change);
             assert(message.header.view == self.view);
-            // Wait until we have `f` messages (excluding ourself) for quorum:
-            assert(self.replica_count > 1);
-            const threshold = self.quorum_view_change - 1;
+            // Wait until we have `f + 1` messages (possibly including ourself) for quorum.
+            // This ensures that we do not start a view-change while normal request processing
+            // is possible.
+            const threshold = self.quorum_view_change;
-            const count = self.count_message_and_receive_quorum_exactly_once(
-                &self.start_view_change_from_other_replicas,
-                message,
-                threshold,
-            ) orelse return;
+            self.start_view_change_from_all_replicas.set(message.header.replica);
-            assert(count == threshold);
-            assert(!self.start_view_change_from_other_replicas.isSet(self.replica));
+            if (self.replica != message.header.replica and
+                !self.start_view_change_window_timeout.ticking)
+            {
+                self.start_view_change_window_timeout.start();
+            }
+            const count = self.start_view_change_from_all_replicas.count();
+            assert(count <= threshold);
+            if (count < threshold) {
+                log.debug("{}: on_start_view_change: view={} waiting for quorum ({}/{})", .{
+                    self.replica,
+                    self.view,
+                    count,
+                    threshold,
+                });
+                return;
+            }
             log.debug("{}: on_start_view_change: view={} quorum received", .{
                 self.replica,
                 self.view,
             });
-            assert(!self.start_view_change_quorum);
-            assert(!self.do_view_change_quorum);
-            self.start_view_change_quorum = true;
-            // When replica i receives start_view_change messages for its view from f other replicas,
-            // it sends a ⟨do_view_change v, l, v’, n, k, i⟩ message to the node that will be the
-            // primary in the new view. Here v is its view, l is its log, v′ is the view number of the
-            // latest view in which its status was normal, n is the op number, and k is the commit
-            // number.
-            self.send_do_view_change();
-            defer self.flush_loopback_queue();
+            self.transition_to_view_change_status(self.view + 1);
+            assert(self.start_view_change_from_all_replicas.count() == 0);
         }
+        /// DVC serves two purposes:
+        ///
         /// When the new primary receives f + 1 do_view_change messages from different replicas
         /// (including itself), it sets its view number to that in the messages and selects as the
         /// new log the one contained in the message with the largest v′; if several messages have
@@ -1204,22 +1296,25 @@ pub fn ReplicaType(
         /// informs the other replicas of the completion of the view change by sending
         /// ⟨start_view v, l, n, k⟩ messages to the other replicas, where l is the new log, n is the
         /// op number, and k is the commit number.
+        ///
+        /// When a new backup receives a do_view_change message for a new view, it transitions to
+        /// that new view in view-change status and begins to broadcast its own DVC.
         fn on_do_view_change(self: *Self, message: *Message) void {
+            assert(message.header.command == .do_view_change);
             if (self.ignore_view_change_message(message)) return;
             assert(self.status == .normal or self.status == .view_change);
             assert(message.header.view >= self.view);
-            assert(self.primary_index(message.header.view) == self.replica);
             self.view_jump(message.header);
             assert(self.status == .view_change);
             assert(message.header.view == self.view);
-            // We may receive a `do_view_change` quorum from other replicas, which already have a
-            // `start_view_change_quorum`, before we receive a `start_view_change_quorum`:
-            if (!self.start_view_change_quorum) {
-                log.debug("{}: on_do_view_change: waiting for start_view_change quorum (view={})", .{
+            if (self.primary_index(message.header.view) != self.replica) {
+                for (self.do_view_change_from_all_replicas) |dvc| assert(dvc == null);
+                log.debug("{}: on_do_view_change: view={} backup awaiting start_view", .{
                     self.replica,
                     self.view,
                 });
@@ -1244,7 +1339,6 @@ pub fn ReplicaType(
                 self.view,
             });
-            assert(self.start_view_change_quorum);
             assert(!self.do_view_change_quorum);
             self.do_view_change_quorum = true;
@@ -1274,6 +1368,7 @@ pub fn ReplicaType(
         /// they execute all operations known to be committed that they haven’t executed previously,
         /// advance their commit number, and update the information in their client table.
         fn on_start_view(self: *Self, message: *const Message) void {
+            assert(message.header.command == .start_view);
             if (self.ignore_view_change_message(message)) return;
             if (message.header.op > self.op_checkpoint_trigger()) {
@@ -1320,6 +1415,7 @@ pub fn ReplicaType(
         }
         fn on_request_start_view(self: *Self, message: *const Message) void {
+            assert(message.header.command == .request_start_view);
             if (self.ignore_repair_message(message)) return;
             assert(self.status == .normal);
@@ -1350,6 +1446,7 @@ pub fn ReplicaType(
         /// prepare. If a guaranteed prepare is found to by faulty, the replica must repair it
         /// to restore durability.
         fn on_request_prepare(self: *Self, message: *const Message) void {
+            assert(message.header.command == .request_prepare);
             if (self.ignore_repair_message(message)) return;
             assert(self.replica_count > 1);
@@ -1483,6 +1580,7 @@ pub fn ReplicaType(
         }
         fn on_request_headers(self: *Self, message: *const Message) void {
+            assert(message.header.command == .request_headers);
             if (self.ignore_repair_message(message)) return;
             assert(self.status == .normal or self.status == .view_change);
@@ -1525,6 +1623,7 @@ pub fn ReplicaType(
         }
         fn on_nack_prepare(self: *Self, message: *Message) void {
+            assert(message.header.command == .nack_prepare);
             if (self.ignore_repair_message(message)) return;
             assert(self.status == .view_change);
@@ -1622,6 +1721,7 @@ pub fn ReplicaType(
         }
         fn on_headers(self: *Self, message: *const Message) void {
+            assert(message.header.command == .headers);
             if (self.ignore_repair_message(message)) return;
             assert(self.status == .normal or self.status == .view_change);
@@ -1665,12 +1765,12 @@ pub fn ReplicaType(
             assert(self.status == .normal);
             assert(self.primary());
-            const prepare = self.pipeline.queue.prepare_queue.head_ptr().?;
-            assert(prepare.message.header.command == .prepare);
-            assert(prepare.message.header.op == self.commit_min + 1);
+            const prepare = self.primary_pipeline_pending().?;
-            if (prepare.ok_quorum_received) {
-                assert(self.committing);
+            if (self.replica_count == 1) {
+                // Replica=1 doesn't write prepares concurrently to avoid gaps in its WAL.
+                assert(self.journal.writes.executing() <= 1);
+                assert(self.journal.writes.executing() == 1 or self.committing);
                 self.prepare_timeout.reset();
                 return;
@@ -1701,10 +1801,15 @@ pub fn ReplicaType(
             }
             if (waiting_len == 0) {
-                self.prepare_timeout.reset();
+                // TODO: This assert will be valid when the state-transfer is implemented and the
+                // threshold=replica_count hack is removed from on_prepare_ok.
+                // assert(self.quorum_replication == self.replica_count);
+                assert(!prepare.ok_from_all_replicas.isSet(self.replica));
+                assert(prepare.ok_from_all_replicas.count() == self.replica_count - 1);
+                assert(prepare.message.header.op <= self.op);
+                self.prepare_timeout.reset();
                 log.debug("{}: on_prepare_timeout: waiting for journal", .{self.replica});
-                assert(!prepare.ok_from_all_replicas.isSet(self.replica));
                 // We may be slow and waiting for the write to complete.
                 //
@@ -1716,9 +1821,7 @@ pub fn ReplicaType(
                 //
                 // Retry the write through `on_repair()` which will work out which is which.
                 // We do expect that the op would have been run through `on_prepare()` already.
-                assert(prepare.message.header.op <= self.op);
                 self.on_repair(prepare.message);
                 return;
             }
@@ -1746,13 +1849,39 @@ pub fn ReplicaType(
             self.send_message_to_replica(replica, prepare.message);
         }
-        fn on_commit_timeout(self: *Self) void {
-            self.commit_timeout.reset();
+        fn on_primary_abdicate_timeout(self: *Self) void {
+            assert(self.status == .normal);
+            assert(self.primary());
+            assert(self.primary_pipeline_pending() != null);
+            self.primary_abdicate_timeout.reset();
+            if (self.replica_count == 1) return;
+            log.debug("{}: on_primary_abdicate_timeout: abdicating (view={})", .{
+                self.replica,
+                self.view,
+            });
+            self.primary_abdicating = true;
+        }
+        fn on_commit_message_timeout(self: *Self) void {
+            self.commit_message_timeout.reset();
             assert(self.status == .normal);
             assert(self.primary());
             assert(self.commit_min == self.commit_max);
+            if (self.primary_abdicating) {
+                assert(self.primary_abdicate_timeout.ticking);
+                assert(self.pipeline.queue.prepare_queue.count > 0);
+                assert(self.primary_pipeline_pending() != null);
+                log.debug("{}: on_commit_message_timeout: primary abdicating (view={})", .{
+                    self.replica,
+                    self.view,
+                });
+                return;
+            }
             const latest_committed_entry = checksum: {
                 if (self.commit_max == self.superblock.working.vsr_state.commit_min) {
                     break :checksum self.superblock.working.vsr_state.commit_min_checksum;
@@ -1768,37 +1897,86 @@ pub fn ReplicaType(
                 .replica = self.replica,
                 .view = self.view,
                 .commit = self.commit_max,
+                .timestamp = self.clock.monotonic(),
             });
         }
-        fn on_normal_status_timeout(self: *Self) void {
+        fn on_normal_heartbeat_timeout(self: *Self) void {
             assert(self.status == .normal);
             assert(self.backup());
-            self.transition_to_view_change_status(self.view + 1);
+            self.normal_heartbeat_timeout.reset();
+            if (self.replica_count == 1) return;
+            log.debug("{}: on_normal_heartbeat_timeout: heartbeat lost (view={})", .{
+                self.replica,
+                self.view,
+            });
+            self.send_start_view_change();
+        }
+        fn on_start_view_change_window_timeout(self: *Self) void {
+            assert(self.status == .normal or self.status == .view_change);
+            assert(self.start_view_change_from_all_replicas.count() > 0);
+            assert(self.replica_count > 1);
+            self.start_view_change_window_timeout.stop();
+            // Don't reset our own SVC; it will be reset if/when we receive a heartbeat.
+            const svc = self.start_view_change_from_all_replicas.isSet(self.replica);
+            self.reset_quorum_start_view_change();
+            if (svc) self.start_view_change_from_all_replicas.set(self.replica);
+        }
+        fn on_start_view_change_message_timeout(self: *Self) void {
+            assert(self.status == .normal or self.status == .view_change);
+            self.start_view_change_message_timeout.reset();
+            if (self.replica_count == 1) return;
+            if (self.start_view_change_from_all_replicas.isSet(self.replica)) {
+                self.send_start_view_change();
+            }
         }
         fn on_view_change_status_timeout(self: *Self) void {
             assert(self.status == .view_change);
-            self.transition_to_view_change_status(self.view + 1);
+            assert(self.replica_count > 1);
+            self.view_change_status_timeout.reset();
+            self.send_start_view_change();
         }
-        fn on_view_change_message_timeout(self: *Self) void {
-            self.view_change_message_timeout.reset();
+        fn on_do_view_change_message_timeout(self: *Self) void {
             assert(self.status == .view_change);
+            assert(self.replica_count > 1);
+            self.do_view_change_message_timeout.reset();
-            // Keep sending `start_view_change` messages:
-            // We may have a `start_view_change_quorum` but other replicas may not.
-            // However, the primary may stop sending once it has a `do_view_change_quorum`.
-            if (!self.do_view_change_quorum) self.send_start_view_change();
-            // It is critical that a `do_view_change` message implies a `start_view_change_quorum`:
-            if (self.start_view_change_quorum) {
-                // The primary need not retry to send a `do_view_change` message to itself:
-                // We assume the MessageBus will not drop messages sent by a replica to itself.
-                if (self.primary_index(self.view) != self.replica) self.send_do_view_change();
+            if (self.primary_index(self.view) == self.replica and self.do_view_change_quorum) {
+                // A primary in status=view_change with a complete DVC quorum must be repairing —
+                // it does not need to signal other replicas.
+                assert(self.view == self.log_view);
+            } else {
+                assert(self.view > self.log_view);
+                self.send_do_view_change();
             }
         }
+        fn on_request_start_view_message_timeout(self: *Self) void {
+            assert(self.status == .view_change);
+            assert(self.primary_index(self.view) != self.replica);
+            self.request_start_view_message_timeout.reset();
+            log.debug("{}: on_request_start_view_message_timeout: view={}", .{
+                self.replica,
+                self.view,
+            });
+            self.send_header_to_replica(self.primary_index(self.view), .{
+                .command = .request_start_view,
+                .cluster = self.cluster,
+                .replica = self.replica,
+                .view = self.view,
+            });
+        }
         fn on_repair_timeout(self: *Self) void {
             assert(self.status == .normal or self.status == .view_change);
             self.repair();
@@ -1926,13 +2104,6 @@ pub fn ReplicaType(
                     assert(self.status == .normal);
                     assert(self.primary());
                 },
-                .start_view_change => {
-                    assert(self.replica_count > 1);
-                    if (self.replica_count == 2) assert(threshold == 1);
-                    assert(self.status == .view_change);
-                    assert(self.replica != message.header.replica);
-                },
                 .nack_prepare => {
                     assert(self.replica_count > 1);
                     if (self.replica_count == 2) assert(threshold >= 1);
@@ -2270,7 +2441,6 @@ pub fn ReplicaType(
                 assert(prepare.message.header.checksum == self.commit_prepare.?.header.checksum);
                 assert(prepare.message.header.op == self.commit_min);
                 assert(prepare.message.header.op == self.commit_max);
-                assert(self.prepare_timeout.ticking);
                 self.message_bus.unref(prepare.message);
@@ -2288,10 +2458,6 @@ pub fn ReplicaType(
                         });
                         self.write_prepare(next.message, .append);
                     }
-                } else {
-                    // When the pipeline is empty, stop the prepare timeout.
-                    // The timeout will be restarted when another entry arrives for the pipeline.
-                    self.prepare_timeout.stop();
                 }
             }
@@ -2815,7 +2981,7 @@ pub fn ReplicaType(
             assert(
                 header.view == self.view or
                     header.command == .request_start_view or
-                    header.command == .pong or header.command == .ping,
+                    header.command == .ping or header.command == .pong,
             );
             assert(header.size == @sizeOf(Header));
@@ -2864,25 +3030,20 @@ pub fn ReplicaType(
             assert(self.journal.header_with_op(self.op) != null);
         }
-        /// Returns whether the replica is a backup for the current view.
-        /// This may be used only when the replica status is normal.
-        fn backup(self: *Self) bool {
-            return !self.primary();
-        }
         fn flush_loopback_queue(self: *Self) void {
-            // There are four cases where a replica will send a message to itself:
-            // However, of these four cases, all but one call send_message_to_replica().
+            // There are five cases where a replica will send a message to itself:
+            // However, of these five cases, all but one call send_message_to_replica().
             //
             // 1. In on_request(), the primary sends a synchronous prepare to itself, but this is
             //    done by calling on_prepare() directly, and subsequent prepare timeout retries will
             //    never resend to self.
             // 2. In on_prepare(), after writing to storage, the primary sends a (typically)
             //    asynchronous prepare_ok to itself.
-            // 3. In on_start_view_change(), after receiving a quorum of start_view_change
-            //    messages, the new primary sends a synchronous do_view_change to itself.
+            // 3. In transition_to_view_change_status(), the new primary sends a synchronous DVC to
+            //    itself.
             // 4. In primary_start_view_as_the_new_primary(), the new primary sends itself a
             //    prepare_ok message for each uncommitted message.
+            // 5. In send_start_view_change(), a replica sends itself a start_view_change message.
             if (self.loopback_queue) |message| {
                 defer self.message_bus.unref(message);
@@ -2898,6 +3059,8 @@ pub fn ReplicaType(
         }
         fn ignore_prepare_ok(self: *Self, message: *const Message) bool {
+            assert(message.header.command == .prepare_ok);
             if (self.primary_index(message.header.view) == self.replica) {
                 assert(message.header.view <= self.view);
             }
@@ -3218,9 +3381,31 @@ pub fn ReplicaType(
             return false;
         }
-        fn ignore_view_change_message(self: *Self, message: *const Message) bool {
-            assert(message.header.command == .start_view_change or
-                message.header.command == .do_view_change or
+        fn ignore_start_view_change_message(self: *const Self, message: *const Message) bool {
+            assert(message.header.command == .start_view_change);
+            switch (self.status) {
+                .normal, .view_change => {},
+                .recovering => unreachable, // Single node clusters don't have view changes.
+                .recovering_head => {
+                    log.debug("{}: on_start_view_change: ignoring (status={})", .{
+                        self.replica,
+                        self.status,
+                    });
+                    return true;
+                },
+            }
+            if (message.header.view < self.view) {
+                log.debug("{}: on_start_view_change: ignoring (older view)", .{self.replica});
+                return true;
+            }
+            return false;
+        }
+        fn ignore_view_change_message(self: *const Self, message: *const Message) bool {
+            assert(message.header.command == .do_view_change or
                 message.header.command == .start_view);
             assert(message.header.view > 0); // The initial view is already zero.
             assert(self.status != .recovering); // Single node clusters don't have view changes.
@@ -3228,6 +3413,7 @@ pub fn ReplicaType(
             const command: []const u8 = @tagName(message.header.command);
             if (self.status == .recovering_head and message.header.command != .start_view) {
+                log.debug("{}: on_{s}: ignoring (recovering_head)", .{ self.replica, command });
                 return true;
             }
@@ -3243,18 +3429,13 @@ pub fn ReplicaType(
             // These may be caused by faults in the network topology.
             switch (message.header.command) {
-                .start_view_change, .start_view => {
+                .start_view => {
                     if (message.header.replica == self.replica) {
                         log.warn("{}: on_{s}: ignoring (self)", .{ self.replica, command });
                         return true;
                     }
                 },
-                .do_view_change => {
-                    if (self.primary_index(message.header.view) != self.replica) {
-                        log.warn("{}: on_{s}: ignoring (backup)", .{ self.replica, command });
-                        return true;
-                    }
-                },
+                .do_view_change => {},
                 else => unreachable,
             }
@@ -3275,6 +3456,11 @@ pub fn ReplicaType(
             return false;
         }
+        /// Returns the index into the configuration of the primary for a given view.
+        fn primary_index(self: *const Self, view: u32) u8 {
+            return @intCast(u8, @mod(view, self.replica_count));
+        }
         /// Returns whether the replica is the primary for the current view.
         /// This may be used only when the replica status is normal.
         fn primary(self: *const Self) bool {
@@ -3282,9 +3468,10 @@ pub fn ReplicaType(
             return self.primary_index(self.view) == self.replica;
         }
-        /// Returns the index into the configuration of the primary for a given view.
-        fn primary_index(self: *const Self, view: u32) u8 {
-            return @intCast(u8, @mod(view, self.replica_count));
+        /// Returns whether the replica is a backup for the current view.
+        /// This may be used only when the replica status is normal.
+        fn backup(self: *const Self) bool {
+            return !self.primary();
         }
         /// Advances `op` to where we need to be before `header` can be processed as a prepare.
@@ -3512,14 +3699,17 @@ pub fn ReplicaType(
             log.debug("{}: primary_pipeline_next: prepare {}", .{ self.replica, message.header.checksum });
-            if (self.pipeline.queue.prepare_queue.tail_ptr()) |previous| {
+            if (self.primary_pipeline_pending()) |_| {
                 // Do not restart the prepare timeout as it is already ticking for another prepare.
-                assert(self.prepare_timeout.ticking);
+                const previous = self.pipeline.queue.prepare_queue.tail_ptr().?;
                 assert(previous.message.header.checksum == message.header.parent);
+                assert(self.prepare_timeout.ticking);
+                assert(self.primary_abdicate_timeout.ticking);
             } else {
-                // We are about to add the first prepare to the pipeline, so start the timeout.
                 assert(!self.prepare_timeout.ticking);
+                assert(!self.primary_abdicate_timeout.ticking);
                 self.prepare_timeout.start();
+                self.primary_abdicate_timeout.start();
             }
             self.pipeline.queue.push_prepare(message);
             self.on_prepare(message);
@@ -3529,6 +3719,24 @@ pub fn ReplicaType(
             assert(self.op == message.header.op);
         }
+        /// Returns the next prepare in the pipeline waiting for a quorum.
+        /// Returns null when the pipeline is empty.
+        /// Returns null when the pipeline is nonempty but all prepares have a quorum.
+        fn primary_pipeline_pending(self: *const Self) ?*const Prepare {
+            assert(self.status == .normal);
+            assert(self.primary());
+            var prepares = self.pipeline.queue.prepare_queue.iterator();
+            while (prepares.next_ptr()) |prepare| {
+                assert(prepare.message.header.command == .prepare);
+                if (!prepare.ok_quorum_received) {
+                    return prepare;
+                }
+            } else {
+                return null;
+            }
+        }
         fn pipeline_prepare_by_op_and_checksum(self: *Self, op: u64, checksum: ?u128) ?*Message {
             assert(self.status == .normal or self.status == .view_change);
             assert(self.replica == self.primary_index(self.view) or checksum != null);
@@ -4349,7 +4557,7 @@ pub fn ReplicaType(
                 return;
             }
-            const next = @mod(self.replica + 1, @intCast(u8, self.replica_count));
+            const next = @mod(self.replica + 1, self.replica_count);
             if (next == self.primary_index(message.header.view)) {
                 log.debug("{}: replicate: not replicating (completed)", .{self.replica});
                 return;
@@ -4412,13 +4620,13 @@ pub fn ReplicaType(
         }
         fn reset_quorum_nack_prepare(self: *Self) void {
+            assert(!self.nack_prepare_from_other_replicas.isSet(self.replica));
             self.reset_quorum_counter(&self.nack_prepare_from_other_replicas);
             self.nack_prepare_op = null;
         }
         fn reset_quorum_start_view_change(self: *Self) void {
-            self.reset_quorum_counter(&self.start_view_change_from_other_replicas);
-            self.start_view_change_quorum = false;
+            self.reset_quorum_counter(&self.start_view_change_from_all_replicas);
         }
         fn send_prepare_ok(self: *Self, header: *const Header) void {
@@ -4506,28 +4714,28 @@ pub fn ReplicaType(
         }
         fn send_start_view_change(self: *Self) void {
-            assert(self.status == .view_change);
-            assert(self.log_view < self.view);
-            assert(!self.do_view_change_quorum);
-            // Send only to other replicas (and not to ourself) to avoid a quorum off-by-one error:
-            // This could happen if the replica mistakenly counts its own message in the quorum.
-            self.send_header_to_other_replicas(.{
+            assert(self.status == .normal or self.status == .view_change);
+            const header: Header = .{
                 .command = .start_view_change,
                 .cluster = self.cluster,
                 .replica = self.replica,
                 .view = self.view,
-            });
+            };
+            self.send_header_to_other_replicas(header);
+            if (!self.start_view_change_from_all_replicas.isSet(self.replica)) {
+                self.send_header_to_replica(self.replica, header);
+                defer self.flush_loopback_queue();
+            }
         }
         fn send_do_view_change(self: *Self) void {
             assert(self.status == .view_change);
-            assert(self.start_view_change_quorum);
+            assert(self.view > self.log_view);
             assert(!self.do_view_change_quorum);
-            const count_start_view_change = self.start_view_change_from_other_replicas.count();
-            assert(count_start_view_change >= self.quorum_view_change - 1);
-            assert(count_start_view_change <= self.replica_count - 1);
             const message = self.create_view_change_message(.do_view_change);
             defer self.message_bus.unref(message);
@@ -4546,7 +4754,14 @@ pub fn ReplicaType(
             assert(message.header.commit == self.commit_min);
             DVCQuorum.verify_message(message);
-            self.send_message_to_replica(self.primary_index(self.view), message);
+            self.send_message_to_other_replicas(message);
+            if (self.replica == self.primary_index(self.view) and
+                self.do_view_change_from_all_replicas[self.replica] == null)
+            {
+                self.send_message_to_replica(self.replica, message);
+                defer self.flush_loopback_queue();
+            }
         }
         fn send_eviction_message_to_client(self: *Self, client: u128) void {
@@ -4568,6 +4783,8 @@ pub fn ReplicaType(
         }
         fn send_header_to_client(self: *Self, client: u128, header: Header) void {
+            assert(header.cluster == self.cluster);
             const message = self.create_message_from_header(header);
             defer self.message_bus.unref(message);
@@ -4643,14 +4860,15 @@ pub fn ReplicaType(
                     assert(replica == self.primary_index(self.view));
                     assert(message.header.replica == self.replica);
                 },
+                .reply => unreachable,
                 .start_view_change => {
-                    assert(self.status == .view_change);
+                    assert(self.status == .normal or self.status == .view_change);
                     assert(message.header.view == self.view);
                     assert(message.header.replica == self.replica);
                 },
                 .do_view_change => {
                     assert(self.status == .view_change);
-                    assert(self.start_view_change_quorum);
+                    assert(self.view > self.log_view);
                     assert(!self.do_view_change_quorum);
                     assert(message.header.view == self.view);
                     assert(message.header.replica == self.replica);
@@ -4658,23 +4876,12 @@ pub fn ReplicaType(
                     assert(message.header.commit == self.commit_min);
                     assert(message.header.timestamp == self.log_view);
                 },
-                .start_view => switch (self.status) {
-                    .normal => {
-                        // A backup may ask the primary to resend the start_view message.
-                        assert(!self.start_view_change_quorum);
-                        assert(!self.do_view_change_quorum);
-                        assert(message.header.view == self.view);
-                        assert(message.header.replica == self.replica);
-                        assert(message.header.replica != replica);
-                    },
-                    .view_change => {
-                        assert(self.start_view_change_quorum);
-                        assert(self.do_view_change_quorum);
-                        assert(message.header.view == self.view);
-                        assert(message.header.replica == self.replica);
-                        assert(message.header.replica != replica);
-                    },
-                    else => unreachable,
+                .start_view => {
+                    assert(self.status == .normal);
+                    assert(!self.do_view_change_quorum);
+                    assert(message.header.view == self.view);
+                    assert(message.header.replica == self.replica);
+                    assert(message.header.replica != replica);
                 },
                 .headers => {
                     assert(self.status == .normal or self.status == .view_change);
@@ -4682,11 +4889,18 @@ pub fn ReplicaType(
                     assert(message.header.replica == self.replica);
                     assert(message.header.replica != replica);
                 },
-                .ping, .pong => {
-                    assert(message.header.view == 0);
+                .ping => {
+                    assert(self.status == .normal);
+                    assert(message.header.replica == self.replica);
+                    assert(message.header.replica != replica);
+                },
+                .pong => {
+                    assert(self.status == .normal or self.status == .view_change);
                     assert(message.header.replica == self.replica);
                     assert(message.header.replica != replica);
                 },
+                .ping_client => unreachable,
+                .pong_client => unreachable,
                 .commit => {
                     assert(self.status == .normal);
                     assert(self.primary());
@@ -4716,12 +4930,14 @@ pub fn ReplicaType(
                     assert(message.header.replica != replica);
                     assert(self.primary_index(self.view) == replica);
                 },
-                else => {
-                    log.info("{}: send_message_to_replica: TODO {s}", .{
-                        self.replica,
-                        @tagName(message.header.command),
-                    });
+                .eviction => {
+                    assert(self.status == .normal);
+                    assert(self.primary());
+                    assert(message.header.view == self.view);
+                    assert(message.header.replica == self.replica);
                 },
+                .request_block => unreachable,
+                .block => unreachable,
             }
             if (replica != self.replica) {
@@ -4929,27 +5145,12 @@ pub fn ReplicaType(
             }
             if (self.status == .view_change and self.log_view < self.view) {
-                if (self.primary_index(self.view) != self.replica and
-                    self.start_view_change_quorum and !self.do_view_change_quorum)
-                {
-                    self.send_do_view_change();
-                }
+                if (!self.do_view_change_quorum) self.send_do_view_change();
             }
         }
         fn set_op_and_commit_max(self: *Self, op: u64, commit_max: u64, method: []const u8) void {
-            assert(self.status == .view_change or self.status == .recovering);
-            switch (self.status) {
-                .normal => unreachable,
-                .view_change => {},
-                .recovering => {
-                    // The replica's view hasn't been set yet.
-                    // It will be set shortly, when we transition to normal status.
-                    assert(self.view == 0);
-                },
-                .recovering_head => unreachable,
-            }
+            assert(self.status == .view_change);
             // Uncommitted ops may not survive a view change so we must assert `op` against
             // `commit_max` and not `self.op`. However, committed ops (`commit_max`) must survive:
@@ -5026,9 +5227,9 @@ pub fn ReplicaType(
         /// replica 0 doesn't have 6/7, then replica 1/2 must share the latest log_view. ∎)
         fn primary_set_log_from_do_view_change_messages(self: *Self) void {
             assert(self.status == .view_change);
+            assert(self.view > self.log_view);
             assert(self.primary_index(self.view) == self.replica);
             assert(self.replica_count > 1);
-            assert(self.start_view_change_quorum);
             assert(self.do_view_change_quorum);
             assert(self.do_view_change_from_all_replicas[self.replica] != null);
             DVCQuorum.verify(self.do_view_change_from_all_replicas);
@@ -5249,12 +5450,16 @@ pub fn ReplicaType(
                 assert(self.replica_count == 1);
                 assert(!self.prepare_timeout.ticking);
-                assert(!self.normal_status_timeout.ticking);
+                assert(!self.primary_abdicate_timeout.ticking);
+                assert(!self.normal_heartbeat_timeout.ticking);
+                assert(!self.start_view_change_window_timeout.ticking);
                 assert(!self.view_change_status_timeout.ticking);
-                assert(!self.view_change_message_timeout.ticking);
+                assert(!self.do_view_change_message_timeout.ticking);
+                assert(!self.request_start_view_message_timeout.ticking);
                 self.ping_timeout.start();
-                self.commit_timeout.start();
+                self.start_view_change_message_timeout.start();
+                self.commit_message_timeout.start();
                 self.repair_timeout.start();
                 self.pipeline.cache.deinit(self.message_bus.pool);
@@ -5269,12 +5474,17 @@ pub fn ReplicaType(
                 );
                 assert(!self.prepare_timeout.ticking);
-                assert(!self.commit_timeout.ticking);
+                assert(!self.primary_abdicate_timeout.ticking);
+                assert(!self.normal_heartbeat_timeout.ticking);
+                assert(!self.start_view_change_window_timeout.ticking);
+                assert(!self.commit_message_timeout.ticking);
                 assert(!self.view_change_status_timeout.ticking);
-                assert(!self.view_change_message_timeout.ticking);
+                assert(!self.do_view_change_message_timeout.ticking);
+                assert(!self.request_start_view_message_timeout.ticking);
                 self.ping_timeout.start();
-                self.normal_status_timeout.start();
+                self.normal_heartbeat_timeout.start();
+                self.start_view_change_message_timeout.start();
                 self.repair_timeout.start();
             }
         }
@@ -5285,6 +5495,7 @@ pub fn ReplicaType(
             assert(self.status == .view_change);
             assert(view_new >= self.view);
             assert(self.journal.header_with_op(self.op) != null);
+            assert(!self.primary_abdicating);
             self.status = .normal;
@@ -5295,6 +5506,8 @@ pub fn ReplicaType(
                 );
                 assert(!self.prepare_timeout.ticking);
+                assert(!self.normal_heartbeat_timeout.ticking);
+                assert(!self.primary_abdicate_timeout.ticking);
                 assert(!self.pipeline_repairing);
                 assert(self.pipeline == .queue);
                 assert(self.view == view_new);
@@ -5308,14 +5521,19 @@ pub fn ReplicaType(
                 self.view_durable_update();
                 self.ping_timeout.start();
-                self.commit_timeout.start();
-                self.normal_status_timeout.stop();
+                self.commit_message_timeout.start();
+                self.start_view_change_window_timeout.stop();
+                self.start_view_change_message_timeout.start();
                 self.view_change_status_timeout.stop();
-                self.view_change_message_timeout.stop();
+                self.do_view_change_message_timeout.stop();
+                self.request_start_view_message_timeout.stop();
                 self.repair_timeout.start();
                 // Do not reset the pipeline as there may be uncommitted ops to drive to completion.
-                if (self.pipeline.queue.prepare_queue.count > 0) self.prepare_timeout.start();
+                if (self.pipeline.queue.prepare_queue.count > 0) {
+                    self.prepare_timeout.start();
+                    self.primary_abdicate_timeout.start();
+                }
             } else {
                 log.debug("{}: transition_to_normal_from_view_change_status: view={}..{} backup", .{
                     self.replica,
@@ -5324,6 +5542,9 @@ pub fn ReplicaType(
                 });
                 assert(!self.prepare_timeout.ticking);
+                assert(!self.normal_heartbeat_timeout.ticking);
+                assert(!self.primary_abdicate_timeout.ticking);
+                assert(self.request_start_view_message_timeout.ticking);
                 assert(self.pipeline == .cache);
                 if (self.log_view == view_new and self.view == view_new) {
@@ -5336,24 +5557,27 @@ pub fn ReplicaType(
                 }
                 self.ping_timeout.start();
-                self.commit_timeout.stop();
-                self.normal_status_timeout.start();
+                self.commit_message_timeout.stop();
+                self.normal_heartbeat_timeout.start();
+                self.start_view_change_window_timeout.stop();
+                self.start_view_change_message_timeout.start();
                 self.view_change_status_timeout.stop();
-                self.view_change_message_timeout.stop();
+                self.do_view_change_message_timeout.stop();
+                self.request_start_view_message_timeout.stop();
                 self.repair_timeout.start();
             }
+            self.heartbeat_timestamp = 0;
             self.reset_quorum_start_view_change();
             self.reset_quorum_do_view_change();
             self.reset_quorum_nack_prepare();
-            assert(self.start_view_change_quorum == false);
             assert(self.do_view_change_quorum == false);
             assert(self.nack_prepare_op == null);
         }
         /// A replica i that notices the need for a view change advances its view, sets its status
-        /// to view_change, and sends a ⟨start_view_change v, i⟩ message to all the other replicas,
+        /// to view_change, and sends a ⟨do_view_change v, i⟩ message to all the other replicas,
         /// where v identifies the new view. A replica notices the need for a view change either
         /// based on its own timer, or because it receives a start_view_change or do_view_change
         /// message for a view with a larger number than its own view.
@@ -5390,30 +5614,40 @@ pub fn ReplicaType(
             }
             self.ping_timeout.stop();
-            self.commit_timeout.stop();
-            self.normal_status_timeout.stop();
+            self.commit_message_timeout.stop();
+            self.normal_heartbeat_timeout.stop();
+            self.start_view_change_window_timeout.stop();
+            self.start_view_change_message_timeout.start();
             self.view_change_status_timeout.start();
-            self.view_change_message_timeout.start();
+            self.do_view_change_message_timeout.start();
             self.repair_timeout.stop();
             self.prepare_timeout.stop();
+            self.primary_abdicate_timeout.stop();
+            if (self.primary_index(self.view) == self.replica) {
+                self.request_start_view_message_timeout.stop();
+            } else {
+                self.request_start_view_message_timeout.start();
+            }
             // Do not reset quorum counters only on entering a view, assuming that the view will be
             // followed only by a single subsequent view change to the next view, because multiple
             // successive view changes can fail, e.g. after a view change timeout.
             // We must therefore reset our counters here to avoid counting messages from an older
             // view, which would violate the quorum intersection property essential for correctness.
+            self.heartbeat_timestamp = 0;
+            self.primary_abdicating = false;
             self.reset_quorum_start_view_change();
             self.reset_quorum_do_view_change();
             self.reset_quorum_nack_prepare();
-            assert(self.start_view_change_quorum == false);
             assert(self.do_view_change_quorum == false);
             assert(self.nack_prepare_op == null);
             if (self.log_view == self.view) {
                 assert(status_before == .recovering_head);
             } else {
-                self.send_start_view_change();
+                self.send_do_view_change();
             }
         }
@@ -5590,6 +5824,7 @@ pub fn ReplicaType(
                     }
                     // TODO Debounce and decouple this from `on_message()` by moving into `tick()`:
+                    // (Using request_start_view_message_timeout).
                     log.debug("{}: view_jump: requesting start_view message", .{self.replica});
                     self.send_header_to_replica(self.primary_index(header.view), .{
                         .command = .request_start_view,