npm - tigerbeetle-node - Versions diffs - 0.11.8 → 0.11.10 - Mend

tigerbeetle-node 0.11.8 → 0.11.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

package/dist/.client.node.sha256 +1 -1
package/package.json +4 -3
package/scripts/build_lib.sh +41 -0
package/src/node.zig +1 -1
package/src/tigerbeetle/scripts/validate_docs.sh +7 -1
package/src/tigerbeetle/src/benchmark.zig +3 -3
package/src/tigerbeetle/src/config.zig +31 -16
package/src/tigerbeetle/src/constants.zig +48 -9
package/src/tigerbeetle/src/ewah.zig +5 -5
package/src/tigerbeetle/src/ewah_fuzz.zig +1 -1
package/src/tigerbeetle/src/lsm/binary_search.zig +1 -1
package/src/tigerbeetle/src/lsm/bloom_filter.zig +1 -1
package/src/tigerbeetle/src/lsm/compaction.zig +34 -21
package/src/tigerbeetle/src/lsm/forest_fuzz.zig +84 -104
package/src/tigerbeetle/src/lsm/grid.zig +19 -13
package/src/tigerbeetle/src/lsm/manifest_log.zig +8 -10
package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +18 -13
package/src/tigerbeetle/src/lsm/merge_iterator.zig +1 -1
package/src/tigerbeetle/src/lsm/segmented_array.zig +17 -17
package/src/tigerbeetle/src/lsm/segmented_array_fuzz.zig +1 -1
package/src/tigerbeetle/src/lsm/set_associative_cache.zig +1 -1
package/src/tigerbeetle/src/lsm/table.zig +8 -20
package/src/tigerbeetle/src/lsm/table_immutable.zig +1 -1
package/src/tigerbeetle/src/lsm/table_iterator.zig +3 -3
package/src/tigerbeetle/src/lsm/table_mutable.zig +14 -2
package/src/tigerbeetle/src/lsm/test.zig +5 -4
package/src/tigerbeetle/src/lsm/tree.zig +1 -2
package/src/tigerbeetle/src/lsm/tree_fuzz.zig +85 -115
package/src/tigerbeetle/src/message_bus.zig +4 -4
package/src/tigerbeetle/src/message_pool.zig +7 -10
package/src/tigerbeetle/src/ring_buffer.zig +22 -12
package/src/tigerbeetle/src/simulator.zig +366 -239
package/src/tigerbeetle/src/state_machine/auditor.zig +5 -5
package/src/tigerbeetle/src/state_machine/workload.zig +3 -3
package/src/tigerbeetle/src/state_machine.zig +190 -178
package/src/tigerbeetle/src/{util.zig → stdx.zig} +2 -0
package/src/tigerbeetle/src/storage.zig +13 -6
package/src/tigerbeetle/src/{test → testing/cluster}/message_bus.zig +3 -3
package/src/tigerbeetle/src/{test → testing/cluster}/network.zig +46 -22
package/src/tigerbeetle/src/testing/cluster/state_checker.zig +169 -0
package/src/tigerbeetle/src/testing/cluster/storage_checker.zig +202 -0
package/src/tigerbeetle/src/testing/cluster.zig +443 -0
package/src/tigerbeetle/src/{test → testing}/fuzz.zig +0 -0
package/src/tigerbeetle/src/testing/hash_log.zig +66 -0
package/src/tigerbeetle/src/{test → testing}/id.zig +0 -0
package/src/tigerbeetle/src/testing/packet_simulator.zig +365 -0
package/src/tigerbeetle/src/{test → testing}/priority_queue.zig +1 -1
package/src/tigerbeetle/src/testing/reply_sequence.zig +139 -0
package/src/tigerbeetle/src/{test → testing}/state_machine.zig +3 -1
package/src/tigerbeetle/src/testing/storage.zig +757 -0
package/src/tigerbeetle/src/{test → testing}/table.zig +21 -0
package/src/tigerbeetle/src/{test → testing}/time.zig +0 -0
package/src/tigerbeetle/src/tigerbeetle.zig +2 -0
package/src/tigerbeetle/src/tracer.zig +3 -3
package/src/tigerbeetle/src/unit_tests.zig +4 -4
package/src/tigerbeetle/src/vopr.zig +2 -2
package/src/tigerbeetle/src/vsr/client.zig +5 -2
package/src/tigerbeetle/src/vsr/clock.zig +93 -53
package/src/tigerbeetle/src/vsr/journal.zig +109 -98
package/src/tigerbeetle/src/vsr/journal_format_fuzz.zig +2 -2
package/src/tigerbeetle/src/vsr/replica.zig +1983 -1430
package/src/tigerbeetle/src/vsr/replica_format.zig +13 -13
package/src/tigerbeetle/src/vsr/superblock.zig +240 -142
package/src/tigerbeetle/src/vsr/superblock_client_table.zig +7 -7
package/src/tigerbeetle/src/vsr/superblock_free_set.zig +1 -1
package/src/tigerbeetle/src/vsr/superblock_free_set_fuzz.zig +1 -1
package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +49 -14
package/src/tigerbeetle/src/vsr/superblock_manifest.zig +38 -19
package/src/tigerbeetle/src/vsr/superblock_quorums.zig +48 -48
package/src/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +51 -51
package/src/tigerbeetle/src/vsr.zig +99 -33
package/src/tigerbeetle/src/demo.zig +0 -132
package/src/tigerbeetle/src/demo_01_create_accounts.zig +0 -35
package/src/tigerbeetle/src/demo_02_lookup_accounts.zig +0 -7
package/src/tigerbeetle/src/demo_03_create_transfers.zig +0 -37
package/src/tigerbeetle/src/demo_04_create_pending_transfers.zig +0 -61
package/src/tigerbeetle/src/demo_05_post_pending_transfers.zig +0 -37
package/src/tigerbeetle/src/demo_06_void_pending_transfers.zig +0 -24
package/src/tigerbeetle/src/demo_07_lookup_transfers.zig +0 -7
package/src/tigerbeetle/src/test/cluster.zig +0 -352
package/src/tigerbeetle/src/test/conductor.zig +0 -366
package/src/tigerbeetle/src/test/packet_simulator.zig +0 -398
package/src/tigerbeetle/src/test/state_checker.zig +0 -169
package/src/tigerbeetle/src/test/storage.zig +0 -864
package/src/tigerbeetle/src/test/storage_checker.zig +0 -204

package/src/tigerbeetle/src/vsr/replica.zig CHANGED Viewed

@@ -3,6 +3,7 @@ const Allocator = std.mem.Allocator;
 const assert = std.debug.assert;
 const constants = @import("../constants.zig");
+const stdx = @import("../stdx.zig");
 const StaticAllocator = @import("../static_allocator.zig");
 const GridType = @import("../lsm/grid.zig").GridType;
@@ -24,26 +25,8 @@ const tracer = @import("../tracer.zig");
 pub const Status = enum {
     normal,
     view_change,
-    // Recovery (for replica_count > 1):
-    //
-    // 1. Open the replica:
-    //    a. At replica start: `status=recovering`.
-    //    b. Recover the WAL. Mark questionable entries as faulty.
-    //    c. If the WAL has no entries (besides the initial commit), skip to step 3 with view 0.
-    // 2. Run VSR recovery protocol:
-    //    a. Send a `recovery` message to every replica (except self).
-    //    b. Wait for f+1 `recovery_response` messages from replicas in `normal` status.
-    //       Each `recovery_response` includes the current view number.
-    //       Each `recovery_response` must include a nonce matching the `recovery` message.
-    //    c. Wait for a `recovery_response` from the primary of the highest known view.
-    // 3. Transition to `status=normal` with the discovered view number:
-    //    * Set `op` to the highest op in the primary's recovery response.
-    //    * Repair faulty messages.
-    //    * Commit through to the discovered `commit_max`.
-    //    * Set `state_machine.prepare_timeout` to the current op's timestamp.
-    //
-    // TODO Document state transfer in this progression.
     recovering,
+    recovering_head,
 };
 const Nonce = u128;
@@ -59,27 +42,17 @@ const Prepare = struct {
     ok_quorum_received: bool = false,
 };
+const Request = struct {
+    message: *Message, // header.command == .request
+    realtime: i64,
+};
 const QuorumMessages = [constants.replicas_max]?*Message;
 const quorum_messages_null = [_]?*Message{null} ** constants.replicas_max;
 const QuorumCounter = std.StaticBitSet(constants.replicas_max);
 const quorum_counter_null = QuorumCounter.initEmpty();
-// CRITICAL: The number of prepare headers to include in the body:
-// We must provide enough headers to cover all uncommitted headers so that the new
-// primary (if we are in a view change) can decide whether to discard uncommitted headers
-// that cannot be repaired because they are gaps, and this must be relative to the
-// cluster as a whole (not relative to the difference between our op and commit number)
-// as otherwise we would break correctness.
-const view_change_headers_count = constants.pipeline_max;
-comptime {
-    assert(view_change_headers_count > 0);
-    assert(view_change_headers_count >= constants.pipeline_max);
-    assert(view_change_headers_count <=
-        @divFloor(constants.message_size_max - @sizeOf(Header), @sizeOf(Header)));
-}
 pub fn ReplicaType(
     comptime StateMachine: type,
     comptime MessageBus: type,
@@ -129,22 +102,44 @@ pub fn ReplicaType(
         /// For executing service up-calls after an operation has been committed:
         state_machine: StateMachine,
-        // TODO Document.
+        /// Durably store VSR state, the "root" of the LSM tree, and other replica metadata.
         superblock: SuperBlock,
+        /// Context for SuperBlock.open() and .checkpoint().
         superblock_context: SuperBlock.Context = undefined,
+        /// Context for SuperBlock.view_change().
+        superblock_context_view_change: SuperBlock.Context = undefined,
         grid: Grid,
         opened: bool,
-        /// The current view, initially 0:
+        /// The current view.
+        /// Initialized from the superblock's VSRState.
+        ///
+        /// Invariants:
+        /// * `replica.view = replica.log_view` when status=normal
+        /// * `replica.view ≥ replica.log_view`
+        /// * `replica.view ≥ replica.view_durable`
+        /// * `replica.view = 0` when replica_count=1.
         view: u32,
-        /// The latest view, in which the replica's status was normal.
-        view_normal: u32,
+        /// The latest view where
+        /// - the replica was a primary and acquired a DVC quorum, or
+        /// - the replica was a backup and processed a SV message.
+        /// i.e. the latest view in which this replica changed its head message.
+        ///
+        /// Initialized from the superblock's VSRState.
+        ///
+        /// Invariants (see `view` for others):
+        /// * `replica.log_view ≥ replica.log_view_durable`
+        /// * `replica.log_view = 0` when replica_count=1.
+        log_view: u32,
         /// The current status, either normal, view_change, or recovering:
         status: Status = .recovering,
         /// The op number assigned to the most recently prepared operation.
+        /// This op is sometimes referred to as the replica's "head" or "head op".
         ///
         /// Invariants (not applicable during status=recovering):
         /// * `replica.op` exists in the Journal.
@@ -159,10 +154,6 @@ pub fn ReplicaType(
         // Also verify that a corresponding header exists in the WAL.
         op: u64,
-        /// The op of the highest checkpointed message.
-        // TODO Refuse to store/ack any op>op_checkpoint+journal_slot_count.
-        op_checkpoint: u64,
         /// The op number of the latest committed and executed operation (according to the replica):
         /// The replica may have to wait for repairs to complete before commit_min reaches commit_max.
         ///
@@ -190,15 +181,20 @@ pub fn ReplicaType(
         /// * checkpointing
         committing: bool = false,
-        /// Whether we are reading a prepare from storage in order to push to the pipeline.
-        repairing_pipeline: bool = false,
-        /// The primary's pipeline of inflight prepares waiting to commit in FIFO order.
-        /// This allows us to pipeline without the complexity of out-of-order commits.
-        ///
-        /// After a view change, the old primary's pipeline is left untouched so that it is able to
-        /// help the new primary repair, even in the face of local storage faults.
-        pipeline: RingBuffer(Prepare, constants.pipeline_max, .array) = .{},
+        /// Whether we are reading a prepare from storage to construct the pipeline.
+        pipeline_repairing: bool = false,
+        /// The pipeline is a queue for a replica which is the primary and in status=normal.
+        /// At all other times the pipeline is a cache.
+        pipeline: union(enum) {
+            /// The primary's pipeline of inflight prepares waiting to commit in FIFO order,
+            /// with a tail of pending requests which have not begun to prepare.
+            /// This allows us to pipeline without the complexity of out-of-order commits.
+            queue: PipelineQueue,
+            /// Prepares in the cache may be committed or uncommitted, and may not belong to the
+            /// current view.
+            cache: PipelineCache,
+        },
         /// In some cases, a replica may send a message to itself. We do not submit these messages
         /// to the message bus but rather queue them here for guaranteed immediate delivery, which
@@ -214,9 +210,6 @@ pub fn ReplicaType(
         /// Unique nack_prepare messages for the same view from OTHER replicas (excluding ourself).
         nack_prepare_from_other_replicas: QuorumCounter = quorum_counter_null,
-        /// Unique recovery_response messages from OTHER replicas (excluding ourself).
-        recovery_response_from_other_replicas: QuorumMessages = quorum_messages_null,
         /// Whether a replica has received a quorum of start_view_change messages for the view change:
         start_view_change_quorum: bool = false,
@@ -254,9 +247,6 @@ pub fn ReplicaType(
         /// The number of ticks before repairing missing/disconnected headers and/or dirty entries:
         repair_timeout: Timeout,
-        /// The number of ticks before attempting to send another set of `recovery` messages.
-        recovery_timeout: Timeout,
         /// The nonce of the `recovery` messages.
         recovery_nonce: Nonce,
@@ -268,6 +258,7 @@ pub fn ReplicaType(
         /// Seeded with the replica's index number.
         prng: std.rand.DefaultPrng,
+        context: ?*anyopaque = null,
         /// Simulator hooks.
         on_change_state: ?fn (replica: *const Self) void = null,
         /// Called immediately after a compaction.
@@ -355,33 +346,83 @@ pub fn ReplicaType(
             // Open the (Forest inside) StateMachine:
             self.opened = false;
             self.state_machine.open(state_machine_open_callback);
-            while (!self.opened) {
-                // self.grid.tick();
-                self.superblock.storage.tick();
-            }
+            while (!self.opened) self.superblock.storage.tick();
             self.opened = false;
             self.journal.recover(journal_recover_callback);
             while (!self.opened) self.superblock.storage.tick();
-            if (self.journal.is_empty()) {
-                // The data file is brand new — no messages have ever been written.
-                // Transition to normal status; no need to run the VSR recovery protocol.
-                assert(self.journal.dirty.count == 0);
-                assert(self.journal.faulty.count == 0);
-                assert(self.commit_min == 0);
-                assert(self.commit_max == 0);
-                assert(self.op_checkpoint == 0);
-                assert(self.op == 0);
-                assert(self.view == 0);
-                log.debug("{}: open: empty data file", .{self.replica});
-                self.transition_to_normal_from_recovering_status(0);
-                assert(self.status == .normal);
-            } else if (self.replica_count == 1) {
-                if (self.journal.faulty.count != 0) @panic("journal is corrupt");
+            const vsr_headers = self.superblock.working.vsr_headers();
+            var op_head: u64 = vsr_headers.slice[0].op;
+            for (self.journal.headers) |*header| {
+                if (header.command == .prepare and header.op > op_head) {
+                    assert(self.log_view >= header.view);
+                    assert(self.log_view == self.view);
+                    op_head = header.op;
+                }
+            }
+            self.op = op_head;
+            for (vsr_headers.slice) |*header| {
+                const slot = .{ .index = header.op % constants.journal_slot_count };
+                if (self.journal.has(header)) {
+                    // Header is already in the WAL.
+                    assert(!self.journal.dirty.bit(slot));
+                    assert(!self.journal.faulty.bit(slot));
+                } else if (self.journal.header_for_op(header.op)) |journal_header| {
+                    assert(!self.journal.dirty.bit(slot));
+                    assert(!self.journal.faulty.bit(slot));
+                    if (header.op < journal_header.op) {
+                        // Don't overwrite a newer op.
+                        // (This must be a SV message because a DVC would not have a newer op).
+                        assert(self.log_view == self.view);
+                    } else {
+                        self.journal.set_header_as_dirty(header);
+                    }
+                } else {
+                    assert(self.journal.dirty.bit(slot) == self.journal.faulty.bit(slot));
+                    self.journal.headers[slot.index] = header.*;
+                    self.journal.dirty.set(slot);
+                    // Don't touch faulty — if it is set, we don't want to unset it. The WAL slot
+                    // may contain a corrupt version is this op, and we don't want to incorrectly
+                    // nack it. (This is why we do not call replace_header()/set_header_as_dirty()
+                    // here.)
+                }
+            }
+            const header_head = self.journal.header_with_op(self.op).?;
+            assert(header_head.view <= self.superblock.working.vsr_state.log_view);
+            if (self.replica_count == 1) {
+                if (self.journal.faulty.count > 0) {
+                    @panic("journal is corrupt");
+                }
+                assert(self.op_head_certain());
+                if (self.commit_min < self.op) {
+                    self.commit_journal(self.op);
+                } else {
+                    self.transition_to_normal_from_recovering_status();
+                }
             } else {
-                assert(self.status == .recovering);
+                // Even if op_head_certain() returns false, a DVC always has a certain head op.
+                if (self.log_view < self.view or self.op_head_certain()) {
+                    if (self.log_view == self.view) {
+                        if (self.primary_index(self.view) == self.replica) {
+                            self.transition_to_view_change_status(self.view + 1);
+                        } else {
+                            self.transition_to_normal_from_recovering_status();
+                        }
+                    } else {
+                        assert(self.view > self.log_view);
+                        self.transition_to_view_change_status(self.view);
+                    }
+                } else {
+                    self.transition_to_recovering_head();
+                }
             }
         }
@@ -504,11 +545,11 @@ pub fn ReplicaType(
                 .grid = self.grid,
                 .opened = self.opened,
                 .view = self.superblock.working.vsr_state.view,
-                .view_normal = self.superblock.working.vsr_state.view_normal,
+                .log_view = self.superblock.working.vsr_state.log_view,
                 .op = 0,
-                .op_checkpoint = self.superblock.working.vsr_state.commit_min,
                 .commit_min = self.superblock.working.vsr_state.commit_min,
                 .commit_max = self.superblock.working.vsr_state.commit_max,
+                .pipeline = .{ .cache = .{} },
                 .ping_timeout = Timeout{
                     .name = "ping_timeout",
                     .id = replica_index,
@@ -544,11 +585,6 @@ pub fn ReplicaType(
                     .id = replica_index,
                     .after = 50,
                 },
-                .recovery_timeout = Timeout{
-                    .name = "recovery_timeout",
-                    .id = replica_index,
-                    .after = 200,
-                },
                 .recovery_nonce = recovery_nonce,
                 .prng = std.rand.DefaultPrng.init(replica_index),
             };
@@ -586,7 +622,11 @@ pub fn ReplicaType(
             self.grid.deinit(allocator);
             defer self.message_bus.deinit(allocator);
-            while (self.pipeline.pop()) |prepare| self.message_bus.unref(prepare.message);
+            // TODO(Zig) 0.10: inline-switch.
+            switch (self.pipeline) {
+                .queue => |*pipeline| pipeline.deinit(self.message_bus.pool),
+                .cache => |*pipeline| pipeline.deinit(self.message_bus.pool),
+            }
             if (self.loopback_queue) |loopback_message| {
                 assert(loopback_message.next == null);
@@ -606,10 +646,6 @@ pub fn ReplicaType(
             for (self.do_view_change_from_all_replicas) |message| {
                 if (message) |m| self.message_bus.unref(m);
             }
-            for (self.recovery_response_from_other_replicas) |message| {
-                if (message) |m| self.message_bus.unref(m);
-            }
         }
         /// The client table records for each client the latest session and the latest committed reply.
@@ -629,36 +665,8 @@ pub fn ReplicaType(
             // TODO Replica owns Time; should it tick() here instead of Clock?
             self.clock.tick();
-            // self.grid.tick();
             self.message_bus.tick();
-            if (self.status == .recovering) {
-                if (self.recovery_timeout.ticking) {
-                    // Continue running the VSR recovery protocol.
-                    self.recovery_timeout.tick();
-                    if (self.recovery_timeout.fired()) self.on_recovery_timeout();
-                } else if (self.replica_count == 1) {
-                    // A cluster-of-one does not run the VSR recovery protocol.
-                    if (self.committing) return;
-                    assert(self.journal.faulty.count == 0);
-                    assert(self.op == 0);
-                    // TODO Assert that this path isn't taken more than once.
-                    self.op = self.journal.op_maximum();
-                    assert(self.op >= self.commit_min);
-                    assert(self.op >= self.op_checkpoint);
-                    assert(self.op <= self.op_checkpoint_trigger());
-                    assert(self.journal.header_with_op(self.op) != null);
-                    self.commit_journal(self.op);
-                    // The recovering→normal transition is deferred until all ops are committed.
-                } else {
-                    // The journal just finished recovery.
-                    // Now try to learn the current view via the VSR recovery protocol.
-                    self.recovery_timeout.start();
-                    self.recover();
-                }
-                return;
-            }
             self.ping_timeout.tick();
             self.prepare_timeout.tick();
             self.commit_timeout.tick();
@@ -725,8 +733,6 @@ pub fn ReplicaType(
                 .start_view_change => self.on_start_view_change(message),
                 .do_view_change => self.on_do_view_change(message),
                 .start_view => self.on_start_view(message),
-                .recovery => self.on_recovery(message),
-                .recovery_response => self.on_recovery_response(message),
                 .request_start_view => self.on_request_start_view(message),
                 .request_prepare => self.on_request_prepare(message),
                 .request_headers => self.on_request_headers(message),
@@ -807,18 +813,22 @@ pub fn ReplicaType(
             self.clock.learn(message.header.replica, m0, t1, m2);
         }
-        /// The primary advances op-number, adds the request to the end of the log, and updates the
-        /// information for this client in the client-table to contain the new request number, s.
-        /// Then it sends a ⟨PREPARE v, m, n, k⟩ message to the other replicas, where v is the
-        /// current view-number, m is the message it received from the client, n is the op-number
-        /// it assigned to the request, and k is the commit-number.
+        /// When there is free space in the pipeline's prepare queue:
+        ///   The primary advances op-number, adds the request to the end of the log, and updates the
+        ///   information for this client in the client-table to contain the new request number, s.
+        ///   Then it sends a ⟨PREPARE v, m, n, k⟩ message to the other replicas, where v is the
+        ///   current view-number, m is the message it received from the client, n is the op-number
+        ///   it assigned to the request, and k is the commit-number.
+        /// Otherwise, when there is room in the pipeline's request queue:
+        ///   The request is queued, and will be dequeued & prepared when the pipeline head commits.
+        /// Otherwise, drop the request.
         fn on_request(self: *Self, message: *Message) void {
             if (self.ignore_request_message(message)) return;
             assert(self.status == .normal);
             assert(self.primary());
             assert(self.commit_min == self.commit_max);
-            assert(self.commit_max + self.pipeline.count == self.op);
+            assert(self.commit_max + self.pipeline.queue.prepare_queue.count == self.op);
             assert(message.header.command == .request);
             assert(message.header.view <= self.view); // The client's view may be behind ours.
@@ -828,59 +838,16 @@ pub fn ReplicaType(
                 return;
             };
-            log.debug("{}: on_request: request {}", .{ self.replica, message.header.checksum });
-            // Guard against the wall clock going backwards by taking the max with timestamps issued:
-            self.state_machine.prepare_timestamp = std.math.max(
-                // The cluster `commit_timestamp` may be ahead of our `prepare_timestamp` because this
-                // may be our first prepare as a recently elected primary:
-                std.math.max(
-                    self.state_machine.prepare_timestamp,
-                    self.state_machine.commit_timestamp,
-                ) + 1,
-                @intCast(u64, realtime),
-            );
-            assert(self.state_machine.prepare_timestamp > self.state_machine.commit_timestamp);
-            const prepare_timestamp = self.state_machine.prepare(
-                message.header.operation.cast(StateMachine),
-                message.body(),
-            );
-            const latest_entry = self.journal.header_with_op(self.op).?;
-            message.header.parent = latest_entry.checksum;
-            message.header.context = message.header.checksum;
-            message.header.view = self.view;
-            message.header.op = self.op + 1;
-            message.header.commit = self.commit_max;
-            message.header.timestamp = prepare_timestamp;
-            message.header.replica = self.replica;
-            message.header.command = .prepare;
-            message.header.set_checksum_body(message.body());
-            message.header.set_checksum();
-            log.debug("{}: on_request: prepare {}", .{ self.replica, message.header.checksum });
-            self.pipeline.push_assume_capacity(.{ .message = message.ref() });
-            assert(self.pipeline.count >= 1);
+            const request = .{
+                .message = message.ref(),
+                .realtime = realtime,
+            };
-            if (self.pipeline.count == 1) {
-                // This is the only prepare in the pipeline, start the timeout:
-                assert(!self.prepare_timeout.ticking);
-                self.prepare_timeout.start();
+            if (self.pipeline.queue.prepare_queue.full()) {
+                self.pipeline.queue.push_request(request);
             } else {
-                // Do not restart the prepare timeout as it is already ticking for another prepare.
-                assert(self.prepare_timeout.ticking);
-                const previous = self.pipeline.get_ptr(self.pipeline.count - 2).?;
-                assert(previous.message.header.checksum == message.header.parent);
+                self.primary_pipeline_prepare(request);
             }
-            self.on_prepare(message);
-            // We expect `on_prepare()` to increment `self.op` to match the primary's latest prepare:
-            // This is critical to ensure that pipelined prepares do not receive the same op number.
-            assert(self.op == message.header.op);
         }
         /// Replication is simple, with a single code path for the primary and backups.
@@ -937,7 +904,7 @@ pub fn ReplicaType(
                 log.debug("{}: on_prepare: ignoring op={} (too far ahead, checkpoint={})", .{
                     self.replica,
                     message.header.op,
-                    self.op_checkpoint,
+                    self.op_checkpoint(),
                 });
                 // When we are the primary, `on_request` enforces this invariant.
                 assert(self.backup());
@@ -948,7 +915,7 @@ pub fn ReplicaType(
             assert(message.header.view == self.view);
             assert(self.primary() or self.backup());
             assert(message.header.replica == self.primary_index(message.header.view));
-            assert(message.header.op > self.op_checkpoint);
+            assert(message.header.op > self.op_checkpoint());
             assert(message.header.op > self.op);
             assert(message.header.op > self.commit_min);
             assert(message.header.op <= self.op_checkpoint_trigger());
@@ -998,11 +965,20 @@ pub fn ReplicaType(
             assert(message.header.view == self.view);
             assert(self.primary());
-            const prepare = self.pipeline_prepare_for_prepare_ok(message) orelse return;
+            const prepare = self.pipeline.queue.prepare_by_prepare_ok(message) orelse {
+                // This can be normal, for example, if an old prepare_ok is replayed.
+                log.debug("{}: on_prepare_ok: not preparing ok={} checksum={}", .{
+                    self.replica,
+                    message.header.op,
+                    message.header.context,
+                });
+                return;
+            };
             assert(prepare.message.header.checksum == message.header.context);
             assert(prepare.message.header.op >= self.commit_max + 1);
-            assert(prepare.message.header.op <= self.commit_max + self.pipeline.count);
+            assert(prepare.message.header.op <= self.commit_max +
+                self.pipeline.queue.prepare_queue.count);
             assert(prepare.message.header.op <= self.op);
             // Wait until we have `f + 1` prepare_ok messages (including ourself) for quorum:
@@ -1010,7 +986,7 @@ pub fn ReplicaType(
             // TODO: When Block recover & state transfer are implemented, this can be removed.
             const threshold =
                 if (prepare.message.header.op == self.op_checkpoint_trigger() or
-                prepare.message.header.op == self.op_checkpoint + constants.lsm_batch_multiple + 1)
+                prepare.message.header.op == self.op_checkpoint() + constants.lsm_batch_multiple + 1)
                 self.replica_count
             else
                 self.quorum_replication;
@@ -1199,28 +1175,6 @@ pub fn ReplicaType(
         /// informs the other replicas of the completion of the view change by sending
         /// ⟨start_view v, l, n, k⟩ messages to the other replicas, where l is the new log, n is the
         /// op number, and k is the commit number.
-        ///
-        /// For each DVC in the quorum:
-        ///
-        /// * The headers must all belong to the same hash chain. (Gaps are allowed).
-        ///   (Reason: the headers bundled with the DVC(s) with the highest view_normal will be
-        ///   loaded into the new primary with `replace_header()`, not `repair_header()`).
-        ///
-        /// Across all DVCs in the quorum:
-        ///
-        /// * The headers of every DVC with the same view_normal must agree. In other words:
-        ///   dvc₁.headers[i].op == dvc₂.headers[j].op implies
-        ///   dvc₁.headers[i].checksum == dvc₂.headers[j].checksum.
-        ///   (Reason: the headers bundled with the DVC(s) with the highest view_normal will be
-        ///   loaded into the new primary with `replace_header()`, not `repair_header()`).
-        ///
-        /// Perhaps unintuitively, it is safe to advertise a header before its message is prepared
-        /// (e.g. the write is still queued). The header is either:
-        ///
-        /// * committed — so another replica in the quorum must have a copy, according to the quorum
-        ///   intersection property. Or,
-        /// * uncommitted — if the header is chosen, but cannot be recovered from any replica, then
-        ///   it will be discarded by the nack protocol.
         fn on_do_view_change(self: *Self, message: *Message) void {
             if (self.ignore_view_change_message(message)) return;
@@ -1255,6 +1209,7 @@ pub fn ReplicaType(
             assert(count == threshold);
             assert(self.do_view_change_from_all_replicas[self.replica] != null);
+            DVCQuorum.verify(self.do_view_change_from_all_replicas);
             log.debug("{}: on_do_view_change: view={} quorum received", .{
                 self.replica,
                 self.view,
@@ -1265,6 +1220,13 @@ pub fn ReplicaType(
             self.do_view_change_quorum = true;
             self.primary_set_log_from_do_view_change_messages();
+            // We aren't status=normal yet, but our headers from our prior log_view may have been
+            // replaced. If we participate in another DVC (before reaching status=normal, which
+            // would update our log_view), we must disambiguate our (new) headers from the
+            // headers of any other replica with the same log_view so that the next primary can
+            // identify an unambiguous set of canonical headers.
+            self.log_view = self.view;
             assert(self.op >= self.commit_max);
             assert(self.state_machine.prepare_timestamp >=
                 self.journal.header_with_op(self.op).?.timestamp);
@@ -1295,7 +1257,9 @@ pub fn ReplicaType(
                 unreachable;
             }
-            assert(self.status == .view_change or self.status == .normal);
+            assert(self.status == .view_change or
+                self.status == .normal or
+                self.status == .recovering_head);
             assert(message.header.view >= self.view);
             assert(message.header.replica != self.replica);
             assert(message.header.replica == self.primary_index(message.header.view));
@@ -1307,13 +1271,23 @@ pub fn ReplicaType(
             assert(message.header.op == op_highest(message_body_as_headers(message)));
             self.set_op_and_commit_max(message.header.op, message.header.commit, "on_start_view");
-            self.replace_headers(message_body_as_headers(message));
+            for (message_body_as_headers_chain_consecutive(message)) |*header| {
+                self.replace_header(header);
+            }
             assert(self.op == message.header.op);
-            if (self.status == .view_change) {
-                self.transition_to_normal_from_view_change_status(message.header.view);
-                self.send_prepare_oks_after_view_change();
+            switch (self.status) {
+                .normal => {},
+                .view_change => {
+                    self.transition_to_normal_from_view_change_status(message.header.view);
+                    self.send_prepare_oks_after_view_change();
+                },
+                .recovering_head => {
+                    self.transition_to_normal_from_recovering_status();
+                    self.send_prepare_oks_after_view_change();
+                },
+                .recovering => unreachable,
             }
             assert(self.status == .normal);
@@ -1329,6 +1303,7 @@ pub fn ReplicaType(
             if (self.ignore_repair_message(message)) return;
             assert(self.status == .normal);
+            assert(self.view == self.log_view);
             assert(message.header.view == self.view);
             assert(message.header.replica != self.replica);
             assert(self.primary());
@@ -1345,391 +1320,90 @@ pub fn ReplicaType(
             self.send_message_to_replica(message.header.replica, start_view);
         }
-        fn on_recovery(self: *Self, message: *const Message) void {
-            assert(self.replica_count > 1);
-            if (self.status != .normal) {
-                log.debug("{}: on_recovery: ignoring ({})", .{ self.replica, self.status });
-                return;
-            }
-            if (message.header.replica == self.replica) {
-                log.warn("{}: on_recovery: ignoring (self)", .{self.replica});
-                return;
-            }
-            const response = self.message_bus.get_message();
-            defer self.message_bus.unref(response);
+        /// If the requested prepare has been guaranteed by this replica:
+        /// * Read the prepare from storage, and forward it to the replica that requested it.
+        /// * Otherwise send no reply — it isn't safe to nack.
+        /// If the requested prepare has *not* been guaranteed by this replica, then send a nack.
+        ///
+        /// A prepare is considered "guaranteed" by a replica if that replica has acknowledged it
+        /// to the cluster. The cluster sees the replica as an underwriter of a guaranteed
+        /// prepare. If a guaranteed prepare is found to by faulty, the replica must repair it
+        /// to restore durability.
+        fn on_request_prepare(self: *Self, message: *const Message) void {
+            if (self.ignore_repair_message(message)) return;
-            log.debug("{}: on_recovery: view={} op={} commit={} nonce={}", .{
-                self.replica,
-                self.view,
-                self.op,
-                self.commit_max,
-                message.header.context,
-            });
+            assert(self.replica_count > 1);
+            assert(self.status == .normal or self.status == .view_change);
+            assert(message.header.view == self.view);
+            assert(message.header.replica != self.replica);
-            response.header.* = .{
-                .command = .recovery_response,
-                .cluster = self.cluster,
-                .context = message.header.context, // Echo the request's nonce.
-                .replica = self.replica,
-                .view = self.view,
-                .op = self.op,
-                .commit = self.commit_max,
+            const op = message.header.op;
+            const slot = self.journal.slot_for_op(op);
+            const checksum: ?u128 = switch (message.header.timestamp) {
+                0 => null,
+                1 => message.header.context,
+                else => unreachable,
             };
-            // A recovery response attaches at least as many headers as a DVC message attaches.
-            // To understand why, consider this scenario, where:
-            //
-            //                   replica_count   3
-            //      do_view_change.headers.len   3 (= pipeline_max)
-            //   recovery_response.headers.len   2 (!)
-            //                   replica 0 log   3, 4a, 5a, 6a, 7a, 8a  (status=normal, primary)
-            //                   replica 1 log   3, 4a, 5a, --, --, --  (status=normal, backup)
-            //                   replica 2 log   3, 4b, 5b, --, --, --  (status=recovering)
-            //
-            // 1. Replica 2 receives a recovery_response quorum.
-            // 2. Replica 2 sets `replica.op` to 8a.
-            // 3. Replica 2 sets its headers from the primary's recovery_response (8a, 7a)
-            //    (via `replace_header()`).
-            // 4. Replica 2 transitions to status=normal.
-            // 5. Replica 0 fails (before replica 2 has a chance to repair its hash chain.)
-            // 6. Replica 1 initiates a view change.
-            // 7. Replica 1 collects a DVC quorum:
-            //      replica 1:  3, 4a, 5a (view_normal=latest)
-            //      replica 2: 5b, 7a, 8a (view_normal=latest)
-            //    Replicas 1 and 2 share the highest view_normal, so both sets of headers are canonical.
-            // 8. Replica 1 loads the canonical headers (via `replace_header()`) from both DVCs.
-            //    Messages 8a and 7a will be dropped via `do_view_change_op_max()` (due to the
-            //    gap at op 6). But there is a conflict at op=5. For correctness, replica 1 must
-            //    pick 5a — 5a may be committed by replica 0.
-            //    Without replica 0's assistance, replica 1 has no way to pick between 5a/5b.
-            //
-            // Including at least as many headers in the recovery response as the DVC maintains the
-            // invariant: DVCs with the same view_normal must never disagree on the identity of a
-            // message.
-            //
-            // (DVCs can still safely include gaps — but they must be of the form [4a,__,6a],
-            // not [4a,__,6b]).
-            const count = self.copy_latest_headers_and_set_size(
-                0,
-                self.op,
-                view_change_headers_count,
-                response,
-            );
-            assert(count > 0); // We expect that self.op always exists.
-            assert(@divExact(response.header.size, @sizeOf(Header)) == 1 + count);
-            response.header.set_checksum_body(response.body());
-            response.header.set_checksum();
-            assert(self.status == .normal);
-            // The checksum for a recovery message is deterministic, and cannot be used as a nonce:
-            assert(response.header.context != message.header.checksum);
-            self.send_message_to_replica(message.header.replica, response);
-        }
-        fn on_recovery_response(self: *Self, message: *Message) void {
-            assert(self.replica_count > 1);
+            // Only the primary may respond to `request_prepare` messages without a checksum.
+            assert(checksum != null or self.primary_index(self.view) == self.replica);
-            if (self.status != .recovering) {
-                log.debug("{}: on_recovery_response: ignoring ({})", .{
+            // Try to serve the message directly from the pipeline.
+            // This saves us from going to disk. And we don't need to worry that the WAL's copy
+            // of an uncommitted prepare is lost/corrupted.
+            if (self.pipeline_prepare_by_op_and_checksum(op, checksum)) |prepare| {
+                log.debug("{}: on_request_prepare: op={} checksum={} reply from pipeline", .{
                     self.replica,
-                    self.status,
+                    op,
+                    checksum,
                 });
+                self.send_message_to_replica(message.header.replica, prepare);
                 return;
             }
-            if (message.header.replica == self.replica) {
-                log.warn("{}: on_recovery_response: ignoring (self)", .{self.replica});
-                return;
-            }
-            if (message.header.context != self.recovery_nonce) {
-                log.warn("{}: on_recovery_response: ignoring (different nonce)", .{self.replica});
-                return;
-            }
-            var responses: *QuorumMessages = &self.recovery_response_from_other_replicas;
-            if (responses[message.header.replica]) |existing| {
-                assert(message.header.replica == existing.header.replica);
-                if (message.header.checksum == existing.header.checksum) {
-                    // The response was replayed by the network; ignore it.
-                    log.debug("{}: on_recovery_response: ignoring (duplicate message)", .{
+            if (self.journal.prepare_inhabited[slot.index]) {
+                const prepare_checksum = self.journal.prepare_checksums[slot.index];
+                // Consult `journal.prepare_checksums` (rather than `journal.headers`):
+                // the former may have the prepare we want — even if journal recovery marked the
+                // slot as faulty and left the in-memory header as reserved.
+                if (checksum == null or checksum.? == prepare_checksum) {
+                    log.debug("{}: on_request_prepare: op={} checksum={} reading", .{
                         self.replica,
+                        op,
+                        checksum,
                     });
-                    return;
-                }
-                // We received a second (distinct) response from a replica. Possible causes:
-                // * We retried the `recovery` message, because we had not yet received a quorum.
-                // * The `recovery` message was duplicated/misdirected by the network, and the
-                //   receiver's state changed in the mean time.
-                log.debug(
-                    "{}: on_recovery_response: replacing response replica={} view={}..{} op={}..{} commit={}..{}",
-                    .{
-                        self.replica,
-                        existing.header.replica,
-                        existing.header.view,
-                        message.header.view,
-                        existing.header.op,
-                        message.header.op,
-                        existing.header.commit,
-                        message.header.commit,
-                    },
-                );
+                    // Improve availability by calling `read_prepare_with_op_and_checksum` instead
+                    // of `read_prepare` — even if `journal.headers` contains the target message.
+                    // The latter skips the read when the target prepare is present but dirty (e.g.
+                    // it was recovered with decision=fix).
+                    // TODO Do not reissue the read if we are already reading in order to send to
+                    // this particular destination replica.
+                    self.journal.read_prepare_with_op_and_checksum(
+                        on_request_prepare_read,
+                        op,
+                        prepare_checksum,
+                        message.header.replica,
+                    );
-                if (message.header.view < existing.header.view or
-                    (message.header.view == existing.header.view and
-                    message.header.op < existing.header.op) or
-                    (message.header.view == existing.header.view and
-                    message.header.op == existing.header.op and
-                    message.header.commit < existing.header.commit))
-                {
-                    // The second message is older than the first one (reordered packets).
-                    log.debug("{}: on_recovery_response: ignoring (older)", .{self.replica});
+                    // We have guaranteed the prepare (not safe to nack).
+                    // Our copy may or may not be valid, but we will try to read & forward it.
                     return;
                 }
+            }
-                // The second message is newer than the first one.
-                assert(message.header.view >= existing.header.view);
-                // The op number may regress if an uncommitted op was discarded in a higher view.
-                assert(message.header.op >= existing.header.op or
-                    message.header.view > existing.header.view);
-                assert(message.header.commit >= existing.header.commit);
-                self.message_bus.unref(existing);
-                responses[message.header.replica] = null;
-            } else {
-                log.debug(
-                    "{}: on_recovery_response: replica={} view={} op={} commit={}",
-                    .{
-                        self.replica,
-                        message.header.replica,
-                        message.header.view,
-                        message.header.op,
-                        message.header.commit,
-                    },
-                );
-            }
-            assert(responses[message.header.replica] == null);
-            responses[message.header.replica] = message.ref();
-            // Wait until we have:
-            // * at least `f + 1` messages for quorum (not including ourself), and
-            // * a response from the primary of the highest discovered view.
-            const count = self.count_quorum(responses, .recovery_response, self.recovery_nonce);
-            assert(count <= self.replica_count - 1);
-            const threshold = self.quorum_view_change;
-            if (count < threshold) {
-                log.debug("{}: on_recovery_response: waiting for quorum ({}/{})", .{
-                    self.replica,
-                    count,
-                    threshold,
-                });
-                return;
-            }
-            const view = blk: { // The latest known view.
-                var view: u32 = 0;
-                for (self.recovery_response_from_other_replicas) |received, replica| {
-                    if (received) |response| {
-                        assert(replica != self.replica);
-                        assert(response.header.replica == replica);
-                        assert(response.header.context == self.recovery_nonce);
-                        view = std.math.max(view, response.header.view);
-                    }
-                }
-                break :blk view;
-            };
-            const primary_response = responses[self.primary_index(view)];
-            if (primary_response == null) {
-                log.debug(
-                    "{}: on_recovery_response: ignoring (awaiting response from primary of view={})",
-                    .{
-                        self.replica,
-                        view,
-                    },
-                );
-                return;
-            }
-            if (primary_response.?.header.view != view) {
-                // The primary (according to the view quorum) isn't the primary (according to itself).
-                // The `recovery_timeout` will retry shortly with another round.
-                log.debug(
-                    "{}: on_recovery_response: ignoring (primary view={} != quorum view={})",
-                    .{
-                        self.replica,
-                        primary_response.?.header.view,
-                        view,
-                    },
-                );
-                return;
-            }
-            // This recovering→normal status transition occurs exactly once.
-            // All further `recovery_response` messages are ignored.
-            // TODO When the view is recovered from the superblock (instead of via the VSR recovery
-            // protocol), if the view number indicates that this replica is a primary, it must
-            // transition to status=view_change instead of status=normal.
-            const primary_headers = message_body_as_headers(primary_response.?);
-            assert(primary_headers.len > 0);
-            const commit = primary_response.?.header.commit;
-            {
-                const op = op_highest(primary_headers);
-                assert(op == primary_response.?.header.op);
-                self.set_op_and_commit_max(op, commit, "on_recovery_response");
-                // TODO If the view's primary is >1 WAL ahead of us, these headers could cause
-                // problems. We don't want to jump this far ahead to repair, but we still need to
-                // use the hash chain to figure out which headers to request. Maybe include our
-                // `op_checkpoint` in the recovery (request) message so that the response can give
-                // more useful (i.e. older) headers.
-                self.replace_headers(primary_headers);
-                if (self.op < constants.journal_slot_count) {
-                    if (self.journal.header_with_op(0)) |header| {
-                        assert(header.command == .prepare);
-                        assert(header.operation == .root);
-                    } else {
-                        // This is the first wrap of the log, and the root prepare is corrupt.
-                        // Repair the root repair. This is necessary to maintain the invariant that
-                        // the op=commit_min exists in-memory.
-                        //
-                        // op=0 wouldn't have been repaired by replace_headers above, because it is
-                        // already "checkpointed".
-                        const header = Header.root_prepare(self.cluster);
-                        self.journal.set_header_as_dirty(&header);
-                        log.debug("{}: on_recovery_response: repair root op", .{self.replica});
-                    }
-                }
-                assert(self.op == op);
-                assert(self.journal.header_with_op(self.op) != null);
-            }
-            assert(self.status == .recovering);
-            self.transition_to_normal_from_recovering_status(view);
-            assert(self.status == .normal);
-            assert(self.backup());
-            log.info("{}: on_recovery_response: recovery done responses={} view={} headers={}..{}" ++
-                " commit={} dirty={} faulty={}", .{
-                self.replica,
-                count,
-                view,
-                primary_headers[primary_headers.len - 1].op,
-                primary_headers[0].op,
-                commit,
-                self.journal.dirty.count,
-                self.journal.faulty.count,
-            });
-            self.state_machine.prepare_timestamp = self.journal.header_with_op(self.op).?.timestamp;
-            // `state_machine.commit_timestamp` is updated as messages are committed.
-            self.reset_quorum_recovery_response();
-            self.commit_journal(commit);
-            self.repair();
-        }
-        /// If the requested prepare has been guaranteed by this replica:
-        /// * Read the prepare from storage, and forward it to the replica that requested it.
-        /// * Otherwise send no reply — it isn't safe to nack.
-        /// If the requested prepare has *not* been guaranteed by this replica, then send a nack.
-        ///
-        /// A prepare is considered "guaranteed" by a replica if that replica has acknowledged it
-        /// to the cluster. The cluster sees the replica as an underwriter of a guaranteed
-        /// prepare. If a guaranteed prepare is found to by faulty, the replica must repair it
-        /// to restore durability.
-        fn on_request_prepare(self: *Self, message: *const Message) void {
-            if (self.ignore_repair_message(message)) return;
-            assert(self.replica_count > 1);
-            assert(self.status == .normal or self.status == .view_change);
-            assert(message.header.view == self.view);
-            assert(message.header.replica != self.replica);
-            const op = message.header.op;
-            const slot = self.journal.slot_for_op(op);
-            const checksum: ?u128 = switch (message.header.timestamp) {
-                0 => null,
-                1 => message.header.context,
-                else => unreachable,
-            };
-            // Only the primary may respond to `request_prepare` messages without a checksum.
-            assert(checksum != null or self.primary_index(self.view) == self.replica);
-            // Try to serve the message directly from the pipeline.
-            // This saves us from going to disk. And we don't need to worry that the WAL's copy
-            // of an uncommitted prepare is lost/corrupted.
-            if (self.pipeline_prepare_for_op_and_checksum(op, checksum)) |prepare| {
-                log.debug("{}: on_request_prepare: op={} checksum={} reply from pipeline", .{
-                    self.replica,
-                    op,
-                    checksum,
-                });
-                self.send_message_to_replica(message.header.replica, prepare.message);
-                return;
-            }
-            if (self.journal.prepare_inhabited[slot.index]) {
-                const prepare_checksum = self.journal.prepare_checksums[slot.index];
-                // Consult `journal.prepare_checksums` (rather than `journal.headers`):
-                // the former may have the prepare we want — even if journal recovery marked the
-                // slot as faulty and left the in-memory header as reserved.
-                if (checksum == null or checksum.? == prepare_checksum) {
-                    log.debug("{}: on_request_prepare: op={} checksum={} reading", .{
-                        self.replica,
-                        op,
-                        checksum,
-                    });
-                    // Improve availability by calling `read_prepare_with_op_and_checksum` instead
-                    // of `read_prepare` — even if `journal.headers` contains the target message.
-                    // The latter skips the read when the target prepare is present but dirty (e.g.
-                    // it was recovered with decision=fix).
-                    // TODO Do not reissue the read if we are already reading in order to send to
-                    // this particular destination replica.
-                    self.journal.read_prepare_with_op_and_checksum(
-                        on_request_prepare_read,
-                        op,
-                        prepare_checksum,
-                        message.header.replica,
-                    );
-                    // We have guaranteed the prepare (not safe to nack).
-                    // Our copy may or may not be valid, but we will try to read & forward it.
-                    return;
-                }
-            }
-            {
-                // We may have guaranteed the prepare but our copy is faulty (not safe to nack).
-                if (self.journal.faulty.bit(slot)) return;
-                if (self.journal.header_with_op_and_checksum(message.header.op, checksum)) |_| {
-                    if (self.journal.dirty.bit(slot)) {
-                        // We know of the prepare but have yet to write it (safe to nack).
-                        // Continue through below...
-                    } else {
-                        // We have guaranteed the prepare and our copy is clean (not safe to nack).
-                        return;
-                    }
-                }
+            {
+                // We may have guaranteed the prepare but our copy is faulty (not safe to nack).
+                if (self.journal.faulty.bit(slot)) return;
+                if (self.journal.header_with_op_and_checksum(message.header.op, checksum)) |_| {
+                    if (self.journal.dirty.bit(slot)) {
+                        // We know of the prepare but have yet to write it (safe to nack).
+                        // Continue through below...
+                    } else {
+                        // We have guaranteed the prepare and our copy is clean (not safe to nack).
+                        return;
+                    }
+                }
             }
             // Protocol-Aware Recovery's CTRL protocol only runs during the view change, when the
@@ -1970,8 +1644,9 @@ pub fn ReplicaType(
             assert(self.status == .normal);
             assert(self.primary());
-            const prepare = self.pipeline.head_ptr().?;
+            const prepare = self.pipeline.queue.prepare_queue.head_ptr().?;
             assert(prepare.message.header.command == .prepare);
+            assert(prepare.message.header.op == self.commit_min + 1);
             if (prepare.ok_quorum_received) {
                 self.prepare_timeout.reset();
@@ -2017,10 +1692,10 @@ pub fn ReplicaType(
                 // We may be slow and waiting for the write to complete.
                 //
                 // We may even have maxed out our IO depth and been unable to initiate the write,
-                // which can happen if `constants.pipeline_max` exceeds `constants.journal_iops_write_max`.
-                // This can lead to deadlock for a cluster of one or two (if we do not retry here),
-                // since there is no other way for the primary to repair the dirty op because no
-                // other replica has it.
+                // which can happen if `constants.pipeline_prepare_queue_max` exceeds
+                // `constants.journal_iops_write_max`. This can lead to deadlock for a cluster of
+                // one or two (if we do not retry here), since there is no other way for the primary
+                // to repair the dirty op because no other replica has it.
                 //
                 // Retry the write through `on_repair()` which will work out which is which.
                 // We do expect that the op would have been run through `on_prepare()` already.
@@ -2107,13 +1782,6 @@ pub fn ReplicaType(
             self.repair();
         }
-        fn on_recovery_timeout(self: *Self) void {
-            assert(self.status == .recovering);
-            assert(self.replica_count > 1);
-            self.recovery_timeout.reset();
-            self.recover();
-        }
         fn reference_message_and_receive_quorum_exactly_once(
             self: *Self,
             messages: *QuorumMessages,
@@ -2301,7 +1969,7 @@ pub fn ReplicaType(
             assert(message.header.view == self.view);
             assert(message.header.op == self.op);
-            if (self.replica_count == 1 and self.pipeline.count > 1) {
+            if (self.replica_count == 1 and self.pipeline.queue.prepare_queue.count > 1) {
                 // In a cluster-of-one, the prepares must always be written to the WAL sequentially
                 // (never concurrently). This ensures that there will be no gaps in the WAL during
                 // crash recovery.
@@ -2364,10 +2032,9 @@ pub fn ReplicaType(
         /// A function which calls `commit_journal()` to set `commit_max` must first call
         /// `view_jump()`. Otherwise, we may fork the log.
         fn commit_journal(self: *Self, commit: u64) void {
-            // TODO Restrict `view_change` status only to the primary purely as defense-in-depth.
-            // Be careful of concurrency when doing this, as successive view changes can happen quickly.
             assert(self.status == .normal or self.status == .view_change or
                 (self.status == .recovering and self.replica_count == 1));
+            assert(!(self.status == .normal and self.primary()));
             assert(self.commit_min <= self.commit_max);
             assert(self.commit_min <= self.op);
             assert(self.commit_max <= self.op or self.commit_max > self.op);
@@ -2392,6 +2059,7 @@ pub fn ReplicaType(
                 log.debug("{}: commit_journal: already committing...", .{self.replica});
                 return;
             }
+            assert(!(self.status == .normal and self.primary()));
             // We check the hash chain before we read each op, rather than once upfront, because
             // it's possible for `commit_max` to change while we read asynchronously, after we
@@ -2413,6 +2081,8 @@ pub fn ReplicaType(
             assert(self.committing);
             assert(self.status == .normal or self.status == .view_change or
                 (self.status == .recovering and self.replica_count == 1));
+            assert(!(self.status == .normal and self.primary()));
+            assert(self.pipeline == .cache);
             assert(self.commit_min <= self.commit_max);
             assert(self.commit_min <= self.op);
@@ -2427,8 +2097,23 @@ pub fn ReplicaType(
             // Even a naive state transfer may fail to correct for this.
             if (self.commit_min < self.commit_max and self.commit_min < self.op) {
                 const op = self.commit_min + 1;
-                const checksum = self.journal.header_with_op(op).?.checksum;
-                self.journal.read_prepare(commit_journal_next_callback, op, checksum, null);
+                const header = self.journal.header_with_op(op).?;
+                if (self.pipeline.cache.prepare_by_op_and_checksum(op, header.checksum)) |prepare| {
+                    log.debug("{}: commit_journal_next: cached prepare op={} checksum={}", .{
+                        self.replica,
+                        op,
+                        header.checksum,
+                    });
+                    self.commit_journal_next_callback(prepare, null);
+                } else {
+                    self.journal.read_prepare(
+                        commit_journal_next_callback,
+                        op,
+                        header.checksum,
+                        null,
+                    );
+                }
             } else {
                 self.commit_ops_done();
                 // This is an optimization to expedite the view change before the `repair_timeout`:
@@ -2438,7 +2123,7 @@ pub fn ReplicaType(
                     assert(self.replica_count == 1);
                     assert(self.commit_min == self.commit_max);
                     assert(self.commit_min == self.op);
-                    self.transition_to_normal_from_recovering_status(0);
+                    self.transition_to_normal_from_recovering_status();
                 } else {
                     // We expect that a cluster-of-one only calls commit_journal() in recovering status.
                     assert(self.replica_count > 1);
@@ -2457,14 +2142,6 @@ pub fn ReplicaType(
                 return;
             }
-            const slot = self.journal.slot_with_op_and_checksum(
-                prepare.?.header.op,
-                prepare.?.header.checksum,
-            ).?;
-            assert(self.journal.prepare_inhabited[slot.index]);
-            assert(self.journal.prepare_checksums[slot.index] == prepare.?.header.checksum);
-            assert(self.journal.has(prepare.?.header));
             switch (self.status) {
                 .normal => {},
                 .view_change => {
@@ -2484,6 +2161,7 @@ pub fn ReplicaType(
                     assert(self.replica_count == 1);
                     assert(self.primary_index(self.view) == self.replica);
                 },
+                .recovering_head => unreachable,
             }
             const op = self.commit_min + 1;
@@ -2497,7 +2175,15 @@ pub fn ReplicaType(
             assert(self.commit_min <= self.commit_max);
             assert(self.commit_min <= self.op);
-            self.commit_journal_next();
+            if (self.status == .normal and self.primary()) {
+                if (self.pipeline.queue.prepare_queue.empty()) {
+                    self.commit_ops_done();
+                } else {
+                    self.commit_pipeline_next();
+                }
+            } else {
+                self.commit_journal_next();
+            }
         }
         /// Begin the commit path that is common between `commit_pipeline` and `commit_journal`:
@@ -2551,8 +2237,14 @@ pub fn ReplicaType(
             assert(self.commit_min <= self.commit_max);
             if (self.status == .normal and self.primary()) {
-                const prepare = self.pipeline.pop().?;
+                const prepare = self.pipeline.queue.pop_prepare().?;
+                if (self.pipeline.queue.pop_request()) |request| {
+                    // Start preparing the next request in the queue (if any).
+                    self.primary_pipeline_prepare(request);
+                }
                 assert(self.commit_min == self.commit_max);
+                assert(prepare.message.header.command == .prepare);
                 assert(prepare.message.header.checksum == self.commit_prepare.?.header.checksum);
                 assert(prepare.message.header.op == self.commit_min);
                 assert(prepare.message.header.op == self.commit_max);
@@ -2560,7 +2252,7 @@ pub fn ReplicaType(
                 self.message_bus.unref(prepare.message);
-                if (self.pipeline.head_ptr()) |next| {
+                if (self.pipeline.queue.prepare_queue.head_ptr()) |next| {
                     assert(next.message.header.op == self.commit_min + 1);
                     assert(next.message.header.op == self.commit_prepare.?.header.op + 1);
@@ -2588,8 +2280,8 @@ pub fn ReplicaType(
             const self = @fieldParentPtr(Self, "state_machine", state_machine);
             assert(self.committing);
             assert(self.commit_callback != null);
-            assert(self.op_checkpoint == self.superblock.staging.vsr_state.commit_min);
-            assert(self.op_checkpoint == self.superblock.working.vsr_state.commit_min);
+            assert(self.op_checkpoint() == self.superblock.staging.vsr_state.commit_min);
+            assert(self.op_checkpoint() == self.superblock.working.vsr_state.commit_min);
             const op = self.commit_prepare.?.header.op;
             assert(op == self.commit_min);
@@ -2604,7 +2296,7 @@ pub fn ReplicaType(
                     "(op={} current_checkpoint={} next_checkpoint={})", .{
                     self.replica,
                     self.op,
-                    self.op_checkpoint,
+                    self.op_checkpoint(),
                     self.op_checkpoint_next(),
                 });
                 tracer.start(
@@ -2638,19 +2330,15 @@ pub fn ReplicaType(
             // Therefore, only ops "A..D" are committed to disk.
             // Thus, the SuperBlock's `commit_min` is set to 7-2=5.
             const vsr_state_commit_min = self.op_checkpoint_next();
-            const vsr_state_new = .{
-                .commit_min_checksum = self.journal.header_with_op(vsr_state_commit_min).?.checksum,
-                .commit_min = vsr_state_commit_min,
-                .commit_max = self.commit_max,
-                .view_normal = self.view_normal,
-                .view = self.view,
-            };
-            assert(self.superblock.working.vsr_state.monotonic(vsr_state_new));
             self.superblock.checkpoint(
                 commit_op_checkpoint_superblock_callback,
                 &self.superblock_context,
-                vsr_state_new,
+                .{
+                    .commit_min_checksum = self.journal.header_with_op(vsr_state_commit_min).?.checksum,
+                    .commit_min = vsr_state_commit_min,
+                    .commit_max = self.commit_max,
+                },
             );
         }
@@ -2661,15 +2349,14 @@ pub fn ReplicaType(
             assert(self.commit_prepare.?.header.op == self.op);
             assert(self.commit_prepare.?.header.op == self.commit_min);
-            self.op_checkpoint = self.op_checkpoint_next();
-            assert(self.op_checkpoint == self.commit_min - constants.lsm_batch_multiple);
-            assert(self.op_checkpoint == self.superblock.staging.vsr_state.commit_min);
-            assert(self.op_checkpoint == self.superblock.working.vsr_state.commit_min);
+            assert(self.op_checkpoint() == self.commit_min - constants.lsm_batch_multiple);
+            assert(self.op_checkpoint() == self.superblock.staging.vsr_state.commit_min);
+            assert(self.op_checkpoint() == self.superblock.working.vsr_state.commit_min);
             log.debug("{}: commit_op_compact_callback: checkpoint done (op={} new_checkpoint={})", .{
                 self.replica,
                 self.op,
-                self.op_checkpoint,
+                self.op_checkpoint(),
             });
             tracer.end(
                 &self.tracer_slot_checkpoint,
@@ -2720,7 +2407,7 @@ pub fn ReplicaType(
             // this commit.
             assert(self.journal.has(prepare.header));
-            if (self.op_checkpoint == self.commit_min) {
+            if (self.op_checkpoint() == self.commit_min) {
                 // op_checkpoint's slot may have been overwritten in the WAL — but we can
                 // always use the VSRState to anchor the hash chain.
                 assert(prepare.header.parent ==
@@ -2752,6 +2439,7 @@ pub fn ReplicaType(
             const reply_body_size = @intCast(u32, self.state_machine.commit(
                 prepare.header.client,
                 prepare.header.op,
+                prepare.header.timestamp,
                 prepare.header.operation.cast(StateMachine),
                 prepare.buffer[@sizeOf(Header)..prepare.header.size],
                 reply.buffer[@sizeOf(Header)..],
@@ -2788,7 +2476,7 @@ pub fn ReplicaType(
             if (self.superblock.working.vsr_state.op_compacted(prepare.header.op)) {
                 // We are recovering from a checkpoint. Prior to the crash, the client table was
                 // updated with entries for one bar beyond the op_checkpoint.
-                assert(self.op_checkpoint == self.superblock.working.vsr_state.commit_min);
+                assert(self.op_checkpoint() == self.superblock.working.vsr_state.commit_min);
                 if (self.client_table().get(prepare.header.client)) |entry| {
                     assert(entry.reply.header.command == .reply);
                     assert(entry.reply.header.op >= prepare.header.op);
@@ -2799,7 +2487,7 @@ pub fn ReplicaType(
                 log.debug("{}: commit_op: skip client table update: prepare.op={} checkpoint={}", .{
                     self.replica,
                     prepare.header.op,
-                    self.op_checkpoint,
+                    self.op_checkpoint(),
                 });
             } else {
                 if (reply.header.operation == .register) {
@@ -2821,7 +2509,7 @@ pub fn ReplicaType(
         fn commit_pipeline(self: *Self) void {
             assert(self.status == .normal);
             assert(self.primary());
-            assert(self.pipeline.count > 0);
+            assert(self.pipeline.queue.prepare_queue.count > 0);
             // Guard against multiple concurrent invocations of commit_journal()/commit_pipeline():
             if (self.committing) {
@@ -2838,10 +2526,10 @@ pub fn ReplicaType(
             assert(self.status == .normal);
             assert(self.primary());
-            if (self.pipeline.head_ptr()) |prepare| {
+            if (self.pipeline.queue.prepare_queue.head_ptr()) |prepare| {
                 assert(self.commit_min == self.commit_max);
                 assert(self.commit_min + 1 == prepare.message.header.op);
-                assert(self.commit_min + self.pipeline.count == self.op);
+                assert(self.commit_min + self.pipeline.queue.prepare_queue.count == self.op);
                 assert(self.journal.has(prepare.message.header));
                 if (!prepare.ok_quorum_received) {
@@ -2867,9 +2555,6 @@ pub fn ReplicaType(
             assert(self.commit_min <= self.op);
             if (self.status == .normal and self.primary()) {
-                if (self.pipeline.head_ptr()) |pipeline_head| {
-                    assert(pipeline_head.message.header.op == self.commit_min + 1);
-                }
                 self.commit_pipeline_next();
             } else {
                 self.commit_ops_done();
@@ -2890,10 +2575,7 @@ pub fn ReplicaType(
         ) usize {
             assert(op_max >= op_min);
             assert(count_max == null or count_max.? > 0);
-            assert(message.header.command == .do_view_change or
-                message.header.command == .start_view or
-                message.header.command == .headers or
-                message.header.command == .recovery_response);
+            assert(message.header.command == .headers);
             const body_size_max = @sizeOf(Header) * std.math.min(
                 @divExact(message.buffer.len - @sizeOf(Header), @sizeOf(Header)),
@@ -2934,7 +2616,6 @@ pub fn ReplicaType(
                     assert(m.header.replica == replica);
                     switch (command) {
                         .do_view_change => assert(m.header.view == self.view),
-                        .recovery_response => assert(m.header.replica != self.replica),
                         else => unreachable,
                     }
                     count += 1;
@@ -3021,17 +2702,30 @@ pub fn ReplicaType(
             assert(self.client_table().count() <= constants.clients_max);
         }
+        /// Construct a SV/DVC message, including attached headers from the current log_view.
+        ///
         /// The caller owns the returned message, if any, which has exactly 1 reference.
         fn create_view_change_message(self: *Self, command: Command) *Message {
-            assert(command == .do_view_change or command == .start_view);
             // We may send a start_view message in normal status to resolve a backup's view jump:
             assert(self.status == .normal or self.status == .view_change);
+            assert((self.status == .normal) == (command == .start_view));
+            assert((self.status == .view_change) == (command == .do_view_change));
+            assert(self.view >= self.log_view);
+            assert(self.view >= self.view_durable());
+            assert(self.log_view >= self.log_view_durable());
+            assert(command != .do_view_change or self.log_view < self.view);
+            assert(command != .start_view or self.log_view == self.view);
             const message = self.message_bus.get_message();
             defer self.message_bus.unref(message);
+            const headers = self.create_view_change_headers();
+            assert(headers.len > 0);
+            assert(headers.get(0).op == self.op);
             message.header.* = .{
+                .size = @intCast(u32, @sizeOf(Header) * (1 + headers.len)),
                 .command = command,
                 .cluster = self.cluster,
                 .replica = self.replica,
@@ -3040,33 +2734,167 @@ pub fn ReplicaType(
                 // number contained in the prepare headers we include in the body. The former shows
                 // how recent a view change the replica participated in, which may be much higher.
                 // We use the `timestamp` field to send this in addition to the current view number:
-                .timestamp = if (command == .do_view_change) self.view_normal else 0,
+                .timestamp = if (command == .do_view_change) self.log_view else 0,
                 .op = self.op,
                 // See the comment in `on_do_view_change()` for why `commit_min` is crucial:
                 .commit = if (command == .do_view_change) self.commit_min else self.commit_max,
             };
-            const count = self.copy_latest_headers_and_set_size(
-                0,
-                self.op,
-                view_change_headers_count,
-                message,
+            stdx.copy_disjoint(
+                .exact,
+                Header,
+                std.mem.bytesAsSlice(Header, message.body()),
+                headers.constSlice(),
             );
-            assert(count > 0); // We expect that self.op always exists.
-            assert(@divExact(message.header.size, @sizeOf(Header)) == 1 + count);
             message.header.set_checksum_body(message.body());
             message.header.set_checksum();
             return message.ref();
         }
+        fn create_view_change_headers(self: *const Self) vsr.ViewChangeHeaders.BoundedArray {
+            assert(self.status == .normal or self.status == .view_change);
+            assert(self.view >= self.log_view);
+            assert(self.view >= self.view_durable());
+            assert(self.log_view >= self.log_view_durable());
+            var headers = vsr.ViewChangeHeaders.BoundedArray{ .buffer = undefined };
+            // Always include the head message.
+            headers.appendAssumeCapacity(self.journal.header_with_op(self.op).?.*);
+            if (self.view == self.log_view) {
+                // Construct SV message headers. (On the backup, these are only stored in the
+                // superblock).
+                if (self.primary_index(self.view) == self.replica and self.status == .normal) {
+                    assert(self.op >= self.commit_max);
+                    // The primary starting a new view has a pristine log suffix.
+                    //
+                    // +1 because commit_min may have been overwritten (and not repaired) if it
+                    // falls on a checkpoint boundary.
+                    var op = self.op;
+                    while (op > self.commit_min + 1) : (op -= 1) {
+                        const header_next = self.journal.header_with_op(op).?;
+                        const header_prev = self.journal.header_with_op(op - 1).?;
+                        assert(header_prev.checksum == header_next.parent);
+                        headers.append(header_prev.*) catch break;
+                    }
+                } else {
+                    // Either:
+                    // - The primary started a new view but has not finished repair.
+                    // - The backup joining a new view has a pristine log suffix — it just
+                    //   loaded a SV.
+                    //
+                    // In each case we send as much of a suffix as is available (fallthrough).
+                }
+            } else {
+                // Construct DVC message headers.
+                assert(self.view > self.log_view);
+                if (self.log_view_durable() == self.log_view) {
+                    const headers_durable = self.superblock.working.vsr_headers().slice;
+                    assert(headers_durable[0].op <= self.op);
+                    if (self.log_view_durable() < self.view_durable()) {
+                        // Ensure that if we started a DVC before a crash, that we will resume
+                        // sending the exact same DVC after recovery.
+                        // (An alternative implementation would be to load the superblock's DVC
+                        // headers (including gaps) into the journal during open(), but that is more
+                        // complicated to implement correctly).
+                        assert(headers_durable[0].op == self.op);
+                        assert(headers_durable[0].checksum == headers.get(0).checksum);
+                        for (headers_durable[1..]) |*header| headers.appendAssumeCapacity(header.*);
+                    } else {
+                        // Durable SV anchor. See Example 4.
+                        assert(self.log_view_durable() == self.view_durable());
+                        var op = self.op;
+                        while (op > headers_durable[headers_durable.len - 1].op) : (op -= 1) {
+                            const header_prev = self.journal.header_with_op(op - 1) orelse continue;
+                            const header_next = self.journal.header_with_op(op);
+                            assert(header_next == null or header_prev.checksum == header_next.?.parent);
+                            headers.append(header_prev.*) catch break;
+                        }
+                    }
+                    return headers;
+                }
+                // The DVC anchor: Within the log suffix following the anchor, we have additional
+                // guarantees about the state of the log headers which allow us to tolerate certain
+                // gaps (by locally guaranteeing that the gap does not hide a break).
+                // See Example 2/3 for more detail.
+                const op_dvc_anchor = std.math.max(
+                    self.commit_min,
+                    // +1: We can have a full pipeline, but not yet have performed any repair.
+                    // In such a case, we want to send those pipeline_prepare_queue_max headers in
+                    // the DVC, but not the preceding op (which may belong to a different chain).
+                    // This satisfies the DVC invariant because the first op in the pipeline is
+                    // "connected" to the canonical chain (via its "parent" checksum).
+                    //
+                    // For example, as a follower, we might have received pipeline_prepare_queue_max
+                    // headers in the SV message, but not done any repair before the next view
+                    // change.
+                    1 + self.op -| constants.pipeline_prepare_queue_max,
+                );
+                if (self.primary_index(self.log_view) == self.replica) {
+                    // Retired primary: see Example 2a.
+                    var op = self.op;
+                    while (op > op_dvc_anchor) : (op -= 1) {
+                        const header_next = self.journal.header_with_op(op).?;
+                        // Exclude gaps since we cannot distinguish the gap from a break.
+                        const header_prev = self.journal.header_with_op(op - 1) orelse break;
+                        if (header_prev.checksum != header_next.parent) break;
+                        headers.append(header_prev.*) catch break;
+                    }
+                } else {
+                    // Retired backup: see Example 2b.
+                    var op = self.op;
+                    while (op > self.commit_min) : (op -= 1) {
+                        const header_prev = self.journal.header_with_op(op - 1) orelse continue;
+                        const header_next = self.journal.header_with_op(op);
+                        assert(header_next == null or header_prev.checksum == header_next.?.parent);
+                        headers.append(header_prev.*) catch break;
+                        // Stop once we connect to the anchor.
+                        if (header_prev.op <= op_dvc_anchor + 1) break;
+                    } else {
+                        assert(self.commit_min == self.op);
+                    }
+                }
+            }
+            // Include as many extra headers as possible, but with no additional gaps (since they
+            // cannot be differentiated from breaks).
+            // - This reduces the number of headers that the new primary will need to repair.
+            // - More importantly, this ensures that a replica which re-sends its DVC does not
+            //   alter the DVC's headers, even if the replica finished a commit (updating
+            //   commit_min, possibly modifying the suffix anchor) in the mean time.
+            //   (This is not required for correctness, but enables additional verification
+            //   in on_do_view_change().)
+            var op = headers.get(headers.len - 1).op;
+            while (op > 0 and headers.len < constants.view_change_headers_max) : (op -= 1) {
+                const header_next = self.journal.header_with_op(op).?;
+                const header_prev = self.journal.header_with_op(op - 1) orelse break;
+                if (header_prev.checksum != header_next.parent) break;
+                headers.appendAssumeCapacity(header_prev.*);
+            }
+            vsr.ViewChangeHeaders.verify(headers.constSlice());
+            return headers;
+        }
         /// The caller owns the returned message, if any, which has exactly 1 reference.
         fn create_message_from_header(self: *Self, header: Header) *Message {
             assert(header.replica == self.replica);
-            assert(header.view == self.view or
-                header.command == .request_start_view or
-                header.command == .recovery);
+            assert(header.view == self.view or header.command == .request_start_view);
             assert(header.size == @sizeOf(Header));
             const message = self.message_bus.pool.get_message();
@@ -3079,67 +2907,6 @@ pub fn ReplicaType(
             return message.ref();
         }
-        /// Returns the op of the highest canonical message, according to this replica (the new
-        /// primary) prior to loading the current view change's DVC quorum headers.
-        /// When this replica participated in the last `view_normal`, this is just `replica.op`.
-        ///
-        /// - A *canonical* message was part of the last view_normal.
-        /// - An *uncanonical* message may have been removed/changed by a prior view.
-        /// - Canonical messages do not necessarily survive into the new view, but they take
-        ///   precedence over uncanonical messages.
-        /// - Canonical messages may be committed or uncommitted.
-        ///
-        /// Consider these logs:
-        ///
-        ///   replica 0: 4, 5, 6b, 7b, 8b  (commit_min=6b, primary, status=normal, view=X)
-        ///   replica 1: 4, 5, 6b, --, --  (commit_min=5, backup, status=normal, view=X)
-        ///   replica 2: 4, 5, 6a, --, 8b  (view<X)
-        ///
-        /// 1. Replica 0 crashes immediately after committing 6b.
-        /// 2. Replicas 1 and 2 must determine the new chain HEAD.
-        /// 3. 8b is discarded due to the gap in 7.
-        /// 4. To distinguish between 6a and 6b (and safely discard 6a), the new primary trusts ops
-        ///    from the DVC(s) with the greatest `view_normal`.
-        fn primary_op_canonical_max(self: *const Self, view_normal_canonical: u64) usize {
-            assert(self.replica_count > 1);
-            assert(self.status == .view_change);
-            assert(self.primary_index(self.view) == self.replica);
-            assert(self.do_view_change_quorum);
-            assert(!self.repair_timeout.ticking);
-            assert(self.journal.header_with_op(self.op) != null);
-            assert(self.view_normal <= view_normal_canonical);
-            if (self.view_normal == view_normal_canonical) return self.op;
-            const uncanonical_op_count = std.math.min(
-                // Do not reset any ops that we have already committed.
-                self.op - self.commit_min,
-                // The number of uncommitted ops cannot be more than the length of the pipeline.
-                // Do not reset any ops that we did not include in our do_view_change message.
-                constants.pipeline_max,
-            );
-            assert(uncanonical_op_count <= constants.pipeline_max);
-            if (uncanonical_op_count == 0) return self.op;
-            // * When uncanonical_op_count = self.op - self.commit_min,
-            //   self.op - uncanonical_op_count = self.commit_min.
-            // * When uncanonical_op_count = constants.pipeline_max,
-            //   constants.pipeline_max < self.op - self.commit_min holds.
-            const canonical_op_max = self.op - uncanonical_op_count;
-            log.debug("{}: on_do_view_change: not canonical ops={}..{}", .{
-                self.replica,
-                canonical_op_max + 1,
-                self.op,
-            });
-            assert(canonical_op_max <= self.op);
-            assert(canonical_op_max >= self.commit_min);
-            assert(canonical_op_max + constants.pipeline_max >= self.op);
-            return canonical_op_max;
-        }
         /// Discards uncommitted ops during a view change from after and including `op`.
         /// This is required to maximize availability in the presence of storage faults.
         /// Refer to the CTRL protocol from Protocol-Aware Recovery for Consensus-Based Storage.
@@ -3192,8 +2959,8 @@ pub fn ReplicaType(
             //    asynchronous prepare_ok to itself.
             // 3. In on_start_view_change(), after receiving a quorum of start_view_change
             //    messages, the new primary sends a synchronous do_view_change to itself.
-            // 4. In start_view_as_the_new_primary(), the new primary sends itself a prepare_ok
-            //    message for each uncommitted message.
+            // 4. In primary_start_view_as_the_new_primary(), the new primary sends itself a
+            //    prepare_ok message for each uncommitted message.
             if (self.loopback_queue) |message| {
                 defer self.message_bus.unref(message);
@@ -3278,7 +3045,8 @@ pub fn ReplicaType(
                         log.warn("{}: on_{s}: ignoring (no context)", .{ self.replica, command });
                         return true;
                     },
-                    else => {},
+                    .headers, .request_headers => {},
+                    else => unreachable,
                 }
             }
@@ -3344,13 +3112,14 @@ pub fn ReplicaType(
             if (self.ignore_request_message_duplicate(message)) return true;
             if (self.ignore_request_message_preparing(message)) return true;
-            // Verify that the new request will fit in the WAL.
-            // The message's op hasn't been assigned yet, but it will be `self.op + 1`.
-            if (self.op == self.op_checkpoint_trigger()) {
+            // Don't accept more requests than will fit in the current checkpoint.
+            // (The request's op hasn't been assigned yet, but it will be `self.op + 1`
+            // when primary_pipeline_next() converts the request to a prepare.)
+            if (self.op + self.pipeline.queue.request_queue.count == self.op_checkpoint_trigger()) {
                 log.debug("{}: on_request: ignoring op={} (too far ahead, checkpoint_trigger={})", .{
                     self.replica,
                     self.op + 1,
-                    self.op_checkpoint,
+                    self.op_checkpoint(),
                 });
                 return true;
             }
@@ -3419,7 +3188,7 @@ pub fn ReplicaType(
             } else if (message.header.operation == .register) {
                 log.debug("{}: on_request: new session", .{self.replica});
                 return false;
-            } else if (self.pipeline_prepare_for_client(message.header.client)) |_| {
+            } else if (self.pipeline.queue.message_by_client(message.header.client)) |_| {
                 // The client registered with the previous primary, which committed and replied back
                 // to the client before the view change, after which the register operation was
                 // reloaded into the pipeline to be driven to completion by the new primary, which
@@ -3491,21 +3260,31 @@ pub fn ReplicaType(
             assert(message.header.client > 0);
             assert(message.header.view <= self.view); // See ignore_request_message_backup().
-            if (self.pipeline_prepare_for_client(message.header.client)) |prepare| {
-                assert(prepare.message.header.command == .prepare);
-                assert(prepare.message.header.client == message.header.client);
-                assert(prepare.message.header.op > self.commit_max);
+            if (self.pipeline.queue.message_by_client(message.header.client)) |pipeline_message| {
+                assert(pipeline_message.header.client == message.header.client);
+                assert(pipeline_message.header.command == .request or
+                    pipeline_message.header.command == .prepare);
-                if (message.header.checksum == prepare.message.header.context) {
-                    log.debug("{}: on_request: ignoring (already preparing)", .{self.replica});
+                if (pipeline_message.header.command == .request and
+                    pipeline_message.header.checksum == message.header.checksum)
+                {
+                    log.debug("{}: on_request: ignoring (already queued)", .{self.replica});
                     return true;
-                } else {
-                    log.err("{}: on_request: ignoring (client forked)", .{self.replica});
+                }
+                if (pipeline_message.header.command == .prepare and
+                    pipeline_message.header.context == message.header.checksum)
+                {
+                    assert(pipeline_message.header.op > self.commit_max);
+                    log.debug("{}: on_request: ignoring (already preparing)", .{self.replica});
                     return true;
                 }
+                log.err("{}: on_request: ignoring (client forked)", .{self.replica});
+                return true;
             }
-            if (self.pipeline.full()) {
+            if (self.pipeline.queue.full()) {
                 log.debug("{}: on_request: ignoring (pipeline full)", .{self.replica});
                 return true;
             }
@@ -3521,7 +3300,10 @@ pub fn ReplicaType(
             const command: []const u8 = @tagName(message.header.command);
-            // 4.3 Recovery
+            if (self.status == .recovering_head and message.header.command != .start_view) {
+                return true;
+            }
             // While a replica's status is recovering it does not participate in either the request
             // processing protocol or the view change protocol.
             // This is critical for correctness (to avoid data loss):
@@ -3614,28 +3396,7 @@ pub fn ReplicaType(
             assert(self.journal.header_with_op(self.op) == null);
         }
-        fn message_body_as_headers(message: *const Message) []const Header {
-            assert(message.header.size > @sizeOf(Header)); // Body must contain at least one header.
-            assert(message.header.command == .do_view_change or
-                message.header.command == .start_view or
-                message.header.command == .headers or
-                message.header.command == .recovery_response);
-            const headers = std.mem.bytesAsSlice(
-                Header,
-                message.buffer[@sizeOf(Header)..message.header.size],
-            );
-            for (headers[0 .. headers.len - 1]) |header, index| {
-                // Headers must be provided in reverse order for the sake of `repair_header()`.
-                // Otherwise, headers may never be repaired where the hash chain never connects.
-                assert(header.op > headers[index + 1].op);
-            }
-            return headers;
-        }
-        /// Returns whether the highest known op is certain.
+        /// Returns whether the head op is certain.
         ///
         /// After recovering the WAL, there are 2 possible outcomes:
         /// * All entries valid. The highest op is certain, and safe to set as `replica.op`.
@@ -3663,41 +3424,34 @@ pub fn ReplicaType(
         /// * ` ✓ ✗ o `: View change is safe.
         /// * ` ✓ = o `: View change is unsafe if any slots are faulty.
         ///              (`replica.op_checkpoint` == `replica.op`).
-        // TODO Use this function once we switch from recovery protocol to the superblock.
-        // If there is an "unsafe" fault, we will need to request a start_view from the primary to
-        // learn the op.
-        fn op_certain(self: *const Self) bool {
+        fn op_head_certain(self: *const Self) bool {
             assert(self.status == .recovering);
-            assert(self.op_checkpoint <= self.op);
+            assert(self.op_checkpoint() <= self.op);
-            const slot_op_checkpoint = self.journal.slot_for_op(self.op_checkpoint).index;
-            const slot_op = self.journal.slot_with_op(self.op).?.index;
+            const slot_op_checkpoint = self.journal.slot_for_op(self.op_checkpoint());
+            const slot_op_head = self.journal.slot_with_op(self.op).?;
             const slot_known_range = vsr.SlotRange{
                 .head = slot_op_checkpoint,
-                .tail = slot_op,
+                .tail = slot_op_head,
             };
             var iterator = self.journal.faulty.bits.iterator(.{ .kind = .set });
             while (iterator.next()) |slot| {
-                // The command is `reserved` when the entry was found faulty during WAL recovery.
-                // Faults found after WAL recovery are not relevant, because we know their op.
-                if (self.journal.headers[slot.index].command == .reserved) {
-                    if (slot_op_checkpoint == slot_op or
-                        !slot_known_range.contains(slot))
-                    {
-                        log.warn("{}: op_certain: op not known (faulty_slot={}, op={}, op_checkpoint={})", .{
-                            self.replica,
-                            slot.index,
-                            self.op,
-                            self.op_checkpoint,
-                        });
-                        return false;
-                    }
+                if (slot_op_checkpoint.index == slot_op_head.index or
+                    !slot_known_range.contains(.{ .index = slot }))
+                {
+                    return false;
                 }
             }
             return true;
         }
+        /// The op of the highest checkpointed message.
+        // TODO Refuse to store/ack any op>op_checkpoint+journal_slot_count.
+        pub fn op_checkpoint(self: *const Self) u64 {
+            return self.superblock.working.vsr_state.commit_min;
+        }
         /// Returns the op that will be `op_checkpoint` after the next checkpoint.
         ///
         /// For a replica with journal_slot_count=8 and lsm_batch_multiple=2:
@@ -3722,21 +3476,21 @@ pub fn ReplicaType(
         ///     %  op_checkpoint_trigger
         ///
         fn op_checkpoint_next(self: *const Self) u64 {
-            assert(self.op_checkpoint <= self.commit_min);
-            assert(self.op_checkpoint <= self.op);
-            assert(self.op_checkpoint == 0 or
-                (self.op_checkpoint + 1) % constants.lsm_batch_multiple == 0);
+            assert(self.op_checkpoint() <= self.commit_min);
+            assert(self.op_checkpoint() <= self.op);
+            assert(self.op_checkpoint() == 0 or
+                (self.op_checkpoint() + 1) % constants.lsm_batch_multiple == 0);
-            const op = if (self.op_checkpoint == 0)
+            const op = if (self.op_checkpoint() == 0)
                 // First wrap: op_checkpoint_next = 8-2-1 = 5
                 constants.journal_slot_count - constants.lsm_batch_multiple - 1
             else
                 // Second wrap: op_checkpoint_next = 5+8-2 = 11
                 // Third wrap: op_checkpoint_next = 11+8-2 = 17
-                self.op_checkpoint + constants.journal_slot_count - constants.lsm_batch_multiple;
+                self.op_checkpoint() + constants.journal_slot_count - constants.lsm_batch_multiple;
             assert((op + 1) % constants.lsm_batch_multiple == 0);
             // The checkpoint always advances.
-            assert(op > self.op_checkpoint);
+            assert(op > self.op_checkpoint());
             return op;
         }
@@ -3790,110 +3544,94 @@ pub fn ReplicaType(
             }
         }
-        /// Searches the pipeline for a prepare for a given op and checksum.
-        /// When `checksum` is `null`, match any checksum.
-        fn pipeline_prepare_for_op_and_checksum(self: *Self, op: u64, checksum: ?u128) ?*Prepare {
-            assert(self.status == .normal or self.status == .view_change);
-            // To optimize the search, we can leverage the fact that the pipeline is ordered and
-            // continuous.
-            if (self.pipeline.count == 0) return null;
-            const head_op = self.pipeline.head_ptr().?.message.header.op;
-            const tail_op = self.pipeline.tail_ptr().?.message.header.op;
-            if (op < head_op) return null;
-            if (op > tail_op) return null;
-            const pipeline_prepare = self.pipeline.get_ptr(op - head_op).?;
-            assert(pipeline_prepare.message.header.op == op);
-            if (checksum == null or pipeline_prepare.message.header.checksum == checksum.?) {
-                return pipeline_prepare;
-            } else {
-                return null;
-            }
-        }
-        /// Searches the pipeline for a prepare for a given client.
-        fn pipeline_prepare_for_client(self: *Self, client: u128) ?*Prepare {
+        fn primary_pipeline_prepare(self: *Self, request: Request) void {
             assert(self.status == .normal);
             assert(self.primary());
             assert(self.commit_min == self.commit_max);
+            assert(self.commit_max + self.pipeline.queue.prepare_queue.count == self.op);
+            assert(!self.pipeline.queue.prepare_queue.full());
+            self.pipeline.queue.verify();
-            var op = self.commit_max + 1;
-            var parent = self.journal.header_with_op(self.commit_max).?.checksum;
-            var iterator = self.pipeline.iterator_mutable();
-            while (iterator.next_ptr()) |prepare| {
-                assert(prepare.message.header.command == .prepare);
-                assert(prepare.message.header.op == op);
-                assert(prepare.message.header.parent == parent);
-                // A client may have multiple requests in the pipeline if these were committed by
-                // the previous primary and were reloaded into the pipeline after a view change.
-                if (prepare.message.header.client == client) return prepare;
+            const message = request.message;
+            assert(!self.ignore_request_message(message));
-                parent = prepare.message.header.checksum;
-                op += 1;
-            }
+            log.debug("{}: primary_pipeline_next: request checksum={} client={}", .{
+                self.replica,
+                message.header.checksum,
+                message.header.client,
+            });
-            assert(self.pipeline.count <= constants.pipeline_max);
-            assert(self.commit_max + self.pipeline.count == op - 1);
-            assert(self.commit_max + self.pipeline.count == self.op);
+            // Guard against the wall clock going backwards by taking the max with timestamps issued:
+            self.state_machine.prepare_timestamp = std.math.max(
+                // The cluster `commit_timestamp` may be ahead of our `prepare_timestamp` because this
+                // may be our first prepare as a recently elected primary:
+                std.math.max(
+                    self.state_machine.prepare_timestamp,
+                    self.state_machine.commit_timestamp,
+                ) + 1,
+                @intCast(u64, request.realtime),
+            );
+            assert(self.state_machine.prepare_timestamp > self.state_machine.commit_timestamp);
-            return null;
-        }
+            const prepare_timestamp = self.state_machine.prepare(
+                message.header.operation.cast(StateMachine),
+                message.body(),
+            );
-        /// Searches the pipeline for a prepare for a given client and checksum.
-        /// Passing the prepare_ok message prevents these u128s from being accidentally swapped.
-        /// Asserts that the returned prepare, if any, exactly matches the prepare_ok.
-        fn pipeline_prepare_for_prepare_ok(self: *Self, ok: *const Message) ?*Prepare {
-            assert(ok.header.command == .prepare_ok);
+            const latest_entry = self.journal.header_with_op(self.op).?;
+            message.header.parent = latest_entry.checksum;
+            message.header.context = message.header.checksum;
+            message.header.view = self.view;
+            message.header.op = self.op + 1;
+            message.header.commit = self.commit_max;
+            message.header.timestamp = prepare_timestamp;
+            message.header.replica = self.replica;
+            message.header.command = .prepare;
-            assert(self.status == .normal);
-            assert(self.primary());
+            message.header.set_checksum_body(message.body());
+            message.header.set_checksum();
-            const prepare = self.pipeline_prepare_for_client(ok.header.client) orelse {
-                log.debug("{}: pipeline_prepare_for_prepare_ok: not preparing", .{self.replica});
-                return null;
-            };
+            log.debug("{}: primary_pipeline_next: prepare {}", .{ self.replica, message.header.checksum });
-            if (ok.header.context != prepare.message.header.checksum) {
-                // This can be normal, for example, if an old prepare_ok is replayed.
-                log.debug("{}: pipeline_prepare_for_prepare_ok: preparing a different client op", .{
-                    self.replica,
-                });
-                return null;
+            if (self.pipeline.queue.prepare_queue.tail_ptr()) |previous| {
+                // Do not restart the prepare timeout as it is already ticking for another prepare.
+                assert(self.prepare_timeout.ticking);
+                assert(previous.message.header.checksum == message.header.parent);
+            } else {
+                // We are about to add the first prepare to the pipeline, so start the timeout.
+                assert(!self.prepare_timeout.ticking);
+                self.prepare_timeout.start();
             }
+            self.pipeline.queue.push_prepare(message);
+            self.on_prepare(message);
-            assert(prepare.message.header.parent == ok.header.parent);
-            assert(prepare.message.header.client == ok.header.client);
-            assert(prepare.message.header.request == ok.header.request);
-            assert(prepare.message.header.cluster == ok.header.cluster);
-            assert(prepare.message.header.epoch == ok.header.epoch);
-            // A prepare may be committed in the same view or in a newer view:
-            assert(prepare.message.header.view <= ok.header.view);
-            assert(prepare.message.header.op == ok.header.op);
-            assert(prepare.message.header.commit == ok.header.commit);
-            assert(prepare.message.header.timestamp == ok.header.timestamp);
-            assert(prepare.message.header.operation == ok.header.operation);
-            return prepare;
+            // We expect `on_prepare()` to increment `self.op` to match the primary's latest prepare:
+            // This is critical to ensure that pipelined prepares do not receive the same op number.
+            assert(self.op == message.header.op);
         }
-        fn recover(self: *Self) void {
-            assert(self.status == .recovering);
-            assert(self.replica_count > 1);
-            log.debug("{}: recover: sending recovery messages nonce={}", .{
-                self.replica,
-                self.recovery_nonce,
-            });
+        fn pipeline_prepare_by_op_and_checksum(self: *Self, op: u64, checksum: ?u128) ?*Message {
+            assert(self.status == .normal or self.status == .view_change);
+            assert(self.replica == self.primary_index(self.view) or checksum != null);
-            self.send_header_to_other_replicas(.{
-                .command = .recovery,
-                .cluster = self.cluster,
-                .context = self.recovery_nonce,
-                .replica = self.replica,
-            });
+            if (checksum == null) {
+                // The PipelineCache may hold messages that have been discarded, so we must be
+                // careful not to access it unless we can verify the entry's checksum.
+                //
+                // Only on_request_prepare() queries the pipeline with checksum=null.
+                // And primaries ignore request_prepare messages during their view change
+                // (during which time the pipeline is not yet repaired, and so is untrusted).
+                assert(self.primary());
+                assert(self.pipeline == .queue);
+            }
+            return switch (self.pipeline) {
+                .cache => |*cache| cache.prepare_by_op_and_checksum(op, checksum.?),
+                .queue => |*queue| if (queue.prepare_by_op_and_checksum(op, checksum)) |prepare|
+                    prepare.message
+                else
+                    null,
+            };
         }
         /// Starting from the latest journal entry, backfill any missing or disconnected headers.
@@ -3911,8 +3649,8 @@ pub fn ReplicaType(
             assert(self.status == .normal or self.status == .view_change);
             assert(self.repairs_allowed());
-            assert(self.op_checkpoint <= self.op);
-            assert(self.op_checkpoint <= self.commit_min);
+            assert(self.op_checkpoint() <= self.op);
+            assert(self.op_checkpoint() <= self.commit_min);
             assert(self.commit_min <= self.op);
             assert(self.commit_min <= self.commit_max);
             assert(self.journal.header_with_op(self.op) != null);
@@ -3954,36 +3692,43 @@ pub fn ReplicaType(
             }
             // Request any missing or disconnected headers:
-            // TODO Snapshots: Ensure that self.commit_min op always exists in the journal.
-            var broken = self.journal.find_latest_headers_break_between(self.commit_min, self.op);
-            if (broken) |range| {
-                log.debug("{}: repair: break: view={} op_min={} op_max={} (commit={}..{} op={})", .{
-                    self.replica,
-                    self.view,
-                    range.op_min,
-                    range.op_max,
-                    self.commit_min,
-                    self.commit_max,
+            if (self.commit_min != self.op) {
+                var broken = self.journal.find_latest_headers_break_between(
+                    self.commit_min + 1,
                     self.op,
-                });
-                assert(range.op_min > self.commit_min);
-                assert(range.op_max < self.op);
-                // A range of `op_min=0` or `op_max=0` should be impossible as a header break:
-                // This is the root op that is prepared when the cluster is initialized.
-                assert(range.op_min > 0);
-                assert(range.op_max > 0);
-                if (self.choose_any_other_replica()) |replica| {
-                    self.send_header_to_replica(replica, .{
-                        .command = .request_headers,
-                        .cluster = self.cluster,
-                        .replica = self.replica,
-                        .view = self.view,
-                        .commit = range.op_min,
-                        .op = range.op_max,
-                    });
+                );
+                if (broken) |range| {
+                    log.debug(
+                        "{}: repair: break: view={} op_min={} op_max={} (commit={}..{} op={})",
+                        .{
+                            self.replica,
+                            self.view,
+                            range.op_min,
+                            range.op_max,
+                            self.commit_min,
+                            self.commit_max,
+                            self.op,
+                        },
+                    );
+                    assert(range.op_min > self.commit_min);
+                    assert(range.op_max < self.op);
+                    // A range of `op_min=0` or `op_max=0` should be impossible as a header break:
+                    // This is the root op that is prepared when the cluster is initialized.
+                    assert(range.op_min > 0);
+                    assert(range.op_max > 0);
+                    if (self.choose_any_other_replica()) |replica| {
+                        self.send_header_to_replica(replica, .{
+                            .command = .request_headers,
+                            .cluster = self.cluster,
+                            .replica = self.replica,
+                            .view = self.view,
+                            .commit = range.op_min,
+                            .op = range.op_max,
+                        });
+                    }
+                    return;
                 }
-                return;
             }
             // Assert that all headers are now present and connected with a perfect hash chain:
@@ -4003,9 +3748,12 @@ pub fn ReplicaType(
             }
             if (self.status == .view_change and self.primary_index(self.view) == self.replica) {
-                if (self.primary_repair_pipeline_op() != null) return self.primary_repair_pipeline();
-                // Start the view as the new primary:
-                self.start_view_as_the_new_primary();
+                // Repair the pipeline, which may discover faulty prepares and drive more repairs.
+                switch (self.primary_repair_pipeline()) {
+                    // primary_repair_pipeline() is already working.
+                    .busy => {},
+                    .done => self.primary_start_view_as_the_new_primary(),
+                }
             }
         }
@@ -4073,8 +3821,8 @@ pub fn ReplicaType(
                 return false;
             }
-            if (header.op <= self.op_checkpoint) {
-                if (header.op == 0 and self.op_checkpoint == 0) {
+            if (header.op <= self.op_checkpoint()) {
+                if (header.op == 0 and self.op_checkpoint() == 0) {
                     // Repairing the root op is allowed until the first checkpoint.
                 } else {
                     // It is critical that we do not repair checkpointed ops; their slots now belong
@@ -4082,7 +3830,7 @@ pub fn ReplicaType(
                     // correctness violation.
                     log.debug("{}: repair_header: false (precedes self.op_checkpoint={})", .{
                         self.replica,
-                        self.op_checkpoint,
+                        self.op_checkpoint(),
                     });
                     return false;
                 }
@@ -4200,175 +3948,173 @@ pub fn ReplicaType(
         }
         /// Reads prepares into the pipeline (before we start the view as the new primary).
-        fn primary_repair_pipeline(self: *Self) void {
+        fn primary_repair_pipeline(self: *Self) enum { done, busy } {
             assert(self.status == .view_change);
             assert(self.primary_index(self.view) == self.replica);
-            assert(self.commit_max < self.op);
+            assert(self.commit_max == self.commit_min);
+            assert(self.commit_max <= self.op);
             assert(self.journal.dirty.count == 0);
+            assert(self.pipeline == .cache);
-            if (self.repairing_pipeline) {
+            if (self.pipeline_repairing) {
                 log.debug("{}: primary_repair_pipeline: already repairing...", .{self.replica});
-                return;
+                return .busy;
             }
-            log.debug("{}: primary_repair_pipeline: repairing", .{self.replica});
-            assert(!self.repairing_pipeline);
-            self.repairing_pipeline = true;
+            if (self.primary_repair_pipeline_op()) |_| {
+                log.debug("{}: primary_repair_pipeline: repairing", .{self.replica});
+                assert(!self.pipeline_repairing);
+                self.pipeline_repairing = true;
+                self.primary_repair_pipeline_read();
+                return .busy;
+            }
-            self.primary_repair_pipeline_read();
+            // All prepares needed to reconstruct the pipeline queue are now available in the cache.
+            return .done;
         }
-        /// Discard messages from the prepare pipeline.
-        /// Retain uncommitted messages that belong in the current view to maximize durability.
-        fn primary_repair_pipeline_diff(self: *Self) void {
+        fn primary_repair_pipeline_done(self: *Self) PipelineQueue {
             assert(self.status == .view_change);
             assert(self.primary_index(self.view) == self.replica);
+            assert(self.commit_max == self.commit_min);
+            assert(self.commit_max <= self.op);
+            assert(self.journal.dirty.count == 0);
+            assert(self.valid_hash_chain_between(self.commit_min, self.op));
+            assert(self.pipeline == .cache);
+            assert(!self.pipeline_repairing);
+            assert(self.primary_repair_pipeline() == .done);
+            assert(self.commit_max + constants.pipeline_prepare_queue_max >= self.op);
-            // Discard messages from the front of the pipeline that committed since we were primary.
-            while (self.pipeline.head_ptr()) |prepare| {
-                if (prepare.message.header.op > self.commit_max) break;
-                self.message_bus.unref(self.pipeline.pop().?.message);
-            }
-            // Discard the whole pipeline if it is now disconnected from the WAL's hash chain.
-            if (self.pipeline.head_ptr()) |pipeline_head| {
-                const parent = self.journal.header_with_op_and_checksum(
-                    pipeline_head.message.header.op - 1,
-                    pipeline_head.message.header.parent,
-                );
-                if (parent == null) {
-                    while (self.pipeline.pop()) |prepare| self.message_bus.unref(prepare.message);
-                    assert(self.pipeline.count == 0);
-                }
-            }
+            var pipeline_queue = PipelineQueue{};
+            var op = self.commit_max + 1;
+            var parent = self.journal.header_with_op(self.commit_max).?.checksum;
+            while (op <= self.op) : (op += 1) {
+                const journal_header = self.journal.header_with_op(op).?;
+                assert(journal_header.op == op);
+                assert(journal_header.parent == parent);
-            // Discard messages from the back of the pipeline that are not part of this view.
-            while (self.pipeline.tail_ptr()) |prepare| {
-                if (self.journal.has(prepare.message.header)) break;
+                const prepare =
+                    self.pipeline.cache.prepare_by_op_and_checksum(op, journal_header.checksum).?;
+                assert(prepare.header.op == op);
+                assert(prepare.header.op <= self.op);
+                assert(prepare.header.checksum == journal_header.checksum);
+                assert(prepare.header.parent == parent);
+                assert(self.journal.has(prepare.header));
-                self.message_bus.unref(self.pipeline.pop_tail().?.message);
+                pipeline_queue.push_prepare(prepare.ref());
+                parent = prepare.header.checksum;
             }
+            assert(self.commit_max + pipeline_queue.prepare_queue.count == self.op);
-            log.debug("{}: primary_repair_pipeline_diff: {} prepare(s)", .{
-                self.replica,
-                self.pipeline.count,
-            });
-            self.verify_pipeline();
-            // Do not reset `repairing_pipeline` here as this must be reset by the read callback.
-            // Otherwise, we would be making `primary_repair_pipeline()` reentrant.
+            pipeline_queue.verify();
+            return pipeline_queue;
         }
         /// Returns the next `op` number that needs to be read into the pipeline.
-        fn primary_repair_pipeline_op(self: *Self) ?u64 {
+        /// Returns null when all necessary prepares are in the pipeline cache.
+        fn primary_repair_pipeline_op(self: *const Self) ?u64 {
             assert(self.status == .view_change);
             assert(self.primary_index(self.view) == self.replica);
+            assert(self.commit_max == self.commit_min);
+            assert(self.commit_max <= self.op);
+            assert(self.pipeline == .cache);
-            // We cannot rely on `pipeline.count` below unless the pipeline has first been diffed.
-            self.primary_repair_pipeline_diff();
-            const op = self.commit_max + self.pipeline.count + 1;
-            if (op <= self.op) return op;
-            assert(self.commit_max + self.pipeline.count == self.op);
+            var op = self.commit_max + 1;
+            while (op <= self.op) : (op += 1) {
+                const op_header = self.journal.header_with_op(op).?;
+                if (!self.pipeline.cache.contains_header(op_header)) {
+                    return op;
+                }
+            }
             return null;
         }
         fn primary_repair_pipeline_read(self: *Self) void {
-            assert(self.repairing_pipeline);
             assert(self.status == .view_change);
             assert(self.primary_index(self.view) == self.replica);
+            assert(self.commit_max == self.commit_min);
+            assert(self.commit_max <= self.op);
+            assert(self.pipeline == .cache);
+            assert(self.pipeline_repairing);
-            if (self.primary_repair_pipeline_op()) |op| {
-                assert(op > self.commit_max);
-                assert(op <= self.op);
-                assert(self.commit_max + self.pipeline.count + 1 == op);
-                const checksum = self.journal.header_with_op(op).?.checksum;
-                log.debug("{}: primary_repair_pipeline_read: op={} checksum={}", .{
-                    self.replica,
-                    op,
-                    checksum,
-                });
-                self.journal.read_prepare(repair_pipeline_push, op, checksum, null);
-            } else {
-                log.debug("{}: primary_repair_pipeline_read: repaired", .{self.replica});
-                self.repairing_pipeline = false;
-                self.repair();
-            }
+            const op = self.primary_repair_pipeline_op().?;
+            const op_checksum = self.journal.header_with_op(op).?.checksum;
+            log.debug("{}: primary_repair_pipeline_read: op={} checksum={}", .{
+                self.replica,
+                op,
+                op_checksum,
+            });
+            self.journal.read_prepare(repair_pipeline_read_callback, op, op_checksum, null);
         }
-        fn repair_pipeline_push(
+        fn repair_pipeline_read_callback(
             self: *Self,
             prepare: ?*Message,
             destination_replica: ?u8,
         ) void {
             assert(destination_replica == null);
-            assert(self.repairing_pipeline);
-            self.repairing_pipeline = false;
+            assert(self.pipeline_repairing);
+            self.pipeline_repairing = false;
             if (prepare == null) {
-                log.debug("{}: repair_pipeline_push: prepare == null", .{self.replica});
+                log.debug("{}: repair_pipeline_read_callback: prepare == null", .{self.replica});
                 return;
             }
             // Our state may have advanced significantly while we were reading from disk.
             if (self.status != .view_change) {
-                log.debug("{}: repair_pipeline_push: no longer in view change status", .{
+                assert(self.primary_index(self.view) != self.replica);
+                log.debug("{}: repair_pipeline_read_callback: no longer in view change status", .{
                     self.replica,
                 });
                 return;
             }
             if (self.primary_index(self.view) != self.replica) {
-                log.debug("{}: repair_pipeline_push: no longer primary", .{self.replica});
+                log.debug("{}: repair_pipeline_read_callback: no longer primary", .{self.replica});
                 return;
             }
             // We may even be several views ahead and may now have a completely different pipeline.
             const op = self.primary_repair_pipeline_op() orelse {
-                log.debug("{}: repair_pipeline_push: pipeline changed", .{self.replica});
+                log.debug("{}: repair_pipeline_read_callback: pipeline changed", .{self.replica});
                 return;
             };
             assert(op > self.commit_max);
             assert(op <= self.op);
-            assert(self.commit_max + self.pipeline.count + 1 == op);
             if (prepare.?.header.op != op) {
-                log.debug("{}: repair_pipeline_push: op changed", .{self.replica});
+                log.debug("{}: repair_pipeline_read_callback: op changed", .{self.replica});
                 return;
             }
             if (prepare.?.header.checksum != self.journal.header_with_op(op).?.checksum) {
-                log.debug("{}: repair_pipeline_push: checksum changed", .{self.replica});
+                log.debug("{}: repair_pipeline_read_callback: checksum changed", .{self.replica});
                 return;
             }
             assert(self.status == .view_change);
             assert(self.primary_index(self.view) == self.replica);
-            log.debug("{}: repair_pipeline_push: op={} checksum={}", .{
+            log.debug("{}: repair_pipeline_read_callback: op={} checksum={}", .{
                 self.replica,
                 prepare.?.header.op,
                 prepare.?.header.checksum,
             });
-            if (self.pipeline.tail_ptr()) |parent| {
-                assert(prepare.?.header.parent == parent.message.header.checksum);
-            }
-            self.pipeline.push_assume_capacity(.{ .message = prepare.?.ref() });
-            assert(self.pipeline.count >= 1);
+            const prepare_evicted = self.pipeline.cache.insert(prepare.?.ref());
+            if (prepare_evicted) |message_evicted| self.message_bus.unref(message_evicted);
-            self.repairing_pipeline = true;
-            self.primary_repair_pipeline_read();
+            if (self.primary_repair_pipeline_op()) |_| {
+                assert(!self.pipeline_repairing);
+                self.pipeline_repairing = true;
+                self.primary_repair_pipeline_read();
+            } else {
+                self.repair();
+            }
         }
         fn repair_prepares(self: *Self) void {
@@ -4376,7 +4122,7 @@ pub fn ReplicaType(
             assert(self.repairs_allowed());
             assert(self.journal.dirty.count > 0);
             assert(self.op >= self.commit_min);
-            assert(self.op - self.commit_min + 1 <= constants.journal_slot_count);
+            assert(self.op - self.commit_min <= constants.journal_slot_count);
             // Request enough prepares to utilize our max IO depth:
             var budget = self.journal.writes.available();
@@ -4434,7 +4180,7 @@ pub fn ReplicaType(
                         // belong) to a newer op, from the new WAL wrap. Additionally, we may not
                         // still have access to its surrounding commits to verify the hash chain.
                         assert(op <= self.commit_min);
-                        assert(op <= self.op_checkpoint);
+                        assert(op <= self.op_checkpoint());
                         assert(self.journal.faulty.bit(slot));
                         log.debug("{}: repair_prepares: remove slot={} " ++
@@ -4516,9 +4262,9 @@ pub fn ReplicaType(
             //
             // Using the pipeline to repair is faster than a `request_prepare`.
             // Also, messages in the pipeline are never corrupt.
-            if (self.pipeline_prepare_for_op_and_checksum(op, checksum)) |prepare| {
-                assert(prepare.message.header.op == op);
-                assert(prepare.message.header.checksum == checksum);
+            if (self.pipeline_prepare_by_op_and_checksum(op, checksum)) |prepare| {
+                assert(prepare.header.op == op);
+                assert(prepare.header.checksum == checksum);
                 if (self.replica_count == 1) {
                     // This op won't start writing until all ops in the pipeline preceding it have
@@ -4528,7 +4274,8 @@ pub fn ReplicaType(
                         op,
                         checksum,
                     });
-                    assert(op > self.pipeline.head_ptr().?.message.header.op);
+                    const pipeline_head = self.pipeline.queue.prepare_queue.head_ptr().?;
+                    assert(pipeline_head.message.header.op < op);
                     return false;
                 }
@@ -4537,7 +4284,7 @@ pub fn ReplicaType(
                     op,
                     checksum,
                 });
-                self.write_prepare(prepare.message, .pipeline);
+                self.write_prepare(prepare, .pipeline);
                 return true;
             }
@@ -4638,29 +4385,10 @@ pub fn ReplicaType(
             }
         }
-        /// The caller must ensure that the headers are trustworthy.
-        ///
-        /// Asserts that sequential ops are hash-chained. (Gaps are permitted).
-        fn replace_headers(self: *Self, headers: []const Header) void {
-            for (headers) |*header, i| {
-                if (i > 0) {
-                    const next = &headers[i - 1];
-                    assert(next.view >= header.view);
-                    if (next.op == header.op + 1) {
-                        assert(next.parent == header.checksum);
-                    } else {
-                        assert(next.op > header.op);
-                    }
-                }
-                self.replace_header(header);
-            }
-        }
         /// Replaces the header if the header is different and not already committed.
         /// The caller must ensure that the header is trustworthy.
         fn replace_header(self: *Self, header: *const Header) void {
-            assert(self.op_checkpoint <= self.commit_min);
+            assert(self.op_checkpoint() <= self.commit_min);
             assert(header.command == .prepare);
             assert(header.op <= self.op); // Never advance the op.
             assert(header.op <= self.op_checkpoint_trigger());
@@ -4670,7 +4398,7 @@ pub fn ReplicaType(
                     assert(existing_header.checksum == header.checksum);
                     return;
                 } else {
-                    if (header.op <= self.op_checkpoint) {
+                    if (header.op <= self.op_checkpoint()) {
                         // Never replace a checkpointed op — those slots are needed by the following
                         // WAL wrap.
                         return;
@@ -4769,35 +4497,11 @@ pub fn ReplicaType(
             self.nack_prepare_op = null;
         }
-        fn reset_quorum_prepare_ok(self: *Self) void {
-            // "prepare_ok"s from previous views are not valid, even if the pipeline entry is reused
-            // after a cycle of view changes. In other words, when a view change cycles around, so
-            // that the original primary becomes a primary of a new view, pipeline entries may be
-            // reused. However, the pipeline's prepare_ok quorums must not be reused, since the
-            // replicas that sent them may have swapped them out during a previous view change.
-            var iterator = self.pipeline.iterator_mutable();
-            while (iterator.next_ptr()) |prepare| {
-                prepare.ok_quorum_received = false;
-                prepare.ok_from_all_replicas = quorum_counter_null;
-                assert(prepare.ok_from_all_replicas.count() == 0);
-            }
-        }
         fn reset_quorum_start_view_change(self: *Self) void {
             self.reset_quorum_counter(&self.start_view_change_from_other_replicas);
             self.start_view_change_quorum = false;
         }
-        fn reset_quorum_recovery_response(self: *Self) void {
-            for (self.recovery_response_from_other_replicas) |*received, replica| {
-                if (received.*) |message| {
-                    assert(replica != self.replica);
-                    self.message_bus.unref(message);
-                    received.* = null;
-                }
-            }
-        }
         fn send_prepare_ok(self: *Self, header: *const Header) void {
             assert(header.command == .prepare);
             assert(header.cluster == self.cluster);
@@ -4920,6 +4624,7 @@ pub fn ReplicaType(
             // operations after the highest `commit_min` may yet have been committed before the old
             // primary crashed. The new primary will use the NACK protocol to be sure of a discard.
             assert(message.header.commit == self.commit_min);
+            DVCQuorum.verify_message(message);
             self.send_message_to_replica(self.primary_index(self.view), message);
         }
@@ -5051,18 +4756,6 @@ pub fn ReplicaType(
                     },
                     else => unreachable,
                 },
-                .recovery => {
-                    assert(self.status == .recovering);
-                    assert(message.header.replica == self.replica);
-                    assert(message.header.replica != replica);
-                    assert(message.header.context == self.recovery_nonce);
-                },
-                .recovery_response => {
-                    assert(self.status == .normal);
-                    assert(message.header.view == self.view);
-                    assert(message.header.replica == self.replica);
-                    assert(message.header.replica != replica);
-                },
                 .headers => {
                     assert(self.status == .normal or self.status == .view_change);
                     assert(message.header.view == self.view);
@@ -5111,6 +4804,42 @@ pub fn ReplicaType(
                 },
             }
+            if (replica != self.replica) {
+                // Critical: Do not advertise a view/log_view before it is durable.
+                // See view_durable()/log_view_durable().
+                if (message.header.view > self.view_durable() and
+                    message.header.command != .request_start_view)
+                {
+                    log.debug("{}: send_message_to_replica: dropped {s} " ++
+                        "(view_durable={} message.view={})", .{
+                        self.replica,
+                        @tagName(message.header.command),
+                        self.view_durable(),
+                        message.header.view,
+                    });
+                    return;
+                }
+                if (message.header.command == .do_view_change) {
+                    const message_log_view = message.header.timestamp;
+                    if (self.log_view_durable() < message_log_view) {
+                        log.debug("{}: send_message_to_replica: dropped {s} " ++
+                            "(log_view_durable={} message.log_view={})", .{
+                            self.replica,
+                            @tagName(message.header.command),
+                            self.log_view_durable(),
+                            message_log_view,
+                        });
+                        return;
+                    }
+                    assert(std.mem.eql(
+                        u8,
+                        message.body(),
+                        std.mem.sliceAsBytes(self.superblock.working.vsr_headers().slice),
+                    ));
+                }
+            }
             if (replica == self.replica) {
                 assert(self.loopback_queue == null);
                 self.loopback_queue = message.ref();
@@ -5119,6 +4848,142 @@ pub fn ReplicaType(
             }
         }
+        /// The highest durable view.
+        /// A replica must not advertise a view higher than its durable view.
+        ///
+        /// The advertised `view` must never backtrack after a crash.
+        /// This ensures the old primary is isolated — if a backup's view backtracks, it could
+        /// ack a prepare to the old primary, forking the log. See VRR §8.2 for more detail.
+        ///
+        /// Equivalent to `superblock.working.vsr_state.view`.
+        fn view_durable(self: *const Self) u32 {
+            return self.superblock.working.vsr_state.view;
+        }
+        /// The highest durable log_view.
+        /// A replica must not advertise a log_view (in a DVC) higher than its durable log_view.
+        ///
+        /// A replica's advertised `log_view` must never backtrack after a crash.
+        /// (`log_view` is only advertised within DVC messages).
+        ///
+        /// To understand why, consider the following replica logs, where:
+        ///
+        ///   - numbers in replica rows denote the version of the op, and
+        ///   - a<b<c denotes the view in which the op was prepared.
+        ///
+        /// Replica 0 prepares some ops, but they never arrive at replica 1/2:
+        ///
+        ///       view=a
+        ///           op  │ 0  1  2
+        ///    replica 0  │ 1a 2a 3a (log_view=a, leader)
+        ///    replica 1  │ -  -  -  (log_view=a, follower — but never receives any prepares)
+        ///   (replica 2) │ -  -  -  (log_view=_, partitioned)
+        ///
+        /// After a view change, replica 1 prepares some ops, but they never arrive at replica 0/2:
+        ///
+        ///       view=b
+        ///           op  │ 0  1  2
+        ///   (replica 0) │ 1a 2a 3a (log_view=a, partitioned)
+        ///    replica 1  │ 4b 5b 6b (log_view=b, leader)
+        ///    replica 2  │ -  -  -  (log_view=b, follower — but never receives any prepares)
+        ///
+        /// After another view change, replica 2 loads replica 1's ops:
+        ///
+        ///       view=c
+        ///           op  │ 0  1  2
+        ///    replica 0  │ 1a 2a 3a (log_view=c, follower)
+        ///   (replica 1) │ 4b 5b 6b (log_view=b, partitioned)
+        ///    replica 2  │ 1c 2c 3c (log_view=c, leader)
+        ///
+        /// Suppose replica 0 crashes and its log_view regresses to a.
+        /// If replica 2 is partitioned, replicas 0 and 1 start view d with the DVCs:
+        ///
+        ///    replica 0  │ 1a 2a 3a (log_view=a, log_view backtracked!)
+        ///    replica 1  │ 4b 5b 6b (log_view=b)
+        ///
+        /// Replica 1's higher log_view is canonical, so 4b/5b/6b replace 1a/2a/3a even though
+        /// the latter may have been committed during view c. The log has forked.
+        ///
+        /// Therefore, a replica's log_view must never regress.
+        ///
+        /// Equivalent to `superblock.working.vsr_state.log_view`.
+        fn log_view_durable(self: *const Self) u32 {
+            return self.superblock.working.vsr_state.log_view;
+        }
+        fn view_durable_updating(self: *const Self) bool {
+            return self.superblock.view_change_in_progress();
+        }
+        /// Persist the current view and log_view to the superblock.
+        /// `view_durable` and `log_view_durable` will update asynchronously, when their respective
+        /// updates are durable.
+        fn view_durable_update(self: *Self) void {
+            assert(self.status == .normal or self.status == .view_change);
+            assert(self.view >= self.log_view);
+            assert(self.view >= self.view_durable());
+            assert(self.log_view >= self.log_view_durable());
+            assert(self.log_view > self.log_view_durable() or self.view > self.view_durable());
+            // The primary must only persist the SV headers after repairs are done.
+            // Otherwise headers could be nacked, truncated, then restored after a crash.
+            assert(self.log_view < self.view or self.replica != self.primary_index(self.view) or
+                self.status == .normal);
+            if (self.view_durable_updating()) return;
+            log.debug("{}: view_durable_update: view_durable={}..{} log_view_durable={}..{}", .{
+                self.replica,
+                self.view_durable(),
+                self.view,
+                self.log_view_durable(),
+                self.log_view,
+            });
+            self.superblock.view_change(
+                view_durable_update_callback,
+                &self.superblock_context_view_change,
+                .{
+                    .commit_max = self.commit_max,
+                    .view = self.view,
+                    .log_view = self.log_view,
+                    .headers = self.create_view_change_headers(),
+                },
+            );
+            assert(self.view_durable_updating());
+        }
+        fn view_durable_update_callback(context: *SuperBlock.Context) void {
+            const self = @fieldParentPtr(Self, "superblock_context_view_change", context);
+            assert(self.status == .normal or self.status == .view_change);
+            assert(!self.view_durable_updating());
+            assert(self.superblock.working.vsr_state.view <= self.view);
+            assert(self.superblock.working.vsr_state.log_view <= self.log_view);
+            assert(self.superblock.working.vsr_state.commit_min <= self.commit_min);
+            assert(self.superblock.working.vsr_state.commit_max <= self.commit_max);
+            log.debug("{}: view_durable_update_callback: " ++
+                "(view_durable={} log_view_durable={})", .{
+                self.replica,
+                self.view_durable(),
+                self.log_view_durable(),
+            });
+            assert(self.view_durable() <= self.view);
+            assert(self.log_view_durable() <= self.view_durable());
+            assert(self.log_view_durable() <= self.log_view);
+            // The view/log_view incremented while the previous view-change update was being saved.
+            const update = self.log_view_durable() < self.log_view or
+                self.view_durable() < self.view;
+            const update_dvc = update and self.log_view < self.view;
+            const update_sv = update and self.log_view == self.view and
+                (self.replica != self.primary_index(self.view) or self.status == .normal);
+            assert(!(update_dvc and update_sv));
+            if (update_dvc or update_sv) self.view_durable_update();
+        }
         fn set_op_and_commit_max(self: *Self, op: u64, commit_max: u64, method: []const u8) void {
             assert(self.status == .view_change or self.status == .recovering);
@@ -5130,6 +4995,7 @@ pub fn ReplicaType(
                     // It will be set shortly, when we transition to normal status.
                     assert(self.view == 0);
                 },
+                .recovering_head => unreachable,
             }
             // Uncommitted ops may not survive a view change so we must assert `op` against
@@ -5156,9 +5022,7 @@ pub fn ReplicaType(
                 });
             }
-            assert(commit_max >=
-                self.commit_max - std.math.min(constants.pipeline_max, self.commit_max));
+            assert(commit_max >= self.commit_max -| constants.pipeline_prepare_queue_max);
             assert(self.commit_min <= self.commit_max);
             assert(self.op >= self.commit_max or self.op < self.commit_max);
@@ -5201,48 +5065,84 @@ pub fn ReplicaType(
         /// where the new primary's headers depends on which of replica 1 and 2's DVC is used
         /// for repair before the other (i.e. whether they repair op 6 or 7 first).
         ///
-        /// For the above case to occur, replicas 0, 1, and 2 must all share the highest `view_normal`.
-        /// And since they share the latest `view_normal`, ops 5,6,7 were just installed by
+        /// For the above case to occur, replicas 0, 1, and 2 must all share the highest `log_view`.
+        /// And since they share the latest `log_view`, ops 5,6,7 were just installed by
         /// `replace_header`, which is order-independent (it doesn't use the hash chain).
         ///
-        /// (If replica 0's view_normal was greater than 1/2's, then replica 0 must have all
+        /// (If replica 0's log_view was greater than 1/2's, then replica 0 must have all
         /// headers from previous views. Which means 6,7 are from the current view. But since
-        /// replica 0 doesn't have 6/7, then replica 1/2 must share the latest view_normal. ∎)
+        /// replica 0 doesn't have 6/7, then replica 1/2 must share the latest log_view. ∎)
         fn primary_set_log_from_do_view_change_messages(self: *Self) void {
             assert(self.status == .view_change);
             assert(self.primary_index(self.view) == self.replica);
             assert(self.replica_count > 1);
             assert(self.start_view_change_quorum);
             assert(self.do_view_change_quorum);
+            assert(self.do_view_change_from_all_replicas[self.replica] != null);
+            DVCQuorum.verify(self.do_view_change_from_all_replicas);
-            const do_view_change_head = self.do_view_change_quorum_head();
-            assert(do_view_change_head.view_normal >= self.view_normal);
-            assert(do_view_change_head.op >= self.commit_min);
-            assert(do_view_change_head.op >= do_view_change_head.commit_min_max);
-            assert(do_view_change_head.commit_min_max >= self.commit_min);
+            const dvcs_all = DVCQuorum.dvcs_all(self.do_view_change_from_all_replicas);
+            assert(dvcs_all.len == self.quorum_view_change);
-            // The `prepare_timestamp` prevents a primary's own clock from running backwards.
-            // Therefore, `prepare_timestamp`:
+            const dvcs_canonical = DVCQuorum.dvcs_canonical(self.do_view_change_from_all_replicas);
+            assert(dvcs_canonical.len > 0);
+            for (dvcs_all.constSlice()) |message| {
+                log.debug(
+                    "{}: on_do_view_change: dvc: " ++
+                        "replica={} log_view={} op={} commit_min={}",
+                    .{
+                        self.replica,
+                        message.header.replica,
+                        @intCast(u32, message.header.timestamp),
+                        message.header.op,
+                        message.header.commit, // The `commit_min` of the replica.
+                    },
+                );
+            }
+            for (dvcs_canonical.constSlice()) |message| {
+                for (message_body_as_headers_chain_disjoint(message)) |*header| {
+                    log.debug(
+                        "{}: on_do_view_change: canonical: replica={} op={} checksum={}",
+                        .{
+                            self.replica,
+                            message.header.replica,
+                            header.op,
+                            header.checksum,
+                        },
+                    );
+                }
+            }
+            const do_view_change_commit_min_max = DVCQuorum.commit_min_max(
+                self.do_view_change_from_all_replicas,
+                .{
+                    .replica = self.replica,
+                    .commit_min = self.commit_min,
+                },
+            );
+            assert(do_view_change_commit_min_max >= self.commit_min);
+            // The `prepare_timestamp` prevents a primary's own clock from running backwards.
+            // Therefore, `prepare_timestamp`:
             // 1. is advanced if behind the cluster, but never reset if ahead of the cluster, i.e.
             // 2. may not always reflect the timestamp of the latest prepared op, and
             // 3. should be advanced before discarding the timestamps of any uncommitted headers.
-            if (self.state_machine.prepare_timestamp < do_view_change_head.timestamp) {
-                self.state_machine.prepare_timestamp = do_view_change_head.timestamp;
+            const timestamp_max = DVCQuorum.timestamp_max(self.do_view_change_from_all_replicas);
+            if (self.state_machine.prepare_timestamp < timestamp_max) {
+                self.state_machine.prepare_timestamp = timestamp_max;
             }
-            const view_normal_canonical = do_view_change_head.view_normal;
-            // `op_canonical` must be computed before calling `set_op_and_commit_max()`, since
-            // that may change `replica.op`.
-            //
-            // Don't remove the uncanonical headers yet — even though the removed headers are
-            // a subset of the DVC headers, removing and then adding them back would cause clean
-            // headers to become dirty.
-            const op_canonical = self.primary_op_canonical_max(view_normal_canonical);
-            assert(op_canonical <= self.op);
-            assert(op_canonical >= self.op -| constants.pipeline_max);
-            assert(op_canonical >= self.commit_min);
-            if (do_view_change_head.op > self.op_checkpoint_trigger()) {
+            var headers_canonical = DVCQuorum.headers_canonical(self.do_view_change_from_all_replicas);
+            const header_head = headers_canonical.next().?;
+            assert(header_head.op == header_head.op);
+            assert(header_head.op >= do_view_change_commit_min_max);
+            assert(header_head.op >= self.op_checkpoint());
+            assert(header_head.op >= self.commit_min);
+            assert(header_head.op >= self.commit_max);
+            if (header_head.op > self.op_checkpoint_trigger()) {
                 // This replica is too far behind, i.e. the new `self.op` is too far ahead of the
                 // last checkpoint. If we wrap now, we overwrite un-checkpointed transfers in the WAL,
                 // precluding recovery.
@@ -5253,32 +5153,40 @@ pub fn ReplicaType(
             }
             self.set_op_and_commit_max(
-                do_view_change_head.op,
-                // `set_op_and_commit_max()` expects the highest commit_max that we know of.
-                // But DVCs include replica's `commit_min`, not `commit_max`.
+                header_head.op,
                 std.math.max(
                     self.commit_max,
-                    do_view_change_head.commit_min_max,
+                    std.math.max(
+                        // `set_op_and_commit_max()` expects the highest commit_max that we know of.
+                        // But DVCs include replica's `commit_min`, not `commit_max`.
+                        do_view_change_commit_min_max,
+                        // An op cannot be uncommitted if it is definitely outside the pipeline.
+                        // Use `do_view_change_op_head` instead of `replica.op` since the former is
+                        // about to become the new `replica.op`.
+                        header_head.op -| constants.pipeline_prepare_queue_max,
+                    ),
                 ),
                 "on_do_view_change",
             );
-            // "`replica.op` exists" invariant may be broken until after the canonical DVC headers
-            // are installed.
-            // First, set all the canonical headers from the replica(s) with highest `view_normal`:
-            for (self.do_view_change_from_all_replicas) |received| {
-                if (received) |message| {
-                    const view_normal = @intCast(u32, message.header.timestamp);
-                    // The view in which this replica's status was normal must be before this view.
-                    assert(view_normal < message.header.view);
+            // "`replica.op` exists" invariant may be broken briefly between set_op_and_commit_max()
+            // and replace_header().
+            self.replace_header(&header_head);
+            assert(self.journal.header_with_op(self.op) != null);
-                    if (view_normal < view_normal_canonical) continue;
-                    assert(view_normal == view_normal_canonical);
+            while (headers_canonical.next()) |header| {
+                assert(header.op < header_head.op);
+                self.replace_header(&header);
+            }
-                    const message_headers = message_body_as_headers(message);
-                    for (message_headers) |*header| {
+            const dvcs_uncanonical =
+                DVCQuorum.dvcs_uncanonical(self.do_view_change_from_all_replicas);
+            for (dvcs_uncanonical.constSlice()) |message| {
+                for (message_body_as_headers_chain_disjoint(message)) |*header| {
+                    // We must trust headers that other replicas have committed, because
+                    // repair_header() will not repair a header if the hash chain has a gap.
+                    if (header.op <= message.header.commit) {
                         log.debug(
-                            "{}: on_do_view_change: canonical: replica={} op={} checksum={}",
+                            "{}: on_do_view_change: committed: replica={} op={} checksum={}",
                             .{
                                 self.replica,
                                 message.header.replica,
@@ -5286,295 +5194,98 @@ pub fn ReplicaType(
                                 header.checksum,
                             },
                         );
-                    }
-                    self.replace_headers(message_headers);
-                }
-            }
-            // Since we used do_view_change_head to set the replica.op, it must have been loaded
-            // into the headers (if it wasn't present already).
-            assert(self.journal.header_with_op(self.op) != null);
-            // Now that the canonical headers are all in place, repair any other headers:
-            for (self.do_view_change_from_all_replicas) |received| {
-                if (received) |message| {
-                    const view_normal = @intCast(u32, message.header.timestamp);
-                    assert(view_normal < message.header.view);
-                    if (view_normal == view_normal_canonical) continue;
-                    assert(view_normal < view_normal_canonical);
-                    for (message_body_as_headers(message)) |*header| {
-                        // We must trust headers that other replicas have committed, because
-                        // repair_header() will not repair a header if the hash chain has a gap.
-                        if (header.op <= message.header.commit) {
-                            log.debug(
-                                "{}: on_do_view_change: committed: replica={} op={} checksum={}",
-                                .{
-                                    self.replica,
-                                    message.header.replica,
-                                    header.op,
-                                    header.checksum,
-                                },
-                            );
-                            self.replace_header(header);
-                        } else {
-                            _ = self.repair_header(header);
-                        }
+                        self.replace_header(header);
+                    } else {
+                        _ = self.repair_header(header);
                     }
                 }
             }
-            const op_max = self.do_view_change_op_max(op_canonical);
-            assert(op_max <= self.op);
-            assert(op_max >= self.commit_min);
-            if (op_max != self.op) {
-                log.debug("{}: primary_set_log_from_do_view_change_messages: discard op={}..{}", .{
-                    self.replica,
-                    op_max + 1,
-                    self.op,
-                });
-                self.journal.remove_entries_from(op_max + 1);
-                self.op = op_max;
-            }
-            assert(self.journal.header_with_op(self.op) != null);
         }
-        fn do_view_change_quorum_head(self: *const Self) struct {
-            /// The highest `view_normal` of any DVC.
-            ///
-            /// The headers bundled with DVCs with the highest `view_normal` are canonical, since
-            /// the replica has knowledge of previous view changes in which headers were replaced.
-            view_normal: u32,
-            /// The highest `commit_min` from any DVC (this is not a `commit_max`).
-            commit_min_max: u64,
-            /// The highest `op` from a DVC with the highest `view_normal`.
-            op: u64,
-            /// The higest timestamp from any DVC.
-            timestamp: u64,
-        } {
+        fn primary_start_view_as_the_new_primary(self: *Self) void {
             assert(self.status == .view_change);
             assert(self.primary_index(self.view) == self.replica);
-            assert(self.replica_count > 1);
-            assert(self.start_view_change_quorum);
+            assert(self.view == self.log_view);
             assert(self.do_view_change_quorum);
-            assert(self.do_view_change_from_all_replicas[self.replica] != null);
-            var v: ?u32 = null; // The highest `view_normal` from any replica.
-            var n: ?u64 = null; // The highest `op` for the highest `view_normal` from any replica.
-            var k: ?u64 = null; // The highest `commit_min` from any replica.
-            var t: ?u64 = null; // The highest `timestamp` from any replica.
-            for (self.do_view_change_from_all_replicas) |received, replica| {
-                if (received) |message| {
-                    assert(message.header.command == .do_view_change);
-                    assert(message.header.cluster == self.cluster);
-                    assert(message.header.replica == replica);
-                    assert(message.header.view == self.view);
-                    assert(message.header.op >= message.header.commit);
-                    assert(message.header.op - message.header.commit <= constants.journal_slot_count);
-                    // The view when this replica was last in normal status, which:
-                    // * may be higher than the view in any of the prepare headers.
-                    // * must be lower than the view of this view change.
-                    const view_normal = @intCast(u32, message.header.timestamp);
-                    assert(view_normal < message.header.view);
-                    if (replica == self.replica) {
-                        assert(view_normal == self.view_normal);
-                        assert(message.header.op == self.op);
-                        // We may have a newer commit than our DVC due to async commits (see below).
-                        assert(message.header.commit <= self.commit_min);
-                    }
+            assert(!self.pipeline_repairing);
+            assert(self.primary_repair_pipeline() == .done);
-                    log.debug(
-                        "{}: on_do_view_change: " ++
-                            "replica={} view_normal={} op={} commit_min={}",
-                        .{
-                            self.replica,
-                            message.header.replica,
-                            view_normal,
-                            message.header.op,
-                            message.header.commit, // The `commit_min` of the replica.
-                        },
-                    );
+            assert(self.commit_min == self.commit_max);
+            assert(self.journal.dirty.count == 0);
+            assert(self.journal.faulty.count == 0);
+            assert(self.nack_prepare_op == null);
+            assert(self.valid_hash_chain_between(self.commit_min, self.op));
-                    if (v == null or view_normal > v.?) {
-                        v = view_normal;
-                        n = message.header.op;
-                    } else if (view_normal == v.? and message.header.op > n.?) {
-                        n = message.header.op;
-                    }
+            {
+                const pipeline_queue = self.primary_repair_pipeline_done();
+                assert(pipeline_queue.request_queue.empty());
+                assert(pipeline_queue.prepare_queue.count + self.commit_max == self.op);
+                if (!pipeline_queue.prepare_queue.empty()) {
+                    const prepares = &pipeline_queue.prepare_queue;
+                    assert(prepares.head_ptr_const().?.message.header.op == self.commit_max + 1);
+                    assert(prepares.tail_ptr_const().?.message.header.op == self.op);
+                }
-                    if (k == null or message.header.commit > k.?) k = message.header.commit;
+                var pipeline_prepares = pipeline_queue.prepare_queue.iterator();
+                while (pipeline_prepares.next()) |prepare| {
+                    assert(self.journal.has(prepare.message.header));
+                    assert(!prepare.ok_quorum_received);
+                    assert(prepare.ok_from_all_replicas.count() == 0);
-                    const message_headers = message_body_as_headers(message);
-                    if (t == null or t.? < message_headers[0].timestamp) {
-                        t = message_headers[0].timestamp;
-                    }
+                    log.debug("{}: start_view_as_the_new_primary: pipeline " ++
+                        "(op={} checksum={x} parent={x})", .{
+                        self.replica,
+                        prepare.message.header.op,
+                        prepare.message.header.checksum,
+                        prepare.message.header.parent,
+                    });
                 }
-            }
-            // Consider the case:
-            // 1. Start committing op=N…M.
-            // 2. Send `do_view_change` to self.
-            // 3. Finish committing op=N…M.
-            // 4. Remaining `do_view_change` messages arrive, completing the quorum.
-            // In this scenario, our own DVC's commit is `N-1`, but `commit_min=M`.
-            // Don't let the commit backtrack.
-            if (k.? < self.commit_min) {
-                assert(self.commit_min >
-                    self.do_view_change_from_all_replicas[self.replica].?.header.commit);
-                log.debug("{}: on_do_view_change: bump commit_min view={} commit={}..{}", .{
-                    self.replica,
-                    self.view,
-                    k.?,
-                    self.commit_min,
-                });
-                k = self.commit_min;
+                self.pipeline.cache.deinit(self.message_bus.pool);
+                self.pipeline = .{ .queue = pipeline_queue };
+                self.pipeline.queue.verify();
             }
-            assert(v.? >= self.view_normal);
-            assert(k.? >= self.commit_min);
-            return .{
-                .view_normal = v.?,
-                .commit_min_max = k.?,
-                .op = n.?,
-                .timestamp = t.?,
-            };
-        }
-        /// Identify headers to discard during a view change before the primary starts the view.
-        /// This is required to maximize availability in the presence of storage faults.
-        /// Refer to the CTRL protocol from Protocol-Aware Recovery for Consensus-Based Storage.
-        ///
-        /// Returns the highest op that:
-        /// - precedes any hash chain breaks in the uncanonical headers, and
-        /// - precedes any gaps in the uncommitted headers.
-        ///
-        /// Breaks
-        ///
-        /// If there is a hash chain break, none of the headers from the canonical DVCs replaced
-        /// the broken (leftover uncanonical) op.
-        /// Removing these is necessary for correctness and liveness, to ensure that
-        /// disconnected headers do not remain in place in lieu of gaps.
-        ///
-        /// Gaps
-        ///
-        /// It is possible for the new primary to have done an op jump in a previous view, and
-        /// introduced a header gap for an op, which may have then been discarded by another primary
-        /// during a view change, before surviving into this view as a gap because our latest op was
-        /// set as the latest op for the quorum.
-        ///
-        /// In this case, it may be impossible for the new primary to repair the missing header as
-        /// the rest of the cluster may have already discarded it. We therefore iterate over our
-        /// uncommitted header gaps to discard any that may be impossible to repair.
-        ///
-        /// For example, if the old primary replicates ops=7,8,9 (all uncommitted) but only op=9 is
-        /// prepared on another replica before the old primary crashes, then this function finds a
-        /// gap for ops=7,8 and will attempt to discard ops 7,8,9.
-        fn do_view_change_op_max(self: *const Self, op_canonical: u64) u64 {
-            assert(self.replica_count > 1);
-            assert(self.status == .view_change);
-            assert(self.primary_index(self.view) == self.replica);
-            assert(self.do_view_change_quorum);
-            assert(!self.repair_timeout.ticking);
-            assert(self.op >= self.commit_max);
-            // At least one replica in the new quorum committed in the new replica.op's WAL wrap —
-            // wrapping implies a checkpoint (which implies a commit).
-            assert(self.op - self.commit_max <= constants.journal_slot_count);
-            assert(self.op - self.commit_min <= constants.journal_slot_count);
-            assert(op_canonical <= self.op);
-            assert(op_canonical >= self.commit_min);
-            // Any uncanonical ops remaining either:
-            // * Connect to the hash chain on the right.
-            // * Do not connect on the right (hash chain break).
-            //
-            // If there is a hash chain break, none of the headers from the canonical DVCs replaced
-            // the broken op. It is truncated like a gap.
-            //
-            // Removing these is necessary for correctness and liveness, to ensure that
-            // disconnected headers do not remain in place in lieu of gaps.
-            const op_before_break = blk: {
-                var op: u64 = op_canonical;
-                while (op < self.op) : (op += 1) {
-                    if (self.journal.header_with_op(op)) |header| {
-                        if (self.journal.header_with_op(op + 1)) |next| {
-                            // Broken hash chain.
-                            if (header.checksum != next.parent) break :blk op;
-                        }
-                    }
-                } else break :blk self.op;
-            };
-            // Find the beginning of the lowest gap.
-            //
-            // While iterating > commit_max does not in itself guarantee that an op is uncommitted
-            // (the old primary may have committed the op shortly before crashing), nevertheless,
-            // if it was committed it would have survived into the new view as a header not a gap.
-            const op_before_gap = blk: {
-                // An op cannot be uncommitted if it is definitely outside the pipeline.
-                const op_committed = std.math.max(self.commit_max, self.op -| constants.pipeline_max);
-                assert(op_committed <= self.op);
-                var op = op_committed;
-                while (op < self.op) : (op += 1) {
-                    if (self.journal.header_with_op(op + 1) == null) break :blk op;
-                } else break :blk self.op;
-            };
-            return std.math.min(op_before_break, op_before_gap);
-        }
-        fn start_view_as_the_new_primary(self: *Self) void {
-            assert(self.status == .view_change);
-            assert(self.primary_index(self.view) == self.replica);
-            assert(self.do_view_change_quorum);
-            assert(!self.repairing_pipeline);
-            assert(self.commit_min == self.commit_max);
-            assert(self.primary_repair_pipeline_op() == null);
-            self.verify_pipeline();
-            assert(self.commit_max + self.pipeline.count == self.op);
-            assert(self.valid_hash_chain_between(self.commit_min, self.op));
-            assert(self.journal.dirty.count == 0);
-            assert(self.journal.faulty.count == 0);
-            assert(self.nack_prepare_op == null);
-            const start_view = self.create_view_change_message(.start_view);
-            defer self.message_bus.unref(start_view);
             self.transition_to_normal_from_view_change_status(self.view);
-            // Detect if the transition to normal status above accidentally resets the pipeline:
-            assert(self.commit_max + self.pipeline.count == self.op);
+            self.view_durable_update();
             assert(self.status == .normal);
             assert(self.primary());
-            assert(start_view.references == 1);
-            assert(start_view.header.command == .start_view);
-            assert(start_view.header.view == self.view);
-            assert(start_view.header.op == self.op);
-            assert(start_view.header.commit == self.commit_max);
             // Send prepare_ok messages to ourself to contribute to the pipeline.
             self.send_prepare_oks_after_view_change();
-            self.send_message_to_other_replicas(start_view);
+            // SVs will be sent out (via timeout) after the view_durable update completes.
+            assert(self.view_durable_updating());
+            assert(self.log_view > self.log_view_durable());
         }
-        fn transition_to_normal_from_recovering_status(self: *Self, new_view: u32) void {
+        fn transition_to_recovering_head(self: *Self) void {
             assert(self.status == .recovering);
-            assert(self.view == 0);
+            assert(self.view == self.log_view);
+            assert(self.op >= self.commit_min);
+            assert(!self.committing);
+            assert(self.replica_count > 1);
+            assert(self.journal.header_with_op(self.op) != null);
+            assert(self.pipeline == .cache);
+            self.status = .recovering_head;
+            log.warn("{}: transition_to_recovering_head: op_checkpoint={} op_head={}", .{
+                self.replica,
+                self.op_checkpoint(),
+                self.op,
+            });
+        }
+        fn transition_to_normal_from_recovering_status(self: *Self) void {
+            assert(self.status == .recovering or self.status == .recovering_head);
+            assert(self.view == self.log_view);
             assert(!self.committing);
-            assert(self.replica_count > 1 or new_view == 0);
+            assert(self.replica_count > 1 or self.commit_min == self.op);
             assert(self.journal.header_with_op(self.op) != null);
-            self.view = new_view;
-            self.view_normal = new_view;
+            assert(self.pipeline == .cache);
             self.status = .normal;
             if (self.primary()) {
@@ -5586,7 +5297,7 @@ pub fn ReplicaType(
                     },
                 );
-                assert(self.journal.is_empty() or self.replica_count == 1);
+                assert(self.replica_count == 1);
                 assert(!self.prepare_timeout.ticking);
                 assert(!self.normal_status_timeout.ticking);
                 assert(!self.view_change_status_timeout.ticking);
@@ -5595,7 +5306,9 @@ pub fn ReplicaType(
                 self.ping_timeout.start();
                 self.commit_timeout.start();
                 self.repair_timeout.start();
-                self.recovery_timeout.stop();
+                self.pipeline.cache.deinit(self.message_bus.pool);
+                self.pipeline = .{ .queue = .{} };
             } else {
                 log.debug(
                     "{}: transition_to_normal_from_recovering_status: view={} backup",
@@ -5613,31 +5326,30 @@ pub fn ReplicaType(
                 self.ping_timeout.start();
                 self.normal_status_timeout.start();
                 self.repair_timeout.start();
-                self.recovery_timeout.stop();
             }
         }
-        fn transition_to_normal_from_view_change_status(self: *Self, new_view: u32) void {
+        fn transition_to_normal_from_view_change_status(self: *Self, view_new: u32) void {
             // In the VRR paper it's possible to transition from normal to normal for the same view.
             // For example, this could happen after a state transfer triggered by an op jump.
             assert(self.status == .view_change);
-            assert(new_view >= self.view);
+            assert(view_new >= self.view);
             assert(self.journal.header_with_op(self.op) != null);
-            self.view = new_view;
-            self.view_normal = new_view;
             self.status = .normal;
             if (self.primary()) {
                 log.debug(
-                    "{}: transition_to_normal_from_view_change_status: view={} primary",
-                    .{
-                        self.replica,
-                        self.view,
-                    },
+                    "{}: transition_to_normal_from_view_change_status: view={}..{} primary",
+                    .{ self.replica, self.view, view_new },
                 );
                 assert(!self.prepare_timeout.ticking);
-                assert(!self.recovery_timeout.ticking);
+                assert(!self.pipeline_repairing);
+                assert(self.pipeline == .queue);
+                assert(self.view == view_new);
+                assert(self.log_view == view_new);
+                assert(self.commit_min == self.commit_max);
                 self.ping_timeout.start();
                 self.commit_timeout.start();
@@ -5647,15 +5359,25 @@ pub fn ReplicaType(
                 self.repair_timeout.start();
                 // Do not reset the pipeline as there may be uncommitted ops to drive to completion.
-                if (self.pipeline.count > 0) self.prepare_timeout.start();
+                if (self.pipeline.queue.prepare_queue.count > 0) self.prepare_timeout.start();
             } else {
-                log.debug("{}: transition_to_normal_from_view_change_status: view={} backup", .{
+                log.debug("{}: transition_to_normal_from_view_change_status: view={}..{} backup", .{
                     self.replica,
                     self.view,
+                    view_new,
                 });
                 assert(!self.prepare_timeout.ticking);
-                assert(!self.recovery_timeout.ticking);
+                assert(self.pipeline == .cache);
+                if (self.log_view == view_new and self.view == view_new) {
+                    // We recovered into the same view we crashed in, with a detour through
+                    // status=recovering_head.
+                } else {
+                    self.view = view_new;
+                    self.log_view = view_new;
+                    self.view_durable_update();
+                }
                 self.ping_timeout.start();
                 self.commit_timeout.stop();
@@ -5668,7 +5390,6 @@ pub fn ReplicaType(
             self.reset_quorum_start_view_change();
             self.reset_quorum_do_view_change();
             self.reset_quorum_nack_prepare();
-            self.reset_quorum_prepare_ok();
             assert(self.start_view_change_quorum == false);
             assert(self.do_view_change_quorum == false);
@@ -5680,17 +5401,34 @@ pub fn ReplicaType(
         /// where v identifies the new view. A replica notices the need for a view change either
         /// based on its own timer, or because it receives a start_view_change or do_view_change
         /// message for a view with a larger number than its own view.
-        fn transition_to_view_change_status(self: *Self, new_view: u32) void {
+        fn transition_to_view_change_status(self: *Self, view_new: u32) void {
             log.debug("{}: transition_to_view_change_status: view={}..{}", .{
                 self.replica,
                 self.view,
-                new_view,
+                view_new,
             });
-            assert(self.status == .normal or self.status == .view_change);
-            assert(new_view > self.view);
-            self.view = new_view;
+            assert(self.status == .normal or
+                self.status == .view_change or
+                self.status == .recovering or
+                self.status == .recovering_head);
+            const status_before = self.status;
             self.status = .view_change;
+            if (self.view == view_new) {
+                assert(status_before == .recovering or status_before == .recovering_head);
+            } else {
+                assert(view_new > self.view);
+                self.view = view_new;
+                self.view_durable_update();
+            }
+            if (self.pipeline == .queue) {
+                var queue = self.pipeline.queue;
+                self.pipeline = .{ .cache = PipelineCache.init_from_queue(&queue) };
+                queue.deinit(self.message_bus.pool);
+            }
             self.ping_timeout.stop();
             self.commit_timeout.stop();
             self.normal_status_timeout.stop();
@@ -5698,7 +5436,6 @@ pub fn ReplicaType(
             self.view_change_message_timeout.start();
             self.repair_timeout.stop();
             self.prepare_timeout.stop();
-            assert(!self.recovery_timeout.ticking);
             // Do not reset quorum counters only on entering a view, assuming that the view will be
             // followed only by a single subsequent view change to the next view, because multiple
@@ -5708,7 +5445,6 @@ pub fn ReplicaType(
             self.reset_quorum_start_view_change();
             self.reset_quorum_do_view_change();
             self.reset_quorum_nack_prepare();
-            self.reset_quorum_prepare_ok();
             assert(self.start_view_change_quorum == false);
             assert(self.do_view_change_quorum == false);
@@ -5785,7 +5521,7 @@ pub fn ReplicaType(
         fn valid_hash_chain_between(self: *const Self, op_min: u64, op_max: u64) bool {
             assert(op_min <= op_max);
             // Headers with ops preceding the checkpoint may be unavailable due to a WAL wrap.
-            assert(op_min >= self.op_checkpoint);
+            assert(op_min >= self.op_checkpoint());
             // If we use anything less than self.op then we may commit ops for a forked hash chain
             // that have since been reordered by a new primary.
@@ -5796,7 +5532,7 @@ pub fn ReplicaType(
             while (op > op_min) {
                 op -= 1;
-                if (self.op_checkpoint == op) {
+                if (self.op_checkpoint() == op) {
                     // op_checkpoint's slot may have been overwritten in the WAL — but we can
                     // always use the VSRState to anchor the hash chain.
                     assert(op == op_min);
@@ -5807,7 +5543,7 @@ pub fn ReplicaType(
                         log.debug("{}: valid_hash_chain_between: break A: {} (checkpoint={})", .{
                             self.replica,
                             self.superblock.working.vsr_state.commit_min_checksum,
-                            self.op_checkpoint,
+                            self.op_checkpoint(),
                         });
                         log.debug("{}: valid_hash_chain_between: break B: {}", .{
                             self.replica,
@@ -5836,37 +5572,6 @@ pub fn ReplicaType(
             return true;
         }
-        fn verify_pipeline(self: *Self) void {
-            assert(self.status == .view_change);
-            var op = self.commit_max + 1;
-            var parent = self.journal.header_with_op(self.commit_max).?.checksum;
-            var iterator = self.pipeline.iterator();
-            while (iterator.next_ptr()) |prepare| {
-                assert(prepare.message.header.command == .prepare);
-                assert(!prepare.ok_quorum_received);
-                assert(prepare.ok_from_all_replicas.count() == 0);
-                log.debug("{}: verify_pipeline: op={} checksum={x} parent={x}", .{
-                    self.replica,
-                    prepare.message.header.op,
-                    prepare.message.header.checksum,
-                    prepare.message.header.parent,
-                });
-                assert(self.journal.has(prepare.message.header));
-                assert(prepare.message.header.op == op);
-                assert(prepare.message.header.op <= self.op);
-                assert(prepare.message.header.parent == parent);
-                parent = prepare.message.header.checksum;
-                op += 1;
-            }
-            assert(self.pipeline.count <= constants.pipeline_max);
-            assert(self.commit_max + self.pipeline.count == op - 1);
-        }
         fn view_jump(self: *Self, header: *const Header) void {
             const to: Status = switch (header.command) {
                 .prepare, .commit => .normal,
@@ -5874,7 +5579,10 @@ pub fn ReplicaType(
                 else => unreachable,
             };
-            if (self.status != .normal and self.status != .view_change) return;
+            switch (self.status) {
+                .normal, .view_change, .recovering_head => {},
+                .recovering => return,
+            }
             if (header.view < self.view) return;
@@ -5898,18 +5606,20 @@ pub fn ReplicaType(
                     .view_change => if (header.view == self.view) return,
                     else => unreachable,
                 },
+                .recovering_head => {},
                 else => unreachable,
             }
             switch (to) {
                 .normal => {
                     if (header.view == self.view) {
-                        assert(self.status == .view_change);
+                        assert(self.status == .view_change or self.status == .recovering_head);
                         log.debug("{}: view_jump: waiting to exit view change", .{self.replica});
                     } else {
                         assert(header.view > self.view);
-                        assert(self.status == .view_change or self.status == .normal);
+                        assert(self.status == .view_change or self.status == .recovering_head or
+                            self.status == .normal);
                         log.debug("{}: view_jump: waiting to jump to newer view", .{self.replica});
                     }
@@ -5924,8 +5634,10 @@ pub fn ReplicaType(
                     });
                 },
                 .view_change => {
-                    assert(header.view > self.view);
-                    assert(self.status == .view_change or self.status == .normal);
+                    assert(self.status == .recovering_head or header.view > self.view);
+                    assert(self.status != .recovering_head or header.command == .start_view);
+                    assert(self.status == .recovering_head or self.status == .view_change or
+                        self.status == .normal);
                     if (header.view == self.view + 1) {
                         log.debug("{}: view_jump: jumping to view change", .{self.replica});
@@ -5944,10 +5656,10 @@ pub fn ReplicaType(
             assert(message.header.view <= self.view);
             assert(message.header.op <= self.op);
-            if (message.header.op == self.op_checkpoint) {
+            if (message.header.op == self.op_checkpoint()) {
                 assert(message.header.op == 0);
             } else {
-                assert(message.header.op > self.op_checkpoint);
+                assert(message.header.op > self.op_checkpoint());
             }
             if (!self.journal.has(message.header)) {
@@ -5968,6 +5680,18 @@ pub fn ReplicaType(
                 return;
             }
+            // Criteria for caching:
+            // - The primary does not update the cache since it is (or will be) reconstructing its
+            //   pipeline.
+            // - Cache uncommitted ops, since it will avoid a WAL read in the common case.
+            if (self.pipeline == .cache and
+                self.replica != self.primary_index(self.view) and
+                self.commit_min < message.header.op)
+            {
+                const prepare_evicted = self.pipeline.cache.insert(message.ref());
+                if (prepare_evicted) |m| self.message_bus.unref(m);
+            }
             self.journal.write_prepare(write_prepare_callback, message, trigger);
         }
@@ -5993,3 +5717,832 @@ pub fn ReplicaType(
         }
     };
 }
+/// A do-view-change:
+/// - selects the view's head
+/// - discards uncommitted ops (to maximize availability in the presence of storage faults)
+/// - retains all committed ops
+/// - retains all possibly-committed ops (because they might be committed — we can't tell)
+///   (Some of these may be discarded during repair, via the nack protocol).
+/// Refer to the CTRL protocol from Protocol-Aware Recovery for Consensus-Based Storage.
+///
+/// Terminology:
+///
+/// - The *head* message (of a view) is the message (committed or uncommitted) within that view with
+///   the highest op.
+///
+/// - *gap*: There is a header for op X and X+n (n>1), but no header at op X+1.
+/// - *break*/*chain break*: The header for op X is not the parent of the header for op X+1.
+/// - *fork*: A correctness bug in which a committed (or possibly committed) message is discarded.
+///
+/// - An *uncanonical* message may have been removed/changed during a prior view.
+/// - A *canonical* message was part of the most recent log_view.
+///   - Canonical messages do not necessarily survive into the new view, but they take
+///     precedence over uncanonical messages.
+///   - Canonical messages may be committed or uncommitted.
+///
+/// - *DVC* refers to a command=do_view_change message.
+/// - *SV* refers to a command=start_view message.
+/// - The *pipeline suffix* is the last pipeline_prepare_queue_max messages of the log (counting
+///   backwards from the head op). For example, when pipeline_prepare_queue_max=3,
+///
+///   - the pipeline suffix of log "1,2,3,4,5" is "3,4,5".
+///   - the pipeline suffix of log "1,2,3,5" is "3,5".
+///
+///
+/// Invariants:
+///
+/// For each DVC message:
+///
+/// - The headers all belong to the same hash chain.
+///   - Reason: If multiple replicas with the same canonical log_view disagree about an op, the new
+///     primary could not determine which is correct.
+///   - Gaps are permitted, but the DVC-sender is responsible for ensuring they do not conceal
+///     chain breaks.
+///   - For example,
+///     - a DVC of 6a,8a is valid (6a/8a belong to the same chain).
+///     - a DVC of 6b,8a is invalid (the gap at 7 conceal a chain break).
+///     - a DVC of 6b,7b,8a is invalid (7b/8a is a chain break)..
+///
+/// - The headers must connect to the cluster's committed ops (the "DVC anchor").
+///   This means that either:
+///   - the DVC includes the op=C header, or
+///   - the DVC includes the op=C+1 header (where C+1's parent is C).
+///   (Where `C = "DVC anchor" = max(replica.commit_min, replica.op -| pipeline_prepare_queue_max)`).
+///   - Reason: The new primary may truncate the entire pipeline (6-9) due to a gap (6),
+///     but afterwards it still requires a head op to repair/chain backward from.
+///     (According to the intersection property, a gap in the pipeline indicates an
+///     uncommitted op).
+///   - For example, given pipeline_prepare_queue_max=3:
+///     - a DVC of 7,8 is invalid if replica.commit_min=5.
+///     - a DVC of 7,8 is valid if replica.commit_min=6.
+///     - a DVC of 5,7,8 is valid. (5,_,7,8)
+///     - a DVC of 5,8 is valid.   (5,_,_,8)
+///     - a DVC of 0,1 is valid.
+///
+/// Across all DVCs in the quorum:
+///
+/// - The headers of every DVC with the same log_view must not conflict.
+///   - In other words:
+///     dvc₁.headers[i].op       == dvc₂.headers[j].op implies
+///     dvc₁.headers[i].checksum == dvc₂.headers[j].checksum.
+///   - Reason: the headers bundled with the DVC(s) with the highest log_view will be
+///     loaded into the new primary with `replace_header()`, not `repair_header()`.
+///
+/// Perhaps unintuitively, it is safe to advertise a header before its message is prepared
+/// (e.g. the write is still queued). The header is either:
+///
+/// - committed — so another replica in the quorum must have a copy, according to the quorum
+///   intersection property. Or,
+/// - uncommitted — if the header is chosen, but cannot be recovered from any replica, then
+///   it will be discarded by the nack protocol.
+///
+///
+/// Examples
+///
+/// In these examples:
+/// - pipeline_prepare_queue_max=3
+/// - Brackets denote the suffix of the replica's log that is actually included in the DVC headers.
+/// - Parenthesis denote a replica that did not participate in the DVC (for example, because it is
+///   partitioned).
+///
+/// Example 1: No gap in canonical headers
+///
+/// Consider a view change with DVCs:
+///
+///   replica   headers                         log_view
+///         0   1  [2   3   4b]                 4          (new primary)
+///         1   1   2   3   4a  5   6  [7   8   9]   5
+///         2  (1   2   3   4a  5   6   7   8   9)   5     (partitioned)
+///
+/// Replica 1's headers are canonical, so replica 0 constructs the log:
+///
+///             1   2   3    4b         7   8   9
+///
+/// The 5/6 gap conceals a hash break — 4b should be 4a.
+/// The view must initially keeps all of these headers, and after the DVC quorum is handled, repairs
+/// backwards from 7. (If it instead discarded at the gap (5…9), the log would fork (4a→4b).)
+///
+///
+/// Example 2: Gap in pipeline suffix
+///
+/// Consider a set of replicas performing a DVC:
+///
+///   replica   headers                              log_view
+///         0   1  [2   3   4b]                      4     (new primary)
+///         1   1   2   3   4a  5   6       8   9    5
+///         2  (1   2   3   ?   ?   ?   ?   ?   ?)   5     (partitioned)
+///
+/// Which headers should replica 1 include in its DVC?
+/// The cases are be distinguished by `log_view % replica_count`.
+///
+/// (These examples are still applicable if the gap is not in the first op of the pipeline suffix).
+///
+///
+/// Example 2a: Gap in the pipeline suffix of a retired primary
+///
+/// The replica was a primary during its retired log_view.
+/// It may have gaps or breaks in its pipeline suffix iff:
+/// - it didn't finish repairs before the next view change, and
+/// - some uncommitted ops were truncated during the DVC (since this "moves" the suffix backwards).
+///
+/// We cannot send op 6 in the DVC because if repairs did not complete, it may be the wrong message.
+///
+/// However, even though we may not have a full unbroken suffix of pipeline_prepare_queue_max
+/// messages, we know that our unbroken suffix (however long it may be) includes all
+/// possibly-committed messages, since otherwise the retired log_view would not have started.
+///
+/// Therefore, the retired primary sends a DVC with only the unbroken log suffix:
+///
+///   replica   headers
+///         1   1   2   3   4a  5   6      [8   9]         (retired primary)
+///
+///
+/// Example 2b: Gap in the pipeline suffix of a retired follower
+///
+/// The replica was a follower during its retired log_view.
+/// Followers always load a full suffix of headers from the view's SV message.
+/// If there is now a gap in it the follower's suffix, this must be due to missed prepares.
+///
+/// Therefore, ops to the left of the gap (where the gap is within the suffix) are part of the
+/// suffix's hash chain, even though we cannot test this by chaining checksum/parent.
+///
+/// Therefore, the retired follower sends the DVC:
+///
+///   replica   headers
+///         1   1   2   3   4a  5  [6       8   9]         (retired follower)
+///
+///
+/// Example 3: Break in pipeline suffix
+///
+/// Consider a set of replicas performing a DVC:
+///
+///   replica   headers                              log_view
+///         0   1  [2   3   4b]                      4     (new primary)
+///         1   1   2   3   4b  5a  6a  7a [8b  9b]  5
+///         2  (1   2   3   4b  5b  7b  7b  8b  9b)  5     (partitioned)
+///
+/// (Note the chain break at replica 1's 7a/8b.)
+/// This scenario is exactly analogous to Example 2, except that it can only occur on a retired
+/// primary, never a retired follower.
+///
+/// The retired primary sends a DVC with only the unbroken log suffix:
+///
+///   replica   headers
+///         1   1   2   3   4a  5   6   7a [8   9]         (retired primary)
+///
+///
+/// Example 4: Gap in retiring primary suffix after recovery
+///
+/// Suppose that replica 1 starts a view as the primary of view 4, with the suffix:
+///
+///  log_view   4
+///      view   4
+///   journal   1   2   3
+///      head   3
+///
+/// During this view, it prepares several ops:
+///
+///  log_view   4
+///      view   4
+///   journal   1   2   3   4   5   6   7
+///      head   7
+///
+/// However, the WAL writes are reordered — ops 4,5,7 writes finish before op=6's write has begun:
+///
+///  log_view   4
+///      view   4
+///   journal   1   2   3   4   5   6   7
+///       wal   1   2   3   4   5   _   7
+///      head   7
+///
+/// Replica 1 crashes and recovers, and immediately begins sending a DVC for view=5.
+/// Under normal circumstances, the retired primary cannot distinguish between a gap and a break
+/// due to the possibility that its did not complete repair (see Example 2a).
+/// In this instance though, the gap is safe to skip over because it is to the right of the durable
+/// SV's head (op=3).
+///
+///  log_view   4
+///      view   5
+///   journal   1   2   3  [4   5   _   7]
+///      head   7
+///
+const DVCQuorum = struct {
+    const DVCArray = std.BoundedArray(*const Message, constants.replicas_max);
+    fn verify(dvc_quorum: QuorumMessages) void {
+        const dvcs = DVCQuorum.dvcs_all(dvc_quorum);
+        assert(dvcs.len >= 2);
+        for (dvcs.constSlice()) |message| verify_message(message);
+        var log_views_all = std.BoundedArray(u32, constants.replicas_max){ .buffer = undefined };
+        for (dvcs.constSlice()) |message| {
+            const log_view = @intCast(u32, message.header.timestamp);
+            if (std.mem.count(u32, log_views_all.constSlice(), &.{log_view}) == 0) {
+                log_views_all.appendAssumeCapacity(log_view);
+            }
+        }
+        // Verify that DVCs with the same log_view do not conflict.
+        for (log_views_all.constSlice()) |log_view| {
+            const view_dvcs = dvcs_with_log_view(dvc_quorum, log_view);
+            var view_headers = HeaderIterator.init(view_dvcs, null);
+            while (view_headers.next()) |_| {}
+        }
+    }
+    fn verify_message(message: *const Message) void {
+        assert(message.header.command == .do_view_change);
+        assert(message.header.op >= message.header.commit);
+        assert(message.header.op - message.header.commit <= constants.journal_slot_count);
+        // The log_view:
+        // * may be higher than the view in any of the prepare headers.
+        // * must be lower than the view of this view change.
+        const log_view = @intCast(u32, message.header.timestamp);
+        assert(log_view < message.header.view);
+        // Ignore the headers, but perform the validation.
+        _ = message_body_as_headers_chain_disjoint(message);
+    }
+    fn dvcs_all(dvc_quorum: QuorumMessages) DVCArray {
+        var array = DVCArray{ .buffer = undefined };
+        for (dvc_quorum) |received, replica| {
+            if (received) |message| {
+                assert(message.header.command == .do_view_change);
+                assert(message.header.replica == replica);
+                array.appendAssumeCapacity(message);
+            }
+        }
+        return array;
+    }
+    fn dvcs_canonical(dvc_quorum: QuorumMessages) DVCArray {
+        return dvcs_with_log_view(dvc_quorum, DVCQuorum.log_view_max(dvc_quorum));
+    }
+    fn dvcs_with_log_view(dvc_quorum: QuorumMessages, log_view: u32) DVCArray {
+        var array = DVCArray{ .buffer = undefined };
+        const dvcs = DVCQuorum.dvcs_all(dvc_quorum);
+        for (dvcs.constSlice()) |message| {
+            const message_log_view = @intCast(u32, message.header.timestamp);
+            if (message_log_view == log_view) {
+                array.appendAssumeCapacity(message);
+            }
+        }
+        return array;
+    }
+    fn dvcs_uncanonical(dvc_quorum: QuorumMessages) DVCArray {
+        const log_view_max_ = DVCQuorum.log_view_max(dvc_quorum);
+        var array = DVCArray{ .buffer = undefined };
+        const dvcs = DVCQuorum.dvcs_all(dvc_quorum);
+        for (dvcs.constSlice()) |message| {
+            const log_view = @intCast(u32, message.header.timestamp);
+            assert(log_view <= log_view_max_);
+            if (log_view < log_view_max_) {
+                array.appendAssumeCapacity(message);
+            }
+        }
+        return array;
+    }
+    /// Returns the highest `log_view` of any DVC.
+    ///
+    /// The headers bundled with DVCs with the highest `log_view` are canonical, since
+    /// the replica has knowledge of previous view changes in which headers were replaced.
+    fn log_view_max(dvc_quorum: QuorumMessages) u32 {
+        var log_view_max_: ?u32 = null;
+        const dvcs = DVCQuorum.dvcs_all(dvc_quorum);
+        for (dvcs.constSlice()) |message| {
+            // The view when this replica was last in normal status, which:
+            // * may be higher than the view in any of the prepare headers.
+            // * must be lower than the view of this view change.
+            const log_view = @intCast(u32, message.header.timestamp);
+            assert(log_view < message.header.view);
+            if (log_view_max_ == null or log_view_max_.? < log_view) {
+                log_view_max_ = log_view;
+            }
+        }
+        return log_view_max_.?;
+    }
+    /// Returns the highest `commit_min` from any DVC (this is not a `commit_max`).
+    fn commit_min_max(dvc_quorum: QuorumMessages, local: struct {
+        replica: u64,
+        commit_min: u64,
+    }) u64 {
+        assert(dvc_quorum[local.replica].?.header.commit <= local.commit_min);
+        var commit_min_max_: ?u64 = null;
+        const dvcs = DVCQuorum.dvcs_all(dvc_quorum);
+        for (dvcs.constSlice()) |message| {
+            if (commit_min_max_ == null or commit_min_max_.? < message.header.commit) {
+                commit_min_max_ = message.header.commit;
+            }
+        }
+        // Consider the case:
+        // 1. Start committing op=N…M.
+        // 2. Send `do_view_change` to self.
+        // 3. Finish committing op=N…M.
+        // 4. Remaining `do_view_change` messages arrive, completing the quorum.
+        // In this scenario, our own DVC's commit is `N-1`, but `commit_min=M`.
+        // Don't let the commit backtrack.
+        if (commit_min_max_.? < local.commit_min) {
+            const dvc_old = dvc_quorum[local.replica].?;
+            assert(dvc_old.header.commit < local.commit_min);
+            assert(dvc_old.header.commit <= commit_min_max_.?);
+            log.debug("{}: on_do_view_change: bump commit_min commit={}..{}", .{
+                local.replica,
+                commit_min_max_.?,
+                local.commit_min,
+            });
+            commit_min_max_ = local.commit_min;
+        }
+        assert(commit_min_max_.? >= local.commit_min);
+        return commit_min_max_.?;
+    }
+    /// Returns the highest `timestamp` from any replica.
+    fn timestamp_max(dvc_quorum: QuorumMessages) u64 {
+        var timestamp_max_: ?u64 = null;
+        const dvcs = DVCQuorum.dvcs_all(dvc_quorum);
+        for (dvcs.constSlice()) |message| {
+            const message_headers = message_body_as_headers_chain_disjoint(message);
+            if (timestamp_max_ == null or timestamp_max_.? < message_headers[0].timestamp) {
+                timestamp_max_ = message_headers[0].timestamp;
+            }
+        }
+        return timestamp_max_.?;
+    }
+    fn op_max_canonical(dvc_quorum: QuorumMessages) u64 {
+        var op_max: ?u64 = null;
+        const dvcs = DVCQuorum.dvcs_canonical(dvc_quorum);
+        for (dvcs.constSlice()) |message| {
+            if (op_max == null or op_max.? < message.header.op) {
+                op_max = message.header.op;
+            }
+        }
+        return op_max.?;
+    }
+    /// Return an iterator over the canonical DVC's headers, from high-to-low op.
+    /// The first header returned is the new head message.
+    fn headers_canonical(dvc_quorum: QuorumMessages) HeaderIterator {
+        const dvcs = DVCQuorum.dvcs_canonical(dvc_quorum);
+        const op_head_max = op_max_canonical(dvc_quorum);
+        // The number of uncommitted ops cannot be more than the length of the pipeline.
+        const op_suffix_min = op_head_max -| constants.pipeline_prepare_queue_max;
+        assert(op_suffix_min <= op_head_max);
+        var op_head_min = op_suffix_min;
+        var ops_in_suffix = std.StaticBitSet(constants.pipeline_prepare_queue_max).initEmpty();
+        for (dvcs.constSlice()) |message| {
+            const message_headers = message_body_as_headers_chain_disjoint(message);
+            for (message_headers) |header| {
+                if (header.op > op_suffix_min) {
+                    ops_in_suffix.set((header.op - op_suffix_min) - 1);
+                }
+            }
+            op_head_min = std.math.max(op_head_min, message_headers[message_headers.len - 1].op);
+        }
+        assert(op_head_max == 0 or ops_in_suffix.isSet((op_head_max - op_suffix_min) - 1));
+        assert(op_head_min >= op_suffix_min);
+        assert(op_head_min <= op_head_max);
+        const op_head = blk: {
+            var op = op_head_min + 1;
+            while (op < op_head_max) : (op += 1) {
+                if (!ops_in_suffix.isSet((op - op_suffix_min) - 1)) {
+                    break :blk op - 1;
+                }
+            } else {
+                break :blk op_head_max;
+            }
+        };
+        assert(op_head >= op_head_min);
+        assert(op_head <= op_head_max);
+        return HeaderIterator.init(dvcs, op_head);
+    }
+    /// Iterate the headers of a set of (same-log_view) DVCs, from high-to-low op.
+    const HeaderIterator = struct {
+        dvcs: DVCArray,
+        dvcs_offsets: std.BoundedArray(usize, constants.replicas_max),
+        child: ?struct {
+            op: u64,
+            parent: u128,
+        } = null,
+        fn init(dvcs: DVCArray, op_head: ?u64) HeaderIterator {
+            assert(dvcs.len > 0);
+            var dvcs_log_view: ?u32 = null;
+            for (dvcs.constSlice()) |message| {
+                const log_view = @intCast(u32, message.header.timestamp);
+                if (dvcs_log_view) |view| {
+                    assert(view == log_view);
+                } else {
+                    dvcs_log_view = log_view;
+                }
+            }
+            var dvcs_offsets = std.BoundedArray(usize, constants.replicas_max){
+                .buffer = undefined,
+            };
+            if (op_head) |op_head_| {
+                // Skip over discarded headers.
+                for (dvcs.constSlice()) |message| {
+                    const offset = for (message_body_as_headers_chain_disjoint(message)) |header, i| {
+                        if (header.op <= op_head_) break i;
+                    } else 0;
+                    dvcs_offsets.appendAssumeCapacity(offset);
+                }
+            } else {
+                for (dvcs.constSlice()) |_| dvcs_offsets.appendAssumeCapacity(0);
+            }
+            assert(dvcs.len == dvcs_offsets.len);
+            return .{
+                .dvcs = dvcs,
+                .dvcs_offsets = dvcs_offsets,
+            };
+        }
+        fn next(iterator: *HeaderIterator) ?Header {
+            const ReplicaSet = std.StaticBitSet(constants.replicas_max);
+            var next_header: ?*const Header = null;
+            var next_advance = ReplicaSet.initEmpty();
+            for (iterator.dvcs.constSlice()) |message, i| {
+                const message_headers = message_body_as_headers_chain_disjoint(message);
+                const message_headers_offset = iterator.dvcs_offsets.get(i);
+                if (message_headers_offset == message_headers.len) continue;
+                const header = &message_headers[message_headers_offset];
+                if (next_header == null or
+                    next_header.?.op < header.op)
+                {
+                    next_header = header;
+                    next_advance = ReplicaSet.initEmpty();
+                }
+                assert((next_header.?.op == header.op) ==
+                    (next_header.?.checksum == header.checksum));
+                if (next_header.?.op == header.op) {
+                    next_advance.set(i);
+                }
+            }
+            assert((next_advance.count() == 0) == (next_header == null));
+            var next_advance_iterator = next_advance.iterator(.{});
+            while (next_advance_iterator.next()) |i| {
+                iterator.dvcs_offsets.slice()[i] += 1;
+            }
+            if (next_header) |header| {
+                if (iterator.child) |child| {
+                    assert(child.op > header.op);
+                    assert((child.op == header.op + 1) == (child.parent == header.checksum));
+                }
+                iterator.child = .{ .op = header.op, .parent = header.parent };
+                return header.*;
+            } else {
+                return null;
+            }
+        }
+    };
+};
+/// Asserts that the headers are in descending op order.
+/// The headers may contain gaps and/or breaks.
+fn message_body_as_headers(message: *const Message) []const Header {
+    assert(message.header.size > @sizeOf(Header)); // Body must contain at least one header.
+    assert(message.header.command == .do_view_change or
+        message.header.command == .start_view or
+        message.header.command == .headers);
+    const headers = std.mem.bytesAsSlice(
+        Header,
+        message.buffer[@sizeOf(Header)..message.header.size],
+    );
+    var child: ?*const Header = null;
+    for (headers) |*header| {
+        assert(!constants.verify or header.valid_checksum());
+        assert(header.cluster == message.header.cluster);
+        assert(header.command == .prepare);
+        assert(header.view <= message.header.view);
+        if (child) |child_header| {
+            // Headers must be provided in reverse order for the sake of `repair_header()`.
+            // Otherwise, headers may never be repaired where the hash chain never connects.
+            assert(header.op < child_header.op);
+        }
+        child = header;
+    }
+    return headers;
+}
+/// Asserts that the headers are in descending op order, and there are no breaks.
+/// The headers may contain gaps.
+fn message_body_as_headers_chain_disjoint(message: *const Message) []const Header {
+    assert(message.header.command == .do_view_change or message.header.command == .start_view);
+    const message_headers = message_body_as_headers(message);
+    assert(message_headers.len > 0);
+    assert(message_headers[0].op == message.header.op);
+    var child: ?*const Header = null;
+    for (message_headers) |*header| {
+        assert(header.op <= message.header.op);
+        if (child) |child_header| {
+            assert(header.view <= child_header.view);
+            assert((header.op + 1 == child_header.op) == (header.checksum == child_header.parent));
+            assert(header.timestamp < child_header.timestamp);
+        }
+        child = header;
+    }
+    return message_headers;
+}
+/// Asserts that the headers are in descending op order, and there are no gaps or breaks.
+fn message_body_as_headers_chain_consecutive(message: *const Message) []const Header {
+    assert(message.header.command == .start_view);
+    const message_headers = message_body_as_headers_chain_disjoint(message);
+    var child: ?*const Header = null;
+    for (message_headers) |*header| {
+        if (child) |child_header| {
+            assert(header.op + 1 == child_header.op);
+            assert(header.checksum == child_header.parent);
+        }
+        child = header;
+    }
+    return message_headers;
+}
+/// The PipelineQueue belongs to a normal-status primary. It consists of two queues:
+/// - A prepare queue, containing all messages currently being prepared.
+/// - A request queue, containing all messages which are waiting to begin preparing.
+///
+/// Invariants:
+/// - prepare_queue contains only messages with command=prepare.
+/// - prepare_queue's messages have sequential, increasing ops.
+/// - prepare_queue's messages are hash-chained.
+/// - request_queue contains only messages with command=request.
+/// - If request_queue is not empty, then prepare_queue is full OR 1-less than full.
+///   (The caller is responsible for maintaining this invariant. If the caller removes an entry
+///   from `prepare_queue`, an entry from request_queue should be moved over promptly.)
+///
+/// Note: The prepare queue may contain multiple prepares from a single client, but the request
+/// queue may not (see message_by_client()).
+const PipelineQueue = struct {
+    const PrepareQueue = RingBuffer(Prepare, constants.pipeline_prepare_queue_max, .array);
+    const RequestQueue = RingBuffer(Request, constants.pipeline_request_queue_max, .array);
+    /// Messages that are preparing (uncommitted, being written to the WAL (may already be written
+    /// to the WAL) and replicated (may just be waiting for acks)).
+    prepare_queue: PrepareQueue = .{},
+    /// Messages that are accepted from the client, but not yet preparing.
+    /// When `pipeline_prepare_queue_max + pipeline_request_queue_max = clients_max`, the request
+    /// queue guards against clients starving one another.
+    request_queue: RequestQueue = .{},
+    fn deinit(pipeline: *PipelineQueue, message_pool: *MessagePool) void {
+        while (pipeline.request_queue.pop()) |r| message_pool.unref(r.message);
+        while (pipeline.prepare_queue.pop()) |p| message_pool.unref(p.message);
+    }
+    fn verify(pipeline: PipelineQueue) void {
+        assert(pipeline.request_queue.count <= constants.pipeline_request_queue_max);
+        assert(pipeline.prepare_queue.count <= constants.pipeline_prepare_queue_max);
+        assert(pipeline.request_queue.empty() or
+            constants.pipeline_prepare_queue_max == pipeline.prepare_queue.count or
+            constants.pipeline_prepare_queue_max == pipeline.prepare_queue.count + 1);
+        if (pipeline.prepare_queue.head_ptr_const()) |head| {
+            var op = head.message.header.op;
+            var parent = head.message.header.parent;
+            var prepare_iterator = pipeline.prepare_queue.iterator();
+            while (prepare_iterator.next_ptr()) |prepare| {
+                assert(prepare.message.header.command == .prepare);
+                assert(prepare.message.header.op == op);
+                assert(prepare.message.header.parent == parent);
+                parent = prepare.message.header.checksum;
+                op += 1;
+            }
+        }
+        var request_iterator = pipeline.request_queue.iterator();
+        while (request_iterator.next()) |request| {
+            assert(request.message.header.command == .request);
+        }
+    }
+    fn full(pipeline: PipelineQueue) bool {
+        if (pipeline.prepare_queue.full()) {
+            return pipeline.request_queue.full();
+        } else {
+            assert(pipeline.request_queue.empty() or
+                pipeline.prepare_queue.count + 1 == constants.pipeline_prepare_queue_max);
+            return false;
+        }
+    }
+    /// Searches the pipeline for a prepare for a given op and checksum.
+    /// When `checksum` is `null`, match any checksum.
+    fn prepare_by_op_and_checksum(pipeline: *PipelineQueue, op: u64, checksum: ?u128) ?*Prepare {
+        if (pipeline.prepare_queue.empty()) return null;
+        // To optimize the search, we can leverage the fact that the pipeline's entries are
+        // ordered and consecutive.
+        const head_op = pipeline.prepare_queue.head_ptr().?.message.header.op;
+        const tail_op = pipeline.prepare_queue.tail_ptr().?.message.header.op;
+        if (op < head_op) return null;
+        if (op > tail_op) return null;
+        const prepare = pipeline.prepare_queue.get_ptr(op - head_op).?;
+        assert(prepare.message.header.op == op);
+        if (checksum == null) return prepare;
+        if (checksum.? == prepare.message.header.checksum) return prepare;
+        return null;
+    }
+    /// Searches the pipeline for a prepare matching the given ack.
+    /// Asserts that the returned prepare corresponds to the prepare_ok.
+    fn prepare_by_prepare_ok(pipeline: *PipelineQueue, ok: *const Message) ?*Prepare {
+        assert(ok.header.command == .prepare_ok);
+        const prepare = pipeline.prepare_by_op_and_checksum(
+            ok.header.op,
+            ok.header.context,
+        ) orelse return null;
+        assert(prepare.message.header.command == .prepare);
+        assert(prepare.message.header.parent == ok.header.parent);
+        assert(prepare.message.header.client == ok.header.client);
+        assert(prepare.message.header.request == ok.header.request);
+        assert(prepare.message.header.cluster == ok.header.cluster);
+        assert(prepare.message.header.epoch == ok.header.epoch);
+        // A prepare may be committed in the same view or in a newer view:
+        assert(prepare.message.header.view <= ok.header.view);
+        assert(prepare.message.header.op == ok.header.op);
+        assert(prepare.message.header.commit == ok.header.commit);
+        assert(prepare.message.header.timestamp == ok.header.timestamp);
+        assert(prepare.message.header.operation == ok.header.operation);
+        return prepare;
+    }
+    /// Search the pipeline (both request & prepare queues) for a message from the given client.
+    /// - A client may have multiple prepares in the pipeline if these were committed by the
+    ///   previous primary and were reloaded into the pipeline after a view change.
+    /// - A client may have at most one request in the pipeline.
+    /// If there are multiple messages in the pipeline from the client, the *latest* message is
+    /// returned (to help the caller identify bad client behavior).
+    fn message_by_client(pipeline: PipelineQueue, client_id: u128) ?*const Message {
+        var message: ?*const Message = null;
+        var prepare_iterator = pipeline.prepare_queue.iterator();
+        while (prepare_iterator.next_ptr()) |prepare| {
+            if (prepare.message.header.client == client_id) message = prepare.message;
+        }
+        var request_iterator = pipeline.request_queue.iterator();
+        while (request_iterator.next()) |request| {
+            if (request.message.header.client == client_id) message = request.message;
+        }
+        return message;
+    }
+    /// Warning: This temporarily violates the prepare/request queue count invariant.
+    /// After invocation, call pop_request→push_prepare to begin preparing the next request.
+    fn pop_prepare(pipeline: *PipelineQueue) ?Prepare {
+        if (pipeline.prepare_queue.pop()) |prepare| {
+            assert(pipeline.request_queue.empty() or
+                pipeline.prepare_queue.count + 1 == constants.pipeline_prepare_queue_max);
+            return prepare;
+        } else {
+            assert(pipeline.request_queue.empty());
+            return null;
+        }
+    }
+    fn pop_request(pipeline: *PipelineQueue) ?Request {
+        return pipeline.request_queue.pop();
+    }
+    fn push_request(pipeline: *PipelineQueue, request: Request) void {
+        assert(request.message.header.command == .request);
+        var queue_iterator = pipeline.request_queue.iterator();
+        while (queue_iterator.next()) |queue_request| {
+            assert(queue_request.message.header.client != request.message.header.client);
+        }
+        pipeline.request_queue.push_assume_capacity(request);
+        if (constants.verify) pipeline.verify();
+    }
+    fn push_prepare(pipeline: *PipelineQueue, message: *Message) void {
+        assert(message.header.command == .prepare);
+        if (pipeline.prepare_queue.tail()) |tail| {
+            assert(message.header.op == tail.message.header.op + 1);
+            assert(message.header.parent == tail.message.header.checksum);
+            assert(message.header.view >= tail.message.header.view);
+        } else {
+            assert(pipeline.request_queue.empty());
+        }
+        pipeline.prepare_queue.push_assume_capacity(.{ .message = message });
+        if (constants.verify) pipeline.verify();
+    }
+};
+/// Prepares in the cache may be committed or uncommitted, and may not belong to the current view.
+///
+/// Invariants:
+/// - The cache contains only messages with command=prepare.
+/// - If a message with op X is in the cache, it is in `prepares[X % prepares.len]`.
+const PipelineCache = struct {
+    const prepares_max =
+        constants.pipeline_prepare_queue_max +
+        constants.pipeline_request_queue_max;
+    prepares: [prepares_max]?*Message = [_]?*Message{null} ** prepares_max,
+    /// Converting a PipelineQueue to a PipelineCache discards all accumulated acks.
+    /// "prepare_ok"s from previous views are not valid, even if the pipeline entry is reused
+    /// after a cycle of view changes. In other words, when a view change cycles around, so
+    /// that the original primary becomes a primary of a new view, pipeline entries may be
+    /// reused. However, the pipeline's prepare_ok quorums must not be reused, since the
+    /// replicas that sent them may have swapped them out during a previous view change.
+    fn init_from_queue(queue: *PipelineQueue) PipelineCache {
+        var cache = PipelineCache{};
+        var prepares = queue.prepare_queue.iterator();
+        while (prepares.next()) |prepare| {
+            const prepare_evicted = cache.insert(prepare.message.ref());
+            assert(prepare_evicted == null);
+            assert(prepare.message.header.command == .prepare);
+        }
+        return cache;
+    }
+    fn deinit(pipeline: *PipelineCache, message_pool: *MessagePool) void {
+        for (pipeline.prepares) |*entry| {
+            if (entry.*) |m| {
+                message_pool.unref(m);
+                entry.* = null;
+            }
+        }
+    }
+    fn empty(pipeline: *const PipelineCache) bool {
+        for (pipeline.prepares) |*entry| {
+            if (entry) |_| return true;
+        }
+        return false;
+    }
+    fn contains_header(pipeline: *const PipelineCache, header: *const Header) bool {
+        assert(header.command == .prepare);
+        const slot = header.op % prepares_max;
+        const prepare = pipeline.prepares[slot] orelse return false;
+        return prepare.header.op == header.op and prepare.header.checksum == header.checksum;
+    }
+    /// Unlike the PipelineQueue, cached messages may not belong to the current view.
+    /// Thus, a matching checksum is required.
+    fn prepare_by_op_and_checksum(pipeline: *PipelineCache, op: u64, checksum: u128) ?*Message {
+        const slot = op % prepares_max;
+        const prepare = pipeline.prepares[slot] orelse return null;
+        if (prepare.header.op != op) return null;
+        if (prepare.header.checksum != checksum) return null;
+        return prepare;
+    }
+    /// Returns the message evicted from the cache, if any.
+    fn insert(pipeline: *PipelineCache, prepare: *Message) ?*Message {
+        assert(prepare.header.command == .prepare);
+        const slot = prepare.header.op % prepares_max;
+        const prepare_evicted = pipeline.prepares[slot];
+        pipeline.prepares[slot] = prepare;
+        return prepare_evicted;
+    }
+};