npm - tigerbeetle-node - Versions diffs - 0.9.0 → 0.11.0 - Mend

tigerbeetle-node 0.9.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (112) hide show

package/README.md +305 -103
package/dist/index.d.ts +70 -67
package/dist/index.js +70 -67
package/dist/index.js.map +1 -1
package/package.json +6 -6
package/scripts/download_node_headers.sh +14 -7
package/src/index.ts +11 -10
package/src/node.zig +22 -20
package/src/tigerbeetle/scripts/benchmark.bat +4 -3
package/src/tigerbeetle/scripts/benchmark.sh +25 -10
package/src/tigerbeetle/scripts/confirm_image.sh +44 -0
package/src/tigerbeetle/scripts/fuzz_loop.sh +15 -0
package/src/tigerbeetle/scripts/fuzz_unique_errors.sh +7 -0
package/src/tigerbeetle/scripts/install.sh +20 -4
package/src/tigerbeetle/scripts/install_zig.bat +5 -1
package/src/tigerbeetle/scripts/install_zig.sh +32 -26
package/src/tigerbeetle/scripts/pre-commit.sh +9 -0
package/src/tigerbeetle/scripts/shellcheck.sh +5 -0
package/src/tigerbeetle/scripts/tests_on_alpine.sh +10 -0
package/src/tigerbeetle/scripts/tests_on_ubuntu.sh +14 -0
package/src/tigerbeetle/scripts/upgrade_ubuntu_kernel.sh +12 -3
package/src/tigerbeetle/src/benchmark.zig +19 -9
package/src/tigerbeetle/src/benchmark_array_search.zig +317 -0
package/src/tigerbeetle/src/benchmarks/perf.zig +299 -0
package/src/tigerbeetle/src/c/tb_client/context.zig +103 -0
package/src/tigerbeetle/src/c/tb_client/packet.zig +80 -0
package/src/tigerbeetle/src/c/tb_client/signal.zig +288 -0
package/src/tigerbeetle/src/c/tb_client/thread.zig +328 -0
package/src/tigerbeetle/src/c/tb_client.h +221 -0
package/src/tigerbeetle/src/c/tb_client.zig +104 -0
package/src/tigerbeetle/src/c/test.zig +1 -0
package/src/tigerbeetle/src/cli.zig +143 -84
package/src/tigerbeetle/src/config.zig +161 -20
package/src/tigerbeetle/src/demo.zig +14 -8
package/src/tigerbeetle/src/demo_05_post_pending_transfers.zig +2 -2
package/src/tigerbeetle/src/ewah.zig +318 -0
package/src/tigerbeetle/src/ewah_benchmark.zig +121 -0
package/src/tigerbeetle/src/eytzinger_benchmark.zig +317 -0
package/src/tigerbeetle/src/fifo.zig +17 -1
package/src/tigerbeetle/src/io/darwin.zig +12 -10
package/src/tigerbeetle/src/io/linux.zig +25 -9
package/src/tigerbeetle/src/io/windows.zig +13 -9
package/src/tigerbeetle/src/iops.zig +101 -0
package/src/tigerbeetle/src/lsm/README.md +214 -0
package/src/tigerbeetle/src/lsm/binary_search.zig +341 -0
package/src/tigerbeetle/src/lsm/bloom_filter.zig +125 -0
package/src/tigerbeetle/src/lsm/compaction.zig +557 -0
package/src/tigerbeetle/src/lsm/composite_key.zig +77 -0
package/src/tigerbeetle/src/lsm/direction.zig +11 -0
package/src/tigerbeetle/src/lsm/eytzinger.zig +587 -0
package/src/tigerbeetle/src/lsm/forest.zig +204 -0
package/src/tigerbeetle/src/lsm/forest_fuzz.zig +412 -0
package/src/tigerbeetle/src/lsm/grid.zig +549 -0
package/src/tigerbeetle/src/lsm/groove.zig +1002 -0
package/src/tigerbeetle/src/lsm/k_way_merge.zig +474 -0
package/src/tigerbeetle/src/lsm/level_iterator.zig +315 -0
package/src/tigerbeetle/src/lsm/manifest.zig +580 -0
package/src/tigerbeetle/src/lsm/manifest_level.zig +925 -0
package/src/tigerbeetle/src/lsm/manifest_log.zig +953 -0
package/src/tigerbeetle/src/lsm/node_pool.zig +231 -0
package/src/tigerbeetle/src/lsm/posted_groove.zig +387 -0
package/src/tigerbeetle/src/lsm/segmented_array.zig +1318 -0
package/src/tigerbeetle/src/lsm/segmented_array_benchmark.zig +148 -0
package/src/tigerbeetle/src/lsm/segmented_array_fuzz.zig +9 -0
package/src/tigerbeetle/src/lsm/set_associative_cache.zig +894 -0
package/src/tigerbeetle/src/lsm/table.zig +967 -0
package/src/tigerbeetle/src/lsm/table_immutable.zig +203 -0
package/src/tigerbeetle/src/lsm/table_iterator.zig +306 -0
package/src/tigerbeetle/src/lsm/table_mutable.zig +174 -0
package/src/tigerbeetle/src/lsm/test.zig +423 -0
package/src/tigerbeetle/src/lsm/tree.zig +1090 -0
package/src/tigerbeetle/src/lsm/tree_fuzz.zig +457 -0
package/src/tigerbeetle/src/main.zig +141 -109
package/src/tigerbeetle/src/message_bus.zig +49 -48
package/src/tigerbeetle/src/message_pool.zig +22 -12
package/src/tigerbeetle/src/ring_buffer.zig +126 -30
package/src/tigerbeetle/src/simulator.zig +205 -140
package/src/tigerbeetle/src/state_machine.zig +1268 -721
package/src/tigerbeetle/src/static_allocator.zig +65 -0
package/src/tigerbeetle/src/storage.zig +40 -14
package/src/tigerbeetle/src/test/accounting/auditor.zig +577 -0
package/src/tigerbeetle/src/test/accounting/workload.zig +819 -0
package/src/tigerbeetle/src/test/cluster.zig +104 -88
package/src/tigerbeetle/src/test/conductor.zig +365 -0
package/src/tigerbeetle/src/test/fuzz.zig +121 -0
package/src/tigerbeetle/src/test/id.zig +89 -0
package/src/tigerbeetle/src/test/message_bus.zig +15 -24
package/src/tigerbeetle/src/test/network.zig +26 -17
package/src/tigerbeetle/src/test/priority_queue.zig +645 -0
package/src/tigerbeetle/src/test/state_checker.zig +94 -68
package/src/tigerbeetle/src/test/state_machine.zig +135 -69
package/src/tigerbeetle/src/test/storage.zig +78 -28
package/src/tigerbeetle/src/tigerbeetle.zig +19 -16
package/src/tigerbeetle/src/unit_tests.zig +15 -0
package/src/tigerbeetle/src/util.zig +51 -0
package/src/tigerbeetle/src/vopr.zig +494 -0
package/src/tigerbeetle/src/vopr_hub/README.md +58 -0
package/src/tigerbeetle/src/vopr_hub/SETUP.md +199 -0
package/src/tigerbeetle/src/vopr_hub/go.mod +3 -0
package/src/tigerbeetle/src/vopr_hub/main.go +1022 -0
package/src/tigerbeetle/src/vopr_hub/scheduler/go.mod +3 -0
package/src/tigerbeetle/src/vopr_hub/scheduler/main.go +403 -0
package/src/tigerbeetle/src/vsr/client.zig +34 -7
package/src/tigerbeetle/src/vsr/journal.zig +164 -174
package/src/tigerbeetle/src/vsr/replica.zig +1602 -651
package/src/tigerbeetle/src/vsr/superblock.zig +1761 -0
package/src/tigerbeetle/src/vsr/superblock_client_table.zig +255 -0
package/src/tigerbeetle/src/vsr/superblock_free_set.zig +644 -0
package/src/tigerbeetle/src/vsr/superblock_manifest.zig +561 -0
package/src/tigerbeetle/src/vsr.zig +118 -170
package/src/tigerbeetle/scripts/vopr.bat +0 -48
package/src/tigerbeetle/scripts/vopr.sh +0 -33

package/src/tigerbeetle/src/vsr/replica.zig CHANGED Viewed

@@ -4,14 +4,20 @@ const assert = std.debug.assert;
 const config = @import("../config.zig");
+const StaticAllocator = @import("../static_allocator.zig");
+const GridType = @import("../lsm/grid.zig").GridType;
+const MessagePool = @import("../message_pool.zig").MessagePool;
 const Message = @import("../message_pool.zig").MessagePool.Message;
 const RingBuffer = @import("../ring_buffer.zig").RingBuffer;
+const ClientTable = @import("superblock_client_table.zig").ClientTable;
+const format_journal = @import("./journal.zig").format_journal;
 const vsr = @import("../vsr.zig");
 const Header = vsr.Header;
 const Timeout = vsr.Timeout;
 const Command = vsr.Command;
 const Version = vsr.Version;
+const VSRState = vsr.VSRState;
 const log = std.log.scoped(.replica);
@@ -39,32 +45,6 @@ pub const Status = enum {
     recovering,
 };
-const ClientTable = std.AutoHashMapUnmanaged(u128, ClientTableEntry);
-/// We found two bugs in the VRR paper relating to the client table:
-///
-/// 1. a correctness bug, where successive client crashes may cause request numbers to collide for
-/// different request payloads, resulting in requests receiving the wrong reply, and
-///
-/// 2. a liveness bug, where if the client table is updated for request and prepare messages with
-/// the client's latest request number, then the client may be locked out from the cluster if the
-/// request is ever reordered through a view change.
-///
-/// We therefore take a different approach with the implementation of our client table, to:
-///
-/// 1. register client sessions explicitly through the state machine to ensure that client session
-/// numbers always increase, and
-///
-/// 2. make a more careful distinction between uncommitted and committed request numbers,
-/// considering that uncommitted requests may not survive a view change.
-const ClientTableEntry = struct {
-    /// The client's session number as committed to the cluster by a register request.
-    session: u64,
-    /// The reply sent to the client's latest committed request.
-    reply: *Message,
-};
 const Nonce = u128;
 const Prepare = struct {
@@ -84,18 +64,40 @@ const quorum_messages_null = [_]?*Message{null} ** config.replicas_max;
 const QuorumCounter = std.StaticBitSet(config.replicas_max);
 const quorum_counter_null = QuorumCounter.initEmpty();
-pub fn Replica(
+// CRITICAL: The number of prepare headers to include in the body:
+// We must provide enough headers to cover all uncommitted headers so that the new
+// leader (if we are in a view change) can decide whether to discard uncommitted headers
+// that cannot be repaired because they are gaps, and this must be relative to the
+// cluster as a whole (not relative to the difference between our op and commit number)
+// as otherwise we would break correctness.
+const view_change_headers_count = config.pipeline_max;
+comptime {
+    assert(view_change_headers_count > 0);
+    assert(view_change_headers_count >= config.pipeline_max);
+    assert(view_change_headers_count <=
+        @divFloor(config.message_size_max - @sizeOf(Header), @sizeOf(Header)));
+}
+pub fn ReplicaType(
     comptime StateMachine: type,
     comptime MessageBus: type,
     comptime Storage: type,
     comptime Time: type,
 ) type {
+    const Grid = GridType(Storage);
+    const SuperBlock = vsr.SuperBlockType(Storage);
     return struct {
         const Self = @This();
         const Journal = vsr.Journal(Self, Storage);
         const Clock = vsr.Clock(Time);
+        /// We use this allocator during open/init and then disable it.
+        /// An accidental dynamic allocation after open/init will cause an assertion failure.
+        static_allocator: StaticAllocator,
         /// The number of the cluster to which this replica belongs:
         cluster: u32,
@@ -111,6 +113,8 @@ pub fn Replica(
         /// The minimum number of replicas required to form a view change quorum:
         quorum_view_change: u8,
+        time: Time,
         /// A distributed fault-tolerant clock for lower and upper bounds on the leader's wall clock:
         clock: Clock,
@@ -118,14 +122,17 @@ pub fn Replica(
         journal: Journal,
         /// An abstraction to send messages from the replica to another replica or client.
-        /// The message bus will also deliver messages to this replica by calling `on_message()`.
-        message_bus: *MessageBus,
+        /// The message bus will also deliver messages to this replica by calling `on_message_from_bus()`.
+        message_bus: MessageBus,
         /// For executing service up-calls after an operation has been committed:
-        state_machine: *StateMachine,
+        state_machine: StateMachine,
-        /// The client table records for each client the latest session and the latest committed reply.
-        client_table: ClientTable,
+        // TODO Document.
+        superblock: SuperBlock,
+        superblock_context: SuperBlock.Context = undefined,
+        grid: Grid,
+        opened: bool,
         /// The current view, initially 0:
         view: u32,
@@ -136,24 +143,46 @@ pub fn Replica(
         /// The current status, either normal, view_change, or recovering:
         status: Status = .recovering,
-        /// The op number assigned to the most recently prepared operation:
+        /// The op number assigned to the most recently prepared operation.
+        ///
+        /// Invariants (not applicable during status=recovering):
+        /// * `replica.op` exists in the Journal.
+        /// * `replica.op ≥ replica.commit_min`.
+        /// * `replica.op ≤ replica.op_checkpoint_trigger`: don't wrap the WAL until we are sure
+        ///   that the overwritten entry will not be required for recovery.
+        // TODO: When recovery protocol is removed, load the `op` from the WAL, and verify that it is ≥op_checkpoint.
+        // Also verify that a corresponding header exists in the WAL.
         op: u64,
         /// The op of the highest checkpointed message.
-        // TODO Update this to use LSM storage.
         // TODO Refuse to store/ack any op>op_checkpoint+journal_slot_count.
-        // TODO Enforce invariant op≥op_checkpoint.
-        op_checkpoint: u64 = 0,
+        op_checkpoint: u64,
         /// The op number of the latest committed and executed operation (according to the replica):
         /// The replica may have to wait for repairs to complete before commit_min reaches commit_max.
+        ///
+        /// Invariants (not applicable during status=recovering):
+        /// * `replica.commit_min` exists in the Journal.
+        /// * `replica.commit_min ≤ replica.op`
+        /// * `replica.commit_min ≥ replica.op_checkpoint`.
+        /// * never decreases
         commit_min: u64,
         /// The op number of the latest committed operation (according to the cluster):
         /// This is the commit number in terms of the VRR paper.
+        ///
+        /// Invariants:
+        /// * `replica.commit_max ≥ replica.commit_min`.
+        /// * never decreases
         commit_max: u64,
-        /// Whether we are reading a prepare from storage in order to commit.
+        /// Guards against concurrent commits.
+        ///
+        /// Set while:
+        /// * prefetching from storage, in preparation for a commit
+        /// * reading a prepare from storage in order to commit
+        /// * compacting storage
+        /// * checkpointing
         committing: bool = false,
         /// Whether we are reading a prepare from storage in order to push to the pipeline.
@@ -164,7 +193,7 @@ pub fn Replica(
         ///
         /// After a view change, the old leader's pipeline is left untouched so that it is able to
         /// help the new leader repair, even in the face of local storage faults.
-        pipeline: RingBuffer(Prepare, config.pipeline_max) = .{},
+        pipeline: RingBuffer(Prepare, config.pipeline_max, .array) = .{},
         /// In some cases, a replica may send a message to itself. We do not submit these messages
         /// to the message bus but rather queue them here for guaranteed immediate delivery, which
@@ -236,18 +265,112 @@ pub fn Replica(
         on_change_state: ?fn (replica: *Self) void = null,
-        pub fn init(
-            allocator: Allocator,
+        /// Called when `commit_prepare` finishes committing.
+        commit_callback: ?fn (*Self) void = null,
+        /// The prepare message being committed.
+        commit_prepare: ?*Message = null,
+        const OpenOptions = struct {
+            replica_count: u8,
+            storage: *Storage,
+            message_pool: *MessagePool,
+            time: Time,
+            state_machine_options: StateMachine.Options,
+            message_bus_options: MessageBus.Options,
+        };
+        /// Initializes and opens the provided replica using the options.
+        pub fn open(self: *Self, parent_allocator: std.mem.Allocator, options: OpenOptions) !void {
+            self.static_allocator = StaticAllocator.init(parent_allocator);
+            const allocator = self.static_allocator.allocator();
+            self.superblock = try SuperBlock.init(
+                allocator,
+                options.storage,
+                options.message_pool,
+            );
+            // Once initialzed, the replica is in charge of calling superblock.deinit()
+            var initialized = false;
+            errdefer if (!initialized) self.superblock.deinit(allocator);
+            // Open the superblock:
+            self.opened = false;
+            self.superblock.open(superblock_open_callback, &self.superblock_context);
+            while (!self.opened) self.superblock.storage.tick();
+            assert(self.superblock.working.vsr_state.internally_consistent());
+            if (self.superblock.working.replica >= options.replica_count) {
+                log.err("{}: open: no address for replica (replica_count={})", .{
+                    self.superblock.working.replica,
+                    options.replica_count,
+                });
+                return error.NoAddress;
+            }
+            // Intiaize the replica:
+            try self.init(allocator, .{
+                .cluster = self.superblock.working.cluster,
+                .replica_index = self.superblock.working.replica,
+                .replica_count = options.replica_count,
+                .storage = options.storage,
+                .time = options.time,
+                .message_pool = options.message_pool,
+                .state_machine_options = options.state_machine_options,
+                .message_bus_options = options.message_bus_options,
+            });
+            // Disable all dynamic allocation from this point onwards.
+            self.static_allocator.transition_from_init_to_static();
+            initialized = true;
+            errdefer self.deinit(allocator);
+            // Open the (Forest inside) StateMachine:
+            self.opened = false;
+            self.state_machine.open(state_machine_open_callback);
+            while (!self.opened) {
+                self.grid.tick();
+                self.superblock.storage.tick();
+            }
+        }
+        fn superblock_open_callback(superblock_context: *SuperBlock.Context) void {
+            const self = @fieldParentPtr(Self, "superblock_context", superblock_context);
+            assert(!self.opened);
+            self.opened = true;
+        }
+        fn state_machine_open_callback(state_machine: *StateMachine) void {
+            const self = @fieldParentPtr(Self, "state_machine", state_machine);
+            assert(!self.opened);
+            self.opened = true;
+        }
+        const Options = struct {
             cluster: u32,
             replica_count: u8,
-            replica: u8,
-            time: *Time,
+            replica_index: u8,
+            time: Time,
             storage: *Storage,
-            message_bus: *MessageBus,
-            state_machine: *StateMachine,
-        ) !Self {
+            message_pool: *MessagePool,
+            // TODO With https://github.com/coilhq/tigerbeetle/issues/71,
+            // the separate message_bus_options won't be necessary.
+            message_bus_options: MessageBus.Options,
+            state_machine_options: StateMachine.Options,
+        };
+        /// NOTE: self.superblock must be initialized and opened prior to this call.
+        fn init(self: *Self, allocator: Allocator, options: Options) !void {
+            const replica_count = options.replica_count;
+            const replica_index = options.replica_index;
             assert(replica_count > 0);
-            assert(replica < replica_count);
+            assert(replica_index < replica_count);
+            assert(self.opened);
+            assert(self.superblock.opened);
+            assert(self.superblock.working.vsr_state.internally_consistent());
             const majority = (replica_count / 2) + 1;
             assert(majority <= replica_count);
@@ -277,91 +400,112 @@ pub fn Replica(
             // Flexible quorums are safe if these two quorums intersect so that this relation holds:
             assert(quorum_replication + quorum_view_change > replica_count);
-            var client_table: ClientTable = .{};
-            errdefer client_table.deinit(allocator);
-            try client_table.ensureTotalCapacity(allocator, @intCast(u32, config.clients_max));
-            assert(client_table.capacity() >= config.clients_max);
+            self.time = options.time;
+            self.clock = try Clock.init(
+                allocator,
+                replica_count,
+                replica_index,
+                &self.time,
+            );
+            errdefer self.clock.deinit(allocator);
-            const root_prepare = Header.root_prepare(cluster);
+            self.journal = try Journal.init(allocator, options.storage, replica_index);
+            errdefer self.journal.deinit(allocator);
-            var clock = try Clock.init(
+            self.message_bus = try MessageBus.init(
                 allocator,
-                replica_count,
-                replica,
-                time,
+                options.cluster,
+                .{ .replica = options.replica_index },
+                options.message_pool,
+                Self.on_message_from_bus,
+                options.message_bus_options,
             );
-            errdefer clock.deinit(allocator);
+            errdefer self.message_bus.deinit(allocator);
+            self.grid = try Grid.init(allocator, &self.superblock);
+            errdefer self.grid.deinit(allocator);
-            const journal = try Journal.init(allocator, storage, replica);
-            errdefer journal.deinit(allocator);
+            self.state_machine = try StateMachine.init(
+                allocator,
+                &self.grid,
+                options.state_machine_options,
+            );
+            errdefer self.state_machine.deinit(allocator);
             const recovery_nonce = blk: {
                 var nonce: [@sizeOf(Nonce)]u8 = undefined;
                 var hash = std.crypto.hash.Blake3.init(.{});
-                hash.update(std.mem.asBytes(&clock.monotonic()));
-                hash.update(&[_]u8{replica});
+                hash.update(std.mem.asBytes(&self.clock.monotonic()));
+                hash.update(&[_]u8{replica_index});
                 hash.final(&nonce);
                 break :blk @bitCast(Nonce, nonce);
             };
-            var self = Self{
-                .cluster = cluster,
+            self.* = Self{
+                .static_allocator = self.static_allocator,
+                .cluster = options.cluster,
                 .replica_count = replica_count,
-                .replica = replica,
+                .replica = replica_index,
                 .quorum_replication = quorum_replication,
                 .quorum_view_change = quorum_view_change,
-                .clock = clock,
-                .journal = journal,
-                .message_bus = message_bus,
-                .state_machine = state_machine,
-                .client_table = client_table,
-                .view = root_prepare.view,
-                .view_normal = root_prepare.view,
-                .op = root_prepare.op,
-                .commit_min = root_prepare.commit,
-                .commit_max = root_prepare.commit,
+                // Copy the (already-initialized) time back, to avoid regressing the monotonic
+                // clock guard.
+                .time = self.time,
+                .clock = self.clock,
+                .journal = self.journal,
+                .message_bus = self.message_bus,
+                .state_machine = self.state_machine,
+                .superblock = self.superblock,
+                .grid = self.grid,
+                .opened = self.opened,
+                .view = self.superblock.working.vsr_state.view,
+                .view_normal = self.superblock.working.vsr_state.view_normal,
+                .op = 0,
+                .op_checkpoint = self.superblock.working.vsr_state.commit_min,
+                .commit_min = self.superblock.working.vsr_state.commit_min,
+                .commit_max = self.superblock.working.vsr_state.commit_max,
                 .ping_timeout = Timeout{
                     .name = "ping_timeout",
-                    .id = replica,
+                    .id = replica_index,
                     .after = 100,
                 },
                 .prepare_timeout = Timeout{
                     .name = "prepare_timeout",
-                    .id = replica,
+                    .id = replica_index,
                     .after = 50,
                 },
                 .commit_timeout = Timeout{
                     .name = "commit_timeout",
-                    .id = replica,
+                    .id = replica_index,
                     .after = 100,
                 },
                 .normal_status_timeout = Timeout{
                     .name = "normal_status_timeout",
-                    .id = replica,
+                    .id = replica_index,
                     .after = 500,
                 },
                 .view_change_status_timeout = Timeout{
                     .name = "view_change_status_timeout",
-                    .id = replica,
+                    .id = replica_index,
                     .after = 500,
                 },
                 .view_change_message_timeout = Timeout{
                     .name = "view_change_message_timeout",
-                    .id = replica,
+                    .id = replica_index,
                     .after = 50,
                 },
                 .repair_timeout = Timeout{
                     .name = "repair_timeout",
-                    .id = replica,
+                    .id = replica_index,
                     .after = 50,
                 },
                 .recovery_timeout = Timeout{
                     .name = "recovery_timeout",
-                    .id = replica,
+                    .id = replica_index,
                     .after = 200,
                 },
                 .recovery_nonce = recovery_nonce,
-                .prng = std.rand.DefaultPrng.init(replica),
+                .prng = std.rand.DefaultPrng.init(replica_index),
             };
             log.debug("{}: init: replica_count={} quorum_view_change={} quorum_replication={}", .{
@@ -375,28 +519,24 @@ pub fn Replica(
             // always overallocate capacity by a factor of two.
             log.debug("{}: init: client_table.capacity()={} for config.clients_max={} entries", .{
                 self.replica,
-                self.client_table.capacity(),
+                self.client_table().capacity(),
                 config.clients_max,
             });
             assert(self.status == .recovering);
-            return self;
         }
         /// Free all memory and unref all messages held by the replica
         /// This does not deinitialize the StateMachine, MessageBus, Storage, or Time
         pub fn deinit(self: *Self, allocator: Allocator) void {
+            self.static_allocator.transition_from_static_to_deinit();
             self.journal.deinit(allocator);
             self.clock.deinit(allocator);
-            {
-                var it = self.client_table.iterator();
-                while (it.next()) |entry| {
-                    self.message_bus.unref(entry.value_ptr.reply);
-                }
-                self.client_table.deinit(allocator);
-            }
+            self.state_machine.deinit(allocator);
+            self.superblock.deinit(allocator);
+            self.grid.deinit(allocator);
+            defer self.message_bus.deinit(allocator);
             while (self.pipeline.pop()) |prepare| self.message_bus.unref(prepare.message);
@@ -406,6 +546,15 @@ pub fn Replica(
                 self.loopback_queue = null;
             }
+            if (self.commit_prepare) |message| {
+                assert(self.committing);
+                assert(self.commit_callback != null);
+                self.message_bus.unref(message);
+                self.commit_prepare = null;
+            } else {
+                assert(self.commit_callback == null);
+            }
             for (self.do_view_change_from_all_replicas) |message| {
                 if (message) |m| self.message_bus.unref(m);
             }
@@ -415,6 +564,11 @@ pub fn Replica(
             }
         }
+        /// The client table records for each client the latest session and the latest committed reply.
+        inline fn client_table(self: *Self) *ClientTable {
+            return &self.superblock.client_table;
+        }
         /// Time is measured in logical ticks that are incremented on every call to tick().
         /// This eliminates a dependency on the system time and enables deterministic testing.
         pub fn tick(self: *Self) void {
@@ -424,8 +578,15 @@ pub fn Replica(
             // decrease throughput significantly.
             assert(self.loopback_queue == null);
+            // TODO Replica owns Time; should it tick() here instead of Clock?
             self.clock.tick();
+            // Storage/IO is ticked by top-level in case of multiple replicas sharing the same IO.
+            // self.journal.storage.tick();
+            self.grid.tick();
+            self.message_bus.tick();
             if (!self.journal.recovered) {
                 if (!self.journal.recovering) self.journal.recover();
                 return;
@@ -442,6 +603,10 @@ pub fn Replica(
                     // The data file is brand new — no messages have ever been written.
                     // Transition to normal status; no need to run the VSR recovery protocol.
                     assert(self.journal.faulty.count == 0);
+                    assert(self.commit_min == 0);
+                    assert(self.commit_max == 0);
+                    assert(self.op_checkpoint == 0);
+                    assert(self.op == 0);
                     self.transition_to_normal_from_recovering_status(0);
                     assert(self.status == .normal);
                 } else if (self.replica_count == 1) {
@@ -449,8 +614,13 @@ pub fn Replica(
                     if (self.journal.faulty.count != 0) @panic("journal is corrupt");
                     if (self.committing) return;
                     assert(self.op == 0);
+                    // TODO Assert that this path isn't taken more than once.
                     self.op = self.journal.op_maximum();
-                    self.commit_ops(self.op);
+                    assert(self.op >= self.commit_min);
+                    assert(self.op >= self.op_checkpoint);
+                    assert(self.op <= self.op_checkpoint_trigger());
+                    assert(self.journal.header_with_op(self.op) != null);
+                    self.commit_journal(self.op);
                     // The recovering→normal transition is deferred until all ops are committed.
                 } else {
                     // The journal just finished recovery.
@@ -482,6 +652,11 @@ pub fn Replica(
         }
         /// Called by the MessageBus to deliver a message to the replica.
+        fn on_message_from_bus(message_bus: *MessageBus, message: *Message) void {
+            const self = @fieldParentPtr(Self, "message_bus", message_bus);
+            self.on_message(message);
+        }
         pub fn on_message(self: *Self, message: *Message) void {
             assert(self.loopback_queue == null);
             assert(message.references > 0);
@@ -533,6 +708,7 @@ pub fn Replica(
                 .request_start_view => self.on_request_start_view(message),
                 .request_prepare => self.on_request_prepare(message),
                 .request_headers => self.on_request_headers(message),
+                .request_block => unreachable, // TODO
                 .headers => self.on_headers(message),
                 .nack_prepare => self.on_nack_prepare(message),
                 // A replica should never handle misdirected messages intended for a client:
@@ -543,6 +719,7 @@ pub fn Replica(
                     });
                     return;
                 },
+                .block => unreachable, // TODO
                 .reserved => unreachable,
             }
@@ -731,7 +908,7 @@ pub fn Replica(
             }
             // Verify that the new request will fit in the WAL.
-            if (message.header.op >= self.op_checkpoint + config.journal_slot_count) {
+            if (message.header.op > self.op_checkpoint_trigger()) {
                 log.debug("{}: on_prepare: ignoring op={} (too far ahead, checkpoint={})", .{
                     self.replica,
                     message.header.op,
@@ -749,13 +926,15 @@ pub fn Replica(
             assert(message.header.op > self.op_checkpoint);
             assert(message.header.op > self.op);
             assert(message.header.op > self.commit_min);
-            assert(message.header.op < self.op_checkpoint + config.journal_slot_count);
+            assert(message.header.op <= self.op_checkpoint_trigger());
             if (self.follower()) self.normal_status_timeout.reset();
             if (message.header.op > self.op + 1) {
                 log.debug("{}: on_prepare: newer op", .{self.replica});
                 self.jump_to_newer_op_in_normal_status(message.header);
+                // "`replica.op` exists" invariant is temporarily broken.
+                assert(self.journal.header_with_op(message.header.op - 1) == null);
             }
             if (self.journal.previous_entry(message.header)) |previous| {
@@ -782,7 +961,7 @@ pub fn Replica(
             if (self.follower()) {
                 // A prepare may already be committed if requested by repair() so take the max:
-                self.commit_ops(std.math.max(message.header.commit, self.commit_max));
+                self.commit_journal(std.math.max(message.header.commit, self.commit_max));
                 assert(self.commit_max >= message.header.commit);
             }
         }
@@ -802,7 +981,10 @@ pub fn Replica(
             assert(prepare.message.header.op <= self.op);
             // Wait until we have `f + 1` prepare_ok messages (including ourself) for quorum:
-            const threshold = self.quorum_replication;
+            // const threshold = self.quorum_replication;
+            // TODO: When Block recover & state transfer are implemented, this can be removed.
+            const threshold =
+                if (prepare.message.header.op == self.op_checkpoint_trigger()) self.replica_count else self.quorum_replication;
             const count = self.count_message_and_receive_quorum_exactly_once(
                 &prepare.ok_from_all_replicas,
@@ -867,7 +1049,7 @@ pub fn Replica(
             }
             self.normal_status_timeout.reset();
-            self.commit_ops(message.header.commit);
+            self.commit_journal(message.header.commit);
         }
         fn on_repair(self: *Self, message: *Message) void {
@@ -894,7 +1076,9 @@ pub fn Replica(
             }
             if (self.status == .view_change and !self.do_view_change_quorum) {
-                log.debug("{}: on_repair: ignoring (view change, waiting for quorum)", .{self.replica});
+                log.debug("{}: on_repair: ignoring (view change, waiting for quorum)", .{
+                    self.replica,
+                });
                 return;
             }
@@ -911,6 +1095,7 @@ pub fn Replica(
             if (self.journal.has_clean(message.header)) {
                 log.debug("{}: on_repair: ignoring (duplicate)", .{self.replica});
                 self.send_prepare_ok(message.header);
                 defer self.flush_loopback_queue();
                 return;
@@ -985,6 +1170,28 @@ pub fn Replica(
         /// informs the other replicas of the completion of the view change by sending
         /// ⟨start_view v, l, n, k⟩ messages to the other replicas, where l is the new log, n is the
         /// op number, and k is the commit number.
+        ///
+        /// For each DVC in the quorum:
+        ///
+        /// * The headers must all belong to the same hash chain. (Gaps are allowed).
+        ///   (Reason: the headers bundled with the DVC(s) with the highest view_normal will be
+        ///   loaded into the new leader with `replace_header()`, not `repair_header()`).
+        ///
+        /// Across all DVCs in the quorum:
+        ///
+        /// * The headers of every DVC with the same view_normal must agree. In other words:
+        ///   dvc₁.headers[i].op == dvc₂.headers[j].op implies
+        ///   dvc₁.headers[i].checksum == dvc₂.headers[j].checksum.
+        ///   (Reason: the headers bundled with the DVC(s) with the highest view_normal will be
+        ///   loaded into the new leader with `replace_header()`, not `repair_header()`).
+        ///
+        /// Perhaps unintuitively, it is safe to advertise a header before its message is prepared
+        /// (e.g. the write is still queued). The header is either:
+        ///
+        /// * committed — so another replica in the quorum must have a copy, according to the quorum
+        ///   intersection property. Or,
+        /// * uncommitted — if the header is chosen, but cannot be recovered from any replica, then
+        ///   it will be discarded by the nack protocol.
         fn on_do_view_change(self: *Self, message: *Message) void {
             if (self.ignore_view_change_message(message)) return;
@@ -1000,8 +1207,9 @@ pub fn Replica(
             // We may receive a `do_view_change` quorum from other replicas, which already have a
             // `start_view_change_quorum`, before we receive a `start_view_change_quorum`:
             if (!self.start_view_change_quorum) {
-                log.debug("{}: on_do_view_change: waiting for start_view_change quorum", .{
+                log.debug("{}: on_do_view_change: waiting for start_view_change quorum (view={})", .{
                     self.replica,
+                    self.view,
                 });
                 return;
             }
@@ -1023,75 +1231,14 @@ pub fn Replica(
                 self.view,
             });
-            var v: ?u32 = null;
-            var k: ?u64 = null;
-            var latest = Header.reserved(self.cluster, 0);
-            for (self.do_view_change_from_all_replicas) |received, replica| {
-                if (received) |m| {
-                    assert(m.header.command == .do_view_change);
-                    assert(m.header.cluster == self.cluster);
-                    assert(m.header.replica == replica);
-                    assert(m.header.view == self.view);
-                    // The latest normal view experienced by this replica:
-                    // This may be higher than the view in any of the prepare headers.
-                    var replica_view_normal = @intCast(u32, m.header.timestamp);
-                    assert(replica_view_normal < m.header.view);
-                    var replica_latest = Header.reserved(self.cluster, 0);
-                    set_latest_op(self.message_body_as_headers(m), &replica_latest);
-                    assert(replica_latest.op == m.header.op);
-                    log.debug(
-                        "{}: on_do_view_change: replica={} v'={} op={} commit={} latest={}",
-                        .{
-                            self.replica,
-                            m.header.replica,
-                            replica_view_normal,
-                            m.header.op,
-                            m.header.commit,
-                            replica_latest,
-                        },
-                    );
-                    if (v == null or replica_view_normal > v.?) {
-                        v = replica_view_normal;
-                        latest = replica_latest;
-                    } else if (replica_view_normal == v.? and replica_latest.op > latest.op) {
-                        v = replica_view_normal;
-                        latest = replica_latest;
-                    }
-                    if (k == null or m.header.commit > k.?) k = m.header.commit;
-                }
-            }
-            self.set_latest_op_and_k(&latest, k.?, "on_do_view_change");
-            // Now that we have the latest op in place, repair any other headers:
-            for (self.do_view_change_from_all_replicas) |received| {
-                if (received) |m| {
-                    for (self.message_body_as_headers(m)) |*h| {
-                        _ = self.repair_header(h);
-                    }
-                }
-            }
-            // Verify that the repairs above have not replaced or advanced the latest op:
-            assert(self.journal.header_with_op(self.op).?.checksum == latest.checksum);
             assert(self.start_view_change_quorum);
             assert(!self.do_view_change_quorum);
             self.do_view_change_quorum = true;
-            self.discard_uncommitted_headers();
+            self.set_log_from_do_view_change_messages();
             assert(self.op >= self.commit_max);
-            const prepare_timestamp = self.journal.header_with_op(self.op).?.timestamp;
-            if (self.state_machine.prepare_timestamp < prepare_timestamp) {
-                self.state_machine.prepare_timestamp = prepare_timestamp;
-            }
+            assert(self.state_machine.prepare_timestamp >=
+                self.journal.header_with_op(self.op).?.timestamp);
             // Start repairs according to the CTRL protocol:
             assert(!self.repair_timeout.ticking);
@@ -1109,6 +1256,16 @@ pub fn Replica(
         fn on_start_view(self: *Self, message: *const Message) void {
             if (self.ignore_view_change_message(message)) return;
+            if (message.header.op > self.op_checkpoint_trigger()) {
+                // This replica is too far behind, i.e. the new `self.op` is too far ahead of the
+                // last checkpoint. If we wrap now, we overwrite un-checkpointed transfers in the WAL,
+                // precluding recovery.
+                //
+                // TODO State transfer. Currently this is unreachable because the
+                // leader won't checkpoint until all replicas are caught up.
+                unreachable;
+            }
             assert(self.status == .view_change or self.status == .normal);
             assert(message.header.view >= self.view);
             assert(message.header.replica != self.replica);
@@ -1118,20 +1275,12 @@ pub fn Replica(
             assert(self.status == .view_change);
             assert(message.header.view == self.view);
+            assert(message.header.op == op_highest(message_body_as_headers(message)));
-            var latest = Header.reserved(self.cluster, 0);
-            set_latest_op(self.message_body_as_headers(message), &latest);
-            assert(latest.op == message.header.op);
-            self.set_latest_op_and_k(&latest, message.header.commit, "on_start_view");
+            self.set_op_and_commit_max(message.header.op, message.header.commit, "on_start_view");
+            self.replace_headers(message_body_as_headers(message));
-            // Now that we have the latest op in place, repair any other headers:
-            for (self.message_body_as_headers(message)) |*h| {
-                _ = self.repair_header(h);
-            }
-            // Verify that the repairs above have not replaced or advanced the latest op:
-            assert(self.journal.header_with_op(self.op).?.checksum == latest.checksum);
+            assert(self.op == message.header.op);
             if (self.status == .view_change) {
                 self.transition_to_normal_from_view_change_status(message.header.view);
@@ -1142,7 +1291,7 @@ pub fn Replica(
             assert(message.header.view == self.view);
             assert(self.follower());
-            self.commit_ops(self.commit_max);
+            self.commit_journal(self.commit_max);
             self.repair();
         }
@@ -1201,8 +1350,45 @@ pub fn Replica(
                 .commit = self.commit_max,
             };
-            const count_max = 8; // The maximum number of prepare headers to include in the body.
-            const count = self.copy_latest_headers_and_set_size(0, self.op, count_max, response);
+            // A recovery response attaches at least as many headers as a DVC message attaches.
+            // To understand why, consider this scenario, where:
+            //
+            //                   replica_count   3
+            //      do_view_change.headers.len   3 (= pipeline_max)
+            //   recovery_response.headers.len   2 (!)
+            //                   replica 0 log   3, 4a, 5a, 6a, 7a, 8a  (status=normal, leader)
+            //                   replica 1 log   3, 4a, 5a, --, --, --  (status=normal, follower)
+            //                   replica 2 log   3, 4b, 5b, --, --, --  (status=recovering)
+            //
+            // 1. Replica 2 receives a recovery_response quorum.
+            // 2. Replica 2 sets `replica.op` to 8a.
+            // 3. Replica 2 sets its headers from the leader's recovery_response (8a, 7a)
+            //    (via `replace_header()`).
+            // 4. Replica 2 transitions to status=normal.
+            // 5. Replica 0 fails (before replica 2 has a chance to repair its hash chain.)
+            // 6. Replica 1 initiates a view change.
+            // 7. Replica 1 collects a DVC quorum:
+            //      replica 1:  3, 4a, 5a (view_normal=latest)
+            //      replica 2: 5b, 7a, 8a (view_normal=latest)
+            //    Replicas 1 and 2 share the highest view_normal, so both sets of headers are canonical.
+            // 8. Replica 1 loads the canonical headers (via `replace_header()`) from both DVCs.
+            //    Messages 8a and 7a will be dropped via `do_view_change_op_max()` (due to the
+            //    gap at op 6). But there is a conflict at op=5. For correctness, replica 1 must
+            //    pick 5a — 5a may be committed by replica 0.
+            //    Without replica 0's assistance, replica 1 has no way to pick between 5a/5b.
+            //
+            // Including at least as many headers in the recovery response as the DVC maintains the
+            // invariant: DVCs with the same view_normal must never disagree on the identity of a
+            // message.
+            //
+            // (DVCs can still safely include gaps — but they must be of the form [4a,__,6a],
+            // not [4a,__,6b]).
+            const count = self.copy_latest_headers_and_set_size(
+                0,
+                self.op,
+                view_change_headers_count,
+                response,
+            );
             assert(count > 0); // We expect that self.op always exists.
             assert(@divExact(response.header.size, @sizeOf(Header)) == 1 + count);
@@ -1258,7 +1444,7 @@ pub fn Replica(
                 //   receiver's state changed in the mean time.
                 log.debug(
-                    "{}: on_recovery_response: replica={} view={}..{} op={}..{} commit={}..{}",
+                    "{}: on_recovery_response: replacing response replica={} view={}..{} op={}..{} commit={}..{}",
                     .{
                         self.replica,
                         existing.header.replica,
@@ -1371,17 +1557,41 @@ pub fn Replica(
             // protocol), if the view number indicates that this replica is a leader, it must
             // transition to status=view_change instead of status=normal.
-            const leader_headers = self.message_body_as_headers(leader_response.?);
+            const leader_headers = message_body_as_headers(leader_response.?);
             assert(leader_headers.len > 0);
             const commit = leader_response.?.header.commit;
             {
-                var latest = Header.reserved(self.cluster, 0);
-                set_latest_op(leader_headers, &latest);
-                assert(latest.op == leader_response.?.header.op);
+                const op = op_highest(leader_headers);
+                assert(op == leader_response.?.header.op);
+                self.set_op_and_commit_max(op, commit, "on_recovery_response");
+                // TODO If the view's primary is >1 WAL ahead of us, these headers could cause
+                // problems. We don't want to jump this far ahead to repair, but we still need to
+                // use the hash chain to figure out which headers to request. Maybe include our
+                // `op_checkpoint` in the recovery (request) message so that the response can give
+                // more useful (i.e. older) headers.
+                self.replace_headers(leader_headers);
+                if (self.op < config.journal_slot_count) {
+                    if (self.journal.header_with_op(0)) |header| {
+                        assert(header.command == .prepare);
+                        assert(header.operation == .root);
+                    } else {
+                        // This is the first wrap of the log, and the root prepare is corrupt.
+                        // Repair the root repair. This is necessary to maintain the invariant that
+                        // the op=commit_min exists in-memory.
+                        //
+                        // op=0 wouldn't have been repaired by replace_headers above, because it is
+                        // already "checkpointed".
+                        const header = Header.root_prepare(self.cluster);
+                        self.journal.set_header_as_dirty(&header);
+                        log.debug("{}: on_recovery_response: repair root op", .{self.replica});
+                    }
+                }
-                self.set_latest_op_and_k(&latest, commit, "on_recovery_response");
-                assert(self.op == latest.op);
+                assert(self.op == op);
                 assert(self.journal.header_with_op(self.op) != null);
             }
@@ -1390,30 +1600,7 @@ pub fn Replica(
             assert(self.status == .normal);
             assert(self.follower());
-            // TODO If the view's primary is >1 WAL ahead of us, these headers could cause
-            // problems. We don't want to jump this far ahead to repair, but we still need to use
-            // the hash chain to figure out which headers to request. Maybe include our
-            // `op_checkpoint` in the recovery (request) message so that the response can give more
-            // useful (i.e. older) headers.
-            for (leader_headers) |*header| {
-                _ = self.repair_header(header);
-            }
-            if (self.op < config.journal_slot_count) {
-                if (self.journal.header_with_op(0)) |header| {
-                    assert(header.command == .prepare);
-                    assert(header.operation == .root);
-                } else {
-                    // This is the first wrap of the log, and the root prepare is corrupt.
-                    // Repair the root repair. This is necessary to maintain the invariant that the
-                    // op=commit_min exists in-memory.
-                    const header = Header.root_prepare(self.cluster);
-                    self.journal.set_header_as_dirty(&header);
-                    log.debug("{}: on_recovery_response: repair root op", .{self.replica});
-                }
-            }
-            log.debug("{}: on_recovery_response: responses={} view={} headers={}..{}" ++
+            log.info("{}: on_recovery_response: recovery done responses={} view={} headers={}..{}" ++
                 " commit={} dirty={} faulty={}", .{
                 self.replica,
                 count,
@@ -1429,7 +1616,7 @@ pub fn Replica(
             // `state_machine.commit_timestamp` is updated as messages are committed.
             self.reset_quorum_recovery_response();
-            self.commit_ops(commit);
+            self.commit_journal(commit);
             self.repair();
         }
@@ -1486,28 +1673,18 @@ pub fn Replica(
                         checksum,
                     });
-                    if (self.journal.header_with_op_and_checksum(op, checksum)) |_| {
-                        // The header for the target prepare is already in-memory.
-                        // This is preferable to the `else` case since we have the prepare's
-                        // `header.size` in-memory, so the read can be (potentially) shorter.
-                        // TODO Do not reissue the read if we are already reading in order to send
-                        // to this particular destination replica.
-                        self.journal.read_prepare(
-                            on_request_prepare_read,
-                            op,
-                            prepare_checksum,
-                            message.header.replica,
-                        );
-                    } else {
-                        // TODO Do not reissue the read if we are already reading in order to send to
-                        // this particular destination replica.
-                        self.journal.read_prepare_with_op_and_checksum(
-                            on_request_prepare_read,
-                            op,
-                            prepare_checksum,
-                            message.header.replica,
-                        );
-                    }
+                    // Improve availability by calling `read_prepare_with_op_and_checksum` instead
+                    // of `read_prepare` — even if `journal.headers` contains the target message.
+                    // The latter skips the read when the target prepare is present but dirty (e.g.
+                    // it was recovered with decision=fix).
+                    // TODO Do not reissue the read if we are already reading in order to send to
+                    // this particular destination replica.
+                    self.journal.read_prepare_with_op_and_checksum(
+                        on_request_prepare_read,
+                        op,
+                        prepare_checksum,
+                        message.header.replica,
+                    );
                     // We have guaranteed the prepare (not safe to nack).
                     // Our copy may or may not be valid, but we will try to read & forward it.
@@ -1734,7 +1911,7 @@ pub fn Replica(
             var op_min: ?u64 = null;
             var op_max: ?u64 = null;
-            for (self.message_body_as_headers(message)) |*h| {
+            for (message_body_as_headers(message)) |*h| {
                 if (op_min == null or h.op < op_min.?) op_min = h.op;
                 if (op_max == null or h.op > op_max.?) op_max = h.op;
                 _ = self.repair_header(h);
@@ -1944,10 +2121,44 @@ pub fn Replica(
                 assert(m.header.replica == message.header.replica);
                 assert(m.header.view == message.header.view);
                 assert(m.header.op == message.header.op);
-                assert(m.header.commit == message.header.commit);
                 assert(m.header.checksum_body == message.header.checksum_body);
-                assert(m.header.checksum == message.header.checksum);
-                log.debug("{}: on_{s}: ignoring (duplicate message)", .{ self.replica, command });
+                if (message.header.command == .do_view_change) {
+                    // Replicas don't resend `do_view_change` messages to themselves.
+                    assert(message.header.replica != self.replica);
+                    // A replica may resend a `do_view_change` with a different commit if it was
+                    // committing originally. Keep the one with the highest commit.
+                    // This is *not* necessary for correctness.
+                    if (m.header.commit < message.header.commit) {
+                        log.debug("{}: on_{s}: replacing (newer message replica={} commit={}..{})", .{
+                            self.replica,
+                            command,
+                            message.header.replica,
+                            m.header.commit,
+                            message.header.commit,
+                        });
+                        // TODO(Buggify): skip updating the DVC, since it isn't required for correctness.
+                        self.message_bus.unref(m);
+                        messages[message.header.replica] = message.ref();
+                    } else if (m.header.commit > message.header.commit) {
+                        log.debug("{}: on_{s}: ignoring (older message replica={})", .{
+                            self.replica,
+                            command,
+                            message.header.replica,
+                        });
+                    } else {
+                        assert(m.header.checksum == message.header.checksum);
+                    }
+                } else {
+                    assert(m.header.commit == message.header.commit);
+                    assert(m.header.checksum == message.header.checksum);
+                }
+                log.debug("{}: on_{s}: ignoring (duplicate message replica={})", .{
+                    self.replica,
+                    command,
+                    message.header.replica,
+                });
                 return null;
             }
@@ -2004,6 +2215,7 @@ pub fn Replica(
                     if (self.replica_count == 2) assert(threshold == 1);
                     assert(self.status == .view_change);
+                    assert(self.replica != message.header.replica);
                 },
                 .nack_prepare => {
                     assert(self.replica_count > 1);
@@ -2011,6 +2223,8 @@ pub fn Replica(
                     assert(self.status == .view_change);
                     assert(self.leader_index(self.view) == self.replica);
+                    assert(message.header.replica != self.replica);
+                    assert(message.header.op == self.nack_prepare_op.?);
                 },
                 else => unreachable,
             }
@@ -2065,9 +2279,15 @@ pub fn Replica(
                 // In a cluster-of-one, the prepares must always be written to the WAL sequentially
                 // (never concurrently). This ensures that there will be no gaps in the WAL during
                 // crash recovery.
-                log.debug("{}: append: serializing append op={}", .{ self.replica, message.header.op });
+                log.debug("{}: append: serializing append op={}", .{
+                    self.replica,
+                    message.header.op,
+                });
             } else {
-                log.debug("{}: append: appending to journal", .{self.replica});
+                log.debug("{}: append: appending to journal op={}", .{
+                    self.replica,
+                    message.header.op,
+                });
                 self.write_prepare(message, .append);
             }
         }
@@ -2115,9 +2335,9 @@ pub fn Replica(
         }
         /// Commit ops up to commit number `commit` (inclusive).
-        /// A function which calls `commit_ops()` to set `commit_max` must first call `view_jump()`.
-        /// Otherwise, we may fork the log.
-        fn commit_ops(self: *Self, commit: u64) void {
+        /// A function which calls `commit_journal()` to set `commit_max` must first call
+        /// `view_jump()`. Otherwise, we may fork the log.
+        fn commit_journal(self: *Self, commit: u64) void {
             // TODO Restrict `view_change` status only to the leader purely as defense-in-depth.
             // Be careful of concurrency when doing this, as successive view changes can happen quickly.
             assert(self.status == .normal or self.status == .view_change or
@@ -2131,9 +2351,9 @@ pub fn Replica(
             if (commit <= self.commit_min) return;
             // We must update `commit_max` even if we are already committing, otherwise we will lose
-            // information that we should know, and `set_latest_op_and_k()` will catch us out:
+            // information that we should know, and `set_op_and_commit_max()` will catch us out:
             if (commit > self.commit_max) {
-                log.debug("{}: commit_ops: advancing commit_max={}..{}", .{
+                log.debug("{}: commit_journal: advancing commit_max={}..{}", .{
                     self.replica,
                     self.commit_max,
                     commit,
@@ -2141,9 +2361,9 @@ pub fn Replica(
                 self.commit_max = commit;
             }
-            // Guard against multiple concurrent invocations of commit_ops():
+            // Guard against multiple concurrent invocations of commit_journal()/commit_pipeline():
             if (self.committing) {
-                log.debug("{}: commit_ops: already committing...", .{self.replica});
+                log.debug("{}: commit_journal: already committing...", .{self.replica});
                 return;
             }
@@ -2160,19 +2380,19 @@ pub fn Replica(
             assert(!self.committing);
             self.committing = true;
-            self.commit_ops_read();
+            self.commit_journal_next();
         }
-        fn commit_ops_read(self: *Self) void {
+        fn commit_journal_next(self: *Self) void {
             assert(self.committing);
             assert(self.status == .normal or self.status == .view_change or
                 (self.status == .recovering and self.replica_count == 1));
             assert(self.commit_min <= self.commit_max);
             assert(self.commit_min <= self.op);
-            if (!self.valid_hash_chain("commit_ops_read")) {
-                self.committing = false;
+            if (!self.valid_hash_chain("commit_journal_next")) {
                 assert(self.replica_count > 1);
+                self.commit_ops_done();
                 return;
             }
             assert(self.op >= self.commit_max);
@@ -2182,9 +2402,9 @@ pub fn Replica(
             if (self.commit_min < self.commit_max and self.commit_min < self.op) {
                 const op = self.commit_min + 1;
                 const checksum = self.journal.header_with_op(op).?.checksum;
-                self.journal.read_prepare(commit_ops_commit, op, checksum, null);
+                self.journal.read_prepare(commit_journal_next_callback, op, checksum, null);
             } else {
-                self.committing = false;
+                self.commit_ops_done();
                 // This is an optimization to expedite the view change before the `repair_timeout`:
                 if (self.status == .view_change and self.repairs_allowed()) self.repair();
@@ -2194,33 +2414,43 @@ pub fn Replica(
                     assert(self.commit_min == self.op);
                     self.transition_to_normal_from_recovering_status(0);
                 } else {
-                    // We expect that a cluster-of-one only calls commit_ops() in recovering status.
+                    // We expect that a cluster-of-one only calls commit_journal() in recovering status.
                     assert(self.replica_count > 1);
                 }
             }
         }
-        fn commit_ops_commit(self: *Self, prepare: ?*Message, destination_replica: ?u8) void {
-            assert(destination_replica == null);
+        fn commit_journal_next_callback(self: *Self, prepare: ?*Message, destination_replica: ?u8) void {
             assert(self.committing);
-            self.committing = false;
+            assert(destination_replica == null);
             if (prepare == null) {
-                log.debug("{}: commit_ops_commit: prepare == null", .{self.replica});
+                self.commit_ops_done();
+                log.debug("{}: commit_journal_next_callback: prepare == null", .{self.replica});
                 if (self.replica_count == 1) @panic("cannot recover corrupt prepare");
                 return;
             }
+            const slot = self.journal.slot_with_op_and_checksum(
+                prepare.?.header.op,
+                prepare.?.header.checksum,
+            ).?;
+            assert(self.journal.prepare_inhabited[slot.index]);
+            assert(self.journal.prepare_checksums[slot.index] == prepare.?.header.checksum);
+            assert(self.journal.has(prepare.?.header));
             switch (self.status) {
                 .normal => {},
                 .view_change => {
                     if (self.leader_index(self.view) != self.replica) {
-                        log.debug("{}: commit_ops_commit: no longer leader", .{self.replica});
+                        self.commit_ops_done();
+                        log.debug("{}: commit_journal_next_callback: no longer leader view={}", .{
+                            self.replica,
+                            self.view,
+                        });
                         assert(self.replica_count > 1);
                         return;
                     }
                     // Only the leader may commit during a view change before starting the new view.
                     // Fall through if this is indeed the case.
                 },
@@ -2231,31 +2461,194 @@ pub fn Replica(
             }
             const op = self.commit_min + 1;
+            assert(prepare.?.header.op == op);
-            if (prepare.?.header.op != op) {
-                log.debug("{}: commit_ops_commit: op changed", .{self.replica});
-                assert(self.replica_count > 1);
-                return;
+            self.commit_op_prefetch(prepare.?, commit_journal_callback);
+        }
+        fn commit_journal_callback(self: *Self) void {
+            assert(self.committing);
+            assert(self.commit_min <= self.commit_max);
+            assert(self.commit_min <= self.op);
+            self.commit_journal_next();
+        }
+        /// Begin the commit path that is common between `commit_pipeline` and `commit_journal`:
+        ///
+        /// 1. prefetch
+        /// 2. commit_op: Update the state machine and the replica's commit_min/commit_max.
+        /// 3. compact
+        /// 4. checkpoint: (Only called when `commit_min == op_checkpoint_trigger`).
+        /// 5. done: Call the `callback` that was passed to `commit_op_prefetch`.
+        fn commit_op_prefetch(
+            self: *Self,
+            prepare: *Message,
+            callback: fn (*Self) void,
+        ) void {
+            assert(self.committing);
+            assert(self.status == .normal or self.status == .view_change or
+                (self.status == .recovering and self.replica_count == 1));
+            assert(self.commit_prepare == null);
+            assert(self.commit_callback == null);
+            assert(prepare.header.command == .prepare);
+            assert(prepare.header.operation != .root);
+            assert(prepare.header.op == self.commit_min + 1);
+            assert(prepare.header.op <= self.op);
+            self.commit_prepare = prepare.ref();
+            self.commit_callback = callback;
+            self.state_machine.prefetch(
+                commit_op_prefetch_callback,
+                prepare.header.op,
+                prepare.header.operation.cast(StateMachine),
+                prepare.body(),
+            );
+        }
+        fn commit_op_prefetch_callback(state_machine: *StateMachine) void {
+            const self = @fieldParentPtr(Self, "state_machine", state_machine);
+            assert(self.committing);
+            assert(self.commit_prepare != null);
+            assert(self.commit_callback != null);
+            assert(self.commit_prepare.?.header.op == self.commit_min + 1);
+            self.commit_op(self.commit_prepare.?);
+            assert(self.commit_min == self.commit_prepare.?.header.op);
+            assert(self.commit_min <= self.commit_max);
+            if (self.status == .normal and self.leader()) {
+                const prepare = self.pipeline.pop().?;
+                assert(self.commit_min == self.commit_max);
+                assert(prepare.message.header.checksum == self.commit_prepare.?.header.checksum);
+                assert(prepare.message.header.op == self.commit_min);
+                assert(prepare.message.header.op == self.commit_max);
+                assert(self.prepare_timeout.ticking);
+                self.message_bus.unref(prepare.message);
+                if (self.pipeline.head_ptr()) |next| {
+                    assert(next.message.header.op == self.commit_min + 1);
+                    assert(next.message.header.op == self.commit_prepare.?.header.op + 1);
+                    if (self.replica_count == 1) {
+                        // Write the next message in the queue.
+                        // A cluster-of-one writes prepares sequentially to avoid gaps in the
+                        // WAL caused by reordered writes.
+                        log.debug("{}: append: appending to journal op={}", .{
+                            self.replica,
+                            next.message.header.op,
+                        });
+                        self.write_prepare(next.message, .append);
+                    }
+                } else {
+                    // When the pipeline is empty, stop the prepare timeout.
+                    // The timeout will be restarted when another entry arrives for the pipeline.
+                    self.prepare_timeout.stop();
+                }
             }
-            if (prepare.?.header.checksum != self.journal.header_with_op(op).?.checksum) {
-                log.debug("{}: commit_ops_commit: checksum changed", .{self.replica});
-                assert(self.replica_count > 1);
-                return;
+            self.state_machine.compact(commit_op_compact_callback, self.commit_prepare.?.header.op);
+        }
+        fn commit_op_compact_callback(state_machine: *StateMachine) void {
+            const self = @fieldParentPtr(Self, "state_machine", state_machine);
+            assert(self.committing);
+            assert(self.commit_callback != null);
+            assert(self.op_checkpoint == self.superblock.staging.vsr_state.commit_min);
+            assert(self.op_checkpoint == self.superblock.working.vsr_state.commit_min);
+            const op = self.commit_prepare.?.header.op;
+            assert(op == self.commit_min);
+            if (op == self.op_checkpoint_trigger()) {
+                assert(op == self.op);
+                assert((op + 1) % config.lsm_batch_multiple == 0);
+                log.debug("{}: commit_op_compact_callback: checkpoint start " ++
+                    "(op={} current_checkpoint={} next_checkpoint={})", .{
+                    self.replica,
+                    self.op,
+                    self.op_checkpoint,
+                    self.op_checkpoint_next(),
+                });
+                self.state_machine.checkpoint(commit_op_checkpoint_state_machine_callback);
+            } else {
+                assert(op < self.op_checkpoint_trigger());
+                self.commit_op_done();
             }
+        }
-            self.commit_op(prepare.?);
+        fn commit_op_checkpoint_state_machine_callback(state_machine: *StateMachine) void {
+            const self = @fieldParentPtr(Self, "state_machine", state_machine);
+            assert(self.committing);
+            assert(self.commit_callback != null);
+            assert(self.commit_prepare.?.header.op == self.op);
+            assert(self.commit_prepare.?.header.op == self.commit_min);
+            assert(self.commit_prepare.?.header.op == self.op_checkpoint_trigger());
-            assert(self.commit_min == op);
-            assert(self.commit_min <= self.commit_max);
-            assert(self.commit_min <= self.op);
+            // For the given WAL (journal_slot_count=8, lsm_batch_multiple=2, op=commit_min=7):
+            //
+            //   A  B  C  D  E
+            //   |01|23|45|67|
+            //
+            // The checkpoint is triggered at "E".
+            // At this point, ops 6 and 7 are in the in-memory immutable table.
+            // They will only be compacted to disk in the next bar.
+            // Therefore, only ops "A..D" are committed to disk.
+            // Thus, the SuperBlock's `commit_min` is set to 7-2=5.
+            const vsr_state_new = .{
+                .commit_min = self.op_checkpoint_next(),
+                .commit_max = self.commit_max,
+                .view_normal = self.view_normal,
+                .view = self.view,
+            };
+            assert(VSRState.monotonic(self.superblock.working.vsr_state, vsr_state_new));
-            self.committing = true;
-            self.commit_ops_read();
+            self.superblock.staging.vsr_state = vsr_state_new;
+            self.superblock.checkpoint(
+                commit_op_checkpoint_superblock_callback,
+                &self.superblock_context,
+            );
+        }
+        fn commit_op_checkpoint_superblock_callback(superblock_context: *SuperBlock.Context) void {
+            const self = @fieldParentPtr(Self, "superblock_context", superblock_context);
+            assert(self.committing);
+            assert(self.commit_callback != null);
+            assert(self.commit_prepare.?.header.op == self.op);
+            assert(self.commit_prepare.?.header.op == self.commit_min);
+            self.op_checkpoint = self.op_checkpoint_next();
+            assert(self.op_checkpoint == self.commit_min - config.lsm_batch_multiple);
+            assert(self.op_checkpoint == self.superblock.staging.vsr_state.commit_min);
+            assert(self.op_checkpoint == self.superblock.working.vsr_state.commit_min);
+            log.debug("{}: commit_op_compact_callback: checkpoint done (op={} new_checkpoint={})", .{
+                self.replica,
+                self.op,
+                self.op_checkpoint,
+            });
+            self.commit_op_done();
+        }
+        fn commit_op_done(self: *Self) void {
+            const callback = self.commit_callback.?;
+            assert(self.committing);
+            assert(self.commit_prepare.?.header.op == self.commit_min);
+            assert(self.commit_prepare.?.header.op < self.op_checkpoint_trigger());
+            self.message_bus.unref(self.commit_prepare.?);
+            self.commit_prepare = null;
+            self.commit_callback = null;
+            callback(self);
         }
         fn commit_op(self: *Self, prepare: *const Message) void {
             // TODO Can we add more checks around allowing commit_op() during a view change?
+            assert(self.committing);
+            assert(self.commit_prepare.? == prepare);
+            assert(self.commit_callback != null);
             assert(self.status == .normal or self.status == .view_change or
                 (self.status == .recovering and self.replica_count == 1));
             assert(prepare.header.command == .prepare);
@@ -2263,10 +2656,12 @@ pub fn Replica(
             assert(prepare.header.op == self.commit_min + 1);
             assert(prepare.header.op <= self.op);
-            // If we are a follower committing through `commit_ops()` then a view change may have
-            // happened since we last checked in `commit_ops_read()`. However, this would relate to
-            // subsequent ops, since by now we have already verified the hash chain for this commit.
+            // If we are a follower committing through `commit_journal()` then a view change may
+            // have happened since we last checked in `commit_journal_next()`. However, this would
+            // relate to subsequent ops, since by now we have already verified the hash chain for
+            // this commit.
+            assert(self.journal.has(prepare.header));
             assert(self.journal.header_with_op(self.commit_min).?.checksum ==
                 prepare.header.parent);
@@ -2282,10 +2677,16 @@ pub fn Replica(
             const reply = self.message_bus.get_message();
             defer self.message_bus.unref(reply);
+            log.debug("{}: commit_op: commit_timestamp={} prepare.header.timestamp={}", .{
+                self.replica,
+                self.state_machine.commit_timestamp,
+                prepare.header.timestamp,
+            });
             assert(self.state_machine.commit_timestamp < prepare.header.timestamp);
             const reply_body_size = @intCast(u32, self.state_machine.commit(
                 prepare.header.client,
+                prepare.header.op,
                 prepare.header.operation.cast(StateMachine),
                 prepare.buffer[@sizeOf(Header)..prepare.header.size],
                 reply.buffer[@sizeOf(Header)..],
@@ -2310,20 +2711,38 @@ pub fn Replica(
                 .replica = prepare.header.replica,
                 .view = prepare.header.view,
                 .op = prepare.header.op,
+                .timestamp = prepare.header.timestamp,
                 .commit = prepare.header.op,
                 .size = @sizeOf(Header) + reply_body_size,
             };
-            assert(reply.header.timestamp == 0);
             assert(reply.header.epoch == 0);
             reply.header.set_checksum_body(reply.body());
             reply.header.set_checksum();
-            if (reply.header.operation == .register) {
-                self.create_client_table_entry(reply);
-            } else {
-                self.update_client_table_entry(reply);
-            }
+            if (self.superblock.working.vsr_state.op_compacted(prepare.header.op)) {
+                // We are recovering from a checkpoint. Prior to the crash, the client table was
+                // updated with entries for one bar beyond the op_checkpoint.
+                assert(self.op_checkpoint == self.superblock.working.vsr_state.commit_min);
+                if (self.client_table().get(prepare.header.client)) |entry| {
+                    assert(entry.reply.header.command == .reply);
+                    assert(entry.reply.header.op >= prepare.header.op);
+                } else {
+                    assert(self.client_table().count() == self.client_table().capacity());
+                }
+                log.debug("{}: commit_op: skip client table update: prepare.op={} checkpoint={}", .{
+                    self.replica,
+                    prepare.header.op,
+                    self.op_checkpoint,
+                });
+            } else {
+                if (reply.header.operation == .register) {
+                    self.create_client_table_entry(reply);
+                } else {
+                    self.update_client_table_entry(reply);
+                }
+            }
             if (self.leader_index(self.view) == self.replica) {
                 log.debug("{}: commit_op: replying to client: {}", .{ self.replica, reply.header });
@@ -2332,22 +2751,38 @@ pub fn Replica(
         }
         /// Commits, frees and pops as many prepares at the head of the pipeline as have quorum.
+        /// Can be called only when the replica is the leader.
         /// Can be called only when the pipeline has at least one prepare.
-        /// Stops the prepare timeout and resets the timeouts counter if the pipeline becomes empty.
         fn commit_pipeline(self: *Self) void {
             assert(self.status == .normal);
             assert(self.leader());
             assert(self.pipeline.count > 0);
-            while (self.pipeline.head_ptr()) |prepare| {
-                assert(self.pipeline.count > 0);
+            // Guard against multiple concurrent invocations of commit_journal()/commit_pipeline():
+            if (self.committing) {
+                log.debug("{}: commit_pipeline: already committing...", .{self.replica});
+                return;
+            }
+            self.committing = true;
+            self.commit_pipeline_next();
+        }
+        fn commit_pipeline_next(self: *Self) void {
+            assert(self.committing);
+            assert(self.status == .normal);
+            assert(self.leader());
+            if (self.pipeline.head_ptr()) |prepare| {
                 assert(self.commit_min == self.commit_max);
-                assert(self.commit_max + self.pipeline.count == self.op);
-                assert(self.commit_max + 1 == prepare.message.header.op);
+                assert(self.commit_min + 1 == prepare.message.header.op);
+                assert(self.commit_min + self.pipeline.count == self.op);
+                assert(self.journal.has(prepare.message.header));
                 if (!prepare.ok_quorum_received) {
                     // Eventually handled by on_prepare_timeout().
                     log.debug("{}: commit_pipeline: waiting for quorum", .{self.replica});
+                    self.commit_ops_done();
                     return;
                 }
@@ -2355,26 +2790,30 @@ pub fn Replica(
                 assert(count >= self.quorum_replication);
                 assert(count <= self.replica_count);
-                self.commit_op(prepare.message);
-                assert(self.commit_min == self.commit_max);
-                assert(self.commit_max == prepare.message.header.op);
+                self.commit_op_prefetch(prepare.message, commit_pipeline_callback);
+            } else {
+                self.commit_ops_done();
+            }
+        }
-                self.message_bus.unref(self.pipeline.pop().?.message);
+        fn commit_pipeline_callback(self: *Self) void {
+            assert(self.committing);
+            assert(self.commit_min <= self.commit_max);
+            assert(self.commit_min <= self.op);
-                if (self.replica_count == 1) {
-                    if (self.pipeline.head_ptr()) |head| {
-                        // Write the next message in the queue.
-                        // A cluster-of-one writes prepares sequentially to avoid gaps in the WAL.
-                        self.write_prepare(head.message, .append);
-                        // The loop will wrap around and exit when `!ok_quorum_received`.
-                    }
+            if (self.status == .normal and self.leader()) {
+                if (self.pipeline.head_ptr()) |pipeline_head| {
+                    assert(pipeline_head.message.header.op == self.commit_min + 1);
                 }
+                self.commit_pipeline_next();
+            } else {
+                self.commit_ops_done();
             }
+        }
-            assert(self.prepare_timeout.ticking);
-            if (self.pipeline.count == 0) self.prepare_timeout.stop();
+        fn commit_ops_done(self: *Self) void {
+            assert(self.committing);
+            self.committing = false;
         }
         fn copy_latest_headers_and_set_size(
@@ -2402,7 +2841,10 @@ pub fn Replica(
             const count = self.journal.copy_latest_headers_between(
                 op_min,
                 op_max,
-                std.mem.bytesAsSlice(Header, message.buffer[@sizeOf(Header)..][0..body_size_max]),
+                std.mem.bytesAsSlice(
+                    Header,
+                    message.buffer[@sizeOf(Header)..][0..body_size_max],
+                ),
             );
             message.header.size = @intCast(u32, @sizeOf(Header) * (1 + count));
@@ -2426,17 +2868,8 @@ pub fn Replica(
                     assert(m.header.context == context);
                     assert(m.header.replica == replica);
                     switch (command) {
-                        .start_view_change => {
-                            assert(m.header.replica != self.replica);
-                            assert(m.header.view == self.view);
-                        },
                         .do_view_change => assert(m.header.view == self.view),
                         .recovery_response => assert(m.header.replica != self.replica),
-                        .nack_prepare => {
-                            // TODO See if we can restrict this branch further.
-                            assert(m.header.replica != self.replica);
-                            assert(m.header.op == self.nack_prepare_op.?);
-                        },
                         else => unreachable,
                     }
                     count += 1;
@@ -2473,12 +2906,12 @@ pub fn Replica(
             // we do require that all entries have different commit numbers and are iterated.
             // This ensures that we will always pick the entry with the oldest commit number.
             // We also check that a client has only one entry in the hash map (or it's buggy).
-            const clients = self.client_table.count();
+            const clients = self.client_table().count();
             assert(clients <= config.clients_max);
             if (clients == config.clients_max) {
                 var evictee: ?*Message = null;
                 var iterated: usize = 0;
-                var iterator = self.client_table.valueIterator();
+                var iterator = self.client_table().iterator();
                 while (iterator.next()) |entry| : (iterated += 1) {
                     assert(entry.reply.header.command == .reply);
                     assert(entry.reply.header.context == 0);
@@ -2503,8 +2936,7 @@ pub fn Replica(
                     config.clients_max,
                     evictee.?.header.client,
                 });
-                assert(self.client_table.remove(evictee.?.header.client));
-                assert(!self.client_table.contains(evictee.?.header.client));
+                self.client_table().remove(evictee.?.header.client);
                 self.message_bus.unref(evictee.?);
             }
@@ -2517,11 +2949,11 @@ pub fn Replica(
             // Any duplicate .register requests should have received the same session number if the
             // client table entry already existed, or been dropped if a session was being committed:
-            self.client_table.putAssumeCapacityNoClobber(reply.header.client, .{
+            self.client_table().put(&.{
                 .session = session,
                 .reply = reply.ref(),
             });
-            assert(self.client_table.count() <= config.clients_max);
+            assert(self.client_table().count() <= config.clients_max);
         }
         /// The caller owns the returned message, if any, which has exactly 1 reference.
@@ -2545,19 +2977,16 @@ pub fn Replica(
                 // We use the `timestamp` field to send this in addition to the current view number:
                 .timestamp = if (command == .do_view_change) self.view_normal else 0,
                 .op = self.op,
-                .commit = self.commit_max,
+                // See the comment in `on_do_view_change()` for why `commit_min` is crucial:
+                .commit = if (command == .do_view_change) self.commit_min else self.commit_max,
             };
-            // CRITICAL: The number of prepare headers to include in the body:
-            // We must provide enough headers to cover all uncommitted headers so that the new
-            // leader (if we are in a view change) can decide whether to discard uncommitted headers
-            // that cannot be repaired because they are gaps, and this must be relative to the
-            // cluster as a whole (not relative to the difference between our op and commit number)
-            // as otherwise we would break correctness.
-            const count_max = config.pipeline_max;
-            assert(count_max > 0);
-            const count = self.copy_latest_headers_and_set_size(0, self.op, count_max, message);
+            const count = self.copy_latest_headers_and_set_size(
+                0,
+                self.op,
+                view_change_headers_count,
+                message,
+            );
             assert(count > 0); // We expect that self.op always exists.
             assert(@divExact(message.header.size, @sizeOf(Header)) == 1 + count);
@@ -2585,106 +3014,65 @@ pub fn Replica(
             return message.ref();
         }
-        /// Discards uncommitted headers during a view change before the new leader starts the view.
-        /// This is required to maximize availability in the presence of storage faults.
-        /// Refer to the CTRL protocol from Protocol-Aware Recovery for Consensus-Based Storage.
+        /// Returns the op of the highest canonical message, according to this replica (the new
+        /// leader) prior to loading the current view change's DVC quorum headers.
+        /// When this replica participated in the last `view_normal`, this is just `replica.op`.
         ///
-        /// It's possible for the new leader to have done an op jump in a previous view, and so
-        /// introduced a header gap for an op, which was then discarded by another leader during a
-        /// newer view change, before surviving into this view as a gap because our latest op was
-        /// set as the latest op for the quorum.
+        /// - A *canonical* message was part of the last view_normal.
+        /// - An *uncanonical* message may have been removed/changed by a prior view.
+        /// - Canonical messages do not necessarily survive into the new view, but they take
+        ///   precedence over uncanonical messages.
+        /// - Canonical messages may be committed or uncommitted.
         ///
-        /// In this case, it may be impossible for the new leader to repair the missing header since
-        /// the rest of the cluster may have already discarded it. We therefore iterate over our
-        /// uncommitted header gaps and compare them with the quorum of do_view_change messages
-        /// received from other replicas, before starting the new view, to discard any that may be
-        /// impossible to repair.
+        /// Consider these logs:
         ///
-        /// For example, if the old primary replicates ops=7,8,9 (all uncommitted) but only op=9 is
-        /// prepared on another replica before the old primary crashes, then this function finds a
-        /// gap for ops=7,8 and will attempt to discard ops 7,8,9.
-        // TODO To improve availability, potentially call this before the local headers are
-        // repaired during the view change, so that we can participate in nacking headers.
-        fn discard_uncommitted_headers(self: *Self) void {
+        ///   replica 0: 4, 5, 6b, 7b, 8b  (commit_min=6b, leader, status=normal, view=X)
+        ///   replica 1: 4, 5, 6b, --, --  (commit_min=5, follower, status=normal, view=X)
+        ///   replica 2: 4, 5, 6a, --, 8b  (view<X)
+        ///
+        /// 1. Replica 0 crashes immediately after committing 6b.
+        /// 2. Replicas 1 and 2 must determine the new chain HEAD.
+        /// 3. 8b is discarded due to the gap in 7.
+        /// 4. To distinguish between 6a and 6b (and safely discard 6a), the new leader trusts ops
+        ///    from the DVC(s) with the greatest `view_normal`.
+        fn op_canonical_max(self: *const Self, view_normal_canonical: u64) usize {
+            assert(self.replica_count > 1);
             assert(self.status == .view_change);
             assert(self.leader_index(self.view) == self.replica);
             assert(self.do_view_change_quorum);
             assert(!self.repair_timeout.ticking);
-            assert(self.op >= self.commit_max);
-            assert(self.replica_count > 1);
-            assert(self.op - self.commit_max <= config.journal_slot_count);
+            assert(self.journal.header_with_op(self.op) != null);
+            assert(self.view_normal <= view_normal_canonical);
-            const threshold = self.replica_count - self.quorum_replication;
-            if (threshold == 0) {
-                assert(self.replica_count == 2);
-                return;
-            }
+            if (self.view_normal == view_normal_canonical) return self.op;
-            // Iterating > commit_max does not in itself guarantee that the header is uncommitted.
-            // We must also count nacks from the quorum, since the old primary may have committed
-            // another op just before crashing, if there was sufficient quorum. Counting nacks
-            // ensures that the old primary could not possibly have committed the header.
-            var op = self.op;
-            while (op > self.commit_max) : (op -= 1) {
-                if (self.journal.header_with_op(op) != null) continue;
-                log.debug("{}: discard_uncommitted_headers: op={} gap", .{ self.replica, op });
-                var nacks: usize = 0;
-                for (self.do_view_change_from_all_replicas) |received, replica| {
-                    if (received) |m| {
-                        assert(m.header.command == .do_view_change);
-                        assert(m.header.cluster == self.cluster);
-                        assert(m.header.replica == replica);
-                        assert(m.header.view == self.view);
-                        assert(m.header.commit <= self.commit_max);
-                        if (replica != self.replica) {
-                            // Check for a gap in the uncommitted headers from this replica.
-                            const received_headers = self.message_body_as_headers(m);
-                            assert(received_headers.len >= 1);
-                            const received_op_min = received_headers[received_headers.len - 1].op;
-                            const received_op_max = received_headers[0].op;
-                            assert(received_op_max >= received_op_min);
-                            const nack = for (received_headers) |*h| {
-                                if (h.op == op) break false;
-                            } else nack: {
-                                // Don't nack ops that didn't fit in the message's attached headers.
-                                break :nack op >= received_op_min;
-                            };
-                            if (nack) nacks += 1;
-                            log.debug("{}: discard_uncommitted_headers: replica={} op={} nack={}", .{
-                                self.replica,
-                                m.header.replica,
-                                op,
-                                nack,
-                            });
-                        }
-                    }
-                }
+            const uncanonical_op_count = std.math.min(
+                // Do not reset any ops that we have already committed.
+                self.op - self.commit_min,
+                // The number of uncommitted ops cannot be more than the length of the pipeline.
+                // Do not reset any ops that we did not include in our do_view_change message.
+                config.pipeline_max,
+            );
-                log.debug("{}: discard_uncommitted_headers: op={} nacks={} threshold={}", .{
-                    self.replica,
-                    op,
-                    nacks,
-                    threshold,
-                });
+            assert(uncanonical_op_count <= config.pipeline_max);
+            if (uncanonical_op_count == 0) return self.op;
-                if (nacks >= threshold) {
-                    assert(op > self.commit_max);
+            // * When uncanonical_op_count = self.op - self.commit_min,
+            //   self.op - uncanonical_op_count = self.commit_min.
+            // * When uncanonical_op_count = config.pipeline_max,
+            //   config.pipeline_max < self.op - self.commit_min holds.
+            const canonical_op_max = self.op - uncanonical_op_count;
-                    self.journal.remove_entries_from(op);
-                    self.op = op - 1;
+            log.debug("{}: on_do_view_change: not canonical ops={}..{}", .{
+                self.replica,
+                canonical_op_max + 1,
+                self.op,
+            });
-                    const slot = self.journal.slot_for_op(op);
-                    assert(self.journal.header_for_op(op) == null);
-                    assert(!self.journal.dirty.bit(slot));
-                    assert(!self.journal.faulty.bit(slot));
-                }
-            }
+            assert(canonical_op_max <= self.op);
+            assert(canonical_op_max >= self.commit_min);
+            assert(canonical_op_max + config.pipeline_max >= self.op);
+            return canonical_op_max;
         }
         /// Discards uncommitted ops during a view change from after and including `op`.
@@ -2710,8 +3098,8 @@ pub fn Replica(
                 self.view,
             });
-            self.journal.remove_entries_from(op);
             self.op = op - 1;
+            self.journal.remove_entries_from(op);
             assert(self.journal.header_for_op(op) == null);
             assert(!self.journal.dirty.bit(slot));
@@ -2729,8 +3117,8 @@ pub fn Replica(
         }
         fn flush_loopback_queue(self: *Self) void {
-            // There are three cases where a replica will send a message to itself:
-            // However, of these three cases, only two cases will call send_message_to_replica().
+            // There are four cases where a replica will send a message to itself:
+            // However, of these four cases, all but one call send_message_to_replica().
             //
             // 1. In on_request(), the leader sends a synchronous prepare to itself, but this is
             //    done by calling on_prepare() directly, and subsequent prepare timeout retries will
@@ -2739,6 +3127,8 @@ pub fn Replica(
             //    asynchronous prepare_ok to itself.
             // 3. In on_start_view_change(), after receiving a quorum of start_view_change
             //    messages, the new leader sends a synchronous do_view_change to itself.
+            // 4. In start_view_as_the_new_leader(), the new leader sends itself a prepare_ok
+            //    message for each uncommitted message.
             if (self.loopback_queue) |message| {
                 defer self.message_bus.unref(message);
@@ -2891,10 +3281,10 @@ pub fn Replica(
             // Verify that the new request will fit in the WAL.
             // The message's op hasn't been assigned yet, but it will be `self.op + 1`.
-            if (self.op + 1 >= self.op_checkpoint + config.journal_slot_count) {
+            if (self.op == self.op_checkpoint_trigger()) {
                 log.debug("{}: on_request: ignoring op={} (too far ahead, checkpoint={})", .{
                     self.replica,
-                    message.header.op,
+                    self.op + 1,
                     self.op_checkpoint,
                 });
                 return true;
@@ -2915,7 +3305,7 @@ pub fn Replica(
             assert(message.header.context == 0 or message.header.operation != .register);
             assert(message.header.request == 0 or message.header.operation != .register);
-            if (self.client_table.getPtr(message.header.client)) |entry| {
+            if (self.client_table().get(message.header.client)) |entry| {
                 assert(entry.reply.header.command == .reply);
                 assert(entry.reply.header.client == message.header.client);
@@ -3105,6 +3495,81 @@ pub fn Replica(
             return false;
         }
+        fn is_repair(self: *const Self, message: *const Message) bool {
+            assert(message.header.command == .prepare);
+            if (self.status == .normal) {
+                if (message.header.view < self.view) return true;
+                if (message.header.view == self.view and message.header.op <= self.op) return true;
+            } else if (self.status == .view_change) {
+                if (message.header.view < self.view) return true;
+                // The view has already started or is newer.
+            }
+            return false;
+        }
+        /// Returns whether the replica is the leader for the current view.
+        /// This may be used only when the replica status is normal.
+        fn leader(self: *const Self) bool {
+            assert(self.status == .normal);
+            return self.leader_index(self.view) == self.replica;
+        }
+        /// Returns the index into the configuration of the leader for a given view.
+        fn leader_index(self: *const Self, view: u32) u8 {
+            return @intCast(u8, @mod(view, self.replica_count));
+        }
+        /// Advances `op` to where we need to be before `header` can be processed as a prepare.
+        ///
+        /// This function temporarily violates the "replica.op must exist in WAL" invariant.
+        fn jump_to_newer_op_in_normal_status(self: *Self, header: *const Header) void {
+            assert(self.status == .normal);
+            assert(self.follower());
+            assert(header.view == self.view);
+            assert(header.op > self.op + 1);
+            // We may have learned of a higher `commit_max` through a commit message before jumping
+            // to a newer op that is less than `commit_max` but greater than `commit_min`:
+            assert(header.op > self.commit_min);
+            // Never overwrite an op that still needs to be checkpointed.
+            assert(header.op <= self.op_checkpoint_trigger());
+            log.debug("{}: jump_to_newer_op: advancing: op={}..{} checksum={}..{}", .{
+                self.replica,
+                self.op,
+                header.op - 1,
+                self.journal.header_with_op(self.op).?.checksum,
+                header.parent,
+            });
+            self.op = header.op - 1;
+            assert(self.op >= self.commit_min);
+            assert(self.op + 1 == header.op);
+            assert(self.journal.header_with_op(self.op) == null);
+        }
+        fn message_body_as_headers(message: *const Message) []const Header {
+            assert(message.header.size > @sizeOf(Header)); // Body must contain at least one header.
+            assert(message.header.command == .do_view_change or
+                message.header.command == .start_view or
+                message.header.command == .headers or
+                message.header.command == .recovery_response);
+            const headers = std.mem.bytesAsSlice(
+                Header,
+                message.buffer[@sizeOf(Header)..message.header.size],
+            );
+            for (headers[0 .. headers.len - 1]) |header, index| {
+                // Headers must be provided in reverse order for the sake of `repair_header()`.
+                // Otherwise, headers may never be repaired where the hash chain never connects.
+                assert(header.op > headers[index + 1].op);
+            }
+            return headers;
+        }
         /// Returns whether the highest known op is certain.
         ///
         /// After recovering the WAL, there are 2 possible outcomes:
@@ -3169,70 +3634,83 @@ pub fn Replica(
             return true;
         }
-        fn is_repair(self: *const Self, message: *const Message) bool {
-            assert(message.header.command == .prepare);
-            if (self.status == .normal) {
-                if (message.header.view < self.view) return true;
-                if (message.header.view == self.view and message.header.op <= self.op) return true;
-            } else if (self.status == .view_change) {
-                if (message.header.view < self.view) return true;
-                // The view has already started or is newer.
-            }
-            return false;
-        }
+        /// Returns the op that will be `op_checkpoint` after the next checkpoint.
+        ///
+        /// For a replica with journal_slot_count=8 and lsm_batch_multiple=2:
+        ///
+        ///   checkpoint() call      0   1   2   3
+        ///   op_checkpoint          0   5  11  17
+        ///   op_checkpoint_next     5  11  17  23
+        ///   op_checkpoint_trigger  7  13  19  25
+        ///
+        ///     commit log (ops)           │ write-ahead log (slots)
+        ///     0   4   8   2   6   0   4  │ 0---4---
+        ///   0 ─────✓·%                   │ 01234✓6%   initial log fill
+        ///   1 ───────────✓·%             │ 890✓2%45   first wrap of log
+        ///   2 ─────────────────✓·%       │ 6✓8%0123   second wrap of log
+        ///   3 ───────────────────────✓·% │ 4%67890✓   third wrap of log
+        ///
+        /// Legend:
+        ///
+        ///   ─/✓  op on disk at checkpoint
+        ///   ·/%  op in memory at checkpoint
+        ///     ✓  op_checkpoint
+        ///     %  op_checkpoint_trigger
+        ///
+        fn op_checkpoint_next(self: *const Self) u64 {
+            assert(self.op_checkpoint <= self.commit_min);
+            assert(self.op_checkpoint <= self.op);
+            assert(self.op_checkpoint == 0 or
+                (self.op_checkpoint + 1) % config.lsm_batch_multiple == 0);
-        /// Returns whether the replica is the leader for the current view.
-        /// This may be used only when the replica status is normal.
-        fn leader(self: *Self) bool {
-            assert(self.status == .normal);
-            return self.leader_index(self.view) == self.replica;
+            const op = if (self.op_checkpoint == 0)
+                // First wrap: op_checkpoint_next = 8-2-1 = 5
+                config.journal_slot_count - config.lsm_batch_multiple - 1
+            else
+                // Second wrap: op_checkpoint_next = 5+8-2 = 11
+                // Third wrap: op_checkpoint_next = 11+8-2 = 17
+                self.op_checkpoint + config.journal_slot_count - config.lsm_batch_multiple;
+            assert((op + 1) % config.lsm_batch_multiple == 0);
+            // The checkpoint always advances.
+            assert(op > self.op_checkpoint);
+            return op;
         }
-        /// Returns the index into the configuration of the leader for a given view.
-        fn leader_index(self: *Self, view: u32) u8 {
-            return @intCast(u8, @mod(view, self.replica_count));
+        /// Returns the next op that will trigger a checkpoint.
+        ///
+        /// Receiving and storing an op higher than `op_checkpoint_trigger()` is forbidden; doing so
+        /// would overwrite a message (or the slot of a message) that has not yet been committed and
+        /// checkpointed.
+        ///
+        /// See `op_checkpoint_next` for more detail.
+        fn op_checkpoint_trigger(self: *const Self) u64 {
+            return self.op_checkpoint_next() + config.lsm_batch_multiple;
         }
-        /// Advances `op` to where we need to be before `header` can be processed as a prepare:
-        fn jump_to_newer_op_in_normal_status(self: *Self, header: *const Header) void {
-            assert(self.status == .normal);
-            assert(self.follower());
-            assert(header.view == self.view);
-            assert(header.op > self.op + 1);
-            // We may have learned of a higher `commit_max` through a commit message before jumping
-            // to a newer op that is less than `commit_max` but greater than `commit_min`:
-            assert(header.op > self.commit_min);
-            // Never overwrite an op that still needs to be checkpointed.
-            assert(header.op - self.op_checkpoint < config.journal_slot_count);
+        /// Finds the header with the highest op number in a slice of headers from a replica.
+        /// The headers must be continuous, in reverse order, all connected, and with no gaps.
+        fn op_highest(headers: []const Header) u64 {
+            assert(headers.len > 0);
-            log.debug("{}: jump_to_newer_op: advancing: op={}..{} checksum={}..{}", .{
-                self.replica,
-                self.op,
-                header.op - 1,
-                self.journal.header_with_op(self.op).?.checksum,
-                header.parent,
-            });
+            for (headers) |header, index| {
+                assert(header.valid_checksum());
+                assert(header.invalid() == null);
+                assert(header.command == .prepare);
-            self.op = header.op - 1;
-            assert(self.op >= self.commit_min);
-            assert(self.op + 1 == header.op);
-        }
+                if (index > 0) {
+                    assert(header.op + 1 == headers[index - 1].op);
+                    assert(header.checksum == headers[index - 1].parent);
+                }
+            }
-        fn message_body_as_headers(_: *Self, message: *const Message) []Header {
-            // TODO Assert message commands that we expect this to be called for.
-            assert(message.header.size > @sizeOf(Header)); // Body must contain at least one header.
-            return std.mem.bytesAsSlice(
-                Header,
-                message.buffer[@sizeOf(Header)..message.header.size],
-            );
+            return headers[0].op;
         }
         /// Panics if immediate neighbors in the same view would have a broken hash chain.
         /// Assumes gaps and does not require that a preceeds b.
         fn panic_if_hash_chain_would_break_in_the_same_view(
-            self: *Self,
+            self: *const Self,
             a: *const Header,
             b: *const Header,
         ) void {
@@ -3279,7 +3757,7 @@ pub fn Replica(
             var op = self.commit_max + 1;
             var parent = self.journal.header_with_op(self.commit_max).?.checksum;
-            var iterator = self.pipeline.iterator();
+            var iterator = self.pipeline.iterator_mutable();
             while (iterator.next_ptr()) |prepare| {
                 assert(prepare.message.header.command == .prepare);
                 assert(prepare.message.header.op == op);
@@ -3380,10 +3858,7 @@ pub fn Replica(
             // The replica repairs backwards from `commit_max`. But if `commit_max` is too high
             // (>1 WAL ahead), then bound it such that uncommitted WAL entries are not overwritten.
-            const commit_max_limit = std.math.min(
-                self.commit_max,
-                self.op_checkpoint + config.journal_slot_count,
-            );
+            const commit_max_limit = std.math.min(self.commit_max, self.op_checkpoint_trigger());
             // Request outstanding committed prepares to advance our op number:
             // This handles the case of an idle cluster, where a follower will not otherwise advance.
@@ -3460,13 +3935,12 @@ pub fn Replica(
             // Commit ops, which may in turn discover faulty prepares and drive more repairs:
             if (self.commit_min < self.commit_max) {
                 assert(self.replica_count > 1);
-                self.commit_ops(self.commit_max);
+                self.commit_journal(self.commit_max);
                 return;
             }
             if (self.status == .view_change and self.leader_index(self.view) == self.replica) {
                 if (self.repair_pipeline_op() != null) return self.repair_pipeline();
                 // Start the view as the new leader:
                 self.start_view_as_the_new_leader();
             }
@@ -3505,6 +3979,9 @@ pub fn Replica(
         /// with an older view number may be committed instead of an op with a newer view number:
         /// http://web.stanford.edu/~ouster/cgi-bin/papers/OngaroPhD.pdf.
         ///
+        /// * Do not replace an op belonging to the current WAL wrap with an op belonging to a
+        ///   previous wrap. In other words, don't repair checkpointed ops.
+        ///
         fn repair_header(self: *Self, header: *const Header) bool {
             assert(header.valid_checksum());
             assert(header.invalid() == null);
@@ -3517,145 +3994,121 @@ pub fn Replica(
             }
             if (header.op > self.op) {
-                log.debug("{}: repair_header: false (advances self.op={})", .{
+                log.debug("{}: repair_header: op={} checksum={} (advances hash chain head)", .{
                     self.replica,
-                    self.op,
+                    header.op,
+                    header.checksum,
+                });
+                return false;
+            } else if (header.op == self.op and !self.journal.has(header)) {
+                assert(self.journal.header_with_op(self.op) != null);
+                log.debug("{}: repair_header: op={} checksum={} (changes hash chain head)", .{
+                    self.replica,
+                    header.op,
+                    header.checksum,
                 });
                 return false;
-            } else if (header.op == self.op) {
-                if (self.journal.header_with_op_and_checksum(self.op, header.checksum)) |_| {
-                    // Fall through below to check if self.op is uncommitted AND reordered,
-                    // which we would see by the presence of an earlier op with higher view number,
-                    // that breaks the chain with self.op. In this case, we must skip the repair to
-                    // avoid overwriting any overlapping op.
+            }
+            if (header.op <= self.op_checkpoint) {
+                if (header.op == 0 and self.op_checkpoint == 0) {
+                    // Repairing the root op is allowed until the first checkpoint.
                 } else {
-                    log.debug("{}: repair_header: false (changes self.op={})", .{
+                    // Otherwise don't repair checkpointed ops, since their slots now belong to
+                    // the next wrap of the WAL.
+                    log.debug("{}: repair_header: false (precedes self.op_checkpoint={})", .{
                         self.replica,
-                        self.op,
+                        self.op_checkpoint,
                     });
                     return false;
                 }
             }
-            if (self.journal.header_for_entry(header)) |existing| {
-                assert(existing.op == header.op);
-                // Do not replace any existing op lightly as doing so may impair durability and even
-                // violate correctness by undoing a prepare already acknowledged to the leader:
+            if (self.journal.header_for_prepare(header)) |existing| {
                 if (existing.checksum == header.checksum) {
-                    const slot = self.journal.slot_with_header(header).?;
-                    if (!self.journal.dirty.bit(slot)) {
-                        log.debug("{}: repair_header: op={} false (checksum clean)", .{
+                    if (self.journal.has_clean(header)) {
+                        log.debug("{}: repair_header: op={} checksum={} (checksum clean)", .{
                             self.replica,
                             header.op,
+                            header.checksum,
                         });
                         return false;
+                    } else {
+                        log.debug("{}: repair_header: op={} checksum={} (checksum dirty)", .{
+                            self.replica,
+                            header.op,
+                            header.checksum,
+                        });
                     }
-                    log.debug("{}: repair_header: op={} exists, checksum dirty", .{
-                        self.replica,
-                        header.op,
-                    });
                 } else if (existing.view == header.view) {
                     // The journal must have wrapped:
-                    // We expect that the same view and op will have the same checksum.
+                    // We expect that the same view and op would have had the same checksum.
                     assert(existing.op != header.op);
                     if (existing.op > header.op) {
-                        log.debug("{}: repair_header: op={} false (view has newer op)", .{
+                        log.debug("{}: repair_header: op={} checksum={} (same view, newer op)", .{
                             self.replica,
                             header.op,
+                            header.checksum,
                         });
-                        return false;
-                    }
-                    log.debug("{}: repair_header: op={} exists, view has older op", .{
-                        self.replica,
-                        header.op,
-                    });
-                } else {
-                    assert(existing.view != header.view);
-                    assert(existing.op == header.op or existing.op != header.op);
-                    if (!self.repair_header_would_connect_hash_chain(header)) {
-                        // We cannot replace this op until we are sure that doing so would not
-                        // violate any prior commitments made to the leader.
-                        log.debug("{}: repair_header: op={} false (exists)", .{
+                    } else {
+                        log.debug("{}: repair_header: op={} checksum={} (same view, older op)", .{
                             self.replica,
                             header.op,
+                            header.checksum,
                         });
-                        return false;
                     }
+                } else {
+                    assert(existing.view != header.view);
+                    assert(existing.op == header.op or existing.op != header.op);
-                    log.debug("{}: repair_header: op={} exists, connects hash chain", .{
+                    log.debug("{}: repair_header: op={} checksum={} (different view)", .{
                         self.replica,
                         header.op,
+                        header.checksum,
                     });
                 }
             } else {
-                log.debug("{}: repair_header: op={} gap", .{ self.replica, header.op });
-            }
-            // Caveat: Do not repair an existing op or gap if doing so would break the hash chain:
-            if (self.repair_header_would_break_hash_chain_with_next_entry(header)) {
-                log.debug("{}: repair_header: op={} false (breaks hash chain)", .{
+                log.debug("{}: repair_header: op={} checksum={} (gap)", .{
                     self.replica,
                     header.op,
+                    header.checksum,
                 });
-                return false;
             }
-            // TODO Snapshots: Skip if this header is already snapshotted.
             assert(header.op < self.op or
                 self.journal.header_with_op(self.op).?.checksum == header.checksum);
-            self.journal.set_header_as_dirty(header);
-            return true;
-        }
-        /// If we repair this header, then would this break the hash chain only to our immediate right?
-        /// This offers a weak guarantee compared to `repair_header_would_connect_hash_chain()` below.
-        /// However, this is useful for allowing repairs when the hash chain is sparse.
-        fn repair_header_would_break_hash_chain_with_next_entry(
-            self: *Self,
-            header: *const Header,
-        ) bool {
-            if (self.journal.previous_entry(header)) |previous| {
-                self.panic_if_hash_chain_would_break_in_the_same_view(previous, header);
+            if (!self.repair_header_would_connect_hash_chain(header)) {
+                // We cannot replace this op until we are sure that this would not:
+                // 1. undermine any prior prepare_ok guarantee made to the primary, and
+                // 2. leak stale ops back into our in-memory headers (and so into a view change).
+                log.debug("{}: repair_header: op={} checksum={} (disconnected from hash chain)", .{
+                    self.replica,
+                    header.op,
+                    header.checksum,
+                });
+                return false;
             }
-            if (self.journal.next_entry(header)) |next| {
-                self.panic_if_hash_chain_would_break_in_the_same_view(header, next);
-                if (header.checksum == next.parent) {
-                    assert(header.view <= next.view);
-                    assert(header.op + 1 == next.op);
-                    // We don't break with `next` but this is no guarantee that `next` does not
-                    // break.
-                    return false;
-                } else {
-                    // If the journal has wrapped, then err in favor of a break regardless of op
-                    // order:
-                    return true;
+            if (header.op <= self.commit_min) {
+                if (self.journal.header_with_op(header.op)) |existing| {
+                    // If we already committed this op, the repair must be the identical message.
+                    assert(header.checksum == existing.checksum);
                 }
             }
-            // We are not completely sure since there is no entry to the immediate right:
-            return false;
+            self.journal.set_header_as_dirty(header);
+            return true;
         }
-        /// If we repair this header, then would this connect the hash chain through to the latest
-        /// op? This offers a strong guarantee that may be used to replace or overlap an existing
-        /// op.
+        /// If we repair this header, would this connect the hash chain through to the latest op?
+        /// This offers a strong guarantee that may be used to replace an existing op.
         ///
         /// Here is an example of what could go wrong if we did not check for complete connection:
         ///
         /// 1. We do a prepare that's going to be committed.
-        /// 2. We do a stale prepare to the right of this, ignoring the hash chain break to the
-        ///    left.
-        /// 3. We do another stale prepare that replaces the first op because it connects to the
-        ///    second.
+        /// 2. We do a stale prepare to the right, ignoring the hash chain break to the left.
+        /// 3. We do another stale prepare that replaces the first since it connects to the second.
         ///
         /// This would violate our quorum replication commitment to the leader.
         /// The mistake in this example was not that we ignored the break to the left, which we must
@@ -4086,6 +4539,55 @@ pub fn Replica(
             }
         }
+        /// The caller must ensure that the headers are trustworthy.
+        ///
+        /// Asserts that sequential ops are hash-chained. (Gaps are permitted).
+        fn replace_headers(self: *Self, headers: []const Header) void {
+            for (headers) |*header, i| {
+                if (i > 0) {
+                    const next = &headers[i - 1];
+                    assert(next.view >= header.view);
+                    if (next.op == header.op + 1) {
+                        assert(next.parent == header.checksum);
+                    } else {
+                        assert(next.op > header.op);
+                    }
+                }
+                self.replace_header(header);
+            }
+        }
+        /// Replaces the header if the header is different and not already committed.
+        /// The caller must ensure that the header is trustworthy.
+        fn replace_header(self: *Self, header: *const Header) void {
+            assert(self.op_checkpoint <= self.commit_min);
+            assert(header.command == .prepare);
+            assert(header.op <= self.op); // Never advance the op.
+            assert(header.op <= self.op_checkpoint_trigger());
+            if (header.op <= self.commit_min) {
+                if (self.journal.header_with_op(header.op)) |existing_header| {
+                    assert(existing_header.checksum == header.checksum);
+                    return;
+                } else {
+                    if (header.op <= self.op_checkpoint) {
+                        // Never replace a checkpointed op — those slots are needed by the following
+                        // WAL wrap.
+                        return;
+                    } else {
+                        // If an op is committed but not checkpointed, we must still have the header.
+                        @panic("missing committed, uncheckpointed header");
+                    }
+                }
+            }
+            // Do not set an op as dirty if we already have it exactly because:
+            // 1. this would trigger a repair and delay the view change, or worse,
+            // 2. prevent repairs to another replica when we have the op.
+            if (!self.journal.has(header)) self.journal.set_header_as_dirty(header);
+        }
         /// Replicates to the next replica in the configuration (until we get back to the leader):
         /// Replication starts and ends with the leader, we never forward back to the leader.
         /// Does not flood the network with prepares that have already committed.
@@ -4149,7 +4651,7 @@ pub fn Replica(
                 assert(replica < self.replica_count);
             }
-            counter.setIntersection(quorum_counter_null);
+            counter.* = quorum_counter_null;
             assert(counter.count() == 0);
             var replica: usize = 0;
@@ -4168,6 +4670,20 @@ pub fn Replica(
             self.nack_prepare_op = null;
         }
+        fn reset_quorum_prepare_ok(self: *Self) void {
+            // "prepare_ok"s from previous views are not valid, even if the pipeline entry is reused
+            // after a cycle of view changes. In other words, when a view change cycles around, so
+            // that the original primary becomes a primary of a new view, pipeline entries may be
+            // reused. However, the pipeline's prepare_ok quorums must not be reused, since the
+            // replicas that sent them may have swapped them out during a previous view change.
+            var iterator = self.pipeline.iterator_mutable();
+            while (iterator.next_ptr()) |prepare| {
+                prepare.ok_quorum_received = false;
+                prepare.ok_from_all_replicas = quorum_counter_null;
+                assert(prepare.ok_from_all_replicas.count() == 0);
+            }
+        }
         fn reset_quorum_start_view_change(self: *Self) void {
             self.reset_quorum_counter(&self.start_view_change_from_other_replicas);
             self.start_view_change_quorum = false;
@@ -4296,8 +4812,15 @@ pub fn Replica(
             assert(message.header.command == .do_view_change);
             assert(message.header.view == self.view);
             assert(message.header.op == self.op);
-            assert(message.header.op == self.message_body_as_headers(message)[0].op);
-            assert(message.header.commit == self.commit_max);
+            assert(message.header.op == message_body_as_headers(message)[0].op);
+            // Each replica must advertise its own commit number, so that the new primary can know
+            // which headers must be replaced in its log. Otherwise, a gap in the log may prevent
+            // the new primary from repairing its log, resulting in the log being forked if the new
+            // primary also discards uncommitted operations.
+            // It is also safe not to use `commit_max` here because the new primary will assume that
+            // operations after the highest `commit_min` may yet have been committed before the old
+            // primary crashed. The new primary will use the NACK protocol to be sure of a discard.
+            assert(message.header.commit == self.commit_min);
             self.send_message_to_replica(self.leader_index(self.view), message);
         }
@@ -4389,6 +4912,7 @@ pub fn Replica(
                 .prepare_ok => {
                     assert(self.status == .normal);
                     assert(message.header.view == self.view);
+                    assert(message.header.op <= self.op_checkpoint_trigger());
                     // We must only ever send a prepare_ok to the latest leader of the active view:
                     // We must never straddle views by sending to a leader in an older view.
                     // Otherwise, we would be enabling a partitioned leader to commit.
@@ -4407,6 +4931,7 @@ pub fn Replica(
                     assert(message.header.view == self.view);
                     assert(message.header.replica == self.replica);
                     assert(message.header.op == self.op);
+                    assert(message.header.commit == self.commit_min);
                     assert(replica == self.leader_index(self.view));
                 },
                 .start_view => switch (self.status) {
@@ -4479,46 +5004,13 @@ pub fn Replica(
             }
         }
-        /// Finds the header with the highest op number in a slice of headers from a replica.
-        /// Searches only by op number to find the highest `self.op` for the replica.
-        fn set_latest_op(headers: []const Header, latest: *Header) void {
-            switch (latest.command) {
-                .reserved, .prepare => assert(latest.valid_checksum()),
-                else => unreachable,
-            }
-            for (headers) |header| {
-                assert(header.valid_checksum());
-                assert(header.invalid() == null);
-                assert(header.command == .prepare);
-                if (latest.command == .reserved or header.op > latest.op) {
-                    // We are simply trying to find the latest `self.op` in the replica's log.
-                    // We therefore do not compare views here.
-                    latest.* = header;
-                }
-            }
-        }
-        fn set_latest_op_and_k(
-            self: *Self,
-            latest: *const Header,
-            k: u64,
-            method: []const u8,
-        ) void {
+        fn set_op_and_commit_max(self: *Self, op: u64, commit_max: u64, method: []const u8) void {
             assert(self.status == .view_change or self.status == .recovering);
             assert(self.journal.recovered);
-            assert(latest.valid_checksum());
-            assert(latest.invalid() == null);
-            assert(latest.command == .prepare);
-            assert(latest.cluster == self.cluster);
             switch (self.status) {
                 .normal => unreachable,
-                .view_change => {
-                    // The view may have started already, so we can have a prepare in the same view:
-                    assert(latest.view <= self.view);
-                },
+                .view_change => {},
                 .recovering => {
                     // The replica's view hasn't been set yet.
                     // It will be set shortly, when we transition to normal status.
@@ -4526,73 +5018,406 @@ pub fn Replica(
                 },
             }
-            log.debug("{}: {s}: view={} op={}..{} commit={}..{} checksum={}", .{
-                self.replica,
-                method,
-                self.view,
-                self.op,
-                latest.op,
-                self.commit_max,
-                k,
-                latest.checksum,
-            });
-            // Uncommitted ops may not survive a view change so we must assert `latest.op` against
+            // Uncommitted ops may not survive a view change so we must assert `op` against
             // `commit_max` and not `self.op`. However, committed ops (`commit_max`) must survive:
-            assert(latest.op >= self.commit_max);
-            assert(latest.op >= latest.commit);
-            assert(latest.op >= k);
-            // We expect that `commit_max` (and `commit_min`) may be greater than `latest.commit`
-            // because `latest.commit` is the commit number at the time the `latest.op` prepared.
-            // We expect that `commit_max` (and `commit_min`) may also be greater even than `k`
-            // because we may be the old leader joining towards the end of the view change and we
-            // may have committed the `latest.op` already. However, this is bounded by pipelining.
-            // The intersection property only requires that all "possibly" committed operations must
-            // survive into the new view so that they can then be committed by the new leader. This
-            // guarantees that if the old leader "possibly" committed the operation, then the new
-            // leader will also commit the operation.
-            if (k < self.commit_max and self.commit_min == self.commit_max) {
+            assert(op >= self.commit_max);
+            assert(op >= commit_max);
+            assert(op <= self.op_checkpoint_trigger());
+            // We expect that our commit numbers may also be greater even than `commit_max` because
+            // we may be the old leader joining towards the end of the view change and we may have
+            // committed `op` already.
+            // However, this is bounded by pipelining.
+            // The intersection property only requires that all possibly committed operations must
+            // survive into the new view so that they can then be committed by the new leader.
+            // This guarantees that if the old leader possibly committed the operation, then the
+            // new leader will also commit the operation.
+            if (commit_max < self.commit_max and self.commit_min == self.commit_max) {
                 log.debug("{}: {s}: k={} < commit_max={} and commit_min == commit_max", .{
                     self.replica,
                     method,
-                    k,
+                    commit_max,
                     self.commit_max,
                 });
             }
-            assert(k >= latest.commit);
-            assert(k >= self.commit_max - std.math.min(config.pipeline_max, self.commit_max));
+            assert(commit_max >=
+                self.commit_max - std.math.min(config.pipeline_max, self.commit_max));
             assert(self.commit_min <= self.commit_max);
             assert(self.op >= self.commit_max or self.op < self.commit_max);
-            self.op = latest.op;
+            const previous_op = self.op;
+            const previous_commit_max = self.commit_max;
+            self.op = op;
+            self.journal.remove_entries_from(self.op + 1);
             // Crucially, we must never rewind `commit_max` (and then `commit_min`) because
             // `commit_min` represents what we have already applied to our state machine:
-            self.commit_max = std.math.max(self.commit_max, k);
+            self.commit_max = std.math.max(self.commit_max, commit_max);
             assert(self.commit_min <= self.commit_max);
-            assert(self.op >= self.commit_max);
+            assert(self.commit_max <= self.op);
-            // Do not set the latest op as dirty if we already have it exactly:
-            // Otherwise, this would trigger a repair and delay the view change, or worse, it would
-            // prevent us from assisting another replica to recover when we do in fact have the op.
-            if (self.journal.has(latest)) {
-                log.debug("{}: {s}: latest op exists exactly", .{ self.replica, method });
-            } else {
-                self.journal.set_header_as_dirty(latest);
+            log.debug("{}: {s}: view={} op={}..{} commit={}..{}", .{
+                self.replica,
+                method,
+                self.view,
+                previous_op,
+                self.op,
+                previous_commit_max,
+                self.commit_max,
+            });
+        }
+        /// Load the new view's headers from the DVC quorum.
+        ///
+        /// The iteration order of DVCs for repair does not impact the final result.
+        /// In other words, you can't end up in a situation with a DVC quorum like:
+        ///
+        ///   replica     headers  commit_min
+        ///         0   4 5 _ _ 8           4 (new leader; handling DVC quorum)
+        ///         1   4 _ 6 _ 8           4
+        ///         2   4 _ _ 7 8           4
+        ///         3  (4 5 6 7 8)          8 (didn't participate in view change)
+        ///         4  (4 5 6 7 8)          8 (didn't participate in view change)
+        ///
+        /// where the new leader's headers depends on which of replica 1 and 2's DVC is used
+        /// for repair before the other (i.e. whether they repair op 6 or 7 first).
+        ///
+        /// For the above case to occur, replicas 0, 1, and 2 must all share the highest `view_normal`.
+        /// And since they share the latest `view_normal`, ops 5,6,7 were just installed by
+        /// `replace_header`, which is order-independent (it doesn't use the hash chain).
+        ///
+        /// (If replica 0's view_normal was greater than 1/2's, then replica 0 must have all
+        /// headers from previous views. Which means 6,7 are from the current view. But since
+        /// replica 0 doesn't have 6/7, then replica 1/2 must share the latest view_normal. ∎)
+        fn set_log_from_do_view_change_messages(self: *Self) void {
+            assert(self.status == .view_change);
+            assert(self.leader_index(self.view) == self.replica);
+            assert(self.replica_count > 1);
+            assert(self.start_view_change_quorum);
+            assert(self.do_view_change_quorum);
+            const do_view_change_head = self.do_view_change_quorum_head();
+            assert(do_view_change_head.view_normal >= self.view_normal);
+            assert(do_view_change_head.op >= self.commit_min);
+            assert(do_view_change_head.op >= do_view_change_head.commit_min_max);
+            assert(do_view_change_head.commit_min_max >= self.commit_min);
+            // The `prepare_timestamp` prevents a primary's own clock from running backwards.
+            // Therefore, `prepare_timestamp`:
+            // 1. is advanced if behind the cluster, but never reset if ahead of the cluster, i.e.
+            // 2. may not always reflect the timestamp of the latest prepared op, and
+            // 3. should be advanced before discarding the timestamps of any uncommitted headers.
+            if (self.state_machine.prepare_timestamp < do_view_change_head.timestamp) {
+                self.state_machine.prepare_timestamp = do_view_change_head.timestamp;
             }
-            assert(self.op == latest.op);
-            self.journal.remove_entries_from(self.op + 1);
-            assert(self.journal.header_with_op(self.op).?.checksum == latest.checksum);
+            const view_normal_canonical = do_view_change_head.view_normal;
+            // `op_canonical` must be computed before calling `set_op_and_commit_max()`, since
+            // that may change `replica.op`.
+            //
+            // Don't remove the uncanonical headers yet — even though the removed headers are
+            // a subset of the DVC headers, removing and then adding them back would cause clean
+            // headers to become dirty.
+            const op_canonical = self.op_canonical_max(view_normal_canonical);
+            assert(op_canonical <= self.op);
+            assert(op_canonical >= self.op -| config.pipeline_max);
+            assert(op_canonical >= self.commit_min);
+            if (do_view_change_head.op > self.op_checkpoint_trigger()) {
+                // This replica is too far behind, i.e. the new `self.op` is too far ahead of the
+                // last checkpoint. If we wrap now, we overwrite un-checkpointed transfers in the WAL,
+                // precluding recovery.
+                //
+                // TODO State transfer. Currently this is unreachable because the
+                // leader won't checkpoint until all replicas are caught up.
+                unreachable;
+            }
+            self.set_op_and_commit_max(
+                do_view_change_head.op,
+                // `set_op_and_commit_max()` expects the highest commit_max that we know of.
+                // But DVCs include replica's `commit_min`, not `commit_max`.
+                std.math.max(
+                    self.commit_max,
+                    do_view_change_head.commit_min_max,
+                ),
+                "on_do_view_change",
+            );
+            // "`replica.op` exists" invariant may be broken until after the canonical DVC headers
+            // are installed.
+            // First, set all the canonical headers from the replica(s) with highest `view_normal`:
+            for (self.do_view_change_from_all_replicas) |received| {
+                if (received) |message| {
+                    const view_normal = @intCast(u32, message.header.timestamp);
+                    // The view in which this replica's status was normal must be before this view.
+                    assert(view_normal < message.header.view);
+                    if (view_normal < view_normal_canonical) continue;
+                    assert(view_normal == view_normal_canonical);
+                    const message_headers = message_body_as_headers(message);
+                    for (message_headers) |*header| {
+                        log.debug(
+                            "{}: on_do_view_change: canonical: replica={} op={} checksum={}",
+                            .{
+                                self.replica,
+                                message.header.replica,
+                                header.op,
+                                header.checksum,
+                            },
+                        );
+                    }
+                    self.replace_headers(message_headers);
+                }
+            }
+            // Since we used do_view_change_head to set the replica.op, it must have been loaded
+            // into the headers (if it wasn't present already).
+            assert(self.journal.header_with_op(self.op) != null);
+            // Now that the canonical headers are all in place, repair any other headers:
+            for (self.do_view_change_from_all_replicas) |received| {
+                if (received) |message| {
+                    const view_normal = @intCast(u32, message.header.timestamp);
+                    assert(view_normal < message.header.view);
+                    if (view_normal == view_normal_canonical) continue;
+                    assert(view_normal < view_normal_canonical);
+                    for (message_body_as_headers(message)) |*header| {
+                        // We must trust headers that other replicas have committed, because
+                        // repair_header() will not repair a header if the hash chain has a gap.
+                        if (header.op <= message.header.commit) {
+                            log.debug(
+                                "{}: on_do_view_change: committed: replica={} op={} checksum={}",
+                                .{
+                                    self.replica,
+                                    message.header.replica,
+                                    header.op,
+                                    header.checksum,
+                                },
+                            );
+                            self.replace_header(header);
+                        } else {
+                            _ = self.repair_header(header);
+                        }
+                    }
+                }
+            }
+            const op_max = self.do_view_change_op_max(op_canonical);
+            assert(op_max <= self.op);
+            assert(op_max >= self.commit_min);
+            if (op_max != self.op) {
+                log.debug("{}: set_log_from_do_view_change_messages: discard op={}..{}", .{
+                    self.replica,
+                    op_max + 1,
+                    self.op,
+                });
+                self.journal.remove_entries_from(op_max + 1);
+                self.op = op_max;
+            }
+            assert(self.journal.header_with_op(self.op) != null);
         }
-        fn start_view_as_the_new_leader(self: *Self) void {
+        fn do_view_change_quorum_head(self: *const Self) struct {
+            /// The highest `view_normal` of any DVC.
+            ///
+            /// The headers bundled with DVCs with the highest `view_normal` are canonical, since
+            /// the replica has knowledge of previous view changes in which headers were replaced.
+            view_normal: u32,
+            /// The highest `commit_min` from any DVC (this is not a `commit_max`).
+            commit_min_max: u64,
+            /// The highest `op` from a DVC with the highest `view_normal`.
+            op: u64,
+            /// The higest timestamp from any DVC.
+            timestamp: u64,
+        } {
             assert(self.status == .view_change);
             assert(self.leader_index(self.view) == self.replica);
+            assert(self.replica_count > 1);
+            assert(self.start_view_change_quorum);
             assert(self.do_view_change_quorum);
+            assert(self.do_view_change_from_all_replicas[self.replica] != null);
-            assert(!self.committing);
+            var v: ?u32 = null; // The highest `view_normal` from any replica.
+            var n: ?u64 = null; // The highest `op` for the highest `view_normal` from any replica.
+            var k: ?u64 = null; // The highest `commit_min` from any replica.
+            var t: ?u64 = null; // The highest `timestamp` from any replica.
+            for (self.do_view_change_from_all_replicas) |received, replica| {
+                if (received) |message| {
+                    assert(message.header.command == .do_view_change);
+                    assert(message.header.cluster == self.cluster);
+                    assert(message.header.replica == replica);
+                    assert(message.header.view == self.view);
+                    assert(message.header.op >= message.header.commit);
+                    assert(message.header.op - message.header.commit <= config.journal_slot_count);
+                    // The view when this replica was last in normal status, which:
+                    // * may be higher than the view in any of the prepare headers.
+                    // * must be lower than the view of this view change.
+                    const view_normal = @intCast(u32, message.header.timestamp);
+                    assert(view_normal < message.header.view);
+                    if (replica == self.replica) {
+                        assert(view_normal == self.view_normal);
+                        assert(message.header.op == self.op);
+                        // We may have a newer commit than our DVC due to async commits (see below).
+                        assert(message.header.commit <= self.commit_min);
+                    }
+                    log.debug(
+                        "{}: on_do_view_change: " ++
+                            "replica={} view_normal={} op={} commit_min={}",
+                        .{
+                            self.replica,
+                            message.header.replica,
+                            view_normal,
+                            message.header.op,
+                            message.header.commit, // The `commit_min` of the replica.
+                        },
+                    );
+                    if (v == null or view_normal > v.?) {
+                        v = view_normal;
+                        n = message.header.op;
+                    } else if (view_normal == v.? and message.header.op > n.?) {
+                        n = message.header.op;
+                    }
+                    if (k == null or message.header.commit > k.?) k = message.header.commit;
+                    const message_headers = message_body_as_headers(message);
+                    if (t == null or t.? < message_headers[0].timestamp) {
+                        t = message_headers[0].timestamp;
+                    }
+                }
+            }
+            // Consider the case:
+            // 1. Start committing op=N…M.
+            // 2. Send `do_view_change` to self.
+            // 3. Finish committing op=N…M.
+            // 4. Remaining `do_view_change` messages arrive, completing the quorum.
+            // In this scenario, our own DVC's commit is `N-1`, but `commit_min=M`.
+            // Don't let the commit backtrack.
+            if (k.? < self.commit_min) {
+                assert(self.commit_min >
+                    self.do_view_change_from_all_replicas[self.replica].?.header.commit);
+                log.debug("{}: on_do_view_change: bump commit_min view={} commit={}..{}", .{
+                    self.replica,
+                    self.view,
+                    k.?,
+                    self.commit_min,
+                });
+                k = self.commit_min;
+            }
+            assert(v.? >= self.view_normal);
+            assert(k.? >= self.commit_min);
+            return .{
+                .view_normal = v.?,
+                .commit_min_max = k.?,
+                .op = n.?,
+                .timestamp = t.?,
+            };
+        }
+        /// Identify headers to discard during a view change before the primary starts the view.
+        /// This is required to maximize availability in the presence of storage faults.
+        /// Refer to the CTRL protocol from Protocol-Aware Recovery for Consensus-Based Storage.
+        ///
+        /// Returns the highest op that:
+        /// - precedes any hash chain breaks in the uncanonical headers, and
+        /// - precedes any gaps in the uncommitted headers.
+        ///
+        /// Breaks
+        ///
+        /// If there is a hash chain break, none of the headers from the canonical DVCs replaced
+        /// the broken (leftover uncanonical) op.
+        /// Removing these is necessary for correctness and liveness, to ensure that
+        /// disconnected headers do not remain in place in lieu of gaps.
+        ///
+        /// Gaps
+        ///
+        /// It is possible for the new primary to have done an op jump in a previous view, and
+        /// introduced a header gap for an op, which may have then been discarded by another primary
+        /// during a view change, before surviving into this view as a gap because our latest op was
+        /// set as the latest op for the quorum.
+        ///
+        /// In this case, it may be impossible for the new primary to repair the missing header as
+        /// the rest of the cluster may have already discarded it. We therefore iterate over our
+        /// uncommitted header gaps to discard any that may be impossible to repair.
+        ///
+        /// For example, if the old primary replicates ops=7,8,9 (all uncommitted) but only op=9 is
+        /// prepared on another replica before the old primary crashes, then this function finds a
+        /// gap for ops=7,8 and will attempt to discard ops 7,8,9.
+        fn do_view_change_op_max(self: *const Self, op_canonical: u64) u64 {
+            assert(self.replica_count > 1);
+            assert(self.status == .view_change);
+            assert(self.leader_index(self.view) == self.replica);
+            assert(self.do_view_change_quorum);
+            assert(!self.repair_timeout.ticking);
+            assert(self.op >= self.commit_max);
+            // At least one replica in the new quorum committed in the new replica.op's WAL wrap —
+            // wrapping implies a checkpoint (which implies a commit).
+            assert(self.op - self.commit_max <= config.journal_slot_count);
+            assert(self.op - self.commit_min <= config.journal_slot_count);
+            assert(op_canonical <= self.op);
+            assert(op_canonical >= self.commit_min);
+            // Any uncanonical ops remaining either:
+            // * Connect to the hash chain on the right.
+            // * Do not connect on the right (hash chain break).
+            //
+            // If there is a hash chain break, none of the headers from the canonical DVCs replaced
+            // the broken op. It is truncated like a gap.
+            //
+            // Removing these is necessary for correctness and liveness, to ensure that
+            // disconnected headers do not remain in place in lieu of gaps.
+            const op_before_break = blk: {
+                var op: u64 = op_canonical;
+                while (op < self.op) : (op += 1) {
+                    if (self.journal.header_with_op(op)) |header| {
+                        if (self.journal.header_with_op(op + 1)) |next| {
+                            // Broken hash chain.
+                            if (header.checksum != next.parent) break :blk op;
+                        }
+                    }
+                } else break :blk self.op;
+            };
+            // Find the beginning of the lowest gap.
+            //
+            // While iterating > commit_max does not in itself guarantee that an op is uncommitted
+            // (the old primary may have committed the op shortly before crashing), nevertheless,
+            // if it was committed it would have survived into the new view as a header not a gap.
+            const op_before_gap = blk: {
+                // An op cannot be uncommitted if it is definitely outside the pipeline.
+                const op_committed = std.math.max(self.commit_max, self.op -| config.pipeline_max);
+                assert(op_committed <= self.op);
+                var op = op_committed;
+                while (op < self.op) : (op += 1) {
+                    if (self.journal.header_with_op(op + 1) == null) break :blk op;
+                } else break :blk self.op;
+            };
+            return std.math.min(op_before_break, op_before_gap);
+        }
+        fn start_view_as_the_new_leader(self: *Self) void {
+            assert(self.status == .view_change);
+            assert(self.leader_index(self.view) == self.replica);
+            assert(self.do_view_change_quorum);
             assert(!self.repairing_pipeline);
             assert(self.commit_min == self.commit_max);
@@ -4630,6 +5455,9 @@ pub fn Replica(
         fn transition_to_normal_from_recovering_status(self: *Self, new_view: u32) void {
             assert(self.status == .recovering);
             assert(self.view == 0);
+            assert(!self.committing);
+            assert(self.replica_count > 1 or new_view == 0);
+            assert(self.journal.header_with_op(self.op) != null);
             self.view = new_view;
             self.view_normal = new_view;
             self.status = .normal;
@@ -4679,6 +5507,7 @@ pub fn Replica(
             // For example, this could happen after a state transfer triggered by an op jump.
             assert(self.status == .view_change);
             assert(new_view >= self.view);
+            assert(self.journal.header_with_op(self.op) != null);
             self.view = new_view;
             self.view_normal = new_view;
             self.status = .normal;
@@ -4724,6 +5553,7 @@ pub fn Replica(
             self.reset_quorum_start_view_change();
             self.reset_quorum_do_view_change();
             self.reset_quorum_nack_prepare();
+            self.reset_quorum_prepare_ok();
             assert(self.start_view_change_quorum == false);
             assert(self.do_view_change_quorum == false);
@@ -4763,6 +5593,7 @@ pub fn Replica(
             self.reset_quorum_start_view_change();
             self.reset_quorum_do_view_change();
             self.reset_quorum_nack_prepare();
+            self.reset_quorum_prepare_ok();
             assert(self.start_view_change_quorum == false);
             assert(self.do_view_change_quorum == false);
@@ -4780,7 +5611,7 @@ pub fn Replica(
             assert(reply.header.commit > 0);
             assert(reply.header.request > 0);
-            if (self.client_table.getPtr(reply.header.client)) |entry| {
+            if (self.client_table().get(reply.header.client)) |entry| {
                 assert(entry.reply.header.command == .reply);
                 assert(entry.reply.header.context == 0);
                 assert(entry.reply.header.op == entry.reply.header.commit);
@@ -4868,12 +5699,16 @@ pub fn Replica(
         }
         fn verify_pipeline(self: *Self) void {
+            assert(self.status == .view_change);
             var op = self.commit_max + 1;
             var parent = self.journal.header_with_op(self.commit_max).?.checksum;
             var iterator = self.pipeline.iterator();
             while (iterator.next_ptr()) |prepare| {
                 assert(prepare.message.header.command == .prepare);
+                assert(!prepare.ok_quorum_received);
+                assert(prepare.ok_from_all_replicas.count() == 0);
                 log.debug("{}: verify_pipeline: op={} checksum={x} parent={x}", .{
                     self.replica,
@@ -4971,6 +5806,12 @@ pub fn Replica(
             assert(message.header.view <= self.view);
             assert(message.header.op <= self.op);
+            if (message.header.op == self.op_checkpoint) {
+                assert(message.header.op == 0);
+            } else {
+                assert(message.header.op > self.op_checkpoint);
+            }
             if (!self.journal.has(message.header)) {
                 log.debug("{}: write_prepare: ignoring op={} checksum={} (header changed)", .{
                     self.replica,
@@ -5013,3 +5854,113 @@ pub fn Replica(
         }
     };
 }
+/// Initialize the TigerBeetle replica's data file.
+pub fn format(
+    comptime Storage: type,
+    allocator: std.mem.Allocator,
+    cluster: u32,
+    replica: u8,
+    storage: *Storage,
+    superblock: *vsr.SuperBlockType(Storage),
+) !void {
+    const ReplicaFormat = ReplicaFormatType(Storage);
+    var replica_format = ReplicaFormat{};
+    try replica_format.format_wal(allocator, cluster, storage);
+    assert(!replica_format.formatting);
+    superblock.format(
+        ReplicaFormat.format_superblock_callback,
+        &replica_format.superblock_context,
+        .{
+            .cluster = cluster,
+            .replica = replica,
+            .size_max = config.size_max, // This can later become a runtime arg, to cap storage.
+        },
+    );
+    replica_format.formatting = true;
+    while (replica_format.formatting) storage.tick();
+}
+fn ReplicaFormatType(comptime Storage: type) type {
+    const SuperBlock = vsr.SuperBlockType(Storage);
+    return struct {
+        const Self = @This();
+        formatting: bool = false,
+        superblock_context: SuperBlock.Context = undefined,
+        wal_write: Storage.Write = undefined,
+        fn format_wal(
+            self: *Self,
+            allocator: std.mem.Allocator,
+            cluster: u32,
+            storage: *Storage,
+        ) !void {
+            const header_zeroes = [_]u8{0} ** @sizeOf(Header);
+            const wal_write_size_max = 4 * 1024 * 1024;
+            assert(wal_write_size_max % config.sector_size == 0);
+            // Direct I/O requires the buffer to be sector-aligned.
+            var wal_buffer = try allocator.allocAdvanced(
+                u8,
+                config.sector_size,
+                wal_write_size_max,
+                .exact,
+            );
+            errdefer allocator.free(wal_buffer);
+            // The logical offset *within the WAL*.
+            var wal_offset: u64 = 0;
+            while (wal_offset < config.journal_size_max) {
+                const size = format_journal(cluster, wal_offset, wal_buffer);
+                assert(size % config.sector_size == 0);
+                assert(size > 0);
+                for (std.mem.bytesAsSlice(Header, wal_buffer[0..size])) |*header| {
+                    if (std.mem.eql(u8, std.mem.asBytes(header), &header_zeroes)) {
+                        // This is the (empty) body of a reserved or root Prepare.
+                    } else {
+                        // This is either a Prepare's header or a redundant header.
+                        assert(header.valid_checksum());
+                        if (header.op == 0) {
+                            assert(header.command == .prepare);
+                            assert(header.operation == .root);
+                        } else {
+                            assert(header.command == .reserved);
+                            assert(header.operation == .reserved);
+                        }
+                    }
+                }
+                storage.write_sectors(
+                    format_wal_sectors_callback,
+                    &self.wal_write,
+                    wal_buffer[0..size],
+                    .wal,
+                    wal_offset,
+                );
+                self.formatting = true;
+                while (self.formatting) storage.tick();
+                wal_offset += size;
+            }
+            // There is nothing left to write.
+            assert(format_journal(cluster, wal_offset, wal_buffer) == 0);
+        }
+        fn format_wal_sectors_callback(write: *Storage.Write) void {
+            const self = @fieldParentPtr(Self, "wal_write", write);
+            assert(self.formatting);
+            self.formatting = false;
+        }
+        fn format_superblock_callback(superblock_context: *SuperBlock.Context) void {
+            const self = @fieldParentPtr(Self, "superblock_context", superblock_context);
+            assert(self.formatting);
+            self.formatting = false;
+        }
+    };
+}