npm - tigerbeetle-node - Versions diffs - 0.11.6 → 0.11.8 - Mend

tigerbeetle-node 0.11.6 → 0.11.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

package/dist/.client.node.sha256 +1 -1
package/package.json +1 -1
package/src/tigerbeetle/scripts/benchmark.bat +1 -2
package/src/tigerbeetle/scripts/benchmark.sh +1 -2
package/src/tigerbeetle/scripts/install.bat +7 -0
package/src/tigerbeetle/scripts/install.sh +2 -3
package/src/tigerbeetle/src/benchmark.zig +3 -3
package/src/tigerbeetle/src/ewah.zig +6 -5
package/src/tigerbeetle/src/ewah_fuzz.zig +1 -1
package/src/tigerbeetle/src/io/darwin.zig +19 -0
package/src/tigerbeetle/src/io/linux.zig +8 -0
package/src/tigerbeetle/src/io/windows.zig +20 -2
package/src/tigerbeetle/src/iops.zig +7 -1
package/src/tigerbeetle/src/lsm/compaction.zig +18 -29
package/src/tigerbeetle/src/lsm/forest_fuzz.zig +9 -5
package/src/tigerbeetle/src/lsm/grid.zig +267 -267
package/src/tigerbeetle/src/lsm/level_iterator.zig +18 -1
package/src/tigerbeetle/src/lsm/manifest.zig +29 -1
package/src/tigerbeetle/src/lsm/manifest_log.zig +5 -5
package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +2 -2
package/src/tigerbeetle/src/lsm/set_associative_cache.zig +26 -70
package/src/tigerbeetle/src/lsm/table.zig +42 -0
package/src/tigerbeetle/src/lsm/table_iterator.zig +27 -0
package/src/tigerbeetle/src/lsm/table_mutable.zig +1 -1
package/src/tigerbeetle/src/lsm/test.zig +2 -3
package/src/tigerbeetle/src/lsm/tree.zig +27 -6
package/src/tigerbeetle/src/lsm/tree_fuzz.zig +1 -1
package/src/tigerbeetle/src/simulator.zig +0 -5
package/src/tigerbeetle/src/storage.zig +58 -6
package/src/tigerbeetle/src/test/cluster.zig +3 -0
package/src/tigerbeetle/src/test/state_checker.zig +1 -1
package/src/tigerbeetle/src/test/storage.zig +22 -1
package/src/tigerbeetle/src/tracer.zig +50 -28
package/src/tigerbeetle/src/unit_tests.zig +9 -4
package/src/tigerbeetle/src/vopr.zig +4 -4
package/src/tigerbeetle/src/vsr/client.zig +11 -7
package/src/tigerbeetle/src/vsr/journal.zig +153 -93
package/src/tigerbeetle/src/vsr/replica.zig +10 -20
package/src/tigerbeetle/src/vsr/superblock.zig +19 -16
package/src/tigerbeetle/src/vsr/superblock_free_set.zig +114 -93
package/src/tigerbeetle/src/vsr/superblock_free_set_fuzz.zig +1 -1
package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +1 -3
package/src/tigerbeetle/src/vsr.zig +55 -8
package/src/tigerbeetle/src/c/tb_client/context.zig +0 -304
package/src/tigerbeetle/src/c/tb_client/echo_client.zig +0 -108
package/src/tigerbeetle/src/c/tb_client/packet.zig +0 -80
package/src/tigerbeetle/src/c/tb_client/signal.zig +0 -286
package/src/tigerbeetle/src/c/tb_client/thread.zig +0 -88
package/src/tigerbeetle/src/c/tb_client.h +0 -220
package/src/tigerbeetle/src/c/tb_client.zig +0 -177
package/src/tigerbeetle/src/c/tb_client_header.zig +0 -218
package/src/tigerbeetle/src/c/tb_client_header_test.zig +0 -135
package/src/tigerbeetle/src/c/test.zig +0 -371
package/src/tigerbeetle/src/cli.zig +0 -399
package/src/tigerbeetle/src/main.zig +0 -242

package/src/tigerbeetle/src/vsr/journal.zig CHANGED Viewed

@@ -46,8 +46,10 @@ const Ring = enum {
 };
 const headers_per_sector = @divExact(constants.sector_size, @sizeOf(Header));
+const headers_per_message = @divExact(constants.message_size_max, @sizeOf(Header));
 comptime {
     assert(headers_per_sector > 0);
+    assert(headers_per_message > 0);
 }
 /// A slot is an index within:
@@ -74,15 +76,15 @@ const SlotRange = struct {
     /// * `head < tail` → `  head··tail  `
     /// * `head > tail` → `··tail  head··` (The range wraps around).
     /// * `head = tail` → panic            (Caller must handle this case separately).
-    fn contains(self: *const SlotRange, slot: Slot) bool {
+    fn contains(range: *const SlotRange, slot: Slot) bool {
         // To avoid confusion, the empty range must be checked separately by the caller.
-        assert(self.head.index != self.tail.index);
+        assert(range.head.index != range.tail.index);
-        if (self.head.index < self.tail.index) {
-            return self.head.index <= slot.index and slot.index <= self.tail.index;
+        if (range.head.index < range.tail.index) {
+            return range.head.index <= slot.index and slot.index <= range.tail.index;
         }
-        if (self.head.index > self.tail.index) {
-            return slot.index <= self.tail.index or self.head.index <= slot.index;
+        if (range.head.index > range.tail.index) {
+            return slot.index <= range.tail.index or range.head.index <= slot.index;
         }
         unreachable;
     }
@@ -180,6 +182,8 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
             }
         };
+        const HeaderChunks = std.StaticBitSet(util.div_ceil(slot_count, headers_per_message));
         storage: *Storage,
         replica: u8,
@@ -212,6 +216,11 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
         /// The buffers belong to the IOP at the corresponding index in IOPS.
         headers_iops: *align(constants.sector_size) [constants.journal_iops_write_max][constants.sector_size]u8,
+        /// A set bit indicates a chunk of redundant headers that no read has been issued to yet.
+        header_chunks_requested: HeaderChunks = HeaderChunks.initFull(),
+        /// A set bit indicates a chunk of redundant headers that has been recovered.
+        header_chunks_recovered: HeaderChunks = HeaderChunks.initEmpty(),
         /// Statically allocated read IO operation context data.
         reads: IOPS(Read, constants.journal_iops_read_max) = .{},
@@ -272,13 +281,11 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
             errdefer allocator.free(headers_redundant);
             for (headers_redundant) |*header| header.* = undefined;
-            var dirty = try BitSet.init(allocator, slot_count);
+            var dirty = try BitSet.init_full(allocator, slot_count);
             errdefer dirty.deinit(allocator);
-            for (headers) |_, index| dirty.set(Slot{ .index = index });
-            var faulty = try BitSet.init(allocator, slot_count);
+            var faulty = try BitSet.init_full(allocator, slot_count);
             errdefer faulty.deinit(allocator);
-            for (headers) |_, index| faulty.set(Slot{ .index = index });
             var prepare_checksums = try allocator.alloc(u128, slot_count);
             errdefer allocator.free(prepare_checksums);
@@ -915,47 +922,58 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
             assert(journal.status == .init);
             assert(journal.dirty.count == slot_count);
             assert(journal.faulty.count == slot_count);
+            assert(journal.reads.executing() == 0);
+            assert(journal.writes.executing() == 0);
+            assert(journal.header_chunks_requested.count() == HeaderChunks.bit_length);
+            assert(journal.header_chunks_recovered.count() == 0);
             journal.status = .{ .recovering = callback };
             log.debug("{}: recover: recovering", .{journal.replica});
-            journal.recover_headers(0);
+            var available: usize = journal.reads.available();
+            while (available > 0) : (available -= 1) journal.recover_headers();
+            assert(journal.header_chunks_recovered.count() == 0);
+            assert(journal.header_chunks_requested.count() ==
+                HeaderChunks.bit_length - journal.reads.executing());
         }
-        fn recover_headers(journal: *Journal, offset: u64) void {
+        fn recover_headers(journal: *Journal) void {
             const replica = @fieldParentPtr(Replica, "journal", journal);
             assert(journal.status == .recovering);
-            assert(journal.dirty.count == slot_count);
-            assert(journal.faulty.count == slot_count);
+            assert(journal.reads.available() > 0);
-            if (offset == headers_size) {
+            if (journal.header_chunks_recovered.count() == HeaderChunks.bit_length) {
+                assert(journal.header_chunks_requested.count() == 0);
                 log.debug("{}: recover_headers: complete", .{journal.replica});
-                journal.recover_prepares(Slot{ .index = 0 });
+                journal.recover_prepares();
                 return;
             }
-            assert(offset < headers_size);
+            const chunk_index = journal.header_chunks_requested.findFirstSet() orelse return;
+            assert(!journal.header_chunks_recovered.isSet(chunk_index));
             const message = replica.message_bus.get_message();
             defer replica.message_bus.unref(message);
-            // We expect that no other process is issuing reads while we are recovering.
-            assert(journal.reads.executing() == 0);
-            const read = journal.reads.acquire() orelse unreachable;
-            read.* = .{
+            const chunk_read = journal.reads.acquire() orelse unreachable;
+            chunk_read.* = .{
                 .journal = journal,
                 .completion = undefined,
                 .message = message.ref(),
                 .callback = undefined,
-                .op = undefined,
-                .checksum = offset,
+                .op = chunk_index,
+                .checksum = undefined,
                 .destination_replica = null,
             };
+            const offset = constants.message_size_max * chunk_index;
+            assert(offset < headers_size);
             const buffer = recover_headers_buffer(message, offset);
             assert(buffer.len > 0);
+            assert(buffer.len <= constants.message_size_max);
+            assert(buffer.len + offset <= headers_size);
             log.debug("{}: recover_headers: offset={} size={} recovering", .{
                 journal.replica,
@@ -963,9 +981,10 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
                 buffer.len,
             });
+            journal.header_chunks_requested.unset(chunk_index);
             journal.storage.read_sectors(
                 recover_headers_callback,
-                &read.completion,
+                &chunk_read.completion,
                 buffer,
                 .wal_headers,
                 offset,
@@ -973,69 +992,94 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
         }
         fn recover_headers_callback(completion: *Storage.Read) void {
-            const read = @fieldParentPtr(Journal.Read, "completion", completion);
-            const journal = read.journal;
+            const chunk_read = @fieldParentPtr(Journal.Read, "completion", completion);
+            const journal = chunk_read.journal;
             const replica = @fieldParentPtr(Replica, "journal", journal);
-            const message = read.message;
+            assert(journal.status == .recovering);
+            assert(chunk_read.destination_replica == null);
-            const offset = @intCast(u64, read.checksum);
-            const buffer = recover_headers_buffer(message, offset);
+            const chunk_index = chunk_read.op;
+            assert(!journal.header_chunks_requested.isSet(chunk_index));
+            assert(!journal.header_chunks_recovered.isSet(chunk_index));
+            const chunk_buffer = recover_headers_buffer(
+                chunk_read.message,
+                chunk_index * constants.message_size_max,
+            );
+            assert(chunk_buffer.len >= @sizeOf(Header));
+            assert(chunk_buffer.len % @sizeOf(Header) == 0);
             log.debug("{}: recover_headers: offset={} size={} recovered", .{
                 journal.replica,
-                offset,
-                buffer.len,
+                chunk_index * constants.message_size_max,
+                chunk_buffer.len,
             });
-            assert(journal.status == .recovering);
-            assert(offset % @sizeOf(Header) == 0);
-            assert(buffer.len >= @sizeOf(Header));
-            assert(buffer.len % @sizeOf(Header) == 0);
-            assert(read.destination_replica == null);
-            assert(journal.dirty.count == slot_count);
-            assert(journal.faulty.count == slot_count);
             // Directly store all the redundant headers in `journal.headers_redundant` (including any
             // that are invalid or corrupt). As the prepares are recovered, these will be replaced
             // or removed as necessary.
-            const buffer_headers = std.mem.bytesAsSlice(Header, buffer);
+            const chunk_headers = std.mem.bytesAsSlice(Header, chunk_buffer);
             util.copy_disjoint(
                 .exact,
                 Header,
-                journal.headers_redundant[@divExact(offset, @sizeOf(Header))..][0..buffer_headers.len],
-                buffer_headers,
+                journal.headers_redundant[chunk_index * headers_per_message ..][0..chunk_headers.len],
+                chunk_headers,
             );
-            const offset_next = offset + buffer.len;
             // We must release before we call `recover_headers()` in case Storage is synchronous.
             // Otherwise, we would run out of messages and reads.
-            replica.message_bus.unref(read.message);
-            journal.reads.release(read);
+            replica.message_bus.unref(chunk_read.message);
+            journal.reads.release(chunk_read);
-            journal.recover_headers(offset_next);
+            journal.header_chunks_recovered.set(chunk_index);
+            journal.recover_headers();
         }
         fn recover_headers_buffer(message: *Message, offset: u64) []align(@alignOf(Header)) u8 {
-            const max = std.math.min(message.buffer.len, headers_size - offset);
+            const max = std.math.min(constants.message_size_max, headers_size - offset);
             assert(max % constants.sector_size == 0);
             assert(max % @sizeOf(Header) == 0);
             return message.buffer[0..max];
         }
-        fn recover_prepares(journal: *Journal, slot: Slot) void {
-            const replica = @fieldParentPtr(Replica, "journal", journal);
+        /// Recover the prepares ring. Reads are issued concurrently.
+        /// - `dirty` is initially full.
+        ///   Bits are cleared when a read is issued to the slot.
+        ///   All bits are set again before recover_slots() is called.
+        /// - `faulty` is initially full.
+        ///   Bits are cleared when the slot's read finishes.
+        ///   All bits are set again before recover_slots() is called.
+        /// - The prepare's headers are loaded into `journal.headers`.
+        fn recover_prepares(journal: *Journal) void {
             assert(journal.status == .recovering);
             assert(journal.dirty.count == slot_count);
             assert(journal.faulty.count == slot_count);
-            // We expect that no other process is issuing reads while we are recovering.
             assert(journal.reads.executing() == 0);
+            assert(journal.writes.executing() == 0);
-            if (slot.index == slot_count) {
-                journal.recover_slots();
-                return;
+            var available: usize = journal.reads.available();
+            while (available > 0) : (available -= 1) journal.recover_prepare();
+            assert(journal.writes.executing() == 0);
+            assert(journal.reads.executing() > 0);
+            assert(journal.reads.executing() + journal.dirty.count == slot_count);
+            assert(journal.faulty.count == slot_count);
+        }
+        fn recover_prepare(journal: *Journal) void {
+            const replica = @fieldParentPtr(Replica, "journal", journal);
+            assert(journal.status == .recovering);
+            assert(journal.reads.available() > 0);
+            assert(journal.dirty.count <= journal.faulty.count);
+            if (journal.faulty.count == 0) {
+                for (journal.headers) |_, index| journal.dirty.set(Slot{ .index = index });
+                for (journal.headers) |_, index| journal.faulty.set(Slot{ .index = index });
+                return journal.recover_slots();
             }
-            assert(slot.index < slot_count);
+            const slot_index = journal.dirty.bits.findFirstSet() orelse return;
+            const slot = Slot{ .index = slot_index };
             const message = replica.message_bus.get_message();
             defer replica.message_bus.unref(message);
@@ -1045,18 +1089,19 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
                 .completion = undefined,
                 .message = message.ref(),
                 .callback = undefined,
-                .op = undefined,
-                .checksum = slot.index,
+                .op = slot.index,
+                .checksum = undefined,
                 .destination_replica = null,
             };
-            log.debug("{}: recover_prepares: recovering slot={}", .{
+            log.debug("{}: recover_prepare: recovering slot={}", .{
                 journal.replica,
                 slot.index,
             });
+            journal.dirty.clear(slot);
             journal.storage.read_sectors(
-                recover_prepares_callback,
+                recover_prepare_callback,
                 &read.completion,
                 // We load the entire message to verify that it isn't torn or corrupt.
                 // We don't know the message's size, so use the entire buffer.
@@ -1066,18 +1111,19 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
             );
         }
-        fn recover_prepares_callback(completion: *Storage.Read) void {
+        fn recover_prepare_callback(completion: *Storage.Read) void {
             const read = @fieldParentPtr(Journal.Read, "completion", completion);
             const journal = read.journal;
             const replica = @fieldParentPtr(Replica, "journal", journal);
             assert(journal.status == .recovering);
-            assert(journal.dirty.count == slot_count);
-            assert(journal.faulty.count == slot_count);
+            assert(journal.dirty.count <= journal.faulty.count);
             assert(read.destination_replica == null);
-            const slot = Slot{ .index = @intCast(u64, read.checksum) };
+            const slot = Slot{ .index = @intCast(u64, read.op) };
             assert(slot.index < slot_count);
+            assert(!journal.dirty.bit(slot));
+            assert(journal.faulty.bit(slot));
             // Check `valid_checksum_body` here rather than in `recover_done` so that we don't need
             // to hold onto the whole message (just the header).
@@ -1090,7 +1136,8 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
             replica.message_bus.unref(read.message);
             journal.reads.release(read);
-            journal.recover_prepares(Slot{ .index = slot.index + 1 });
+            journal.faulty.clear(slot);
+            journal.recover_prepare();
         }
         /// When in doubt about whether a particular message was received, it must be marked as
@@ -1431,19 +1478,27 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
         }
         fn recover_fix_callback(write: *Journal.Write) void {
+            const journal = write.journal;
+            assert(journal.status == .recovering);
             assert(write.trigger == .fix);
-            write.journal.recover_fix();
+            journal.writes.release(write);
+            journal.recover_fix();
         }
         fn recover_done(journal: *Journal) void {
-            const replica = @fieldParentPtr(Replica, "journal", journal);
-            const callback = journal.status.recovering;
-            journal.status = .recovered;
+            assert(journal.status == .recovering);
+            assert(journal.reads.executing() == 0);
+            assert(journal.writes.executing() == 0);
             assert(journal.dirty.count <= slot_count);
             assert(journal.faulty.count <= slot_count);
             assert(journal.faulty.count == journal.dirty.count);
+            assert(journal.header_chunks_requested.count() == 0);
+            assert(journal.header_chunks_recovered.count() == HeaderChunks.bit_length);
+            const replica = @fieldParentPtr(Replica, "journal", journal);
+            const callback = journal.status.recovering;
+            journal.status = .recovered;
             // Abort if all slots are faulty, since something is very wrong.
             if (journal.faulty.count == slot_count) @panic("WAL is completely corrupt");
@@ -2112,9 +2167,9 @@ const Case = struct {
         };
     }
-    fn check(self: *const Case, parameters: [9]bool) !bool {
+    fn check(case: *const Case, parameters: [9]bool) !bool {
         for (parameters) |b, i| {
-            switch (self.pattern[i]) {
+            switch (case.pattern[i]) {
                 .any => {},
                 .is_false => if (b) return false,
                 .is_true => if (!b) return false,
@@ -2125,12 +2180,12 @@ const Case = struct {
         return true;
     }
-    fn decision(self: *const Case, replica_count: u8) RecoveryDecision {
+    fn decision(case: *const Case, replica_count: u8) RecoveryDecision {
         assert(replica_count > 0);
         if (replica_count == 1) {
-            return self.decision_single;
+            return case.decision_single;
         } else {
-            return self.decision_multiple;
+            return case.decision_multiple;
         }
     }
 };
@@ -2227,36 +2282,41 @@ pub const BitSet = struct {
     /// The number of bits set (updated incrementally as bits are set or cleared):
     count: u64 = 0,
-    fn init(allocator: Allocator, count: usize) !BitSet {
-        const bits = try std.DynamicBitSetUnmanaged.initEmpty(allocator, count);
+    fn init_full(allocator: Allocator, count: usize) !BitSet {
+        const bits = try std.DynamicBitSetUnmanaged.initFull(allocator, count);
         errdefer bits.deinit(allocator);
-        return BitSet{ .bits = bits };
+        return BitSet{
+            .bits = bits,
+            .count = count,
+        };
     }
-    fn deinit(self: *BitSet, allocator: Allocator) void {
-        self.bits.deinit(allocator);
+    fn deinit(bit_set: *BitSet, allocator: Allocator) void {
+        assert(bit_set.count == bit_set.bits.count());
+        bit_set.bits.deinit(allocator);
     }
     /// Clear the bit for a slot (idempotent):
-    pub fn clear(self: *BitSet, slot: Slot) void {
-        if (self.bits.isSet(slot.index)) {
-            self.bits.unset(slot.index);
-            self.count -= 1;
+    pub fn clear(bit_set: *BitSet, slot: Slot) void {
+        if (bit_set.bits.isSet(slot.index)) {
+            bit_set.bits.unset(slot.index);
+            bit_set.count -= 1;
         }
     }
     /// Whether the bit for a slot is set:
-    pub fn bit(self: *const BitSet, slot: Slot) bool {
-        return self.bits.isSet(slot.index);
+    pub fn bit(bit_set: *const BitSet, slot: Slot) bool {
+        return bit_set.bits.isSet(slot.index);
     }
     /// Set the bit for a slot (idempotent):
-    pub fn set(self: *BitSet, slot: Slot) void {
-        if (!self.bits.isSet(slot.index)) {
-            self.bits.set(slot.index);
-            self.count += 1;
-            assert(self.count <= self.bits.bit_length);
+    pub fn set(bit_set: *BitSet, slot: Slot) void {
+        if (!bit_set.bits.isSet(slot.index)) {
+            bit_set.bits.set(slot.index);
+            bit_set.count += 1;
+            assert(bit_set.count <= bit_set.bits.bit_length);
         }
     }
 };

package/src/tigerbeetle/src/vsr/replica.zig CHANGED Viewed

@@ -311,8 +311,8 @@ pub fn ReplicaType(
                 allocator,
                 .{
                     .storage = options.storage,
-                    .message_pool = options.message_pool,
                     .storage_size_limit = options.storage_size_limit,
+                    .message_pool = options.message_pool,
                 },
             );
@@ -356,7 +356,7 @@ pub fn ReplicaType(
             self.opened = false;
             self.state_machine.open(state_machine_open_callback);
             while (!self.opened) {
-                self.grid.tick();
+                // self.grid.tick();
                 self.superblock.storage.tick();
             }
@@ -427,22 +427,12 @@ pub fn ReplicaType(
             assert(self.superblock.opened);
             assert(self.superblock.working.vsr_state.internally_consistent());
-            const majority = (replica_count / 2) + 1;
-            assert(majority <= replica_count);
-            assert(constants.quorum_replication_max >= 2);
-            const quorum_replication = std.math.min(constants.quorum_replication_max, majority);
-            assert(quorum_replication >= 2 or quorum_replication == replica_count);
-            const quorum_view_change = std.math.max(
-                replica_count - quorum_replication + 1,
-                majority,
-            );
-            // The view change quorum may be more expensive to make the replication quorum cheaper.
-            // The insight is that the replication phase is by far more common than the view change.
-            // This trade-off allows us to optimize for the common case.
-            // See the comments in `constants.zig` for further explanation.
-            assert(quorum_view_change >= majority);
+            const quorums = vsr.quorums(replica_count);
+            const quorum_replication = quorums.replication;
+            const quorum_view_change = quorums.view_change;
+            assert(quorum_replication <= replica_count);
+            assert(quorum_view_change <= replica_count);
+            assert(quorum_view_change + quorum_replication >= replica_count);
             if (replica_count <= 2) {
                 assert(quorum_replication == replica_count);
@@ -639,7 +629,7 @@ pub fn ReplicaType(
             // TODO Replica owns Time; should it tick() here instead of Clock?
             self.clock.tick();
-            self.grid.tick();
+            // self.grid.tick();
             self.message_bus.tick();
             if (self.status == .recovering) {
@@ -4440,7 +4430,7 @@ pub fn ReplicaType(
                         // - or (indistinguishably) this might originally have been an op greater
                         //   than replica.op, which was truncated, but is now corrupt.
                         //
-                        // we don't try to repair this op because the slot belongs (or will soon
+                        // We don't try to repair this op because the slot belongs (or will soon
                         // belong) to a newer op, from the new WAL wrap. Additionally, we may not
                         // still have access to its surrounding commits to verify the hash chain.
                         assert(op <= self.commit_min);

package/src/tigerbeetle/src/vsr/superblock.zig CHANGED Viewed

@@ -325,16 +325,14 @@ pub const superblock_trailer_free_set_size_max = blk: {
     const encode_size_max = SuperBlockFreeSet.encode_size_max(block_count_max);
     assert(encode_size_max > 0);
-    // Round up to the nearest sector:
-    break :blk div_ceil(encode_size_max, constants.sector_size) * constants.sector_size;
+    break :blk vsr.sector_ceil(encode_size_max);
 };
 pub const superblock_trailer_client_table_size_max = blk: {
     const encode_size_max = SuperBlockClientTable.encode_size_max;
     assert(encode_size_max > 0);
-    // Round up to the nearest sector:
-    break :blk div_ceil(encode_size_max, constants.sector_size) * constants.sector_size;
+    break :blk vsr.sector_ceil(encode_size_max);
 };
 pub const data_file_size_min = blk: {
@@ -352,15 +350,15 @@ const block_count_max = blk: {
     // The size of a freeset is related to the number of blocks it must store.
     // Maximize the number of grid blocks.
-    var shard_count = @divFloor(size, constants.block_size * SuperBlockFreeSet.shard_size);
+    var shard_count = @divFloor(size, constants.block_size * SuperBlockFreeSet.shard_bits);
     while (true) : (shard_count -= 1) {
-        const block_count = shard_count * SuperBlockFreeSet.shard_size;
+        const block_count = shard_count * SuperBlockFreeSet.shard_bits;
         const grid_size = block_count * constants.block_size;
         const free_set_size = vsr.sector_ceil(SuperBlockFreeSet.encode_size_max(block_count));
         const free_sets_size = constants.superblock_copies * free_set_size;
         if (free_sets_size + grid_size <= size) break;
     }
-    break :blk shard_count * SuperBlockFreeSet.shard_size;
+    break :blk shard_count * SuperBlockFreeSet.shard_bits;
 };
 comptime {
@@ -495,9 +493,9 @@ pub fn SuperBlockType(comptime Storage: type) type {
             const shard_count_limit = @intCast(usize, @divFloor(
                 options.storage_size_limit - data_file_size_min,
-                constants.block_size * FreeSet.shard_size,
+                constants.block_size * FreeSet.shard_bits,
             ));
-            const block_count_limit = shard_count_limit * FreeSet.shard_size;
+            const block_count_limit = shard_count_limit * FreeSet.shard_bits;
             assert(block_count_limit <= block_count_max);
             const a = try allocator.allocAdvanced(SuperBlockSector, constants.sector_size, 1, .exact);
@@ -524,9 +522,7 @@ pub fn SuperBlockType(comptime Storage: type) type {
             );
             errdefer manifest.deinit(allocator);
-            // TODO Allocate a FreeSet (and write buffer) when storage_size_limit is small.
-            // Right now we can allocate blocks outside of the limit.
-            var free_set = try FreeSet.init(allocator, block_count_max);
+            var free_set = try FreeSet.init(allocator, block_count_limit);
             errdefer free_set.deinit(allocator);
             var client_table = try ClientTable.init(allocator, options.message_pool);
@@ -543,7 +539,7 @@ pub fn SuperBlockType(comptime Storage: type) type {
             const free_set_buffer = try allocator.allocAdvanced(
                 u8,
                 constants.sector_size,
-                superblock_trailer_free_set_size_max,
+                SuperBlockFreeSet.encode_size_max(block_count_limit),
                 .exact,
             );
             errdefer allocator.free(free_set_buffer);
@@ -799,7 +795,7 @@ pub fn SuperBlockType(comptime Storage: type) type {
         fn write_staging_encode_free_set(superblock: *SuperBlock) void {
             const staging: *SuperBlockSector = superblock.staging;
-            const encode_size_max = FreeSet.encode_size_max(block_count_max);
+            const encode_size_max = FreeSet.encode_size_max(superblock.block_count_limit);
             const target = superblock.free_set_buffer[0..encode_size_max];
             superblock.free_set.include_staging();
@@ -816,7 +812,14 @@ pub fn SuperBlockType(comptime Storage: type) type {
             assert(staging.storage_size <= staging.storage_size_max);
             assert(staging.storage_size <= superblock.storage_size_limit);
-            staging.free_set_size = @intCast(u32, superblock.free_set.encode(target));
+            if (superblock.free_set.count_acquired() == 0) {
+                // EWAH encodes a zero-length bitset to an empty slice anyway, but handle this
+                // condition separately so that during formatting it doesn't depend on the choice
+                // of storage_size_limit.
+                staging.free_set_size = 0;
+            } else {
+                staging.free_set_size = @intCast(u32, superblock.free_set.encode(target));
+            }
             staging.free_set_checksum = vsr.checksum(target[0..staging.free_set_size]);
         }
@@ -1116,7 +1119,7 @@ pub fn SuperBlockType(comptime Storage: type) type {
                     assert(working.sequence == 1);
                     assert(working.storage_size == data_file_size_min);
                     assert(working.manifest_size == 0);
-                    assert(working.free_set_size == 8);
+                    assert(working.free_set_size == 0);
                     assert(working.client_table_size == 4);
                     assert(working.vsr_state.commit_min_checksum ==
                         vsr.Header.root_prepare(working.cluster).checksum);