tigerbeetle-node 0.11.6 → 0.11.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/dist/.client.node.sha256 +1 -1
  2. package/package.json +1 -1
  3. package/src/tigerbeetle/scripts/benchmark.bat +1 -2
  4. package/src/tigerbeetle/scripts/benchmark.sh +1 -2
  5. package/src/tigerbeetle/scripts/install.bat +7 -0
  6. package/src/tigerbeetle/scripts/install.sh +2 -3
  7. package/src/tigerbeetle/src/benchmark.zig +3 -3
  8. package/src/tigerbeetle/src/ewah.zig +6 -5
  9. package/src/tigerbeetle/src/ewah_fuzz.zig +1 -1
  10. package/src/tigerbeetle/src/io/darwin.zig +19 -0
  11. package/src/tigerbeetle/src/io/linux.zig +8 -0
  12. package/src/tigerbeetle/src/io/windows.zig +20 -2
  13. package/src/tigerbeetle/src/iops.zig +7 -1
  14. package/src/tigerbeetle/src/lsm/compaction.zig +18 -29
  15. package/src/tigerbeetle/src/lsm/forest_fuzz.zig +9 -5
  16. package/src/tigerbeetle/src/lsm/grid.zig +267 -267
  17. package/src/tigerbeetle/src/lsm/level_iterator.zig +18 -1
  18. package/src/tigerbeetle/src/lsm/manifest.zig +29 -1
  19. package/src/tigerbeetle/src/lsm/manifest_log.zig +5 -5
  20. package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +2 -2
  21. package/src/tigerbeetle/src/lsm/set_associative_cache.zig +26 -70
  22. package/src/tigerbeetle/src/lsm/table.zig +42 -0
  23. package/src/tigerbeetle/src/lsm/table_iterator.zig +27 -0
  24. package/src/tigerbeetle/src/lsm/table_mutable.zig +1 -1
  25. package/src/tigerbeetle/src/lsm/test.zig +2 -3
  26. package/src/tigerbeetle/src/lsm/tree.zig +27 -6
  27. package/src/tigerbeetle/src/lsm/tree_fuzz.zig +1 -1
  28. package/src/tigerbeetle/src/simulator.zig +0 -5
  29. package/src/tigerbeetle/src/storage.zig +58 -6
  30. package/src/tigerbeetle/src/test/cluster.zig +3 -0
  31. package/src/tigerbeetle/src/test/state_checker.zig +1 -1
  32. package/src/tigerbeetle/src/test/storage.zig +22 -1
  33. package/src/tigerbeetle/src/tracer.zig +50 -28
  34. package/src/tigerbeetle/src/unit_tests.zig +9 -4
  35. package/src/tigerbeetle/src/vopr.zig +4 -4
  36. package/src/tigerbeetle/src/vsr/client.zig +11 -7
  37. package/src/tigerbeetle/src/vsr/journal.zig +153 -93
  38. package/src/tigerbeetle/src/vsr/replica.zig +10 -20
  39. package/src/tigerbeetle/src/vsr/superblock.zig +19 -16
  40. package/src/tigerbeetle/src/vsr/superblock_free_set.zig +114 -93
  41. package/src/tigerbeetle/src/vsr/superblock_free_set_fuzz.zig +1 -1
  42. package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +1 -3
  43. package/src/tigerbeetle/src/vsr.zig +55 -8
  44. package/src/tigerbeetle/src/c/tb_client/context.zig +0 -304
  45. package/src/tigerbeetle/src/c/tb_client/echo_client.zig +0 -108
  46. package/src/tigerbeetle/src/c/tb_client/packet.zig +0 -80
  47. package/src/tigerbeetle/src/c/tb_client/signal.zig +0 -286
  48. package/src/tigerbeetle/src/c/tb_client/thread.zig +0 -88
  49. package/src/tigerbeetle/src/c/tb_client.h +0 -220
  50. package/src/tigerbeetle/src/c/tb_client.zig +0 -177
  51. package/src/tigerbeetle/src/c/tb_client_header.zig +0 -218
  52. package/src/tigerbeetle/src/c/tb_client_header_test.zig +0 -135
  53. package/src/tigerbeetle/src/c/test.zig +0 -371
  54. package/src/tigerbeetle/src/cli.zig +0 -399
  55. package/src/tigerbeetle/src/main.zig +0 -242
@@ -46,8 +46,10 @@ const Ring = enum {
46
46
  };
47
47
 
48
48
  const headers_per_sector = @divExact(constants.sector_size, @sizeOf(Header));
49
+ const headers_per_message = @divExact(constants.message_size_max, @sizeOf(Header));
49
50
  comptime {
50
51
  assert(headers_per_sector > 0);
52
+ assert(headers_per_message > 0);
51
53
  }
52
54
 
53
55
  /// A slot is an index within:
@@ -74,15 +76,15 @@ const SlotRange = struct {
74
76
  /// * `head < tail` → ` head··tail `
75
77
  /// * `head > tail` → `··tail head··` (The range wraps around).
76
78
  /// * `head = tail` → panic (Caller must handle this case separately).
77
- fn contains(self: *const SlotRange, slot: Slot) bool {
79
+ fn contains(range: *const SlotRange, slot: Slot) bool {
78
80
  // To avoid confusion, the empty range must be checked separately by the caller.
79
- assert(self.head.index != self.tail.index);
81
+ assert(range.head.index != range.tail.index);
80
82
 
81
- if (self.head.index < self.tail.index) {
82
- return self.head.index <= slot.index and slot.index <= self.tail.index;
83
+ if (range.head.index < range.tail.index) {
84
+ return range.head.index <= slot.index and slot.index <= range.tail.index;
83
85
  }
84
- if (self.head.index > self.tail.index) {
85
- return slot.index <= self.tail.index or self.head.index <= slot.index;
86
+ if (range.head.index > range.tail.index) {
87
+ return slot.index <= range.tail.index or range.head.index <= slot.index;
86
88
  }
87
89
  unreachable;
88
90
  }
@@ -180,6 +182,8 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
180
182
  }
181
183
  };
182
184
 
185
+ const HeaderChunks = std.StaticBitSet(util.div_ceil(slot_count, headers_per_message));
186
+
183
187
  storage: *Storage,
184
188
  replica: u8,
185
189
 
@@ -212,6 +216,11 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
212
216
  /// The buffers belong to the IOP at the corresponding index in IOPS.
213
217
  headers_iops: *align(constants.sector_size) [constants.journal_iops_write_max][constants.sector_size]u8,
214
218
 
219
+ /// A set bit indicates a chunk of redundant headers that no read has been issued to yet.
220
+ header_chunks_requested: HeaderChunks = HeaderChunks.initFull(),
221
+ /// A set bit indicates a chunk of redundant headers that has been recovered.
222
+ header_chunks_recovered: HeaderChunks = HeaderChunks.initEmpty(),
223
+
215
224
  /// Statically allocated read IO operation context data.
216
225
  reads: IOPS(Read, constants.journal_iops_read_max) = .{},
217
226
 
@@ -272,13 +281,11 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
272
281
  errdefer allocator.free(headers_redundant);
273
282
  for (headers_redundant) |*header| header.* = undefined;
274
283
 
275
- var dirty = try BitSet.init(allocator, slot_count);
284
+ var dirty = try BitSet.init_full(allocator, slot_count);
276
285
  errdefer dirty.deinit(allocator);
277
- for (headers) |_, index| dirty.set(Slot{ .index = index });
278
286
 
279
- var faulty = try BitSet.init(allocator, slot_count);
287
+ var faulty = try BitSet.init_full(allocator, slot_count);
280
288
  errdefer faulty.deinit(allocator);
281
- for (headers) |_, index| faulty.set(Slot{ .index = index });
282
289
 
283
290
  var prepare_checksums = try allocator.alloc(u128, slot_count);
284
291
  errdefer allocator.free(prepare_checksums);
@@ -915,47 +922,58 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
915
922
  assert(journal.status == .init);
916
923
  assert(journal.dirty.count == slot_count);
917
924
  assert(journal.faulty.count == slot_count);
925
+ assert(journal.reads.executing() == 0);
926
+ assert(journal.writes.executing() == 0);
927
+ assert(journal.header_chunks_requested.count() == HeaderChunks.bit_length);
928
+ assert(journal.header_chunks_recovered.count() == 0);
918
929
 
919
930
  journal.status = .{ .recovering = callback };
920
-
921
931
  log.debug("{}: recover: recovering", .{journal.replica});
922
932
 
923
- journal.recover_headers(0);
933
+ var available: usize = journal.reads.available();
934
+ while (available > 0) : (available -= 1) journal.recover_headers();
935
+
936
+ assert(journal.header_chunks_recovered.count() == 0);
937
+ assert(journal.header_chunks_requested.count() ==
938
+ HeaderChunks.bit_length - journal.reads.executing());
924
939
  }
925
940
 
926
- fn recover_headers(journal: *Journal, offset: u64) void {
941
+ fn recover_headers(journal: *Journal) void {
927
942
  const replica = @fieldParentPtr(Replica, "journal", journal);
928
-
929
943
  assert(journal.status == .recovering);
930
- assert(journal.dirty.count == slot_count);
931
- assert(journal.faulty.count == slot_count);
944
+ assert(journal.reads.available() > 0);
932
945
 
933
- if (offset == headers_size) {
946
+ if (journal.header_chunks_recovered.count() == HeaderChunks.bit_length) {
947
+ assert(journal.header_chunks_requested.count() == 0);
934
948
  log.debug("{}: recover_headers: complete", .{journal.replica});
935
- journal.recover_prepares(Slot{ .index = 0 });
949
+ journal.recover_prepares();
936
950
  return;
937
951
  }
938
- assert(offset < headers_size);
952
+
953
+ const chunk_index = journal.header_chunks_requested.findFirstSet() orelse return;
954
+ assert(!journal.header_chunks_recovered.isSet(chunk_index));
939
955
 
940
956
  const message = replica.message_bus.get_message();
941
957
  defer replica.message_bus.unref(message);
942
958
 
943
- // We expect that no other process is issuing reads while we are recovering.
944
- assert(journal.reads.executing() == 0);
945
-
946
- const read = journal.reads.acquire() orelse unreachable;
947
- read.* = .{
959
+ const chunk_read = journal.reads.acquire() orelse unreachable;
960
+ chunk_read.* = .{
948
961
  .journal = journal,
949
962
  .completion = undefined,
950
963
  .message = message.ref(),
951
964
  .callback = undefined,
952
- .op = undefined,
953
- .checksum = offset,
965
+ .op = chunk_index,
966
+ .checksum = undefined,
954
967
  .destination_replica = null,
955
968
  };
956
969
 
970
+ const offset = constants.message_size_max * chunk_index;
971
+ assert(offset < headers_size);
972
+
957
973
  const buffer = recover_headers_buffer(message, offset);
958
974
  assert(buffer.len > 0);
975
+ assert(buffer.len <= constants.message_size_max);
976
+ assert(buffer.len + offset <= headers_size);
959
977
 
960
978
  log.debug("{}: recover_headers: offset={} size={} recovering", .{
961
979
  journal.replica,
@@ -963,9 +981,10 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
963
981
  buffer.len,
964
982
  });
965
983
 
984
+ journal.header_chunks_requested.unset(chunk_index);
966
985
  journal.storage.read_sectors(
967
986
  recover_headers_callback,
968
- &read.completion,
987
+ &chunk_read.completion,
969
988
  buffer,
970
989
  .wal_headers,
971
990
  offset,
@@ -973,69 +992,94 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
973
992
  }
974
993
 
975
994
  fn recover_headers_callback(completion: *Storage.Read) void {
976
- const read = @fieldParentPtr(Journal.Read, "completion", completion);
977
- const journal = read.journal;
995
+ const chunk_read = @fieldParentPtr(Journal.Read, "completion", completion);
996
+ const journal = chunk_read.journal;
978
997
  const replica = @fieldParentPtr(Replica, "journal", journal);
979
- const message = read.message;
998
+ assert(journal.status == .recovering);
999
+ assert(chunk_read.destination_replica == null);
980
1000
 
981
- const offset = @intCast(u64, read.checksum);
982
- const buffer = recover_headers_buffer(message, offset);
1001
+ const chunk_index = chunk_read.op;
1002
+ assert(!journal.header_chunks_requested.isSet(chunk_index));
1003
+ assert(!journal.header_chunks_recovered.isSet(chunk_index));
1004
+
1005
+ const chunk_buffer = recover_headers_buffer(
1006
+ chunk_read.message,
1007
+ chunk_index * constants.message_size_max,
1008
+ );
1009
+ assert(chunk_buffer.len >= @sizeOf(Header));
1010
+ assert(chunk_buffer.len % @sizeOf(Header) == 0);
983
1011
 
984
1012
  log.debug("{}: recover_headers: offset={} size={} recovered", .{
985
1013
  journal.replica,
986
- offset,
987
- buffer.len,
1014
+ chunk_index * constants.message_size_max,
1015
+ chunk_buffer.len,
988
1016
  });
989
1017
 
990
- assert(journal.status == .recovering);
991
- assert(offset % @sizeOf(Header) == 0);
992
- assert(buffer.len >= @sizeOf(Header));
993
- assert(buffer.len % @sizeOf(Header) == 0);
994
- assert(read.destination_replica == null);
995
- assert(journal.dirty.count == slot_count);
996
- assert(journal.faulty.count == slot_count);
997
-
998
1018
  // Directly store all the redundant headers in `journal.headers_redundant` (including any
999
1019
  // that are invalid or corrupt). As the prepares are recovered, these will be replaced
1000
1020
  // or removed as necessary.
1001
- const buffer_headers = std.mem.bytesAsSlice(Header, buffer);
1021
+ const chunk_headers = std.mem.bytesAsSlice(Header, chunk_buffer);
1002
1022
  util.copy_disjoint(
1003
1023
  .exact,
1004
1024
  Header,
1005
- journal.headers_redundant[@divExact(offset, @sizeOf(Header))..][0..buffer_headers.len],
1006
- buffer_headers,
1025
+ journal.headers_redundant[chunk_index * headers_per_message ..][0..chunk_headers.len],
1026
+ chunk_headers,
1007
1027
  );
1008
1028
 
1009
- const offset_next = offset + buffer.len;
1010
1029
  // We must release before we call `recover_headers()` in case Storage is synchronous.
1011
1030
  // Otherwise, we would run out of messages and reads.
1012
- replica.message_bus.unref(read.message);
1013
- journal.reads.release(read);
1031
+ replica.message_bus.unref(chunk_read.message);
1032
+ journal.reads.release(chunk_read);
1014
1033
 
1015
- journal.recover_headers(offset_next);
1034
+ journal.header_chunks_recovered.set(chunk_index);
1035
+ journal.recover_headers();
1016
1036
  }
1017
1037
 
1018
1038
  fn recover_headers_buffer(message: *Message, offset: u64) []align(@alignOf(Header)) u8 {
1019
- const max = std.math.min(message.buffer.len, headers_size - offset);
1039
+ const max = std.math.min(constants.message_size_max, headers_size - offset);
1020
1040
  assert(max % constants.sector_size == 0);
1021
1041
  assert(max % @sizeOf(Header) == 0);
1022
1042
  return message.buffer[0..max];
1023
1043
  }
1024
1044
 
1025
- fn recover_prepares(journal: *Journal, slot: Slot) void {
1026
- const replica = @fieldParentPtr(Replica, "journal", journal);
1045
+ /// Recover the prepares ring. Reads are issued concurrently.
1046
+ /// - `dirty` is initially full.
1047
+ /// Bits are cleared when a read is issued to the slot.
1048
+ /// All bits are set again before recover_slots() is called.
1049
+ /// - `faulty` is initially full.
1050
+ /// Bits are cleared when the slot's read finishes.
1051
+ /// All bits are set again before recover_slots() is called.
1052
+ /// - The prepare's headers are loaded into `journal.headers`.
1053
+ fn recover_prepares(journal: *Journal) void {
1027
1054
  assert(journal.status == .recovering);
1028
1055
  assert(journal.dirty.count == slot_count);
1029
1056
  assert(journal.faulty.count == slot_count);
1030
- // We expect that no other process is issuing reads while we are recovering.
1031
1057
  assert(journal.reads.executing() == 0);
1058
+ assert(journal.writes.executing() == 0);
1032
1059
 
1033
- if (slot.index == slot_count) {
1034
- journal.recover_slots();
1035
- return;
1060
+ var available: usize = journal.reads.available();
1061
+ while (available > 0) : (available -= 1) journal.recover_prepare();
1062
+
1063
+ assert(journal.writes.executing() == 0);
1064
+ assert(journal.reads.executing() > 0);
1065
+ assert(journal.reads.executing() + journal.dirty.count == slot_count);
1066
+ assert(journal.faulty.count == slot_count);
1067
+ }
1068
+
1069
+ fn recover_prepare(journal: *Journal) void {
1070
+ const replica = @fieldParentPtr(Replica, "journal", journal);
1071
+ assert(journal.status == .recovering);
1072
+ assert(journal.reads.available() > 0);
1073
+ assert(journal.dirty.count <= journal.faulty.count);
1074
+
1075
+ if (journal.faulty.count == 0) {
1076
+ for (journal.headers) |_, index| journal.dirty.set(Slot{ .index = index });
1077
+ for (journal.headers) |_, index| journal.faulty.set(Slot{ .index = index });
1078
+ return journal.recover_slots();
1036
1079
  }
1037
- assert(slot.index < slot_count);
1038
1080
 
1081
+ const slot_index = journal.dirty.bits.findFirstSet() orelse return;
1082
+ const slot = Slot{ .index = slot_index };
1039
1083
  const message = replica.message_bus.get_message();
1040
1084
  defer replica.message_bus.unref(message);
1041
1085
 
@@ -1045,18 +1089,19 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
1045
1089
  .completion = undefined,
1046
1090
  .message = message.ref(),
1047
1091
  .callback = undefined,
1048
- .op = undefined,
1049
- .checksum = slot.index,
1092
+ .op = slot.index,
1093
+ .checksum = undefined,
1050
1094
  .destination_replica = null,
1051
1095
  };
1052
1096
 
1053
- log.debug("{}: recover_prepares: recovering slot={}", .{
1097
+ log.debug("{}: recover_prepare: recovering slot={}", .{
1054
1098
  journal.replica,
1055
1099
  slot.index,
1056
1100
  });
1057
1101
 
1102
+ journal.dirty.clear(slot);
1058
1103
  journal.storage.read_sectors(
1059
- recover_prepares_callback,
1104
+ recover_prepare_callback,
1060
1105
  &read.completion,
1061
1106
  // We load the entire message to verify that it isn't torn or corrupt.
1062
1107
  // We don't know the message's size, so use the entire buffer.
@@ -1066,18 +1111,19 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
1066
1111
  );
1067
1112
  }
1068
1113
 
1069
- fn recover_prepares_callback(completion: *Storage.Read) void {
1114
+ fn recover_prepare_callback(completion: *Storage.Read) void {
1070
1115
  const read = @fieldParentPtr(Journal.Read, "completion", completion);
1071
1116
  const journal = read.journal;
1072
1117
  const replica = @fieldParentPtr(Replica, "journal", journal);
1073
1118
 
1074
1119
  assert(journal.status == .recovering);
1075
- assert(journal.dirty.count == slot_count);
1076
- assert(journal.faulty.count == slot_count);
1120
+ assert(journal.dirty.count <= journal.faulty.count);
1077
1121
  assert(read.destination_replica == null);
1078
1122
 
1079
- const slot = Slot{ .index = @intCast(u64, read.checksum) };
1123
+ const slot = Slot{ .index = @intCast(u64, read.op) };
1080
1124
  assert(slot.index < slot_count);
1125
+ assert(!journal.dirty.bit(slot));
1126
+ assert(journal.faulty.bit(slot));
1081
1127
 
1082
1128
  // Check `valid_checksum_body` here rather than in `recover_done` so that we don't need
1083
1129
  // to hold onto the whole message (just the header).
@@ -1090,7 +1136,8 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
1090
1136
  replica.message_bus.unref(read.message);
1091
1137
  journal.reads.release(read);
1092
1138
 
1093
- journal.recover_prepares(Slot{ .index = slot.index + 1 });
1139
+ journal.faulty.clear(slot);
1140
+ journal.recover_prepare();
1094
1141
  }
1095
1142
 
1096
1143
  /// When in doubt about whether a particular message was received, it must be marked as
@@ -1431,19 +1478,27 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
1431
1478
  }
1432
1479
 
1433
1480
  fn recover_fix_callback(write: *Journal.Write) void {
1481
+ const journal = write.journal;
1482
+ assert(journal.status == .recovering);
1434
1483
  assert(write.trigger == .fix);
1435
- write.journal.recover_fix();
1484
+
1485
+ journal.writes.release(write);
1486
+ journal.recover_fix();
1436
1487
  }
1437
1488
 
1438
1489
  fn recover_done(journal: *Journal) void {
1439
- const replica = @fieldParentPtr(Replica, "journal", journal);
1440
-
1441
- const callback = journal.status.recovering;
1442
- journal.status = .recovered;
1443
-
1490
+ assert(journal.status == .recovering);
1491
+ assert(journal.reads.executing() == 0);
1492
+ assert(journal.writes.executing() == 0);
1444
1493
  assert(journal.dirty.count <= slot_count);
1445
1494
  assert(journal.faulty.count <= slot_count);
1446
1495
  assert(journal.faulty.count == journal.dirty.count);
1496
+ assert(journal.header_chunks_requested.count() == 0);
1497
+ assert(journal.header_chunks_recovered.count() == HeaderChunks.bit_length);
1498
+
1499
+ const replica = @fieldParentPtr(Replica, "journal", journal);
1500
+ const callback = journal.status.recovering;
1501
+ journal.status = .recovered;
1447
1502
 
1448
1503
  // Abort if all slots are faulty, since something is very wrong.
1449
1504
  if (journal.faulty.count == slot_count) @panic("WAL is completely corrupt");
@@ -2112,9 +2167,9 @@ const Case = struct {
2112
2167
  };
2113
2168
  }
2114
2169
 
2115
- fn check(self: *const Case, parameters: [9]bool) !bool {
2170
+ fn check(case: *const Case, parameters: [9]bool) !bool {
2116
2171
  for (parameters) |b, i| {
2117
- switch (self.pattern[i]) {
2172
+ switch (case.pattern[i]) {
2118
2173
  .any => {},
2119
2174
  .is_false => if (b) return false,
2120
2175
  .is_true => if (!b) return false,
@@ -2125,12 +2180,12 @@ const Case = struct {
2125
2180
  return true;
2126
2181
  }
2127
2182
 
2128
- fn decision(self: *const Case, replica_count: u8) RecoveryDecision {
2183
+ fn decision(case: *const Case, replica_count: u8) RecoveryDecision {
2129
2184
  assert(replica_count > 0);
2130
2185
  if (replica_count == 1) {
2131
- return self.decision_single;
2186
+ return case.decision_single;
2132
2187
  } else {
2133
- return self.decision_multiple;
2188
+ return case.decision_multiple;
2134
2189
  }
2135
2190
  }
2136
2191
  };
@@ -2227,36 +2282,41 @@ pub const BitSet = struct {
2227
2282
  /// The number of bits set (updated incrementally as bits are set or cleared):
2228
2283
  count: u64 = 0,
2229
2284
 
2230
- fn init(allocator: Allocator, count: usize) !BitSet {
2231
- const bits = try std.DynamicBitSetUnmanaged.initEmpty(allocator, count);
2285
+ fn init_full(allocator: Allocator, count: usize) !BitSet {
2286
+ const bits = try std.DynamicBitSetUnmanaged.initFull(allocator, count);
2232
2287
  errdefer bits.deinit(allocator);
2233
2288
 
2234
- return BitSet{ .bits = bits };
2289
+ return BitSet{
2290
+ .bits = bits,
2291
+ .count = count,
2292
+ };
2235
2293
  }
2236
2294
 
2237
- fn deinit(self: *BitSet, allocator: Allocator) void {
2238
- self.bits.deinit(allocator);
2295
+ fn deinit(bit_set: *BitSet, allocator: Allocator) void {
2296
+ assert(bit_set.count == bit_set.bits.count());
2297
+
2298
+ bit_set.bits.deinit(allocator);
2239
2299
  }
2240
2300
 
2241
2301
  /// Clear the bit for a slot (idempotent):
2242
- pub fn clear(self: *BitSet, slot: Slot) void {
2243
- if (self.bits.isSet(slot.index)) {
2244
- self.bits.unset(slot.index);
2245
- self.count -= 1;
2302
+ pub fn clear(bit_set: *BitSet, slot: Slot) void {
2303
+ if (bit_set.bits.isSet(slot.index)) {
2304
+ bit_set.bits.unset(slot.index);
2305
+ bit_set.count -= 1;
2246
2306
  }
2247
2307
  }
2248
2308
 
2249
2309
  /// Whether the bit for a slot is set:
2250
- pub fn bit(self: *const BitSet, slot: Slot) bool {
2251
- return self.bits.isSet(slot.index);
2310
+ pub fn bit(bit_set: *const BitSet, slot: Slot) bool {
2311
+ return bit_set.bits.isSet(slot.index);
2252
2312
  }
2253
2313
 
2254
2314
  /// Set the bit for a slot (idempotent):
2255
- pub fn set(self: *BitSet, slot: Slot) void {
2256
- if (!self.bits.isSet(slot.index)) {
2257
- self.bits.set(slot.index);
2258
- self.count += 1;
2259
- assert(self.count <= self.bits.bit_length);
2315
+ pub fn set(bit_set: *BitSet, slot: Slot) void {
2316
+ if (!bit_set.bits.isSet(slot.index)) {
2317
+ bit_set.bits.set(slot.index);
2318
+ bit_set.count += 1;
2319
+ assert(bit_set.count <= bit_set.bits.bit_length);
2260
2320
  }
2261
2321
  }
2262
2322
  };
@@ -311,8 +311,8 @@ pub fn ReplicaType(
311
311
  allocator,
312
312
  .{
313
313
  .storage = options.storage,
314
- .message_pool = options.message_pool,
315
314
  .storage_size_limit = options.storage_size_limit,
315
+ .message_pool = options.message_pool,
316
316
  },
317
317
  );
318
318
 
@@ -356,7 +356,7 @@ pub fn ReplicaType(
356
356
  self.opened = false;
357
357
  self.state_machine.open(state_machine_open_callback);
358
358
  while (!self.opened) {
359
- self.grid.tick();
359
+ // self.grid.tick();
360
360
  self.superblock.storage.tick();
361
361
  }
362
362
 
@@ -427,22 +427,12 @@ pub fn ReplicaType(
427
427
  assert(self.superblock.opened);
428
428
  assert(self.superblock.working.vsr_state.internally_consistent());
429
429
 
430
- const majority = (replica_count / 2) + 1;
431
- assert(majority <= replica_count);
432
-
433
- assert(constants.quorum_replication_max >= 2);
434
- const quorum_replication = std.math.min(constants.quorum_replication_max, majority);
435
- assert(quorum_replication >= 2 or quorum_replication == replica_count);
436
-
437
- const quorum_view_change = std.math.max(
438
- replica_count - quorum_replication + 1,
439
- majority,
440
- );
441
- // The view change quorum may be more expensive to make the replication quorum cheaper.
442
- // The insight is that the replication phase is by far more common than the view change.
443
- // This trade-off allows us to optimize for the common case.
444
- // See the comments in `constants.zig` for further explanation.
445
- assert(quorum_view_change >= majority);
430
+ const quorums = vsr.quorums(replica_count);
431
+ const quorum_replication = quorums.replication;
432
+ const quorum_view_change = quorums.view_change;
433
+ assert(quorum_replication <= replica_count);
434
+ assert(quorum_view_change <= replica_count);
435
+ assert(quorum_view_change + quorum_replication >= replica_count);
446
436
 
447
437
  if (replica_count <= 2) {
448
438
  assert(quorum_replication == replica_count);
@@ -639,7 +629,7 @@ pub fn ReplicaType(
639
629
 
640
630
  // TODO Replica owns Time; should it tick() here instead of Clock?
641
631
  self.clock.tick();
642
- self.grid.tick();
632
+ // self.grid.tick();
643
633
  self.message_bus.tick();
644
634
 
645
635
  if (self.status == .recovering) {
@@ -4440,7 +4430,7 @@ pub fn ReplicaType(
4440
4430
  // - or (indistinguishably) this might originally have been an op greater
4441
4431
  // than replica.op, which was truncated, but is now corrupt.
4442
4432
  //
4443
- // we don't try to repair this op because the slot belongs (or will soon
4433
+ // We don't try to repair this op because the slot belongs (or will soon
4444
4434
  // belong) to a newer op, from the new WAL wrap. Additionally, we may not
4445
4435
  // still have access to its surrounding commits to verify the hash chain.
4446
4436
  assert(op <= self.commit_min);
@@ -325,16 +325,14 @@ pub const superblock_trailer_free_set_size_max = blk: {
325
325
  const encode_size_max = SuperBlockFreeSet.encode_size_max(block_count_max);
326
326
  assert(encode_size_max > 0);
327
327
 
328
- // Round up to the nearest sector:
329
- break :blk div_ceil(encode_size_max, constants.sector_size) * constants.sector_size;
328
+ break :blk vsr.sector_ceil(encode_size_max);
330
329
  };
331
330
 
332
331
  pub const superblock_trailer_client_table_size_max = blk: {
333
332
  const encode_size_max = SuperBlockClientTable.encode_size_max;
334
333
  assert(encode_size_max > 0);
335
334
 
336
- // Round up to the nearest sector:
337
- break :blk div_ceil(encode_size_max, constants.sector_size) * constants.sector_size;
335
+ break :blk vsr.sector_ceil(encode_size_max);
338
336
  };
339
337
 
340
338
  pub const data_file_size_min = blk: {
@@ -352,15 +350,15 @@ const block_count_max = blk: {
352
350
  // The size of a freeset is related to the number of blocks it must store.
353
351
  // Maximize the number of grid blocks.
354
352
 
355
- var shard_count = @divFloor(size, constants.block_size * SuperBlockFreeSet.shard_size);
353
+ var shard_count = @divFloor(size, constants.block_size * SuperBlockFreeSet.shard_bits);
356
354
  while (true) : (shard_count -= 1) {
357
- const block_count = shard_count * SuperBlockFreeSet.shard_size;
355
+ const block_count = shard_count * SuperBlockFreeSet.shard_bits;
358
356
  const grid_size = block_count * constants.block_size;
359
357
  const free_set_size = vsr.sector_ceil(SuperBlockFreeSet.encode_size_max(block_count));
360
358
  const free_sets_size = constants.superblock_copies * free_set_size;
361
359
  if (free_sets_size + grid_size <= size) break;
362
360
  }
363
- break :blk shard_count * SuperBlockFreeSet.shard_size;
361
+ break :blk shard_count * SuperBlockFreeSet.shard_bits;
364
362
  };
365
363
 
366
364
  comptime {
@@ -495,9 +493,9 @@ pub fn SuperBlockType(comptime Storage: type) type {
495
493
 
496
494
  const shard_count_limit = @intCast(usize, @divFloor(
497
495
  options.storage_size_limit - data_file_size_min,
498
- constants.block_size * FreeSet.shard_size,
496
+ constants.block_size * FreeSet.shard_bits,
499
497
  ));
500
- const block_count_limit = shard_count_limit * FreeSet.shard_size;
498
+ const block_count_limit = shard_count_limit * FreeSet.shard_bits;
501
499
  assert(block_count_limit <= block_count_max);
502
500
 
503
501
  const a = try allocator.allocAdvanced(SuperBlockSector, constants.sector_size, 1, .exact);
@@ -524,9 +522,7 @@ pub fn SuperBlockType(comptime Storage: type) type {
524
522
  );
525
523
  errdefer manifest.deinit(allocator);
526
524
 
527
- // TODO Allocate a FreeSet (and write buffer) when storage_size_limit is small.
528
- // Right now we can allocate blocks outside of the limit.
529
- var free_set = try FreeSet.init(allocator, block_count_max);
525
+ var free_set = try FreeSet.init(allocator, block_count_limit);
530
526
  errdefer free_set.deinit(allocator);
531
527
 
532
528
  var client_table = try ClientTable.init(allocator, options.message_pool);
@@ -543,7 +539,7 @@ pub fn SuperBlockType(comptime Storage: type) type {
543
539
  const free_set_buffer = try allocator.allocAdvanced(
544
540
  u8,
545
541
  constants.sector_size,
546
- superblock_trailer_free_set_size_max,
542
+ SuperBlockFreeSet.encode_size_max(block_count_limit),
547
543
  .exact,
548
544
  );
549
545
  errdefer allocator.free(free_set_buffer);
@@ -799,7 +795,7 @@ pub fn SuperBlockType(comptime Storage: type) type {
799
795
 
800
796
  fn write_staging_encode_free_set(superblock: *SuperBlock) void {
801
797
  const staging: *SuperBlockSector = superblock.staging;
802
- const encode_size_max = FreeSet.encode_size_max(block_count_max);
798
+ const encode_size_max = FreeSet.encode_size_max(superblock.block_count_limit);
803
799
  const target = superblock.free_set_buffer[0..encode_size_max];
804
800
 
805
801
  superblock.free_set.include_staging();
@@ -816,7 +812,14 @@ pub fn SuperBlockType(comptime Storage: type) type {
816
812
  assert(staging.storage_size <= staging.storage_size_max);
817
813
  assert(staging.storage_size <= superblock.storage_size_limit);
818
814
 
819
- staging.free_set_size = @intCast(u32, superblock.free_set.encode(target));
815
+ if (superblock.free_set.count_acquired() == 0) {
816
+ // EWAH encodes a zero-length bitset to an empty slice anyway, but handle this
817
+ // condition separately so that during formatting it doesn't depend on the choice
818
+ // of storage_size_limit.
819
+ staging.free_set_size = 0;
820
+ } else {
821
+ staging.free_set_size = @intCast(u32, superblock.free_set.encode(target));
822
+ }
820
823
  staging.free_set_checksum = vsr.checksum(target[0..staging.free_set_size]);
821
824
  }
822
825
 
@@ -1116,7 +1119,7 @@ pub fn SuperBlockType(comptime Storage: type) type {
1116
1119
  assert(working.sequence == 1);
1117
1120
  assert(working.storage_size == data_file_size_min);
1118
1121
  assert(working.manifest_size == 0);
1119
- assert(working.free_set_size == 8);
1122
+ assert(working.free_set_size == 0);
1120
1123
  assert(working.client_table_size == 4);
1121
1124
  assert(working.vsr_state.commit_min_checksum ==
1122
1125
  vsr.Header.root_prepare(working.cluster).checksum);