tigerbeetle-node 0.11.6 → 0.11.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/.client.node.sha256 +1 -1
- package/package.json +1 -1
- package/src/tigerbeetle/scripts/benchmark.bat +1 -2
- package/src/tigerbeetle/scripts/benchmark.sh +1 -2
- package/src/tigerbeetle/scripts/install.bat +7 -0
- package/src/tigerbeetle/scripts/install.sh +2 -3
- package/src/tigerbeetle/src/benchmark.zig +3 -3
- package/src/tigerbeetle/src/ewah.zig +6 -5
- package/src/tigerbeetle/src/ewah_fuzz.zig +1 -1
- package/src/tigerbeetle/src/io/darwin.zig +19 -0
- package/src/tigerbeetle/src/io/linux.zig +8 -0
- package/src/tigerbeetle/src/io/windows.zig +20 -2
- package/src/tigerbeetle/src/iops.zig +7 -1
- package/src/tigerbeetle/src/lsm/compaction.zig +18 -29
- package/src/tigerbeetle/src/lsm/forest_fuzz.zig +9 -5
- package/src/tigerbeetle/src/lsm/grid.zig +267 -267
- package/src/tigerbeetle/src/lsm/level_iterator.zig +18 -1
- package/src/tigerbeetle/src/lsm/manifest.zig +29 -1
- package/src/tigerbeetle/src/lsm/manifest_log.zig +5 -5
- package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +2 -2
- package/src/tigerbeetle/src/lsm/set_associative_cache.zig +26 -70
- package/src/tigerbeetle/src/lsm/table.zig +42 -0
- package/src/tigerbeetle/src/lsm/table_iterator.zig +27 -0
- package/src/tigerbeetle/src/lsm/table_mutable.zig +1 -1
- package/src/tigerbeetle/src/lsm/test.zig +2 -3
- package/src/tigerbeetle/src/lsm/tree.zig +27 -6
- package/src/tigerbeetle/src/lsm/tree_fuzz.zig +1 -1
- package/src/tigerbeetle/src/simulator.zig +0 -5
- package/src/tigerbeetle/src/storage.zig +58 -6
- package/src/tigerbeetle/src/test/cluster.zig +3 -0
- package/src/tigerbeetle/src/test/state_checker.zig +1 -1
- package/src/tigerbeetle/src/test/storage.zig +22 -1
- package/src/tigerbeetle/src/tracer.zig +50 -28
- package/src/tigerbeetle/src/unit_tests.zig +9 -4
- package/src/tigerbeetle/src/vopr.zig +4 -4
- package/src/tigerbeetle/src/vsr/client.zig +11 -7
- package/src/tigerbeetle/src/vsr/journal.zig +153 -93
- package/src/tigerbeetle/src/vsr/replica.zig +10 -20
- package/src/tigerbeetle/src/vsr/superblock.zig +19 -16
- package/src/tigerbeetle/src/vsr/superblock_free_set.zig +114 -93
- package/src/tigerbeetle/src/vsr/superblock_free_set_fuzz.zig +1 -1
- package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +1 -3
- package/src/tigerbeetle/src/vsr.zig +55 -8
- package/src/tigerbeetle/src/c/tb_client/context.zig +0 -304
- package/src/tigerbeetle/src/c/tb_client/echo_client.zig +0 -108
- package/src/tigerbeetle/src/c/tb_client/packet.zig +0 -80
- package/src/tigerbeetle/src/c/tb_client/signal.zig +0 -286
- package/src/tigerbeetle/src/c/tb_client/thread.zig +0 -88
- package/src/tigerbeetle/src/c/tb_client.h +0 -220
- package/src/tigerbeetle/src/c/tb_client.zig +0 -177
- package/src/tigerbeetle/src/c/tb_client_header.zig +0 -218
- package/src/tigerbeetle/src/c/tb_client_header_test.zig +0 -135
- package/src/tigerbeetle/src/c/test.zig +0 -371
- package/src/tigerbeetle/src/cli.zig +0 -399
- package/src/tigerbeetle/src/main.zig +0 -242
|
@@ -46,8 +46,10 @@ const Ring = enum {
|
|
|
46
46
|
};
|
|
47
47
|
|
|
48
48
|
const headers_per_sector = @divExact(constants.sector_size, @sizeOf(Header));
|
|
49
|
+
const headers_per_message = @divExact(constants.message_size_max, @sizeOf(Header));
|
|
49
50
|
comptime {
|
|
50
51
|
assert(headers_per_sector > 0);
|
|
52
|
+
assert(headers_per_message > 0);
|
|
51
53
|
}
|
|
52
54
|
|
|
53
55
|
/// A slot is an index within:
|
|
@@ -74,15 +76,15 @@ const SlotRange = struct {
|
|
|
74
76
|
/// * `head < tail` → ` head··tail `
|
|
75
77
|
/// * `head > tail` → `··tail head··` (The range wraps around).
|
|
76
78
|
/// * `head = tail` → panic (Caller must handle this case separately).
|
|
77
|
-
fn contains(
|
|
79
|
+
fn contains(range: *const SlotRange, slot: Slot) bool {
|
|
78
80
|
// To avoid confusion, the empty range must be checked separately by the caller.
|
|
79
|
-
assert(
|
|
81
|
+
assert(range.head.index != range.tail.index);
|
|
80
82
|
|
|
81
|
-
if (
|
|
82
|
-
return
|
|
83
|
+
if (range.head.index < range.tail.index) {
|
|
84
|
+
return range.head.index <= slot.index and slot.index <= range.tail.index;
|
|
83
85
|
}
|
|
84
|
-
if (
|
|
85
|
-
return slot.index <=
|
|
86
|
+
if (range.head.index > range.tail.index) {
|
|
87
|
+
return slot.index <= range.tail.index or range.head.index <= slot.index;
|
|
86
88
|
}
|
|
87
89
|
unreachable;
|
|
88
90
|
}
|
|
@@ -180,6 +182,8 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
180
182
|
}
|
|
181
183
|
};
|
|
182
184
|
|
|
185
|
+
const HeaderChunks = std.StaticBitSet(util.div_ceil(slot_count, headers_per_message));
|
|
186
|
+
|
|
183
187
|
storage: *Storage,
|
|
184
188
|
replica: u8,
|
|
185
189
|
|
|
@@ -212,6 +216,11 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
212
216
|
/// The buffers belong to the IOP at the corresponding index in IOPS.
|
|
213
217
|
headers_iops: *align(constants.sector_size) [constants.journal_iops_write_max][constants.sector_size]u8,
|
|
214
218
|
|
|
219
|
+
/// A set bit indicates a chunk of redundant headers that no read has been issued to yet.
|
|
220
|
+
header_chunks_requested: HeaderChunks = HeaderChunks.initFull(),
|
|
221
|
+
/// A set bit indicates a chunk of redundant headers that has been recovered.
|
|
222
|
+
header_chunks_recovered: HeaderChunks = HeaderChunks.initEmpty(),
|
|
223
|
+
|
|
215
224
|
/// Statically allocated read IO operation context data.
|
|
216
225
|
reads: IOPS(Read, constants.journal_iops_read_max) = .{},
|
|
217
226
|
|
|
@@ -272,13 +281,11 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
272
281
|
errdefer allocator.free(headers_redundant);
|
|
273
282
|
for (headers_redundant) |*header| header.* = undefined;
|
|
274
283
|
|
|
275
|
-
var dirty = try BitSet.
|
|
284
|
+
var dirty = try BitSet.init_full(allocator, slot_count);
|
|
276
285
|
errdefer dirty.deinit(allocator);
|
|
277
|
-
for (headers) |_, index| dirty.set(Slot{ .index = index });
|
|
278
286
|
|
|
279
|
-
var faulty = try BitSet.
|
|
287
|
+
var faulty = try BitSet.init_full(allocator, slot_count);
|
|
280
288
|
errdefer faulty.deinit(allocator);
|
|
281
|
-
for (headers) |_, index| faulty.set(Slot{ .index = index });
|
|
282
289
|
|
|
283
290
|
var prepare_checksums = try allocator.alloc(u128, slot_count);
|
|
284
291
|
errdefer allocator.free(prepare_checksums);
|
|
@@ -915,47 +922,58 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
915
922
|
assert(journal.status == .init);
|
|
916
923
|
assert(journal.dirty.count == slot_count);
|
|
917
924
|
assert(journal.faulty.count == slot_count);
|
|
925
|
+
assert(journal.reads.executing() == 0);
|
|
926
|
+
assert(journal.writes.executing() == 0);
|
|
927
|
+
assert(journal.header_chunks_requested.count() == HeaderChunks.bit_length);
|
|
928
|
+
assert(journal.header_chunks_recovered.count() == 0);
|
|
918
929
|
|
|
919
930
|
journal.status = .{ .recovering = callback };
|
|
920
|
-
|
|
921
931
|
log.debug("{}: recover: recovering", .{journal.replica});
|
|
922
932
|
|
|
923
|
-
journal.
|
|
933
|
+
var available: usize = journal.reads.available();
|
|
934
|
+
while (available > 0) : (available -= 1) journal.recover_headers();
|
|
935
|
+
|
|
936
|
+
assert(journal.header_chunks_recovered.count() == 0);
|
|
937
|
+
assert(journal.header_chunks_requested.count() ==
|
|
938
|
+
HeaderChunks.bit_length - journal.reads.executing());
|
|
924
939
|
}
|
|
925
940
|
|
|
926
|
-
fn recover_headers(journal: *Journal
|
|
941
|
+
fn recover_headers(journal: *Journal) void {
|
|
927
942
|
const replica = @fieldParentPtr(Replica, "journal", journal);
|
|
928
|
-
|
|
929
943
|
assert(journal.status == .recovering);
|
|
930
|
-
assert(journal.
|
|
931
|
-
assert(journal.faulty.count == slot_count);
|
|
944
|
+
assert(journal.reads.available() > 0);
|
|
932
945
|
|
|
933
|
-
if (
|
|
946
|
+
if (journal.header_chunks_recovered.count() == HeaderChunks.bit_length) {
|
|
947
|
+
assert(journal.header_chunks_requested.count() == 0);
|
|
934
948
|
log.debug("{}: recover_headers: complete", .{journal.replica});
|
|
935
|
-
journal.recover_prepares(
|
|
949
|
+
journal.recover_prepares();
|
|
936
950
|
return;
|
|
937
951
|
}
|
|
938
|
-
|
|
952
|
+
|
|
953
|
+
const chunk_index = journal.header_chunks_requested.findFirstSet() orelse return;
|
|
954
|
+
assert(!journal.header_chunks_recovered.isSet(chunk_index));
|
|
939
955
|
|
|
940
956
|
const message = replica.message_bus.get_message();
|
|
941
957
|
defer replica.message_bus.unref(message);
|
|
942
958
|
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
const read = journal.reads.acquire() orelse unreachable;
|
|
947
|
-
read.* = .{
|
|
959
|
+
const chunk_read = journal.reads.acquire() orelse unreachable;
|
|
960
|
+
chunk_read.* = .{
|
|
948
961
|
.journal = journal,
|
|
949
962
|
.completion = undefined,
|
|
950
963
|
.message = message.ref(),
|
|
951
964
|
.callback = undefined,
|
|
952
|
-
.op =
|
|
953
|
-
.checksum =
|
|
965
|
+
.op = chunk_index,
|
|
966
|
+
.checksum = undefined,
|
|
954
967
|
.destination_replica = null,
|
|
955
968
|
};
|
|
956
969
|
|
|
970
|
+
const offset = constants.message_size_max * chunk_index;
|
|
971
|
+
assert(offset < headers_size);
|
|
972
|
+
|
|
957
973
|
const buffer = recover_headers_buffer(message, offset);
|
|
958
974
|
assert(buffer.len > 0);
|
|
975
|
+
assert(buffer.len <= constants.message_size_max);
|
|
976
|
+
assert(buffer.len + offset <= headers_size);
|
|
959
977
|
|
|
960
978
|
log.debug("{}: recover_headers: offset={} size={} recovering", .{
|
|
961
979
|
journal.replica,
|
|
@@ -963,9 +981,10 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
963
981
|
buffer.len,
|
|
964
982
|
});
|
|
965
983
|
|
|
984
|
+
journal.header_chunks_requested.unset(chunk_index);
|
|
966
985
|
journal.storage.read_sectors(
|
|
967
986
|
recover_headers_callback,
|
|
968
|
-
&
|
|
987
|
+
&chunk_read.completion,
|
|
969
988
|
buffer,
|
|
970
989
|
.wal_headers,
|
|
971
990
|
offset,
|
|
@@ -973,69 +992,94 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
973
992
|
}
|
|
974
993
|
|
|
975
994
|
fn recover_headers_callback(completion: *Storage.Read) void {
|
|
976
|
-
const
|
|
977
|
-
const journal =
|
|
995
|
+
const chunk_read = @fieldParentPtr(Journal.Read, "completion", completion);
|
|
996
|
+
const journal = chunk_read.journal;
|
|
978
997
|
const replica = @fieldParentPtr(Replica, "journal", journal);
|
|
979
|
-
|
|
998
|
+
assert(journal.status == .recovering);
|
|
999
|
+
assert(chunk_read.destination_replica == null);
|
|
980
1000
|
|
|
981
|
-
const
|
|
982
|
-
|
|
1001
|
+
const chunk_index = chunk_read.op;
|
|
1002
|
+
assert(!journal.header_chunks_requested.isSet(chunk_index));
|
|
1003
|
+
assert(!journal.header_chunks_recovered.isSet(chunk_index));
|
|
1004
|
+
|
|
1005
|
+
const chunk_buffer = recover_headers_buffer(
|
|
1006
|
+
chunk_read.message,
|
|
1007
|
+
chunk_index * constants.message_size_max,
|
|
1008
|
+
);
|
|
1009
|
+
assert(chunk_buffer.len >= @sizeOf(Header));
|
|
1010
|
+
assert(chunk_buffer.len % @sizeOf(Header) == 0);
|
|
983
1011
|
|
|
984
1012
|
log.debug("{}: recover_headers: offset={} size={} recovered", .{
|
|
985
1013
|
journal.replica,
|
|
986
|
-
|
|
987
|
-
|
|
1014
|
+
chunk_index * constants.message_size_max,
|
|
1015
|
+
chunk_buffer.len,
|
|
988
1016
|
});
|
|
989
1017
|
|
|
990
|
-
assert(journal.status == .recovering);
|
|
991
|
-
assert(offset % @sizeOf(Header) == 0);
|
|
992
|
-
assert(buffer.len >= @sizeOf(Header));
|
|
993
|
-
assert(buffer.len % @sizeOf(Header) == 0);
|
|
994
|
-
assert(read.destination_replica == null);
|
|
995
|
-
assert(journal.dirty.count == slot_count);
|
|
996
|
-
assert(journal.faulty.count == slot_count);
|
|
997
|
-
|
|
998
1018
|
// Directly store all the redundant headers in `journal.headers_redundant` (including any
|
|
999
1019
|
// that are invalid or corrupt). As the prepares are recovered, these will be replaced
|
|
1000
1020
|
// or removed as necessary.
|
|
1001
|
-
const
|
|
1021
|
+
const chunk_headers = std.mem.bytesAsSlice(Header, chunk_buffer);
|
|
1002
1022
|
util.copy_disjoint(
|
|
1003
1023
|
.exact,
|
|
1004
1024
|
Header,
|
|
1005
|
-
journal.headers_redundant[
|
|
1006
|
-
|
|
1025
|
+
journal.headers_redundant[chunk_index * headers_per_message ..][0..chunk_headers.len],
|
|
1026
|
+
chunk_headers,
|
|
1007
1027
|
);
|
|
1008
1028
|
|
|
1009
|
-
const offset_next = offset + buffer.len;
|
|
1010
1029
|
// We must release before we call `recover_headers()` in case Storage is synchronous.
|
|
1011
1030
|
// Otherwise, we would run out of messages and reads.
|
|
1012
|
-
replica.message_bus.unref(
|
|
1013
|
-
journal.reads.release(
|
|
1031
|
+
replica.message_bus.unref(chunk_read.message);
|
|
1032
|
+
journal.reads.release(chunk_read);
|
|
1014
1033
|
|
|
1015
|
-
journal.
|
|
1034
|
+
journal.header_chunks_recovered.set(chunk_index);
|
|
1035
|
+
journal.recover_headers();
|
|
1016
1036
|
}
|
|
1017
1037
|
|
|
1018
1038
|
fn recover_headers_buffer(message: *Message, offset: u64) []align(@alignOf(Header)) u8 {
|
|
1019
|
-
const max = std.math.min(
|
|
1039
|
+
const max = std.math.min(constants.message_size_max, headers_size - offset);
|
|
1020
1040
|
assert(max % constants.sector_size == 0);
|
|
1021
1041
|
assert(max % @sizeOf(Header) == 0);
|
|
1022
1042
|
return message.buffer[0..max];
|
|
1023
1043
|
}
|
|
1024
1044
|
|
|
1025
|
-
|
|
1026
|
-
|
|
1045
|
+
/// Recover the prepares ring. Reads are issued concurrently.
|
|
1046
|
+
/// - `dirty` is initially full.
|
|
1047
|
+
/// Bits are cleared when a read is issued to the slot.
|
|
1048
|
+
/// All bits are set again before recover_slots() is called.
|
|
1049
|
+
/// - `faulty` is initially full.
|
|
1050
|
+
/// Bits are cleared when the slot's read finishes.
|
|
1051
|
+
/// All bits are set again before recover_slots() is called.
|
|
1052
|
+
/// - The prepare's headers are loaded into `journal.headers`.
|
|
1053
|
+
fn recover_prepares(journal: *Journal) void {
|
|
1027
1054
|
assert(journal.status == .recovering);
|
|
1028
1055
|
assert(journal.dirty.count == slot_count);
|
|
1029
1056
|
assert(journal.faulty.count == slot_count);
|
|
1030
|
-
// We expect that no other process is issuing reads while we are recovering.
|
|
1031
1057
|
assert(journal.reads.executing() == 0);
|
|
1058
|
+
assert(journal.writes.executing() == 0);
|
|
1032
1059
|
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1060
|
+
var available: usize = journal.reads.available();
|
|
1061
|
+
while (available > 0) : (available -= 1) journal.recover_prepare();
|
|
1062
|
+
|
|
1063
|
+
assert(journal.writes.executing() == 0);
|
|
1064
|
+
assert(journal.reads.executing() > 0);
|
|
1065
|
+
assert(journal.reads.executing() + journal.dirty.count == slot_count);
|
|
1066
|
+
assert(journal.faulty.count == slot_count);
|
|
1067
|
+
}
|
|
1068
|
+
|
|
1069
|
+
fn recover_prepare(journal: *Journal) void {
|
|
1070
|
+
const replica = @fieldParentPtr(Replica, "journal", journal);
|
|
1071
|
+
assert(journal.status == .recovering);
|
|
1072
|
+
assert(journal.reads.available() > 0);
|
|
1073
|
+
assert(journal.dirty.count <= journal.faulty.count);
|
|
1074
|
+
|
|
1075
|
+
if (journal.faulty.count == 0) {
|
|
1076
|
+
for (journal.headers) |_, index| journal.dirty.set(Slot{ .index = index });
|
|
1077
|
+
for (journal.headers) |_, index| journal.faulty.set(Slot{ .index = index });
|
|
1078
|
+
return journal.recover_slots();
|
|
1036
1079
|
}
|
|
1037
|
-
assert(slot.index < slot_count);
|
|
1038
1080
|
|
|
1081
|
+
const slot_index = journal.dirty.bits.findFirstSet() orelse return;
|
|
1082
|
+
const slot = Slot{ .index = slot_index };
|
|
1039
1083
|
const message = replica.message_bus.get_message();
|
|
1040
1084
|
defer replica.message_bus.unref(message);
|
|
1041
1085
|
|
|
@@ -1045,18 +1089,19 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
1045
1089
|
.completion = undefined,
|
|
1046
1090
|
.message = message.ref(),
|
|
1047
1091
|
.callback = undefined,
|
|
1048
|
-
.op =
|
|
1049
|
-
.checksum =
|
|
1092
|
+
.op = slot.index,
|
|
1093
|
+
.checksum = undefined,
|
|
1050
1094
|
.destination_replica = null,
|
|
1051
1095
|
};
|
|
1052
1096
|
|
|
1053
|
-
log.debug("{}:
|
|
1097
|
+
log.debug("{}: recover_prepare: recovering slot={}", .{
|
|
1054
1098
|
journal.replica,
|
|
1055
1099
|
slot.index,
|
|
1056
1100
|
});
|
|
1057
1101
|
|
|
1102
|
+
journal.dirty.clear(slot);
|
|
1058
1103
|
journal.storage.read_sectors(
|
|
1059
|
-
|
|
1104
|
+
recover_prepare_callback,
|
|
1060
1105
|
&read.completion,
|
|
1061
1106
|
// We load the entire message to verify that it isn't torn or corrupt.
|
|
1062
1107
|
// We don't know the message's size, so use the entire buffer.
|
|
@@ -1066,18 +1111,19 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
1066
1111
|
);
|
|
1067
1112
|
}
|
|
1068
1113
|
|
|
1069
|
-
fn
|
|
1114
|
+
fn recover_prepare_callback(completion: *Storage.Read) void {
|
|
1070
1115
|
const read = @fieldParentPtr(Journal.Read, "completion", completion);
|
|
1071
1116
|
const journal = read.journal;
|
|
1072
1117
|
const replica = @fieldParentPtr(Replica, "journal", journal);
|
|
1073
1118
|
|
|
1074
1119
|
assert(journal.status == .recovering);
|
|
1075
|
-
assert(journal.dirty.count
|
|
1076
|
-
assert(journal.faulty.count == slot_count);
|
|
1120
|
+
assert(journal.dirty.count <= journal.faulty.count);
|
|
1077
1121
|
assert(read.destination_replica == null);
|
|
1078
1122
|
|
|
1079
|
-
const slot = Slot{ .index = @intCast(u64, read.
|
|
1123
|
+
const slot = Slot{ .index = @intCast(u64, read.op) };
|
|
1080
1124
|
assert(slot.index < slot_count);
|
|
1125
|
+
assert(!journal.dirty.bit(slot));
|
|
1126
|
+
assert(journal.faulty.bit(slot));
|
|
1081
1127
|
|
|
1082
1128
|
// Check `valid_checksum_body` here rather than in `recover_done` so that we don't need
|
|
1083
1129
|
// to hold onto the whole message (just the header).
|
|
@@ -1090,7 +1136,8 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
1090
1136
|
replica.message_bus.unref(read.message);
|
|
1091
1137
|
journal.reads.release(read);
|
|
1092
1138
|
|
|
1093
|
-
journal.
|
|
1139
|
+
journal.faulty.clear(slot);
|
|
1140
|
+
journal.recover_prepare();
|
|
1094
1141
|
}
|
|
1095
1142
|
|
|
1096
1143
|
/// When in doubt about whether a particular message was received, it must be marked as
|
|
@@ -1431,19 +1478,27 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
1431
1478
|
}
|
|
1432
1479
|
|
|
1433
1480
|
fn recover_fix_callback(write: *Journal.Write) void {
|
|
1481
|
+
const journal = write.journal;
|
|
1482
|
+
assert(journal.status == .recovering);
|
|
1434
1483
|
assert(write.trigger == .fix);
|
|
1435
|
-
|
|
1484
|
+
|
|
1485
|
+
journal.writes.release(write);
|
|
1486
|
+
journal.recover_fix();
|
|
1436
1487
|
}
|
|
1437
1488
|
|
|
1438
1489
|
fn recover_done(journal: *Journal) void {
|
|
1439
|
-
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
journal.status = .recovered;
|
|
1443
|
-
|
|
1490
|
+
assert(journal.status == .recovering);
|
|
1491
|
+
assert(journal.reads.executing() == 0);
|
|
1492
|
+
assert(journal.writes.executing() == 0);
|
|
1444
1493
|
assert(journal.dirty.count <= slot_count);
|
|
1445
1494
|
assert(journal.faulty.count <= slot_count);
|
|
1446
1495
|
assert(journal.faulty.count == journal.dirty.count);
|
|
1496
|
+
assert(journal.header_chunks_requested.count() == 0);
|
|
1497
|
+
assert(journal.header_chunks_recovered.count() == HeaderChunks.bit_length);
|
|
1498
|
+
|
|
1499
|
+
const replica = @fieldParentPtr(Replica, "journal", journal);
|
|
1500
|
+
const callback = journal.status.recovering;
|
|
1501
|
+
journal.status = .recovered;
|
|
1447
1502
|
|
|
1448
1503
|
// Abort if all slots are faulty, since something is very wrong.
|
|
1449
1504
|
if (journal.faulty.count == slot_count) @panic("WAL is completely corrupt");
|
|
@@ -2112,9 +2167,9 @@ const Case = struct {
|
|
|
2112
2167
|
};
|
|
2113
2168
|
}
|
|
2114
2169
|
|
|
2115
|
-
fn check(
|
|
2170
|
+
fn check(case: *const Case, parameters: [9]bool) !bool {
|
|
2116
2171
|
for (parameters) |b, i| {
|
|
2117
|
-
switch (
|
|
2172
|
+
switch (case.pattern[i]) {
|
|
2118
2173
|
.any => {},
|
|
2119
2174
|
.is_false => if (b) return false,
|
|
2120
2175
|
.is_true => if (!b) return false,
|
|
@@ -2125,12 +2180,12 @@ const Case = struct {
|
|
|
2125
2180
|
return true;
|
|
2126
2181
|
}
|
|
2127
2182
|
|
|
2128
|
-
fn decision(
|
|
2183
|
+
fn decision(case: *const Case, replica_count: u8) RecoveryDecision {
|
|
2129
2184
|
assert(replica_count > 0);
|
|
2130
2185
|
if (replica_count == 1) {
|
|
2131
|
-
return
|
|
2186
|
+
return case.decision_single;
|
|
2132
2187
|
} else {
|
|
2133
|
-
return
|
|
2188
|
+
return case.decision_multiple;
|
|
2134
2189
|
}
|
|
2135
2190
|
}
|
|
2136
2191
|
};
|
|
@@ -2227,36 +2282,41 @@ pub const BitSet = struct {
|
|
|
2227
2282
|
/// The number of bits set (updated incrementally as bits are set or cleared):
|
|
2228
2283
|
count: u64 = 0,
|
|
2229
2284
|
|
|
2230
|
-
fn
|
|
2231
|
-
const bits = try std.DynamicBitSetUnmanaged.
|
|
2285
|
+
fn init_full(allocator: Allocator, count: usize) !BitSet {
|
|
2286
|
+
const bits = try std.DynamicBitSetUnmanaged.initFull(allocator, count);
|
|
2232
2287
|
errdefer bits.deinit(allocator);
|
|
2233
2288
|
|
|
2234
|
-
return BitSet{
|
|
2289
|
+
return BitSet{
|
|
2290
|
+
.bits = bits,
|
|
2291
|
+
.count = count,
|
|
2292
|
+
};
|
|
2235
2293
|
}
|
|
2236
2294
|
|
|
2237
|
-
fn deinit(
|
|
2238
|
-
|
|
2295
|
+
fn deinit(bit_set: *BitSet, allocator: Allocator) void {
|
|
2296
|
+
assert(bit_set.count == bit_set.bits.count());
|
|
2297
|
+
|
|
2298
|
+
bit_set.bits.deinit(allocator);
|
|
2239
2299
|
}
|
|
2240
2300
|
|
|
2241
2301
|
/// Clear the bit for a slot (idempotent):
|
|
2242
|
-
pub fn clear(
|
|
2243
|
-
if (
|
|
2244
|
-
|
|
2245
|
-
|
|
2302
|
+
pub fn clear(bit_set: *BitSet, slot: Slot) void {
|
|
2303
|
+
if (bit_set.bits.isSet(slot.index)) {
|
|
2304
|
+
bit_set.bits.unset(slot.index);
|
|
2305
|
+
bit_set.count -= 1;
|
|
2246
2306
|
}
|
|
2247
2307
|
}
|
|
2248
2308
|
|
|
2249
2309
|
/// Whether the bit for a slot is set:
|
|
2250
|
-
pub fn bit(
|
|
2251
|
-
return
|
|
2310
|
+
pub fn bit(bit_set: *const BitSet, slot: Slot) bool {
|
|
2311
|
+
return bit_set.bits.isSet(slot.index);
|
|
2252
2312
|
}
|
|
2253
2313
|
|
|
2254
2314
|
/// Set the bit for a slot (idempotent):
|
|
2255
|
-
pub fn set(
|
|
2256
|
-
if (!
|
|
2257
|
-
|
|
2258
|
-
|
|
2259
|
-
assert(
|
|
2315
|
+
pub fn set(bit_set: *BitSet, slot: Slot) void {
|
|
2316
|
+
if (!bit_set.bits.isSet(slot.index)) {
|
|
2317
|
+
bit_set.bits.set(slot.index);
|
|
2318
|
+
bit_set.count += 1;
|
|
2319
|
+
assert(bit_set.count <= bit_set.bits.bit_length);
|
|
2260
2320
|
}
|
|
2261
2321
|
}
|
|
2262
2322
|
};
|
|
@@ -311,8 +311,8 @@ pub fn ReplicaType(
|
|
|
311
311
|
allocator,
|
|
312
312
|
.{
|
|
313
313
|
.storage = options.storage,
|
|
314
|
-
.message_pool = options.message_pool,
|
|
315
314
|
.storage_size_limit = options.storage_size_limit,
|
|
315
|
+
.message_pool = options.message_pool,
|
|
316
316
|
},
|
|
317
317
|
);
|
|
318
318
|
|
|
@@ -356,7 +356,7 @@ pub fn ReplicaType(
|
|
|
356
356
|
self.opened = false;
|
|
357
357
|
self.state_machine.open(state_machine_open_callback);
|
|
358
358
|
while (!self.opened) {
|
|
359
|
-
self.grid.tick();
|
|
359
|
+
// self.grid.tick();
|
|
360
360
|
self.superblock.storage.tick();
|
|
361
361
|
}
|
|
362
362
|
|
|
@@ -427,22 +427,12 @@ pub fn ReplicaType(
|
|
|
427
427
|
assert(self.superblock.opened);
|
|
428
428
|
assert(self.superblock.working.vsr_state.internally_consistent());
|
|
429
429
|
|
|
430
|
-
const
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
assert(
|
|
434
|
-
|
|
435
|
-
assert(
|
|
436
|
-
|
|
437
|
-
const quorum_view_change = std.math.max(
|
|
438
|
-
replica_count - quorum_replication + 1,
|
|
439
|
-
majority,
|
|
440
|
-
);
|
|
441
|
-
// The view change quorum may be more expensive to make the replication quorum cheaper.
|
|
442
|
-
// The insight is that the replication phase is by far more common than the view change.
|
|
443
|
-
// This trade-off allows us to optimize for the common case.
|
|
444
|
-
// See the comments in `constants.zig` for further explanation.
|
|
445
|
-
assert(quorum_view_change >= majority);
|
|
430
|
+
const quorums = vsr.quorums(replica_count);
|
|
431
|
+
const quorum_replication = quorums.replication;
|
|
432
|
+
const quorum_view_change = quorums.view_change;
|
|
433
|
+
assert(quorum_replication <= replica_count);
|
|
434
|
+
assert(quorum_view_change <= replica_count);
|
|
435
|
+
assert(quorum_view_change + quorum_replication >= replica_count);
|
|
446
436
|
|
|
447
437
|
if (replica_count <= 2) {
|
|
448
438
|
assert(quorum_replication == replica_count);
|
|
@@ -639,7 +629,7 @@ pub fn ReplicaType(
|
|
|
639
629
|
|
|
640
630
|
// TODO Replica owns Time; should it tick() here instead of Clock?
|
|
641
631
|
self.clock.tick();
|
|
642
|
-
self.grid.tick();
|
|
632
|
+
// self.grid.tick();
|
|
643
633
|
self.message_bus.tick();
|
|
644
634
|
|
|
645
635
|
if (self.status == .recovering) {
|
|
@@ -4440,7 +4430,7 @@ pub fn ReplicaType(
|
|
|
4440
4430
|
// - or (indistinguishably) this might originally have been an op greater
|
|
4441
4431
|
// than replica.op, which was truncated, but is now corrupt.
|
|
4442
4432
|
//
|
|
4443
|
-
//
|
|
4433
|
+
// We don't try to repair this op because the slot belongs (or will soon
|
|
4444
4434
|
// belong) to a newer op, from the new WAL wrap. Additionally, we may not
|
|
4445
4435
|
// still have access to its surrounding commits to verify the hash chain.
|
|
4446
4436
|
assert(op <= self.commit_min);
|
|
@@ -325,16 +325,14 @@ pub const superblock_trailer_free_set_size_max = blk: {
|
|
|
325
325
|
const encode_size_max = SuperBlockFreeSet.encode_size_max(block_count_max);
|
|
326
326
|
assert(encode_size_max > 0);
|
|
327
327
|
|
|
328
|
-
|
|
329
|
-
break :blk div_ceil(encode_size_max, constants.sector_size) * constants.sector_size;
|
|
328
|
+
break :blk vsr.sector_ceil(encode_size_max);
|
|
330
329
|
};
|
|
331
330
|
|
|
332
331
|
pub const superblock_trailer_client_table_size_max = blk: {
|
|
333
332
|
const encode_size_max = SuperBlockClientTable.encode_size_max;
|
|
334
333
|
assert(encode_size_max > 0);
|
|
335
334
|
|
|
336
|
-
|
|
337
|
-
break :blk div_ceil(encode_size_max, constants.sector_size) * constants.sector_size;
|
|
335
|
+
break :blk vsr.sector_ceil(encode_size_max);
|
|
338
336
|
};
|
|
339
337
|
|
|
340
338
|
pub const data_file_size_min = blk: {
|
|
@@ -352,15 +350,15 @@ const block_count_max = blk: {
|
|
|
352
350
|
// The size of a freeset is related to the number of blocks it must store.
|
|
353
351
|
// Maximize the number of grid blocks.
|
|
354
352
|
|
|
355
|
-
var shard_count = @divFloor(size, constants.block_size * SuperBlockFreeSet.
|
|
353
|
+
var shard_count = @divFloor(size, constants.block_size * SuperBlockFreeSet.shard_bits);
|
|
356
354
|
while (true) : (shard_count -= 1) {
|
|
357
|
-
const block_count = shard_count * SuperBlockFreeSet.
|
|
355
|
+
const block_count = shard_count * SuperBlockFreeSet.shard_bits;
|
|
358
356
|
const grid_size = block_count * constants.block_size;
|
|
359
357
|
const free_set_size = vsr.sector_ceil(SuperBlockFreeSet.encode_size_max(block_count));
|
|
360
358
|
const free_sets_size = constants.superblock_copies * free_set_size;
|
|
361
359
|
if (free_sets_size + grid_size <= size) break;
|
|
362
360
|
}
|
|
363
|
-
break :blk shard_count * SuperBlockFreeSet.
|
|
361
|
+
break :blk shard_count * SuperBlockFreeSet.shard_bits;
|
|
364
362
|
};
|
|
365
363
|
|
|
366
364
|
comptime {
|
|
@@ -495,9 +493,9 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
495
493
|
|
|
496
494
|
const shard_count_limit = @intCast(usize, @divFloor(
|
|
497
495
|
options.storage_size_limit - data_file_size_min,
|
|
498
|
-
constants.block_size * FreeSet.
|
|
496
|
+
constants.block_size * FreeSet.shard_bits,
|
|
499
497
|
));
|
|
500
|
-
const block_count_limit = shard_count_limit * FreeSet.
|
|
498
|
+
const block_count_limit = shard_count_limit * FreeSet.shard_bits;
|
|
501
499
|
assert(block_count_limit <= block_count_max);
|
|
502
500
|
|
|
503
501
|
const a = try allocator.allocAdvanced(SuperBlockSector, constants.sector_size, 1, .exact);
|
|
@@ -524,9 +522,7 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
524
522
|
);
|
|
525
523
|
errdefer manifest.deinit(allocator);
|
|
526
524
|
|
|
527
|
-
|
|
528
|
-
// Right now we can allocate blocks outside of the limit.
|
|
529
|
-
var free_set = try FreeSet.init(allocator, block_count_max);
|
|
525
|
+
var free_set = try FreeSet.init(allocator, block_count_limit);
|
|
530
526
|
errdefer free_set.deinit(allocator);
|
|
531
527
|
|
|
532
528
|
var client_table = try ClientTable.init(allocator, options.message_pool);
|
|
@@ -543,7 +539,7 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
543
539
|
const free_set_buffer = try allocator.allocAdvanced(
|
|
544
540
|
u8,
|
|
545
541
|
constants.sector_size,
|
|
546
|
-
|
|
542
|
+
SuperBlockFreeSet.encode_size_max(block_count_limit),
|
|
547
543
|
.exact,
|
|
548
544
|
);
|
|
549
545
|
errdefer allocator.free(free_set_buffer);
|
|
@@ -799,7 +795,7 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
799
795
|
|
|
800
796
|
fn write_staging_encode_free_set(superblock: *SuperBlock) void {
|
|
801
797
|
const staging: *SuperBlockSector = superblock.staging;
|
|
802
|
-
const encode_size_max = FreeSet.encode_size_max(
|
|
798
|
+
const encode_size_max = FreeSet.encode_size_max(superblock.block_count_limit);
|
|
803
799
|
const target = superblock.free_set_buffer[0..encode_size_max];
|
|
804
800
|
|
|
805
801
|
superblock.free_set.include_staging();
|
|
@@ -816,7 +812,14 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
816
812
|
assert(staging.storage_size <= staging.storage_size_max);
|
|
817
813
|
assert(staging.storage_size <= superblock.storage_size_limit);
|
|
818
814
|
|
|
819
|
-
|
|
815
|
+
if (superblock.free_set.count_acquired() == 0) {
|
|
816
|
+
// EWAH encodes a zero-length bitset to an empty slice anyway, but handle this
|
|
817
|
+
// condition separately so that during formatting it doesn't depend on the choice
|
|
818
|
+
// of storage_size_limit.
|
|
819
|
+
staging.free_set_size = 0;
|
|
820
|
+
} else {
|
|
821
|
+
staging.free_set_size = @intCast(u32, superblock.free_set.encode(target));
|
|
822
|
+
}
|
|
820
823
|
staging.free_set_checksum = vsr.checksum(target[0..staging.free_set_size]);
|
|
821
824
|
}
|
|
822
825
|
|
|
@@ -1116,7 +1119,7 @@ pub fn SuperBlockType(comptime Storage: type) type {
|
|
|
1116
1119
|
assert(working.sequence == 1);
|
|
1117
1120
|
assert(working.storage_size == data_file_size_min);
|
|
1118
1121
|
assert(working.manifest_size == 0);
|
|
1119
|
-
assert(working.free_set_size ==
|
|
1122
|
+
assert(working.free_set_size == 0);
|
|
1120
1123
|
assert(working.client_table_size == 4);
|
|
1121
1124
|
assert(working.vsr_state.commit_min_checksum ==
|
|
1122
1125
|
vsr.Header.root_prepare(working.cluster).checksum);
|