tigerbeetle-node 0.11.8 → 0.11.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/.client.node.sha256 +1 -1
- package/package.json +4 -3
- package/scripts/build_lib.sh +41 -0
- package/src/node.zig +1 -1
- package/src/tigerbeetle/scripts/validate_docs.sh +7 -1
- package/src/tigerbeetle/src/benchmark.zig +3 -3
- package/src/tigerbeetle/src/config.zig +31 -16
- package/src/tigerbeetle/src/constants.zig +48 -9
- package/src/tigerbeetle/src/ewah.zig +5 -5
- package/src/tigerbeetle/src/ewah_fuzz.zig +1 -1
- package/src/tigerbeetle/src/lsm/binary_search.zig +1 -1
- package/src/tigerbeetle/src/lsm/bloom_filter.zig +1 -1
- package/src/tigerbeetle/src/lsm/compaction.zig +34 -21
- package/src/tigerbeetle/src/lsm/forest_fuzz.zig +84 -104
- package/src/tigerbeetle/src/lsm/grid.zig +19 -13
- package/src/tigerbeetle/src/lsm/manifest_log.zig +8 -10
- package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +18 -13
- package/src/tigerbeetle/src/lsm/merge_iterator.zig +1 -1
- package/src/tigerbeetle/src/lsm/segmented_array.zig +17 -17
- package/src/tigerbeetle/src/lsm/segmented_array_fuzz.zig +1 -1
- package/src/tigerbeetle/src/lsm/set_associative_cache.zig +1 -1
- package/src/tigerbeetle/src/lsm/table.zig +8 -20
- package/src/tigerbeetle/src/lsm/table_immutable.zig +1 -1
- package/src/tigerbeetle/src/lsm/table_iterator.zig +3 -3
- package/src/tigerbeetle/src/lsm/table_mutable.zig +14 -2
- package/src/tigerbeetle/src/lsm/test.zig +5 -4
- package/src/tigerbeetle/src/lsm/tree.zig +1 -2
- package/src/tigerbeetle/src/lsm/tree_fuzz.zig +85 -115
- package/src/tigerbeetle/src/message_bus.zig +4 -4
- package/src/tigerbeetle/src/message_pool.zig +7 -10
- package/src/tigerbeetle/src/ring_buffer.zig +22 -12
- package/src/tigerbeetle/src/simulator.zig +366 -239
- package/src/tigerbeetle/src/state_machine/auditor.zig +5 -5
- package/src/tigerbeetle/src/state_machine/workload.zig +3 -3
- package/src/tigerbeetle/src/state_machine.zig +190 -178
- package/src/tigerbeetle/src/{util.zig → stdx.zig} +2 -0
- package/src/tigerbeetle/src/storage.zig +13 -6
- package/src/tigerbeetle/src/{test → testing/cluster}/message_bus.zig +3 -3
- package/src/tigerbeetle/src/{test → testing/cluster}/network.zig +46 -22
- package/src/tigerbeetle/src/testing/cluster/state_checker.zig +169 -0
- package/src/tigerbeetle/src/testing/cluster/storage_checker.zig +202 -0
- package/src/tigerbeetle/src/testing/cluster.zig +443 -0
- package/src/tigerbeetle/src/{test → testing}/fuzz.zig +0 -0
- package/src/tigerbeetle/src/testing/hash_log.zig +66 -0
- package/src/tigerbeetle/src/{test → testing}/id.zig +0 -0
- package/src/tigerbeetle/src/testing/packet_simulator.zig +365 -0
- package/src/tigerbeetle/src/{test → testing}/priority_queue.zig +1 -1
- package/src/tigerbeetle/src/testing/reply_sequence.zig +139 -0
- package/src/tigerbeetle/src/{test → testing}/state_machine.zig +3 -1
- package/src/tigerbeetle/src/testing/storage.zig +757 -0
- package/src/tigerbeetle/src/{test → testing}/table.zig +21 -0
- package/src/tigerbeetle/src/{test → testing}/time.zig +0 -0
- package/src/tigerbeetle/src/tigerbeetle.zig +2 -0
- package/src/tigerbeetle/src/tracer.zig +3 -3
- package/src/tigerbeetle/src/unit_tests.zig +4 -4
- package/src/tigerbeetle/src/vopr.zig +2 -2
- package/src/tigerbeetle/src/vsr/client.zig +5 -2
- package/src/tigerbeetle/src/vsr/clock.zig +93 -53
- package/src/tigerbeetle/src/vsr/journal.zig +109 -98
- package/src/tigerbeetle/src/vsr/journal_format_fuzz.zig +2 -2
- package/src/tigerbeetle/src/vsr/replica.zig +1983 -1430
- package/src/tigerbeetle/src/vsr/replica_format.zig +13 -13
- package/src/tigerbeetle/src/vsr/superblock.zig +240 -142
- package/src/tigerbeetle/src/vsr/superblock_client_table.zig +7 -7
- package/src/tigerbeetle/src/vsr/superblock_free_set.zig +1 -1
- package/src/tigerbeetle/src/vsr/superblock_free_set_fuzz.zig +1 -1
- package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +49 -14
- package/src/tigerbeetle/src/vsr/superblock_manifest.zig +38 -19
- package/src/tigerbeetle/src/vsr/superblock_quorums.zig +48 -48
- package/src/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +51 -51
- package/src/tigerbeetle/src/vsr.zig +99 -33
- package/src/tigerbeetle/src/demo.zig +0 -132
- package/src/tigerbeetle/src/demo_01_create_accounts.zig +0 -35
- package/src/tigerbeetle/src/demo_02_lookup_accounts.zig +0 -7
- package/src/tigerbeetle/src/demo_03_create_transfers.zig +0 -37
- package/src/tigerbeetle/src/demo_04_create_pending_transfers.zig +0 -61
- package/src/tigerbeetle/src/demo_05_post_pending_transfers.zig +0 -37
- package/src/tigerbeetle/src/demo_06_void_pending_transfers.zig +0 -24
- package/src/tigerbeetle/src/demo_07_lookup_transfers.zig +0 -7
- package/src/tigerbeetle/src/test/cluster.zig +0 -352
- package/src/tigerbeetle/src/test/conductor.zig +0 -366
- package/src/tigerbeetle/src/test/packet_simulator.zig +0 -398
- package/src/tigerbeetle/src/test/state_checker.zig +0 -169
- package/src/tigerbeetle/src/test/storage.zig +0 -864
- package/src/tigerbeetle/src/test/storage_checker.zig +0 -204
|
@@ -7,7 +7,7 @@ const math = std.math;
|
|
|
7
7
|
const constants = @import("../constants.zig");
|
|
8
8
|
|
|
9
9
|
const Message = @import("../message_pool.zig").MessagePool.Message;
|
|
10
|
-
const
|
|
10
|
+
const stdx = @import("../stdx.zig");
|
|
11
11
|
const vsr = @import("../vsr.zig");
|
|
12
12
|
const Header = vsr.Header;
|
|
13
13
|
const IOPS = @import("../iops.zig").IOPS;
|
|
@@ -65,7 +65,7 @@ comptime {
|
|
|
65
65
|
const Slot = struct { index: usize };
|
|
66
66
|
|
|
67
67
|
/// An inclusive, non-empty range of slots.
|
|
68
|
-
const SlotRange = struct {
|
|
68
|
+
pub const SlotRange = struct {
|
|
69
69
|
head: Slot,
|
|
70
70
|
tail: Slot,
|
|
71
71
|
|
|
@@ -76,7 +76,7 @@ const SlotRange = struct {
|
|
|
76
76
|
/// * `head < tail` → ` head··tail `
|
|
77
77
|
/// * `head > tail` → `··tail head··` (The range wraps around).
|
|
78
78
|
/// * `head = tail` → panic (Caller must handle this case separately).
|
|
79
|
-
fn contains(range: *const SlotRange, slot: Slot) bool {
|
|
79
|
+
pub fn contains(range: *const SlotRange, slot: Slot) bool {
|
|
80
80
|
// To avoid confusion, the empty range must be checked separately by the caller.
|
|
81
81
|
assert(range.head.index != range.tail.index);
|
|
82
82
|
|
|
@@ -103,7 +103,7 @@ comptime {
|
|
|
103
103
|
assert(slot_count >= headers_per_sector);
|
|
104
104
|
// The length of the prepare pipeline is the upper bound on how many ops can be
|
|
105
105
|
// reordered during a view change. See `recover_prepares_callback()` for more detail.
|
|
106
|
-
assert(slot_count > constants.
|
|
106
|
+
assert(slot_count > constants.pipeline_prepare_queue_max);
|
|
107
107
|
|
|
108
108
|
assert(headers_size > 0);
|
|
109
109
|
assert(headers_size % constants.sector_size == 0);
|
|
@@ -182,7 +182,7 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
182
182
|
}
|
|
183
183
|
};
|
|
184
184
|
|
|
185
|
-
const HeaderChunks = std.StaticBitSet(
|
|
185
|
+
const HeaderChunks = std.StaticBitSet(stdx.div_ceil(slot_count, headers_per_message));
|
|
186
186
|
|
|
187
187
|
storage: *Storage,
|
|
188
188
|
replica: u8,
|
|
@@ -204,9 +204,9 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
204
204
|
/// 2. The write of prepare 7 finishes (before prepare 6).
|
|
205
205
|
/// 3. Op 7 continues on to write the redundant headers.
|
|
206
206
|
/// Because prepare 6 is not yet written, header 6 is written as reserved.
|
|
207
|
-
/// 4. If at this point the replica crashes & restarts, slot 6 is in case `@
|
|
207
|
+
/// 4. If at this point the replica crashes & restarts, slot 6 is in case `@I`
|
|
208
208
|
/// (decision=nil) which can be locally repaired.
|
|
209
|
-
/// In contrast, if op 6's prepare header was written in step 3, it would be case `@
|
|
209
|
+
/// In contrast, if op 6's prepare header was written in step 3, it would be case `@H`,
|
|
210
210
|
/// which requires remote repair.
|
|
211
211
|
///
|
|
212
212
|
/// During recovery, store the redundant (unvalidated) headers.
|
|
@@ -358,39 +358,6 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
358
358
|
}
|
|
359
359
|
}
|
|
360
360
|
|
|
361
|
-
/// Returns whether this is a fresh database WAL; no prepares (except the root) have ever
|
|
362
|
-
/// been written. This determines whether a replica can transition immediately to normal
|
|
363
|
-
/// status, or if it needs to run recovery protocol.
|
|
364
|
-
///
|
|
365
|
-
/// Called by the replica immediately after WAL recovery completes, but before the replica
|
|
366
|
-
/// issues any I/O from handling messages.
|
|
367
|
-
pub fn is_empty(journal: *const Journal) bool {
|
|
368
|
-
assert(journal.status == .recovered);
|
|
369
|
-
assert(journal.writes.executing() == 0);
|
|
370
|
-
|
|
371
|
-
if (!journal.headers[0].valid_checksum()) return false;
|
|
372
|
-
if (journal.headers[0].operation != .root) return false;
|
|
373
|
-
|
|
374
|
-
const replica = @fieldParentPtr(Replica, "journal", journal);
|
|
375
|
-
assert(journal.headers[0].checksum == Header.root_prepare(replica.cluster).checksum);
|
|
376
|
-
assert(journal.headers[0].checksum == journal.prepare_checksums[0]);
|
|
377
|
-
assert(journal.prepare_inhabited[0]);
|
|
378
|
-
|
|
379
|
-
// If any message is faulty, we must fall back to VSR recovery protocol (i.e. treat
|
|
380
|
-
// this as a non-empty WAL) since that message may have been a prepare.
|
|
381
|
-
if (journal.faulty.count > 0) return false;
|
|
382
|
-
|
|
383
|
-
for (journal.headers[1..]) |*header| {
|
|
384
|
-
if (header.command == .prepare) return false;
|
|
385
|
-
}
|
|
386
|
-
|
|
387
|
-
for (journal.prepare_inhabited[1..]) |inhabited| {
|
|
388
|
-
if (inhabited) return false;
|
|
389
|
-
}
|
|
390
|
-
|
|
391
|
-
return true;
|
|
392
|
-
}
|
|
393
|
-
|
|
394
361
|
pub fn slot_for_op(_: *const Journal, op: u64) Slot {
|
|
395
362
|
return Slot{ .index = op % slot_count };
|
|
396
363
|
}
|
|
@@ -418,7 +385,7 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
418
385
|
|
|
419
386
|
pub fn slot_with_header(journal: *const Journal, header: *const Header) ?Slot {
|
|
420
387
|
assert(header.command == .prepare);
|
|
421
|
-
return journal.
|
|
388
|
+
return journal.slot_with_op_and_checksum(header.op, header.checksum);
|
|
422
389
|
}
|
|
423
390
|
|
|
424
391
|
/// Returns any existing header at the location indicated by header.op.
|
|
@@ -598,9 +565,12 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
598
565
|
/// Finds the latest break in headers between `op_min` and `op_max` (both inclusive).
|
|
599
566
|
/// A break is a missing header or a header not connected to the next header by hash chain.
|
|
600
567
|
/// On finding the highest break, extends the range downwards to cover as much as possible.
|
|
601
|
-
///
|
|
602
|
-
///
|
|
568
|
+
///
|
|
569
|
+
/// We expect that `op_max` (`replica.op`) must exist.
|
|
570
|
+
/// `op_min` may exist or not.
|
|
571
|
+
///
|
|
603
572
|
/// A range will never include `op_max` because this must be up to date as the latest op.
|
|
573
|
+
/// A range may include `op_min`.
|
|
604
574
|
/// We must therefore first resolve any op uncertainty so that we can trust `op_max` here.
|
|
605
575
|
///
|
|
606
576
|
/// For example: If ops 3, 9 and 10 are missing, returns: `{ .op_min = 9, .op_max = 10 }`.
|
|
@@ -612,6 +582,8 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
612
582
|
op_min: u64,
|
|
613
583
|
op_max: u64,
|
|
614
584
|
) ?HeaderRange {
|
|
585
|
+
assert(journal.status == .recovered);
|
|
586
|
+
assert(journal.header_with_op(op_max) != null);
|
|
615
587
|
assert(op_max >= op_min);
|
|
616
588
|
assert(op_max - op_min + 1 <= slot_count);
|
|
617
589
|
var range: ?HeaderRange = null;
|
|
@@ -644,7 +616,6 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
644
616
|
} else if (a.checksum == b.parent) {
|
|
645
617
|
// A is connected to B, but B is disconnected, add A to range:
|
|
646
618
|
assert(a.view <= b.view);
|
|
647
|
-
assert(a.op > op_min);
|
|
648
619
|
r.op_min = a.op;
|
|
649
620
|
} else if (a.view < b.view) {
|
|
650
621
|
// A is not connected to B, and A is older than B, add A to range:
|
|
@@ -661,7 +632,6 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
661
632
|
assert(a.view <= b.view);
|
|
662
633
|
} else if (a.view != b.view) {
|
|
663
634
|
// A is not connected to B, open range:
|
|
664
|
-
assert(a.op > op_min);
|
|
665
635
|
assert(b.op <= op_max);
|
|
666
636
|
range = .{ .op_min = a.op, .op_max = a.op };
|
|
667
637
|
} else {
|
|
@@ -680,7 +650,6 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
680
650
|
}
|
|
681
651
|
}
|
|
682
652
|
} else {
|
|
683
|
-
assert(op > op_min);
|
|
684
653
|
assert(op < op_max);
|
|
685
654
|
|
|
686
655
|
// A does not exist, or A has an older (or newer if reordered) op number:
|
|
@@ -699,8 +668,7 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
699
668
|
}
|
|
700
669
|
|
|
701
670
|
if (range) |r| {
|
|
702
|
-
|
|
703
|
-
assert(r.op_min > op_min);
|
|
671
|
+
assert(r.op_min >= op_min);
|
|
704
672
|
// We can never repair op_max (replica.op) since that is the latest op:
|
|
705
673
|
// We can assume this because any existing view jump barrier must first be resolved.
|
|
706
674
|
assert(r.op_max < op_max);
|
|
@@ -1019,7 +987,7 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
1019
987
|
// that are invalid or corrupt). As the prepares are recovered, these will be replaced
|
|
1020
988
|
// or removed as necessary.
|
|
1021
989
|
const chunk_headers = std.mem.bytesAsSlice(Header, chunk_buffer);
|
|
1022
|
-
|
|
990
|
+
stdx.copy_disjoint(
|
|
1023
991
|
.exact,
|
|
1024
992
|
Header,
|
|
1025
993
|
journal.headers_redundant[chunk_index * headers_per_message ..][0..chunk_headers.len],
|
|
@@ -1176,17 +1144,17 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
1176
1144
|
///
|
|
1177
1145
|
/// Recovery decision table:
|
|
1178
1146
|
///
|
|
1179
|
-
/// label @A @B @C @D @E @F @G @H @I @J @K @L @M
|
|
1180
|
-
/// header valid 0 1 1 0 0 0 1 1 1 1 1 1 1
|
|
1181
|
-
/// header reserved _ 1 0 _ _ _ 1
|
|
1182
|
-
/// prepare valid 0 0 0 1 1 1 1 1 1 1 1 1 1
|
|
1183
|
-
/// prepare reserved _ _ _ 1 0 0 0
|
|
1184
|
-
/// prepare.op is maximum _ _ _ _ 0 1
|
|
1185
|
-
/// match checksum _ _ _ _ _ _ _ _
|
|
1186
|
-
/// match op _ _ _ _ _ _ _ _
|
|
1187
|
-
/// match view _ _ _ _ _ _ _ _
|
|
1188
|
-
/// decision (replicas>1) vsr vsr vsr vsr vsr fix
|
|
1189
|
-
/// decision (replicas=1) fix fix
|
|
1147
|
+
/// label @A @B @C @D @E @F @G @H @I @J @K @L @M
|
|
1148
|
+
/// header valid 0 1 1 0 0 0 1 1 1 1 1 1 1
|
|
1149
|
+
/// header reserved _ 1 0 _ _ _ 1 0 1 0 0 0 0
|
|
1150
|
+
/// prepare valid 0 0 0 1 1 1 1 1 1 1 1 1 1
|
|
1151
|
+
/// prepare reserved _ _ _ 1 0 0 0 1 1 0 0 0 0
|
|
1152
|
+
/// prepare.op is maximum _ _ _ _ 0 1 1 _ _ _ _ _ _
|
|
1153
|
+
/// match checksum _ _ _ _ _ _ _ _ !1 0 0 0 1
|
|
1154
|
+
/// match op _ _ _ _ _ _ _ _ !1 < > 1 !1
|
|
1155
|
+
/// match view _ _ _ _ _ _ _ _ !1 _ _ !0 !1
|
|
1156
|
+
/// decision (replicas>1) vsr vsr vsr vsr vsr fix fix vsr nil fix vsr vsr eql
|
|
1157
|
+
/// decision (replicas=1) fix fix
|
|
1190
1158
|
///
|
|
1191
1159
|
/// Legend:
|
|
1192
1160
|
///
|
|
@@ -1209,6 +1177,8 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
1209
1177
|
/// 4. has command=reserved or command=prepare
|
|
1210
1178
|
fn recover_slots(journal: *Journal) void {
|
|
1211
1179
|
const replica = @fieldParentPtr(Replica, "journal", journal);
|
|
1180
|
+
const log_view = replica.superblock.working.vsr_state.log_view;
|
|
1181
|
+
const view_change_headers = replica.superblock.working.vsr_headers();
|
|
1212
1182
|
|
|
1213
1183
|
assert(journal.status == .recovering);
|
|
1214
1184
|
assert(journal.reads.executing() == 0);
|
|
@@ -1216,8 +1186,32 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
1216
1186
|
assert(journal.dirty.count == slot_count);
|
|
1217
1187
|
assert(journal.faulty.count == slot_count);
|
|
1218
1188
|
|
|
1189
|
+
// Discard headers which we are certain do not belong in the current log_view.
|
|
1190
|
+
// - This ensures that we don't accidentally set our new head op to be a message
|
|
1191
|
+
// which was truncated but not yet overwritten.
|
|
1192
|
+
// - This is also necessary to ensure that generated DVC's headers are complete.
|
|
1193
|
+
//
|
|
1194
|
+
// It is essential that this is performed before we compute the op_max so that the
|
|
1195
|
+
// recovery cases apply correctly.
|
|
1196
|
+
for ([_][]align(constants.sector_size) Header{
|
|
1197
|
+
journal.headers_redundant,
|
|
1198
|
+
journal.headers,
|
|
1199
|
+
}) |headers| {
|
|
1200
|
+
for (headers) |*header_untrusted, index| {
|
|
1201
|
+
const slot = Slot{ .index = index };
|
|
1202
|
+
if (header_ok(replica.cluster, slot, header_untrusted)) |header| {
|
|
1203
|
+
var view_range = view_change_headers.view_for_op(header.op, log_view);
|
|
1204
|
+
view_range.max = std.math.min(view_range.max, log_view);
|
|
1205
|
+
|
|
1206
|
+
if (header.command == .prepare and !view_range.contains(header.view)) {
|
|
1207
|
+
header_untrusted.* = Header.reserved(replica.cluster, index);
|
|
1208
|
+
}
|
|
1209
|
+
}
|
|
1210
|
+
}
|
|
1211
|
+
}
|
|
1212
|
+
|
|
1219
1213
|
const prepare_op_max = std.math.max(
|
|
1220
|
-
replica.op_checkpoint,
|
|
1214
|
+
replica.op_checkpoint(),
|
|
1221
1215
|
op_maximum_headers_untrusted(replica.cluster, journal.headers),
|
|
1222
1216
|
);
|
|
1223
1217
|
|
|
@@ -1232,7 +1226,7 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
1232
1226
|
|
|
1233
1227
|
// `prepare_checksums` improves the availability of `request_prepare` by being more
|
|
1234
1228
|
// flexible than `headers` regarding the prepares it references. It may hold a
|
|
1235
|
-
// prepare whose redundant header is broken, as long as the prepare
|
|
1229
|
+
// prepare whose redundant header is broken, as long as the prepare itself is valid.
|
|
1236
1230
|
if (prepare != null and prepare.?.command == .prepare) {
|
|
1237
1231
|
assert(!journal.prepare_inhabited[index]);
|
|
1238
1232
|
journal.prepare_inhabited[index] = true;
|
|
@@ -1255,7 +1249,7 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
1255
1249
|
for (cases) |case, index| journal.recover_slot(Slot{ .index = index }, case);
|
|
1256
1250
|
assert(cases.len == slot_count);
|
|
1257
1251
|
|
|
1258
|
-
|
|
1252
|
+
stdx.copy_disjoint(.exact, Header, journal.headers_redundant, journal.headers);
|
|
1259
1253
|
|
|
1260
1254
|
log.debug("{}: recover_slots: dirty={} faulty={}", .{
|
|
1261
1255
|
journal.replica,
|
|
@@ -1285,7 +1279,7 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
1285
1279
|
|
|
1286
1280
|
const op_max = op_maximum_headers_untrusted(replica.cluster, journal.headers_redundant);
|
|
1287
1281
|
if (op_max != op_maximum_headers_untrusted(replica.cluster, journal.headers)) return null;
|
|
1288
|
-
if (op_max < replica.op_checkpoint) return null;
|
|
1282
|
+
if (op_max < replica.op_checkpoint()) return null;
|
|
1289
1283
|
// We can't assume that the header at `op_max` is a prepare — an empty journal with a
|
|
1290
1284
|
// corrupt root prepare (op_max=0) will be repaired later.
|
|
1291
1285
|
|
|
@@ -1316,7 +1310,7 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
1316
1310
|
// unless the prepare header was lost, in which case this slot may also not be torn.
|
|
1317
1311
|
}
|
|
1318
1312
|
|
|
1319
|
-
const checkpoint_index = journal.slot_for_op(replica.op_checkpoint).index;
|
|
1313
|
+
const checkpoint_index = journal.slot_for_op(replica.op_checkpoint()).index;
|
|
1320
1314
|
const known_range = SlotRange{
|
|
1321
1315
|
.head = Slot{ .index = checkpoint_index },
|
|
1322
1316
|
.tail = torn_slot,
|
|
@@ -1334,13 +1328,13 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
1334
1328
|
// truncate).
|
|
1335
1329
|
//
|
|
1336
1330
|
// When the checkpoint and torn op are in the same slot, then we can only be certain
|
|
1337
|
-
// if there are no faults other than the torn op
|
|
1331
|
+
// if there are no faults other than the torn op itself.
|
|
1338
1332
|
for (cases) |case, index| {
|
|
1339
1333
|
// Do not use `faulty.bit()` because the decisions have not been processed yet.
|
|
1340
1334
|
if (case.decision(replica.replica_count) == .vsr) {
|
|
1341
1335
|
if (checkpoint_index == torn_slot.index) {
|
|
1342
|
-
assert(op_max >= replica.op_checkpoint);
|
|
1343
|
-
assert(torn_op > replica.op_checkpoint);
|
|
1336
|
+
assert(op_max >= replica.op_checkpoint());
|
|
1337
|
+
assert(torn_op > replica.op_checkpoint());
|
|
1344
1338
|
if (index != torn_slot.index) return null;
|
|
1345
1339
|
} else {
|
|
1346
1340
|
if (!known_range.contains(Slot{ .index = index })) return null;
|
|
@@ -1393,12 +1387,12 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
1393
1387
|
journal.faulty.clear(slot);
|
|
1394
1388
|
assert(journal.dirty.bit(slot));
|
|
1395
1389
|
if (replica.replica_count == 1) {
|
|
1396
|
-
// @D, @E, @F, @G, @
|
|
1390
|
+
// @D, @E, @F, @G, @J
|
|
1397
1391
|
} else {
|
|
1398
1392
|
assert(prepare.?.command == .prepare);
|
|
1399
1393
|
assert(journal.prepare_inhabited[slot.index]);
|
|
1400
1394
|
assert(journal.prepare_checksums[slot.index] == prepare.?.checksum);
|
|
1401
|
-
// @F, @
|
|
1395
|
+
// @F, @G, @J
|
|
1402
1396
|
}
|
|
1403
1397
|
},
|
|
1404
1398
|
.vsr => {
|
|
@@ -1439,6 +1433,7 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
1439
1433
|
|
|
1440
1434
|
/// Repair the redundant headers for slots with decision=fix, one sector at a time.
|
|
1441
1435
|
fn recover_fix(journal: *Journal) void {
|
|
1436
|
+
const replica = @fieldParentPtr(Replica, "journal", journal);
|
|
1442
1437
|
assert(journal.status == .recovering);
|
|
1443
1438
|
assert(journal.writes.executing() == 0);
|
|
1444
1439
|
assert(journal.dirty.count >= journal.faulty.count);
|
|
@@ -1448,6 +1443,15 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
1448
1443
|
var dirty_iterator = journal.dirty.bits.iterator(.{ .kind = .set });
|
|
1449
1444
|
while (dirty_iterator.next()) |dirty_slot| {
|
|
1450
1445
|
if (journal.faulty.bit(Slot{ .index = dirty_slot })) continue;
|
|
1446
|
+
if (journal.prepare_inhabited[dirty_slot]) {
|
|
1447
|
+
assert(journal.prepare_checksums[dirty_slot] ==
|
|
1448
|
+
journal.headers[dirty_slot].checksum);
|
|
1449
|
+
assert(journal.prepare_checksums[dirty_slot] ==
|
|
1450
|
+
journal.headers_redundant[dirty_slot].checksum);
|
|
1451
|
+
} else {
|
|
1452
|
+
// Case @D for R=1.
|
|
1453
|
+
assert(replica.replica_count == 1);
|
|
1454
|
+
}
|
|
1451
1455
|
|
|
1452
1456
|
const dirty_slot_sector = @divFloor(dirty_slot, headers_per_sector);
|
|
1453
1457
|
if (fix_sector) |fix_sector_| {
|
|
@@ -1538,7 +1542,7 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
1538
1542
|
|
|
1539
1543
|
for (journal.headers) |*header, index| {
|
|
1540
1544
|
// We must remove the header regardless of whether it is a prepare or reserved,
|
|
1541
|
-
// since a reserved header may have been marked faulty for case @
|
|
1545
|
+
// since a reserved header may have been marked faulty for case @H, and
|
|
1542
1546
|
// since the caller expects the WAL to be truncated, with clean slots.
|
|
1543
1547
|
if (header.op >= op_min) {
|
|
1544
1548
|
// TODO Explore scenarios where the data on disk may resurface after a crash.
|
|
@@ -1616,11 +1620,12 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
1616
1620
|
assert(message.header.size >= @sizeOf(Header));
|
|
1617
1621
|
assert(message.header.size <= message.buffer.len);
|
|
1618
1622
|
assert(journal.has(message.header));
|
|
1623
|
+
assert(!journal.writing(message.header.op, message.header.checksum));
|
|
1619
1624
|
assert(replica.replica_count != 1 or journal.writes.executing() == 0);
|
|
1620
1625
|
|
|
1621
1626
|
// The underlying header memory must be owned by the buffer and not by journal.headers:
|
|
1622
1627
|
// Otherwise, concurrent writes may modify the memory of the pointer while we write.
|
|
1623
|
-
assert(@ptrToInt(message.header) == @ptrToInt(message.buffer
|
|
1628
|
+
assert(@ptrToInt(message.header) == @ptrToInt(message.buffer));
|
|
1624
1629
|
|
|
1625
1630
|
const slot = journal.slot_with_header(message.header).?;
|
|
1626
1631
|
|
|
@@ -1657,12 +1662,10 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
1657
1662
|
const buffer = message.buffer[0..vsr.sector_ceil(message.header.size)];
|
|
1658
1663
|
const offset = Ring.prepares.offset(slot);
|
|
1659
1664
|
|
|
1660
|
-
|
|
1661
|
-
|
|
1662
|
-
|
|
1663
|
-
|
|
1664
|
-
assert(sum_of_sector_padding_bytes == 0);
|
|
1665
|
-
}
|
|
1665
|
+
// Assert that any sector padding has already been zeroed:
|
|
1666
|
+
var sum_of_sector_padding_bytes: u8 = 0;
|
|
1667
|
+
for (buffer[message.header.size..]) |byte| sum_of_sector_padding_bytes |= byte;
|
|
1668
|
+
assert(sum_of_sector_padding_bytes == 0);
|
|
1666
1669
|
|
|
1667
1670
|
journal.prepare_inhabited[slot.index] = false;
|
|
1668
1671
|
journal.prepare_checksums[slot.index] = 0;
|
|
@@ -1765,9 +1768,17 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
1765
1768
|
return;
|
|
1766
1769
|
}
|
|
1767
1770
|
|
|
1771
|
+
const slot = journal.slot_with_header(message.header).?;
|
|
1772
|
+
if (!journal.prepare_inhabited[slot.index] or
|
|
1773
|
+
journal.prepare_checksums[slot.index] != message.header.checksum)
|
|
1774
|
+
{
|
|
1775
|
+
journal.write_prepare_debug(message.header, "entry changed twice while writing headers");
|
|
1776
|
+
journal.write_prepare_release(write, null);
|
|
1777
|
+
return;
|
|
1778
|
+
}
|
|
1779
|
+
|
|
1768
1780
|
journal.write_prepare_debug(message.header, "complete, marking clean");
|
|
1769
1781
|
|
|
1770
|
-
const slot = journal.slot_with_header(message.header).?;
|
|
1771
1782
|
journal.dirty.clear(slot);
|
|
1772
1783
|
journal.faulty.clear(slot);
|
|
1773
1784
|
|
|
@@ -1955,7 +1966,7 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
1955
1966
|
// This ensures that faulty headers are still faulty when they are read back
|
|
1956
1967
|
// from disk during recovery. This prevents faulty entries from changing to
|
|
1957
1968
|
// reserved (and clean) after a crash and restart (e.g. accidentally converting
|
|
1958
|
-
// a case `@D` to a `@
|
|
1969
|
+
// a case `@D` to a `@I` after a restart).
|
|
1959
1970
|
sector_headers[i] = .{
|
|
1960
1971
|
.checksum = 0,
|
|
1961
1972
|
.cluster = replica.cluster,
|
|
@@ -2005,7 +2016,7 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
2005
2016
|
/// Case @B may be caused by crashing while writing the prepare (torn write).
|
|
2006
2017
|
///
|
|
2007
2018
|
/// @D:
|
|
2008
|
-
/// This is possibly a torn
|
|
2019
|
+
/// This is possibly a torn write to the redundant headers, so when replica_count=1 we must
|
|
2009
2020
|
/// repair this locally. The probability that this results in an incorrect recovery is:
|
|
2010
2021
|
/// P(crash during first WAL wrap)
|
|
2011
2022
|
/// × P(redundant header is corrupt)
|
|
@@ -2021,7 +2032,7 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
2021
2032
|
/// misdirected.
|
|
2022
2033
|
///
|
|
2023
2034
|
///
|
|
2024
|
-
/// @F and @
|
|
2035
|
+
/// @F and @G:
|
|
2025
2036
|
/// The replica is recovering from a crash after writing the prepare, but before writing the
|
|
2026
2037
|
/// redundant header.
|
|
2027
2038
|
///
|
|
@@ -2029,22 +2040,23 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
2029
2040
|
/// @G:
|
|
2030
2041
|
/// One of:
|
|
2031
2042
|
///
|
|
2043
|
+
/// * The prepare was written, but then truncated, so the redundant header was written as reserved.
|
|
2032
2044
|
/// * A misdirected read to a reserved header.
|
|
2033
2045
|
/// * The redundant header's write was lost or misdirected.
|
|
2034
2046
|
///
|
|
2035
|
-
///
|
|
2047
|
+
/// There is a risk of data loss in the case of 2 lost writes.
|
|
2036
2048
|
///
|
|
2037
2049
|
///
|
|
2038
|
-
/// @
|
|
2050
|
+
/// @H:
|
|
2039
2051
|
/// The redundant header is present & valid, but the corresponding prepare was a lost or misdirected
|
|
2040
2052
|
/// read or write.
|
|
2041
2053
|
///
|
|
2042
2054
|
///
|
|
2043
|
-
/// @
|
|
2055
|
+
/// @I:
|
|
2044
2056
|
/// This slot is legitimately reserved — this may be the first fill of the log.
|
|
2045
2057
|
///
|
|
2046
2058
|
///
|
|
2047
|
-
/// @
|
|
2059
|
+
/// @J and @K:
|
|
2048
2060
|
/// When the redundant header & prepare header are both valid but distinct ops, always pick the
|
|
2049
2061
|
/// higher op.
|
|
2050
2062
|
///
|
|
@@ -2056,21 +2068,21 @@ pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
|
2056
2068
|
/// The length of the prepare pipeline is the upper bound on how many ops can be reordered during a
|
|
2057
2069
|
/// view change.
|
|
2058
2070
|
///
|
|
2059
|
-
/// @
|
|
2071
|
+
/// @J:
|
|
2060
2072
|
/// When the higher op belongs to the prepare, repair locally.
|
|
2061
2073
|
/// The most likely cause for this case is that the log wrapped, but the redundant header write was
|
|
2062
2074
|
/// lost.
|
|
2063
2075
|
///
|
|
2064
|
-
/// @
|
|
2076
|
+
/// @K:
|
|
2065
2077
|
/// When the higher op belongs to the header, mark faulty.
|
|
2066
2078
|
///
|
|
2067
2079
|
///
|
|
2068
|
-
/// @
|
|
2080
|
+
/// @L:
|
|
2069
2081
|
/// The message was rewritten due to a view change.
|
|
2070
2082
|
/// A single-replica cluster doesn't ever change views.
|
|
2071
2083
|
///
|
|
2072
2084
|
///
|
|
2073
|
-
/// @
|
|
2085
|
+
/// @M:
|
|
2074
2086
|
/// The redundant header matches the message's header.
|
|
2075
2087
|
/// This is the usual case: both the prepare and header are correct and equivalent.
|
|
2076
2088
|
const recovery_cases = table: {
|
|
@@ -2102,14 +2114,13 @@ const recovery_cases = table: {
|
|
|
2102
2114
|
Case.init("@D", .vsr, .fix, .{ _0, __, _1, _1, __, __, __, __, __ }),
|
|
2103
2115
|
Case.init("@E", .vsr, .fix, .{ _0, __, _1, _0, _0, __, __, __, __ }),
|
|
2104
2116
|
Case.init("@F", .fix, .fix, .{ _0, __, _1, _0, _1, __, __, __, __ }),
|
|
2105
|
-
Case.init("@G", .
|
|
2106
|
-
Case.init("@H", .
|
|
2107
|
-
Case.init("@I", .
|
|
2108
|
-
Case.init("@J", .
|
|
2109
|
-
Case.init("@K", .
|
|
2110
|
-
Case.init("@L", .vsr, .vsr, .{ _1, _0, _1, _0, __, _0,
|
|
2111
|
-
Case.init("@M", .
|
|
2112
|
-
Case.init("@N", .eql, .eql, .{ _1, _0, _1, _0, __, _1, a1, a0, a1 }), // normal path: prepare
|
|
2117
|
+
Case.init("@G", .fix, .fix, .{ _1, _1, _1, _0, __, __, __, __, __ }),
|
|
2118
|
+
Case.init("@H", .vsr, .vsr, .{ _1, _0, _1, _1, __, __, __, __, __ }),
|
|
2119
|
+
Case.init("@I", .nil, .nil, .{ _1, _1, _1, _1, __, a1, a1, a0, a1 }), // normal path: reserved
|
|
2120
|
+
Case.init("@J", .fix, .fix, .{ _1, _0, _1, _0, __, _0, _0, _1, __ }), // header.op < prepare.op
|
|
2121
|
+
Case.init("@K", .vsr, .vsr, .{ _1, _0, _1, _0, __, _0, _0, _0, __ }), // header.op > prepare.op
|
|
2122
|
+
Case.init("@L", .vsr, .vsr, .{ _1, _0, _1, _0, __, _0, _1, a0, a0 }),
|
|
2123
|
+
Case.init("@M", .eql, .eql, .{ _1, _0, _1, _0, __, _1, a1, a0, a1 }), // normal path: prepare
|
|
2113
2124
|
};
|
|
2114
2125
|
};
|
|
2115
2126
|
|
|
@@ -4,10 +4,10 @@ const assert = std.debug.assert;
|
|
|
4
4
|
const log = std.log.scoped(.fuzz_journal_format);
|
|
5
5
|
|
|
6
6
|
const constants = @import("../constants.zig");
|
|
7
|
-
const
|
|
7
|
+
const stdx = @import("../stdx.zig");
|
|
8
8
|
const vsr = @import("../vsr.zig");
|
|
9
9
|
const journal = @import("./journal.zig");
|
|
10
|
-
const fuzz = @import("../
|
|
10
|
+
const fuzz = @import("../testing/fuzz.zig");
|
|
11
11
|
|
|
12
12
|
pub const tigerbeetle_config = @import("../config.zig").configs.test_min;
|
|
13
13
|
|