tigerbeetle-node 0.9.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +305 -103
- package/dist/index.d.ts +70 -67
- package/dist/index.js +70 -67
- package/dist/index.js.map +1 -1
- package/package.json +6 -6
- package/scripts/download_node_headers.sh +14 -7
- package/src/index.ts +11 -10
- package/src/node.zig +22 -20
- package/src/tigerbeetle/scripts/benchmark.bat +4 -3
- package/src/tigerbeetle/scripts/benchmark.sh +25 -10
- package/src/tigerbeetle/scripts/confirm_image.sh +44 -0
- package/src/tigerbeetle/scripts/fuzz_loop.sh +15 -0
- package/src/tigerbeetle/scripts/fuzz_unique_errors.sh +7 -0
- package/src/tigerbeetle/scripts/install.sh +20 -4
- package/src/tigerbeetle/scripts/install_zig.bat +5 -1
- package/src/tigerbeetle/scripts/install_zig.sh +32 -26
- package/src/tigerbeetle/scripts/pre-commit.sh +9 -0
- package/src/tigerbeetle/scripts/shellcheck.sh +5 -0
- package/src/tigerbeetle/scripts/tests_on_alpine.sh +10 -0
- package/src/tigerbeetle/scripts/tests_on_ubuntu.sh +14 -0
- package/src/tigerbeetle/scripts/upgrade_ubuntu_kernel.sh +12 -3
- package/src/tigerbeetle/src/benchmark.zig +19 -9
- package/src/tigerbeetle/src/benchmark_array_search.zig +317 -0
- package/src/tigerbeetle/src/benchmarks/perf.zig +299 -0
- package/src/tigerbeetle/src/c/tb_client/context.zig +103 -0
- package/src/tigerbeetle/src/c/tb_client/packet.zig +80 -0
- package/src/tigerbeetle/src/c/tb_client/signal.zig +288 -0
- package/src/tigerbeetle/src/c/tb_client/thread.zig +328 -0
- package/src/tigerbeetle/src/c/tb_client.h +221 -0
- package/src/tigerbeetle/src/c/tb_client.zig +104 -0
- package/src/tigerbeetle/src/c/test.zig +1 -0
- package/src/tigerbeetle/src/cli.zig +143 -84
- package/src/tigerbeetle/src/config.zig +161 -20
- package/src/tigerbeetle/src/demo.zig +14 -8
- package/src/tigerbeetle/src/demo_05_post_pending_transfers.zig +2 -2
- package/src/tigerbeetle/src/ewah.zig +318 -0
- package/src/tigerbeetle/src/ewah_benchmark.zig +121 -0
- package/src/tigerbeetle/src/eytzinger_benchmark.zig +317 -0
- package/src/tigerbeetle/src/fifo.zig +17 -1
- package/src/tigerbeetle/src/io/darwin.zig +12 -10
- package/src/tigerbeetle/src/io/linux.zig +25 -9
- package/src/tigerbeetle/src/io/windows.zig +13 -9
- package/src/tigerbeetle/src/iops.zig +101 -0
- package/src/tigerbeetle/src/lsm/README.md +214 -0
- package/src/tigerbeetle/src/lsm/binary_search.zig +341 -0
- package/src/tigerbeetle/src/lsm/bloom_filter.zig +125 -0
- package/src/tigerbeetle/src/lsm/compaction.zig +557 -0
- package/src/tigerbeetle/src/lsm/composite_key.zig +77 -0
- package/src/tigerbeetle/src/lsm/direction.zig +11 -0
- package/src/tigerbeetle/src/lsm/eytzinger.zig +587 -0
- package/src/tigerbeetle/src/lsm/forest.zig +204 -0
- package/src/tigerbeetle/src/lsm/forest_fuzz.zig +412 -0
- package/src/tigerbeetle/src/lsm/grid.zig +549 -0
- package/src/tigerbeetle/src/lsm/groove.zig +1002 -0
- package/src/tigerbeetle/src/lsm/k_way_merge.zig +474 -0
- package/src/tigerbeetle/src/lsm/level_iterator.zig +315 -0
- package/src/tigerbeetle/src/lsm/manifest.zig +580 -0
- package/src/tigerbeetle/src/lsm/manifest_level.zig +925 -0
- package/src/tigerbeetle/src/lsm/manifest_log.zig +953 -0
- package/src/tigerbeetle/src/lsm/node_pool.zig +231 -0
- package/src/tigerbeetle/src/lsm/posted_groove.zig +387 -0
- package/src/tigerbeetle/src/lsm/segmented_array.zig +1318 -0
- package/src/tigerbeetle/src/lsm/segmented_array_benchmark.zig +148 -0
- package/src/tigerbeetle/src/lsm/segmented_array_fuzz.zig +9 -0
- package/src/tigerbeetle/src/lsm/set_associative_cache.zig +894 -0
- package/src/tigerbeetle/src/lsm/table.zig +967 -0
- package/src/tigerbeetle/src/lsm/table_immutable.zig +203 -0
- package/src/tigerbeetle/src/lsm/table_iterator.zig +306 -0
- package/src/tigerbeetle/src/lsm/table_mutable.zig +174 -0
- package/src/tigerbeetle/src/lsm/test.zig +423 -0
- package/src/tigerbeetle/src/lsm/tree.zig +1090 -0
- package/src/tigerbeetle/src/lsm/tree_fuzz.zig +457 -0
- package/src/tigerbeetle/src/main.zig +141 -109
- package/src/tigerbeetle/src/message_bus.zig +49 -48
- package/src/tigerbeetle/src/message_pool.zig +22 -12
- package/src/tigerbeetle/src/ring_buffer.zig +126 -30
- package/src/tigerbeetle/src/simulator.zig +205 -140
- package/src/tigerbeetle/src/state_machine.zig +1268 -721
- package/src/tigerbeetle/src/static_allocator.zig +65 -0
- package/src/tigerbeetle/src/storage.zig +40 -14
- package/src/tigerbeetle/src/test/accounting/auditor.zig +577 -0
- package/src/tigerbeetle/src/test/accounting/workload.zig +819 -0
- package/src/tigerbeetle/src/test/cluster.zig +104 -88
- package/src/tigerbeetle/src/test/conductor.zig +365 -0
- package/src/tigerbeetle/src/test/fuzz.zig +121 -0
- package/src/tigerbeetle/src/test/id.zig +89 -0
- package/src/tigerbeetle/src/test/message_bus.zig +15 -24
- package/src/tigerbeetle/src/test/network.zig +26 -17
- package/src/tigerbeetle/src/test/priority_queue.zig +645 -0
- package/src/tigerbeetle/src/test/state_checker.zig +94 -68
- package/src/tigerbeetle/src/test/state_machine.zig +135 -69
- package/src/tigerbeetle/src/test/storage.zig +78 -28
- package/src/tigerbeetle/src/tigerbeetle.zig +19 -16
- package/src/tigerbeetle/src/unit_tests.zig +15 -0
- package/src/tigerbeetle/src/util.zig +51 -0
- package/src/tigerbeetle/src/vopr.zig +494 -0
- package/src/tigerbeetle/src/vopr_hub/README.md +58 -0
- package/src/tigerbeetle/src/vopr_hub/SETUP.md +199 -0
- package/src/tigerbeetle/src/vopr_hub/go.mod +3 -0
- package/src/tigerbeetle/src/vopr_hub/main.go +1022 -0
- package/src/tigerbeetle/src/vopr_hub/scheduler/go.mod +3 -0
- package/src/tigerbeetle/src/vopr_hub/scheduler/main.go +403 -0
- package/src/tigerbeetle/src/vsr/client.zig +34 -7
- package/src/tigerbeetle/src/vsr/journal.zig +164 -174
- package/src/tigerbeetle/src/vsr/replica.zig +1602 -651
- package/src/tigerbeetle/src/vsr/superblock.zig +1761 -0
- package/src/tigerbeetle/src/vsr/superblock_client_table.zig +255 -0
- package/src/tigerbeetle/src/vsr/superblock_free_set.zig +644 -0
- package/src/tigerbeetle/src/vsr/superblock_manifest.zig +561 -0
- package/src/tigerbeetle/src/vsr.zig +118 -170
- package/src/tigerbeetle/scripts/vopr.bat +0 -48
- package/src/tigerbeetle/scripts/vopr.sh +0 -33
|
@@ -9,10 +9,11 @@ const config = @import("../config.zig");
|
|
|
9
9
|
const Message = @import("../message_pool.zig").MessagePool.Message;
|
|
10
10
|
const vsr = @import("../vsr.zig");
|
|
11
11
|
const Header = vsr.Header;
|
|
12
|
+
const IOPS = @import("../iops.zig").IOPS;
|
|
12
13
|
|
|
13
14
|
const log = std.log.scoped(.journal);
|
|
14
15
|
|
|
15
|
-
/// There are two contiguous circular buffers on disk in the journal storage zone.
|
|
16
|
+
/// There are two contiguous circular buffers on disk in the journal storage zone (`vsr.Zone.wal`).
|
|
16
17
|
///
|
|
17
18
|
/// In both rings, the `op` for each reserved header is set to the slot index.
|
|
18
19
|
/// This helps WAL recovery detect misdirected reads/writes.
|
|
@@ -215,7 +216,8 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
215
216
|
recovering: bool = false,
|
|
216
217
|
|
|
217
218
|
pub fn init(allocator: Allocator, storage: *Storage, replica: u8) !Self {
|
|
218
|
-
|
|
219
|
+
// TODO Fix this assertion:
|
|
220
|
+
// assert(write_ahead_log_zone_size <= storage.size);
|
|
219
221
|
|
|
220
222
|
var headers = try allocator.allocAdvanced(
|
|
221
223
|
Header,
|
|
@@ -324,11 +326,11 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
324
326
|
assert(!self.recovering);
|
|
325
327
|
assert(self.recovered);
|
|
326
328
|
assert(self.writes.executing() == 0);
|
|
327
|
-
assert(self.headers[0].valid_checksum());
|
|
328
329
|
|
|
329
|
-
|
|
330
|
+
if (!self.headers[0].valid_checksum()) return false;
|
|
330
331
|
if (self.headers[0].operation != .root) return false;
|
|
331
332
|
|
|
333
|
+
const replica = @fieldParentPtr(Replica, "journal", self);
|
|
332
334
|
assert(self.headers[0].checksum == Header.root_prepare(replica.cluster).checksum);
|
|
333
335
|
assert(self.headers[0].checksum == self.prepare_checksums[0]);
|
|
334
336
|
assert(self.prepare_inhabited[0]);
|
|
@@ -378,14 +380,15 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
378
380
|
return self.slot_with_op(header.op);
|
|
379
381
|
}
|
|
380
382
|
|
|
381
|
-
/// Returns any existing
|
|
382
|
-
///
|
|
383
|
-
pub fn
|
|
383
|
+
/// Returns any existing header at the location indicated by header.op.
|
|
384
|
+
/// The existing header may have an older or newer op number.
|
|
385
|
+
pub fn header_for_prepare(self: *const Self, header: *const Header) ?*const Header {
|
|
384
386
|
assert(header.command == .prepare);
|
|
385
387
|
return self.header_for_op(header.op);
|
|
386
388
|
}
|
|
387
389
|
|
|
388
390
|
/// We use `op` directly to index into the headers array and locate ops without a scan.
|
|
391
|
+
/// The existing header may have an older or newer op number.
|
|
389
392
|
pub fn header_for_op(self: *const Self, op: u64) ?*const Header {
|
|
390
393
|
// TODO Snapshots
|
|
391
394
|
const slot = self.slot_for_op(op);
|
|
@@ -508,7 +511,8 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
508
511
|
|
|
509
512
|
/// Copies latest headers between `op_min` and `op_max` (both inclusive) as fit in `dest`.
|
|
510
513
|
/// Reverses the order when copying so that latest headers are copied first, which protects
|
|
511
|
-
/// against the callsite slicing the buffer the wrong way and incorrectly
|
|
514
|
+
/// against the callsite slicing the buffer the wrong way and incorrectly, and which is
|
|
515
|
+
/// required by message handlers that use the hash chain for repairs.
|
|
512
516
|
/// Skips .reserved headers (gaps between headers).
|
|
513
517
|
/// Zeroes the `dest` buffer in case the copy would underflow and leave a buffer bleed.
|
|
514
518
|
/// Returns the number of headers actually copied.
|
|
@@ -668,6 +672,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
668
672
|
return range;
|
|
669
673
|
}
|
|
670
674
|
|
|
675
|
+
/// Read a prepare from disk. There must be a matching in-memory header.
|
|
671
676
|
pub fn read_prepare(
|
|
672
677
|
self: *Self,
|
|
673
678
|
callback: fn (replica: *Replica, prepare: ?*Message, destination_replica: ?u8) void,
|
|
@@ -685,40 +690,20 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
685
690
|
return;
|
|
686
691
|
}
|
|
687
692
|
|
|
688
|
-
|
|
689
|
-
// header memory may then change:
|
|
690
|
-
const exact = self.header_with_op_and_checksum(op, checksum) orelse {
|
|
693
|
+
const slot = self.slot_with_op_and_checksum(op, checksum) orelse {
|
|
691
694
|
self.read_prepare_log(op, checksum, "no entry exactly");
|
|
692
695
|
callback(replica, null, null);
|
|
693
696
|
return;
|
|
694
697
|
};
|
|
695
698
|
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
return;
|
|
703
|
-
}
|
|
704
|
-
|
|
705
|
-
if (self.dirty.bit(slot)) {
|
|
706
|
-
self.read_prepare_log(op, checksum, "dirty");
|
|
699
|
+
if (self.prepare_inhabited[slot.index] and
|
|
700
|
+
self.prepare_checksums[slot.index] == checksum)
|
|
701
|
+
{
|
|
702
|
+
self.read_prepare_with_op_and_checksum(callback, op, checksum, destination_replica);
|
|
703
|
+
} else {
|
|
704
|
+
self.read_prepare_log(op, checksum, "no matching prepare");
|
|
707
705
|
callback(replica, null, null);
|
|
708
|
-
return;
|
|
709
706
|
}
|
|
710
|
-
|
|
711
|
-
// Skip the disk read if the header is all we need:
|
|
712
|
-
if (exact.size == @sizeOf(Header)) {
|
|
713
|
-
const message = replica.message_bus.get_message();
|
|
714
|
-
defer replica.message_bus.unref(message);
|
|
715
|
-
|
|
716
|
-
message.header.* = exact.*;
|
|
717
|
-
callback(replica, message, destination_replica);
|
|
718
|
-
return;
|
|
719
|
-
}
|
|
720
|
-
|
|
721
|
-
self.read_prepare_with_op_and_checksum(callback, op, checksum, destination_replica);
|
|
722
707
|
}
|
|
723
708
|
|
|
724
709
|
/// Read a prepare from disk. There may or may not be an in-memory header.
|
|
@@ -738,6 +723,18 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
738
723
|
const message = replica.message_bus.get_message();
|
|
739
724
|
defer replica.message_bus.unref(message);
|
|
740
725
|
|
|
726
|
+
// If the header is in-memory, we can skip the read from the disk.
|
|
727
|
+
if (self.header_with_op_and_checksum(op, checksum)) |exact| {
|
|
728
|
+
if (exact.size == @sizeOf(Header)) {
|
|
729
|
+
message.header.* = exact.*;
|
|
730
|
+
// Normally the message's padding would have been zeroed by the MessageBus,
|
|
731
|
+
// but we are copying (only) a message header into a new buffer.
|
|
732
|
+
std.mem.set(u8, message.buffer[@sizeOf(Header)..config.sector_size], 0);
|
|
733
|
+
callback(replica, message, destination_replica);
|
|
734
|
+
return;
|
|
735
|
+
}
|
|
736
|
+
}
|
|
737
|
+
|
|
741
738
|
const read = self.reads.acquire() orelse {
|
|
742
739
|
self.read_prepare_log(op, checksum, "waiting for IOP");
|
|
743
740
|
callback(replica, null, null);
|
|
@@ -755,12 +752,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
755
752
|
};
|
|
756
753
|
|
|
757
754
|
const buffer: []u8 = message.buffer[0..config.message_size_max];
|
|
758
|
-
const offset =
|
|
759
|
-
|
|
760
|
-
log.debug(
|
|
761
|
-
"{}: read_sectors: offset={} len={}",
|
|
762
|
-
.{ replica.replica, offset, buffer.len },
|
|
763
|
-
);
|
|
755
|
+
const offset = offset_logical(.prepares, slot);
|
|
764
756
|
|
|
765
757
|
// Memory must not be owned by `self.headers` as these may be modified concurrently:
|
|
766
758
|
assert(@ptrToInt(buffer.ptr) < @ptrToInt(self.headers.ptr) or
|
|
@@ -771,6 +763,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
771
763
|
read_prepare_with_op_and_checksum_callback,
|
|
772
764
|
&read.completion,
|
|
773
765
|
buffer,
|
|
766
|
+
.wal,
|
|
774
767
|
offset,
|
|
775
768
|
);
|
|
776
769
|
}
|
|
@@ -818,6 +811,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
818
811
|
read.callback(replica, null, null);
|
|
819
812
|
return;
|
|
820
813
|
}
|
|
814
|
+
assert(read.message.header.invalid() == null);
|
|
821
815
|
|
|
822
816
|
if (read.message.header.cluster != replica.cluster) {
|
|
823
817
|
// This could be caused by a misdirected read or write.
|
|
@@ -940,7 +934,8 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
940
934
|
recover_headers_callback,
|
|
941
935
|
&read.completion,
|
|
942
936
|
buffer,
|
|
943
|
-
|
|
937
|
+
.wal,
|
|
938
|
+
offset,
|
|
944
939
|
);
|
|
945
940
|
}
|
|
946
941
|
|
|
@@ -987,7 +982,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
987
982
|
self.recover_headers(offset_next);
|
|
988
983
|
}
|
|
989
984
|
|
|
990
|
-
fn recover_headers_buffer(message: *Message, offset: u64) []u8 {
|
|
985
|
+
fn recover_headers_buffer(message: *Message, offset: u64) []align(@alignOf(Header)) u8 {
|
|
991
986
|
const max = std.math.min(message.buffer.len, headers_size - offset);
|
|
992
987
|
assert(max % config.sector_size == 0);
|
|
993
988
|
assert(max % @sizeOf(Header) == 0);
|
|
@@ -1034,7 +1029,8 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
1034
1029
|
// We load the entire message to verify that it isn't torn or corrupt.
|
|
1035
1030
|
// We don't know the message's size, so use the entire buffer.
|
|
1036
1031
|
message.buffer[0..config.message_size_max],
|
|
1037
|
-
|
|
1032
|
+
.wal,
|
|
1033
|
+
offset_logical(.prepares, slot),
|
|
1038
1034
|
);
|
|
1039
1035
|
}
|
|
1040
1036
|
|
|
@@ -1450,6 +1446,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
1450
1446
|
header.op,
|
|
1451
1447
|
header.checksum,
|
|
1452
1448
|
});
|
|
1449
|
+
|
|
1453
1450
|
const slot = self.slot_for_header(header);
|
|
1454
1451
|
|
|
1455
1452
|
if (self.has(header)) {
|
|
@@ -1513,7 +1510,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
1513
1510
|
|
|
1514
1511
|
// Slice the message to the nearest sector, we don't want to write the whole buffer:
|
|
1515
1512
|
const buffer = message.buffer[0..vsr.sector_ceil(message.header.size)];
|
|
1516
|
-
const offset =
|
|
1513
|
+
const offset = offset_logical(.prepares, slot);
|
|
1517
1514
|
|
|
1518
1515
|
if (builtin.mode == .Debug) {
|
|
1519
1516
|
// Assert that any sector padding has already been zeroed:
|
|
@@ -1587,9 +1584,8 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
1587
1584
|
.index = @divFloor(slot_of_message.index, headers_per_sector) * headers_per_sector,
|
|
1588
1585
|
};
|
|
1589
1586
|
|
|
1590
|
-
const offset =
|
|
1587
|
+
const offset = offset_logical(.headers, slot_of_message);
|
|
1591
1588
|
assert(offset % config.sector_size == 0);
|
|
1592
|
-
assert(offset == slot_first.index * @sizeOf(Header));
|
|
1593
1589
|
|
|
1594
1590
|
const buffer: []u8 = write.header_sector(self);
|
|
1595
1591
|
const buffer_headers = std.mem.bytesAsSlice(Header, buffer);
|
|
@@ -1739,48 +1735,28 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
1739
1735
|
.prepares => {
|
|
1740
1736
|
const offset = config.message_size_max * slot.index;
|
|
1741
1737
|
assert(offset < prepares_size);
|
|
1742
|
-
return offset;
|
|
1738
|
+
return offset + config.journal_size_headers;
|
|
1743
1739
|
},
|
|
1744
1740
|
}
|
|
1745
1741
|
}
|
|
1746
1742
|
|
|
1747
|
-
fn offset_physical(ring: Ring, slot: Slot) u64 {
|
|
1748
|
-
return switch (ring) {
|
|
1749
|
-
.headers => offset_logical(.headers, slot),
|
|
1750
|
-
.prepares => headers_size + offset_logical(.prepares, slot),
|
|
1751
|
-
};
|
|
1752
|
-
}
|
|
1753
|
-
|
|
1754
1743
|
fn offset_logical_in_headers_for_message(self: *const Self, message: *Message) u64 {
|
|
1755
1744
|
return offset_logical(.headers, self.slot_for_header(message.header));
|
|
1756
1745
|
}
|
|
1757
1746
|
|
|
1758
|
-
|
|
1759
|
-
fn offset_physical_for_logical(ring: Ring, offset: u64) u64 {
|
|
1760
|
-
switch (ring) {
|
|
1761
|
-
.headers => {
|
|
1762
|
-
assert(offset < headers_size);
|
|
1763
|
-
return offset;
|
|
1764
|
-
},
|
|
1765
|
-
.prepares => {
|
|
1766
|
-
assert(offset < prepares_size);
|
|
1767
|
-
return headers_size + offset;
|
|
1768
|
-
},
|
|
1769
|
-
}
|
|
1770
|
-
}
|
|
1771
|
-
|
|
1747
|
+
// TODO Add a `Ring` argument, and make the offset relative to that.
|
|
1772
1748
|
fn write_sectors(
|
|
1773
1749
|
self: *Self,
|
|
1774
1750
|
callback: fn (write: *Self.Write) void,
|
|
1775
1751
|
write: *Self.Write,
|
|
1776
1752
|
buffer: []const u8,
|
|
1777
|
-
|
|
1753
|
+
offset_in_wal: u64,
|
|
1778
1754
|
) void {
|
|
1779
1755
|
write.range = .{
|
|
1780
1756
|
.callback = callback,
|
|
1781
1757
|
.completion = undefined,
|
|
1782
1758
|
.buffer = buffer,
|
|
1783
|
-
.offset =
|
|
1759
|
+
.offset = offset_in_wal,
|
|
1784
1760
|
.locked = false,
|
|
1785
1761
|
};
|
|
1786
1762
|
self.lock_sectors(write);
|
|
@@ -1816,6 +1792,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
1816
1792
|
write_sectors_on_write,
|
|
1817
1793
|
&write.range.completion,
|
|
1818
1794
|
write.range.buffer,
|
|
1795
|
+
.wal,
|
|
1819
1796
|
write.range.offset,
|
|
1820
1797
|
);
|
|
1821
1798
|
// We rely on the Storage.write_sectors() implementation being always synchronous,
|
|
@@ -1856,10 +1833,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
1856
1833
|
self.lock_sectors(@fieldParentPtr(Self.Write, "range", waiting));
|
|
1857
1834
|
}
|
|
1858
1835
|
|
|
1859
|
-
|
|
1860
|
-
const callback = range.callback;
|
|
1861
|
-
range.* = undefined;
|
|
1862
|
-
callback(write);
|
|
1836
|
+
range.callback(write);
|
|
1863
1837
|
}
|
|
1864
1838
|
|
|
1865
1839
|
pub fn writing(self: *Self, op: u64, checksum: u128) bool {
|
|
@@ -1921,105 +1895,6 @@ pub const BitSet = struct {
|
|
|
1921
1895
|
}
|
|
1922
1896
|
};
|
|
1923
1897
|
|
|
1924
|
-
/// Take a u6 to limit to 64 items max (2^6 = 64)
|
|
1925
|
-
pub fn IOPS(comptime T: type, comptime size: u6) type {
|
|
1926
|
-
const Map = std.StaticBitSet(size);
|
|
1927
|
-
return struct {
|
|
1928
|
-
const Self = @This();
|
|
1929
|
-
|
|
1930
|
-
items: [size]T = undefined,
|
|
1931
|
-
/// 1 bits are free items.
|
|
1932
|
-
free: Map = Map.initFull(),
|
|
1933
|
-
|
|
1934
|
-
pub fn acquire(self: *Self) ?*T {
|
|
1935
|
-
const i = self.free.findFirstSet() orelse return null;
|
|
1936
|
-
self.free.unset(i);
|
|
1937
|
-
return &self.items[i];
|
|
1938
|
-
}
|
|
1939
|
-
|
|
1940
|
-
pub fn release(self: *Self, item: *T) void {
|
|
1941
|
-
item.* = undefined;
|
|
1942
|
-
const i = (@ptrToInt(item) - @ptrToInt(&self.items)) / @sizeOf(T);
|
|
1943
|
-
assert(!self.free.isSet(i));
|
|
1944
|
-
self.free.set(i);
|
|
1945
|
-
}
|
|
1946
|
-
|
|
1947
|
-
/// Returns the count of IOPs available.
|
|
1948
|
-
pub fn available(self: *const Self) usize {
|
|
1949
|
-
return self.free.count();
|
|
1950
|
-
}
|
|
1951
|
-
|
|
1952
|
-
/// Returns the count of IOPs in use.
|
|
1953
|
-
pub fn executing(self: *const Self) usize {
|
|
1954
|
-
return size - self.available();
|
|
1955
|
-
}
|
|
1956
|
-
|
|
1957
|
-
pub const Iterator = struct {
|
|
1958
|
-
iops: *Self,
|
|
1959
|
-
bitset_iterator: Map.Iterator(.{ .kind = .unset }),
|
|
1960
|
-
|
|
1961
|
-
pub fn next(iterator: *@This()) ?*T {
|
|
1962
|
-
const i = iterator.bitset_iterator.next() orelse return null;
|
|
1963
|
-
return &iterator.iops.items[i];
|
|
1964
|
-
}
|
|
1965
|
-
};
|
|
1966
|
-
|
|
1967
|
-
pub fn iterate(self: *Self) Iterator {
|
|
1968
|
-
return .{
|
|
1969
|
-
.iops = self,
|
|
1970
|
-
.bitset_iterator = self.free.iterator(.{ .kind = .unset }),
|
|
1971
|
-
};
|
|
1972
|
-
}
|
|
1973
|
-
};
|
|
1974
|
-
}
|
|
1975
|
-
|
|
1976
|
-
test "IOPS" {
|
|
1977
|
-
const testing = std.testing;
|
|
1978
|
-
var iops = IOPS(u32, 4){};
|
|
1979
|
-
|
|
1980
|
-
try testing.expectEqual(@as(usize, 4), iops.available());
|
|
1981
|
-
try testing.expectEqual(@as(usize, 0), iops.executing());
|
|
1982
|
-
|
|
1983
|
-
var one = iops.acquire().?;
|
|
1984
|
-
|
|
1985
|
-
try testing.expectEqual(@as(usize, 3), iops.available());
|
|
1986
|
-
try testing.expectEqual(@as(usize, 1), iops.executing());
|
|
1987
|
-
|
|
1988
|
-
var two = iops.acquire().?;
|
|
1989
|
-
var three = iops.acquire().?;
|
|
1990
|
-
|
|
1991
|
-
try testing.expectEqual(@as(usize, 1), iops.available());
|
|
1992
|
-
try testing.expectEqual(@as(usize, 3), iops.executing());
|
|
1993
|
-
|
|
1994
|
-
var four = iops.acquire().?;
|
|
1995
|
-
try testing.expectEqual(@as(?*u32, null), iops.acquire());
|
|
1996
|
-
|
|
1997
|
-
try testing.expectEqual(@as(usize, 0), iops.available());
|
|
1998
|
-
try testing.expectEqual(@as(usize, 4), iops.executing());
|
|
1999
|
-
|
|
2000
|
-
iops.release(two);
|
|
2001
|
-
|
|
2002
|
-
try testing.expectEqual(@as(usize, 1), iops.available());
|
|
2003
|
-
try testing.expectEqual(@as(usize, 3), iops.executing());
|
|
2004
|
-
|
|
2005
|
-
// there is only one slot free, so we will get the same pointer back.
|
|
2006
|
-
try testing.expectEqual(@as(?*u32, two), iops.acquire());
|
|
2007
|
-
|
|
2008
|
-
iops.release(four);
|
|
2009
|
-
iops.release(two);
|
|
2010
|
-
iops.release(one);
|
|
2011
|
-
iops.release(three);
|
|
2012
|
-
|
|
2013
|
-
try testing.expectEqual(@as(usize, 4), iops.available());
|
|
2014
|
-
try testing.expectEqual(@as(usize, 0), iops.executing());
|
|
2015
|
-
|
|
2016
|
-
one = iops.acquire().?;
|
|
2017
|
-
two = iops.acquire().?;
|
|
2018
|
-
three = iops.acquire().?;
|
|
2019
|
-
four = iops.acquire().?;
|
|
2020
|
-
try testing.expectEqual(@as(?*u32, null), iops.acquire());
|
|
2021
|
-
}
|
|
2022
|
-
|
|
2023
1898
|
/// @B and @C:
|
|
2024
1899
|
/// This prepare header is corrupt.
|
|
2025
1900
|
/// We may have a valid redundant header, but need to recover the full message.
|
|
@@ -2291,3 +2166,118 @@ test "recovery_cases" {
|
|
|
2291
2166
|
if (case_match == null) @panic("no matching case");
|
|
2292
2167
|
}
|
|
2293
2168
|
}
|
|
2169
|
+
|
|
2170
|
+
/// Format part of a new WAL, writing to `target`.
|
|
2171
|
+
///
|
|
2172
|
+
/// `offset_logical` is relative to the beginning of the WAL.
|
|
2173
|
+
/// Returns the number of bytes written to `target`.
|
|
2174
|
+
pub fn format_journal(cluster: u32, offset_logical: u64, target: []u8) usize {
|
|
2175
|
+
assert(offset_logical <= config.journal_size_max);
|
|
2176
|
+
assert(offset_logical % config.sector_size == 0);
|
|
2177
|
+
assert(target.len > 0);
|
|
2178
|
+
assert(target.len % config.sector_size == 0);
|
|
2179
|
+
|
|
2180
|
+
const sector_max = @divExact(config.journal_size_max, config.sector_size);
|
|
2181
|
+
var sectors = std.mem.bytesAsSlice([config.sector_size]u8, target);
|
|
2182
|
+
for (sectors) |*sector_data, i| {
|
|
2183
|
+
const sector = @divExact(offset_logical, config.sector_size) + i;
|
|
2184
|
+
if (sector == sector_max) {
|
|
2185
|
+
if (i == 0) {
|
|
2186
|
+
assert(offset_logical == config.journal_size_max);
|
|
2187
|
+
}
|
|
2188
|
+
return i * config.sector_size;
|
|
2189
|
+
} else {
|
|
2190
|
+
format_journal_sector(cluster, sector, sector_data);
|
|
2191
|
+
}
|
|
2192
|
+
}
|
|
2193
|
+
return target.len;
|
|
2194
|
+
}
|
|
2195
|
+
|
|
2196
|
+
fn format_journal_sector(cluster: u32, sector: usize, sector_data: *[config.sector_size]u8) void {
|
|
2197
|
+
assert(sector < @divExact(config.journal_size_max, config.sector_size));
|
|
2198
|
+
|
|
2199
|
+
var sector_headers = std.mem.bytesAsSlice(Header, sector_data);
|
|
2200
|
+
|
|
2201
|
+
if (sector * headers_per_sector < slot_count) {
|
|
2202
|
+
for (sector_headers) |*header, i| {
|
|
2203
|
+
const slot = sector * headers_per_sector + i;
|
|
2204
|
+
if (sector == 0 and i == 0) {
|
|
2205
|
+
header.* = Header.root_prepare(cluster);
|
|
2206
|
+
assert(header.op == 0);
|
|
2207
|
+
assert(header.command == .prepare);
|
|
2208
|
+
assert(header.operation == .root);
|
|
2209
|
+
} else {
|
|
2210
|
+
header.* = Header.reserved(cluster, slot);
|
|
2211
|
+
}
|
|
2212
|
+
}
|
|
2213
|
+
return;
|
|
2214
|
+
}
|
|
2215
|
+
|
|
2216
|
+
const sectors_per_message = @divExact(config.message_size_max, config.sector_size);
|
|
2217
|
+
const sector_in_prepares = sector - @divExact(slot_count, headers_per_sector);
|
|
2218
|
+
const message_slot = @divFloor(sector_in_prepares, sectors_per_message);
|
|
2219
|
+
assert(message_slot < slot_count);
|
|
2220
|
+
|
|
2221
|
+
std.mem.set(u8, sector_data, 0);
|
|
2222
|
+
if (sector_in_prepares % sectors_per_message == 0) {
|
|
2223
|
+
// The header goes in the first sector of the message.
|
|
2224
|
+
if (message_slot == 0) {
|
|
2225
|
+
sector_headers[0] = Header.root_prepare(cluster);
|
|
2226
|
+
} else {
|
|
2227
|
+
sector_headers[0] = Header.reserved(cluster, message_slot);
|
|
2228
|
+
}
|
|
2229
|
+
}
|
|
2230
|
+
}
|
|
2231
|
+
|
|
2232
|
+
test "format_journal" {
|
|
2233
|
+
const cluster = 123;
|
|
2234
|
+
const write_sizes = [_]usize{
|
|
2235
|
+
config.sector_size,
|
|
2236
|
+
config.sector_size * 2,
|
|
2237
|
+
config.sector_size * 3,
|
|
2238
|
+
config.journal_size_max,
|
|
2239
|
+
};
|
|
2240
|
+
|
|
2241
|
+
for (write_sizes) |write_size_max| {
|
|
2242
|
+
const wal_data = try std.testing.allocator.alignedAlloc(u8, @alignOf(Header), config.journal_size_max);
|
|
2243
|
+
defer std.testing.allocator.free(wal_data);
|
|
2244
|
+
|
|
2245
|
+
const write_data = try std.testing.allocator.alloc(u8, write_size_max);
|
|
2246
|
+
defer std.testing.allocator.free(write_data);
|
|
2247
|
+
|
|
2248
|
+
const headers_ring = std.mem.bytesAsSlice(Header, wal_data[0..config.journal_size_headers]);
|
|
2249
|
+
const prepare_ring = std.mem.bytesAsSlice([config.message_size_max]u8, wal_data[config.journal_size_headers..]);
|
|
2250
|
+
try std.testing.expectEqual(@as(usize, config.journal_slot_count), headers_ring.len);
|
|
2251
|
+
try std.testing.expectEqual(@as(usize, config.journal_slot_count), prepare_ring.len);
|
|
2252
|
+
|
|
2253
|
+
var offset: u64 = 0;
|
|
2254
|
+
while (true) {
|
|
2255
|
+
const write_size = format_journal(cluster, offset, write_data);
|
|
2256
|
+
if (write_size == 0) break;
|
|
2257
|
+
std.mem.copy(u8, wal_data[offset..][0..write_size], write_data[0..write_size]);
|
|
2258
|
+
offset += write_size;
|
|
2259
|
+
}
|
|
2260
|
+
|
|
2261
|
+
for (headers_ring) |*header, slot| {
|
|
2262
|
+
try std.testing.expect(header.valid_checksum());
|
|
2263
|
+
try std.testing.expect(header.valid_checksum_body(&[0]u8{}));
|
|
2264
|
+
try std.testing.expectEqual(header.invalid(), null);
|
|
2265
|
+
try std.testing.expectEqual(header.cluster, cluster);
|
|
2266
|
+
try std.testing.expectEqual(header.op, slot);
|
|
2267
|
+
try std.testing.expectEqual(header.size, @sizeOf(Header));
|
|
2268
|
+
if (slot == 0) {
|
|
2269
|
+
try std.testing.expectEqual(header.command, .prepare);
|
|
2270
|
+
try std.testing.expectEqual(header.operation, .root);
|
|
2271
|
+
} else {
|
|
2272
|
+
try std.testing.expectEqual(header.command, .reserved);
|
|
2273
|
+
}
|
|
2274
|
+
|
|
2275
|
+
const prepare_bytes = prepare_ring[slot];
|
|
2276
|
+
const prepare_header = std.mem.bytesAsValue(Header, prepare_bytes[0..@sizeOf(Header)]);
|
|
2277
|
+
const prepare_body = prepare_bytes[@sizeOf(Header)..];
|
|
2278
|
+
|
|
2279
|
+
try std.testing.expectEqual(header.*, prepare_header.*);
|
|
2280
|
+
for (prepare_body) |byte| try std.testing.expectEqual(byte, 0);
|
|
2281
|
+
}
|
|
2282
|
+
}
|
|
2283
|
+
}
|