tigerbeetle-node 0.10.0 → 0.11.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +302 -101
- package/dist/index.d.ts +70 -72
- package/dist/index.js +70 -72
- package/dist/index.js.map +1 -1
- package/package.json +9 -8
- package/scripts/download_node_headers.sh +14 -7
- package/src/index.ts +6 -10
- package/src/node.zig +6 -3
- package/src/tigerbeetle/scripts/benchmark.sh +4 -4
- package/src/tigerbeetle/scripts/confirm_image.sh +44 -0
- package/src/tigerbeetle/scripts/fuzz_loop.sh +15 -0
- package/src/tigerbeetle/scripts/fuzz_unique_errors.sh +7 -0
- package/src/tigerbeetle/scripts/install.sh +19 -4
- package/src/tigerbeetle/scripts/install_zig.bat +5 -1
- package/src/tigerbeetle/scripts/install_zig.sh +24 -14
- package/src/tigerbeetle/scripts/pre-commit.sh +9 -0
- package/src/tigerbeetle/scripts/shellcheck.sh +5 -0
- package/src/tigerbeetle/scripts/tests_on_alpine.sh +10 -0
- package/src/tigerbeetle/scripts/tests_on_ubuntu.sh +14 -0
- package/src/tigerbeetle/scripts/validate_docs.sh +17 -0
- package/src/tigerbeetle/src/benchmark.zig +29 -13
- package/src/tigerbeetle/src/c/tb_client/context.zig +248 -47
- package/src/tigerbeetle/src/c/tb_client/echo_client.zig +108 -0
- package/src/tigerbeetle/src/c/tb_client/packet.zig +2 -2
- package/src/tigerbeetle/src/c/tb_client/signal.zig +2 -4
- package/src/tigerbeetle/src/c/tb_client/thread.zig +17 -257
- package/src/tigerbeetle/src/c/tb_client.h +118 -84
- package/src/tigerbeetle/src/c/tb_client.zig +88 -23
- package/src/tigerbeetle/src/c/tb_client_header_test.zig +135 -0
- package/src/tigerbeetle/src/c/test.zig +371 -1
- package/src/tigerbeetle/src/cli.zig +37 -7
- package/src/tigerbeetle/src/config.zig +58 -17
- package/src/tigerbeetle/src/demo.zig +5 -2
- package/src/tigerbeetle/src/demo_01_create_accounts.zig +1 -1
- package/src/tigerbeetle/src/demo_03_create_transfers.zig +13 -0
- package/src/tigerbeetle/src/ewah.zig +11 -33
- package/src/tigerbeetle/src/ewah_benchmark.zig +8 -9
- package/src/tigerbeetle/src/io/linux.zig +1 -1
- package/src/tigerbeetle/src/lsm/README.md +308 -0
- package/src/tigerbeetle/src/lsm/binary_search.zig +137 -10
- package/src/tigerbeetle/src/lsm/bloom_filter.zig +43 -0
- package/src/tigerbeetle/src/lsm/compaction.zig +376 -397
- package/src/tigerbeetle/src/lsm/composite_key.zig +2 -0
- package/src/tigerbeetle/src/lsm/eytzinger.zig +1 -1
- package/src/tigerbeetle/src/{eytzinger_benchmark.zig → lsm/eytzinger_benchmark.zig} +34 -21
- package/src/tigerbeetle/src/lsm/forest.zig +21 -447
- package/src/tigerbeetle/src/lsm/forest_fuzz.zig +414 -0
- package/src/tigerbeetle/src/lsm/grid.zig +170 -76
- package/src/tigerbeetle/src/lsm/groove.zig +197 -133
- package/src/tigerbeetle/src/lsm/k_way_merge.zig +40 -18
- package/src/tigerbeetle/src/lsm/level_iterator.zig +28 -9
- package/src/tigerbeetle/src/lsm/manifest.zig +93 -180
- package/src/tigerbeetle/src/lsm/manifest_level.zig +161 -454
- package/src/tigerbeetle/src/lsm/manifest_log.zig +243 -356
- package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +665 -0
- package/src/tigerbeetle/src/lsm/node_pool.zig +4 -0
- package/src/tigerbeetle/src/lsm/posted_groove.zig +65 -76
- package/src/tigerbeetle/src/lsm/segmented_array.zig +580 -251
- package/src/tigerbeetle/src/lsm/segmented_array_benchmark.zig +148 -0
- package/src/tigerbeetle/src/lsm/segmented_array_fuzz.zig +9 -0
- package/src/tigerbeetle/src/lsm/set_associative_cache.zig +62 -12
- package/src/tigerbeetle/src/lsm/table.zig +115 -68
- package/src/tigerbeetle/src/lsm/table_immutable.zig +30 -23
- package/src/tigerbeetle/src/lsm/table_iterator.zig +27 -17
- package/src/tigerbeetle/src/lsm/table_mutable.zig +63 -12
- package/src/tigerbeetle/src/lsm/test.zig +61 -56
- package/src/tigerbeetle/src/lsm/tree.zig +450 -407
- package/src/tigerbeetle/src/lsm/tree_fuzz.zig +461 -0
- package/src/tigerbeetle/src/main.zig +83 -8
- package/src/tigerbeetle/src/message_bus.zig +20 -9
- package/src/tigerbeetle/src/message_pool.zig +22 -19
- package/src/tigerbeetle/src/ring_buffer.zig +7 -3
- package/src/tigerbeetle/src/simulator.zig +179 -119
- package/src/tigerbeetle/src/state_machine.zig +381 -246
- package/src/tigerbeetle/src/static_allocator.zig +65 -0
- package/src/tigerbeetle/src/storage.zig +3 -7
- package/src/tigerbeetle/src/test/accounting/auditor.zig +577 -0
- package/src/tigerbeetle/src/test/accounting/workload.zig +823 -0
- package/src/tigerbeetle/src/test/cluster.zig +33 -81
- package/src/tigerbeetle/src/test/conductor.zig +366 -0
- package/src/tigerbeetle/src/test/fuzz.zig +121 -0
- package/src/tigerbeetle/src/test/id.zig +89 -0
- package/src/tigerbeetle/src/test/network.zig +45 -19
- package/src/tigerbeetle/src/test/packet_simulator.zig +40 -29
- package/src/tigerbeetle/src/test/priority_queue.zig +645 -0
- package/src/tigerbeetle/src/test/state_checker.zig +91 -69
- package/src/tigerbeetle/src/test/state_machine.zig +11 -35
- package/src/tigerbeetle/src/test/storage.zig +470 -106
- package/src/tigerbeetle/src/test/storage_checker.zig +204 -0
- package/src/tigerbeetle/src/tigerbeetle.zig +15 -16
- package/src/tigerbeetle/src/unit_tests.zig +13 -1
- package/src/tigerbeetle/src/util.zig +97 -11
- package/src/tigerbeetle/src/vopr.zig +495 -0
- package/src/tigerbeetle/src/vsr/client.zig +21 -3
- package/src/tigerbeetle/src/vsr/journal.zig +293 -212
- package/src/tigerbeetle/src/vsr/replica.zig +1086 -515
- package/src/tigerbeetle/src/vsr/superblock.zig +382 -637
- package/src/tigerbeetle/src/vsr/superblock_client_table.zig +14 -16
- package/src/tigerbeetle/src/vsr/superblock_free_set.zig +416 -153
- package/src/tigerbeetle/src/vsr/superblock_free_set_fuzz.zig +332 -0
- package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +349 -0
- package/src/tigerbeetle/src/vsr/superblock_manifest.zig +62 -12
- package/src/tigerbeetle/src/vsr/superblock_quorums.zig +394 -0
- package/src/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +312 -0
- package/src/tigerbeetle/src/vsr.zig +94 -60
- package/src/tigerbeetle/scripts/vopr.bat +0 -48
- package/src/tigerbeetle/scripts/vopr.sh +0 -33
- package/src/tigerbeetle/src/benchmark_array_search.zig +0 -317
- package/src/tigerbeetle/src/benchmarks/perf.zig +0 -299
|
@@ -10,11 +10,14 @@ const Transfer = tb.Transfer;
|
|
|
10
10
|
const CreateAccountsResult = tb.CreateAccountsResult;
|
|
11
11
|
const CreateTransfersResult = tb.CreateTransfersResult;
|
|
12
12
|
|
|
13
|
+
const util = @import("util.zig");
|
|
13
14
|
const IO = @import("io.zig").IO;
|
|
14
15
|
const Storage = @import("storage.zig").Storage;
|
|
15
16
|
const MessagePool = @import("message_pool.zig").MessagePool;
|
|
16
17
|
const MessageBus = @import("message_bus.zig").MessageBusClient;
|
|
17
|
-
const StateMachine = @import("state_machine.zig").StateMachineType(Storage
|
|
18
|
+
const StateMachine = @import("state_machine.zig").StateMachineType(Storage, .{
|
|
19
|
+
.message_body_size_max = config.message_body_size_max,
|
|
20
|
+
});
|
|
18
21
|
|
|
19
22
|
const vsr = @import("vsr.zig");
|
|
20
23
|
const Header = vsr.Header;
|
|
@@ -59,7 +62,7 @@ pub fn request(
|
|
|
59
62
|
defer client.unref(message);
|
|
60
63
|
|
|
61
64
|
const body = std.mem.asBytes(&batch);
|
|
62
|
-
|
|
65
|
+
util.copy_disjoint(.inexact, u8, message.buffer[@sizeOf(Header)..], body);
|
|
63
66
|
|
|
64
67
|
client.request(
|
|
65
68
|
0,
|
|
@@ -7,6 +7,19 @@ pub fn main() !void {
|
|
|
7
7
|
const transfers = [_]Transfer{
|
|
8
8
|
Transfer{
|
|
9
9
|
.id = 1,
|
|
10
|
+
.debit_account_id = 2,
|
|
11
|
+
.credit_account_id = 1,
|
|
12
|
+
.user_data = 0,
|
|
13
|
+
.reserved = 0,
|
|
14
|
+
.pending_id = 0,
|
|
15
|
+
.timeout = 0,
|
|
16
|
+
.ledger = 710, // Let's use the ISO-4217 Code Number for ZAR
|
|
17
|
+
.code = 1,
|
|
18
|
+
.flags = .{},
|
|
19
|
+
.amount = 10000, // Let's start with some liquidity in account 1.
|
|
20
|
+
},
|
|
21
|
+
Transfer{
|
|
22
|
+
.id = 2,
|
|
10
23
|
.debit_account_id = 1,
|
|
11
24
|
.credit_account_id = 2,
|
|
12
25
|
.user_data = 0,
|
|
@@ -2,7 +2,9 @@ const std = @import("std");
|
|
|
2
2
|
const assert = std.debug.assert;
|
|
3
3
|
const math = std.math;
|
|
4
4
|
const mem = std.mem;
|
|
5
|
-
const
|
|
5
|
+
const util = @import("util.zig");
|
|
6
|
+
const div_ceil = util.div_ceil;
|
|
7
|
+
const disjoint_slices = util.disjoint_slices;
|
|
6
8
|
|
|
7
9
|
/// Encode or decode a bitset using Daniel Lemire's EWAH codec.
|
|
8
10
|
/// ("Histogram-Aware Sorting for Enhanced Word-Aligned Compression in Bitmap Indexes")
|
|
@@ -58,10 +60,12 @@ pub fn ewah(comptime Word: type) type {
|
|
|
58
60
|
|
|
59
61
|
/// Decodes the compressed bitset in `source` into `target_words`.
|
|
60
62
|
/// Returns the number of *words* written to `target_words`.
|
|
63
|
+
// TODO Refactor to return an error when `source` is invalid,
|
|
64
|
+
// so that we can test invalid encodings.
|
|
61
65
|
pub fn decode(source: []align(@alignOf(Word)) const u8, target_words: []Word) usize {
|
|
62
66
|
assert(source.len % @sizeOf(Word) == 0);
|
|
63
67
|
assert(source.len >= @sizeOf(Marker));
|
|
64
|
-
assert(
|
|
68
|
+
assert(disjoint_slices(u8, Word, source, target_words));
|
|
65
69
|
|
|
66
70
|
const source_words = mem.bytesAsSlice(Word, source);
|
|
67
71
|
var source_index: usize = 0;
|
|
@@ -75,7 +79,8 @@ pub fn ewah(comptime Word: type) type {
|
|
|
75
79
|
if (marker.uniform_bit == 1) ~@as(Word, 0) else 0,
|
|
76
80
|
);
|
|
77
81
|
target_index += marker.uniform_word_count;
|
|
78
|
-
|
|
82
|
+
util.copy_disjoint(
|
|
83
|
+
.exact,
|
|
79
84
|
Word,
|
|
80
85
|
target_words[target_index..][0..marker.literal_word_count],
|
|
81
86
|
source_words[source_index..][0..marker.literal_word_count],
|
|
@@ -92,7 +97,7 @@ pub fn ewah(comptime Word: type) type {
|
|
|
92
97
|
pub fn encode(source_words: []const Word, target: []align(@alignOf(Word)) u8) usize {
|
|
93
98
|
assert(target.len >= @sizeOf(Marker));
|
|
94
99
|
assert(target.len == encode_size_max(source_words.len));
|
|
95
|
-
assert(
|
|
100
|
+
assert(disjoint_slices(Word, u8, source_words, target));
|
|
96
101
|
|
|
97
102
|
const target_words = mem.bytesAsSlice(Word, target);
|
|
98
103
|
std.mem.set(Word, target_words, 0);
|
|
@@ -126,7 +131,8 @@ pub fn ewah(comptime Word: type) type {
|
|
|
126
131
|
.literal_word_count = @intCast(MarkerLiteralCount, literal_word_count),
|
|
127
132
|
});
|
|
128
133
|
target_index += 1;
|
|
129
|
-
|
|
134
|
+
util.copy_disjoint(
|
|
135
|
+
.exact,
|
|
130
136
|
Word,
|
|
131
137
|
target_words[target_index..][0..literal_word_count],
|
|
132
138
|
source_words[source_index..][0..literal_word_count],
|
|
@@ -154,34 +160,6 @@ pub fn ewah(comptime Word: type) type {
|
|
|
154
160
|
};
|
|
155
161
|
}
|
|
156
162
|
|
|
157
|
-
fn is_disjoint(comptime A: type, comptime B: type, a: []const A, b: []const B) bool {
|
|
158
|
-
return @ptrToInt(a.ptr) + a.len * @sizeOf(A) <= @ptrToInt(b.ptr) or
|
|
159
|
-
@ptrToInt(b.ptr) + b.len * @sizeOf(B) <= @ptrToInt(a.ptr);
|
|
160
|
-
}
|
|
161
|
-
|
|
162
|
-
test "is_disjoint" {
|
|
163
|
-
const a = try std.testing.allocator.alignedAlloc(u8, @sizeOf(u32), 8 * @sizeOf(u32));
|
|
164
|
-
defer std.testing.allocator.free(a);
|
|
165
|
-
|
|
166
|
-
const b = try std.testing.allocator.alloc(u32, 8);
|
|
167
|
-
defer std.testing.allocator.free(b);
|
|
168
|
-
|
|
169
|
-
try std.testing.expectEqual(true, is_disjoint(u8, u32, a, b));
|
|
170
|
-
try std.testing.expectEqual(true, is_disjoint(u32, u8, b, a));
|
|
171
|
-
|
|
172
|
-
try std.testing.expectEqual(true, is_disjoint(u8, u8, a, a[0..0]));
|
|
173
|
-
try std.testing.expectEqual(true, is_disjoint(u32, u32, b, b[0..0]));
|
|
174
|
-
|
|
175
|
-
try std.testing.expectEqual(false, is_disjoint(u8, u8, a, a[0..1]));
|
|
176
|
-
try std.testing.expectEqual(false, is_disjoint(u8, u8, a, a[a.len - 1 .. a.len]));
|
|
177
|
-
|
|
178
|
-
try std.testing.expectEqual(false, is_disjoint(u32, u32, b, b[0..1]));
|
|
179
|
-
try std.testing.expectEqual(false, is_disjoint(u32, u32, b, b[b.len - 1 .. b.len]));
|
|
180
|
-
|
|
181
|
-
try std.testing.expectEqual(false, is_disjoint(u8, u32, a, std.mem.bytesAsSlice(u32, a)));
|
|
182
|
-
try std.testing.expectEqual(false, is_disjoint(u32, u8, b, std.mem.sliceAsBytes(b)));
|
|
183
|
-
}
|
|
184
|
-
|
|
185
163
|
test "ewah Word=u8 decode→encode→decode" {
|
|
186
164
|
try test_decode_with_word(u8);
|
|
187
165
|
|
|
@@ -35,15 +35,16 @@ pub fn main() !void {
|
|
|
35
35
|
var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
|
|
36
36
|
defer arena.deinit();
|
|
37
37
|
|
|
38
|
+
const allocator = arena.allocator();
|
|
38
39
|
var i: usize = 0;
|
|
39
40
|
var bitsets: [samples][]usize = undefined;
|
|
40
41
|
var bitsets_encoded: [samples][]align(@alignOf(usize)) u8 = undefined;
|
|
41
42
|
var bitsets_decoded: [samples][]usize = undefined;
|
|
42
43
|
var bitset_lengths: [samples]usize = undefined;
|
|
43
44
|
while (i < samples) : (i += 1) {
|
|
44
|
-
bitsets[i] = try make_bitset(
|
|
45
|
-
bitsets_encoded[i] = try
|
|
46
|
-
bitsets_decoded[i] = try
|
|
45
|
+
bitsets[i] = try make_bitset(allocator, config);
|
|
46
|
+
bitsets_encoded[i] = try allocator.alignedAlloc(u8, @alignOf(usize), ewah.encode_size_max(bitsets[0].len));
|
|
47
|
+
bitsets_decoded[i] = try allocator.alloc(usize, config.words);
|
|
47
48
|
}
|
|
48
49
|
|
|
49
50
|
// Benchmark encoding.
|
|
@@ -96,16 +97,14 @@ pub fn main() !void {
|
|
|
96
97
|
}
|
|
97
98
|
}
|
|
98
99
|
|
|
99
|
-
fn make_bitset(allocator:
|
|
100
|
+
fn make_bitset(allocator: std.mem.Allocator, config: BitSetConfig) ![]usize {
|
|
100
101
|
var words = try allocator.alloc(usize, config.words);
|
|
101
102
|
var w: usize = 0;
|
|
102
|
-
var run: bool = true;
|
|
103
103
|
var literal: usize = 1;
|
|
104
104
|
while (w < words.len) : (w += 1) {
|
|
105
|
-
const
|
|
106
|
-
const
|
|
107
|
-
const
|
|
108
|
-
const run_bit = prng.random.boolean();
|
|
105
|
+
const run_length = prng.random().uintLessThan(usize, 2 * config.run_length_e);
|
|
106
|
+
const literals_length = prng.random().uintLessThan(usize, 2 * config.literals_length_e);
|
|
107
|
+
const run_bit = prng.random().boolean();
|
|
109
108
|
|
|
110
109
|
const run_end = std.math.min(w + run_length, words.len);
|
|
111
110
|
while (w < run_end) : (w += 1) {
|
|
@@ -26,7 +26,7 @@ pub const IO = struct {
|
|
|
26
26
|
const uts = std.os.uname();
|
|
27
27
|
const release = std.mem.sliceTo(&uts.release, 0);
|
|
28
28
|
const version = try std.builtin.Version.parse(release);
|
|
29
|
-
if (version.major
|
|
29
|
+
if (version.order(std.builtin.Version{ .major = 5, .minor = 5 }) == .lt) {
|
|
30
30
|
@panic("Linux kernel 5.5 or greater is required for io_uring OP_ACCEPT");
|
|
31
31
|
}
|
|
32
32
|
|
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
# Glossary
|
|
2
|
+
|
|
3
|
+
- _bar_/_measure_: `lsm_batch_multiple` beats; unit of incremental compaction.
|
|
4
|
+
- _beat_: `op % lsm_batch_multiple`; Single step of an incremental compaction.
|
|
5
|
+
- _groove_: A collection of LSM trees, storing objects and their indices.
|
|
6
|
+
- _immutable table_: in-memory table; one per tree. Used to periodically flush the mutable table to
|
|
7
|
+
disk.
|
|
8
|
+
- _level_: Between `0` and `lsm_levels - 1` (usually `lsm_levels = 7`).
|
|
9
|
+
- _forest_: a collection of grooves.
|
|
10
|
+
- _manifest_: index of table and level metadata; one per tree.
|
|
11
|
+
- _mutable table_: in-memory table; one per tree. All tree updates are applied only to this table.
|
|
12
|
+
- _snapshot_: sequence number which selects the queryable partition of on-disk tables.
|
|
13
|
+
|
|
14
|
+
# Tree
|
|
15
|
+
## Tables
|
|
16
|
+
|
|
17
|
+
A tree is a hierarchy of in-memory and on-disk tables. There are three categories of tables:
|
|
18
|
+
|
|
19
|
+
- The [mutable table](table_mutable.zig) is an in-memory table.
|
|
20
|
+
- Each tree has a single mutable table.
|
|
21
|
+
- All tree updates, inserts, and removes are applied to the mutable table.
|
|
22
|
+
- The mutable table's size is allocated to accommodate a full bar of updates.
|
|
23
|
+
- The [immutable table](table_immutable.zig) is an in-memory table.
|
|
24
|
+
- Each tree has a single immutable table.
|
|
25
|
+
- The mutable table's contents are periodically moved to the immutable table,
|
|
26
|
+
where they are stored while being flushed to level `0`.
|
|
27
|
+
- Level `0` … level `config.lsm_levels - 1` each contain an exponentially increasing number of
|
|
28
|
+
immutable on-disk tables.
|
|
29
|
+
- Each tree has as many as `config.lsm_growth_factor ^ (level + 1)` tables per level.
|
|
30
|
+
(`config.lsm_growth_factor` is typically 8).
|
|
31
|
+
- Within a given level and snapshot, the tables' key ranges are [disjoint](manifest_level.zig).
|
|
32
|
+
|
|
33
|
+
## Compaction
|
|
34
|
+
|
|
35
|
+
Tree compaction runs to the sound of music!
|
|
36
|
+
|
|
37
|
+
Compacting LSM trees involves merging and moving tables into the next levels as needed.
|
|
38
|
+
To avoid write amplification stalls and bound latency, compaction is done incrementally.
|
|
39
|
+
|
|
40
|
+
A full compaction phase is denoted as a bar or measure, using terms from music notation.
|
|
41
|
+
Each bar consists of `lsm_batch_multiple` beats or "compaction ticks" of work.
|
|
42
|
+
A compaction tick executes asynchronously immediately after every commit, with
|
|
43
|
+
`beat = commit.op % lsm_batch_multiple`.
|
|
44
|
+
|
|
45
|
+
A bar is split in half according to the "first" beat and "middle" beat.
|
|
46
|
+
The first half of the bar compacts even levels while the latter compacts odd levels.
|
|
47
|
+
Mutable table changes are sorted and compacted into the immutable table.
|
|
48
|
+
The immutable table is compacted into level 0 during the odd level half of the bar.
|
|
49
|
+
|
|
50
|
+
At any given point, there are at most `⌈levels/2⌉` compactions running concurrently.
|
|
51
|
+
The source level is denoted as `level_a` and the target level as `level_b`.
|
|
52
|
+
The last level in the LSM tree has no target level so it is never a source level.
|
|
53
|
+
Each compaction compacts a [single table](#table-selection) from `level_a` into all tables in
|
|
54
|
+
`level_b` which intersect the `level_a` table's key range.
|
|
55
|
+
|
|
56
|
+
Invariants:
|
|
57
|
+
* At the end of every beat, there is space in mutable table for the next beat.
|
|
58
|
+
* The manifest log is compacted at the end of every beat.
|
|
59
|
+
* The compactions' output tables are not [visible](#snapshots-and-compaction) until the compaction has finished.
|
|
60
|
+
|
|
61
|
+
1. First half-bar, first beat ("first beat"):
|
|
62
|
+
* Assert no compactions are currently running.
|
|
63
|
+
* Allow the per-level table limits to overflow if needed (for example, if we may compact a table
|
|
64
|
+
from level `A` to level `B`, where level `B` is already full).
|
|
65
|
+
* Start compactions from even levels that have reached their table limit.
|
|
66
|
+
* Acquire reservations from the Free Set for all blocks (upper-bound) that will be written
|
|
67
|
+
during this half-bar.
|
|
68
|
+
|
|
69
|
+
2. First half-bar, last beat:
|
|
70
|
+
* Finish ticking any incomplete even-level compactions.
|
|
71
|
+
* Assert on callback completion that all compactions are complete.
|
|
72
|
+
* Release reservations from the Free Set.
|
|
73
|
+
|
|
74
|
+
3. Second half-bar, first beat ("middle beat"):
|
|
75
|
+
* Assert no compactions are currently running.
|
|
76
|
+
* Start compactions from odd levels that have reached their table limit.
|
|
77
|
+
* Compact the immutable table if it contains any sorted values (it might be empty).
|
|
78
|
+
* Acquire reservations from the Free Set for all blocks (upper-bound) that will be written
|
|
79
|
+
during this half-bar.
|
|
80
|
+
|
|
81
|
+
4. Second half-bar, last beat:
|
|
82
|
+
* Finish ticking any incomplete odd-level and immutable table compactions.
|
|
83
|
+
* Assert on callback completion that all compactions are complete.
|
|
84
|
+
* Assert on callback completion that no level's table count overflows.
|
|
85
|
+
* Flush, clear, and sort mutable table values into immutable table for next bar.
|
|
86
|
+
* Remove input tables that are invisible to all current and persisted snapshots.
|
|
87
|
+
* Release reservations from the Free Set.
|
|
88
|
+
|
|
89
|
+
### Compaction Selection Policy
|
|
90
|
+
|
|
91
|
+
Compaction targets the table from level `A` which overlaps the fewest tables of level `B`.
|
|
92
|
+
|
|
93
|
+
For example, in the following table (with `lsm_growth_factor=2`), each table is depicted as the range of keys it includes. The tables with uppercase letters would be chosen for compaction next.
|
|
94
|
+
|
|
95
|
+
```
|
|
96
|
+
Level 0 A─────────────H l───────────────────────────z
|
|
97
|
+
Level 1 a───────e L─M o───────s u───────y
|
|
98
|
+
Level 2 b───d e─────h i───k l───n o─p q───s u─v w─────z
|
|
99
|
+
(Keys) a b c d e f g h i j k l m n o p q r s t u v w x y z
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
Links:
|
|
103
|
+
- [`Manifest.compaction_table`](manifest.zig)
|
|
104
|
+
- [Constructing and Analyzing the LSM Compaction Design Space](http://vldb.org/pvldb/vol14/p2216-sarkar.pdf) describes the tradeoffs of various data movement policies. TigerBeetle implements the "least overlapping with parent" policy.
|
|
105
|
+
- [Option of Compaction Priority](https://rocksdb.org/blog/2016/01/29/compaction_pri.html)
|
|
106
|
+
|
|
107
|
+
## Snapshots
|
|
108
|
+
|
|
109
|
+
Each table has a minimum and maximum integer snapshot (`snapshot_min` and `snapshot_max`).
|
|
110
|
+
|
|
111
|
+
Each query targets a particular snapshot. A table `T` is _visible_ to a snapshot `S` when
|
|
112
|
+
|
|
113
|
+
```
|
|
114
|
+
T.snapshot_min ≤ S ≤ T.snapshot_max
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
and is _invisible_ to the snapshot otherwise.
|
|
118
|
+
|
|
119
|
+
Compaction does not modify tables in place — it copies data. Snapshots control and distinguish
|
|
120
|
+
which copies are useful, and which can be deleted. Snapshots can also be persisted, enabling
|
|
121
|
+
queries against past states of the tree (unimplemented; future work).
|
|
122
|
+
|
|
123
|
+
### Snapshots and Compaction
|
|
124
|
+
|
|
125
|
+
Consider the half-bar compaction beginning at op=`X` (`12`), with `lsm_batch_multiple=M` (`8`).
|
|
126
|
+
Each half-bar contains `N=M/2` (`4`) beats. The next half-bar begins at `Y=X+N` (`16`).
|
|
127
|
+
|
|
128
|
+
During the half-bar compaction `X` (op=`X…Y-1`; `12…15`), each commit prefetches from the snapshot
|
|
129
|
+
[equal to its own op](#current-snapshot). As shown, they continue to query the old (input) tables.
|
|
130
|
+
|
|
131
|
+
During the half-bar compaction `X`:
|
|
132
|
+
- `snapshot_max` of each input table is truncated to `Y-1` (`15`).
|
|
133
|
+
- `snapshot_min` of each output table is initialized to `Y` (`16`).
|
|
134
|
+
|
|
135
|
+
```
|
|
136
|
+
0 4 8 12 16 20 24 (op, snapshot)
|
|
137
|
+
┼───┬───┼───┬───┼───┬───┼
|
|
138
|
+
####
|
|
139
|
+
····────────X────────···· (input tables, before compaction)
|
|
140
|
+
····──────────── (input tables, after compaction)
|
|
141
|
+
Y────···· (output tables, after compaction)
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
Beginning from the next op after the compaction (`Y`; `16`):
|
|
145
|
+
- The output tables of the above compaction `X` are visible.
|
|
146
|
+
- The input tables of the above compaction `X` are invisible.
|
|
147
|
+
- Therefore, it will lookup from the output tables, but ignore the input tables.
|
|
148
|
+
- Callers must not query from the output tables of `X` before the compaction half-bar has finished
|
|
149
|
+
(i.e. before the end of beat `Y-1` (`15`)), since those tables are incomplete.
|
|
150
|
+
|
|
151
|
+
At this point the input tables can be removed if they are invisible to all persistent snapshots.
|
|
152
|
+
|
|
153
|
+
### Snapshot Queries
|
|
154
|
+
|
|
155
|
+
Each query targets a particular snapshot, either:
|
|
156
|
+
- the [current snapshot](#current-snapshot), or
|
|
157
|
+
- a [persisted snapshot](#persistent-snapshots).
|
|
158
|
+
|
|
159
|
+
#### Current Snapshot
|
|
160
|
+
|
|
161
|
+
Each tree tracks the highest snapshot safe to query from (`tree.lookup_snapshot_max`), to ensure that
|
|
162
|
+
an ongoing compaction's incomplete output tables are not visible. Queries targeting
|
|
163
|
+
`tree.lookup_snapshot_max` always read from the mutable and immutable tables — so each commit can
|
|
164
|
+
see all previous commits' updates.)
|
|
165
|
+
|
|
166
|
+
During typical operation, the `lookup_snapshot_max` when prefetching op `S` is snapshot `S`.
|
|
167
|
+
The following chart depicts:
|
|
168
|
+
- `lookup_snapshot_max` (`$`)
|
|
169
|
+
- for each commit op (the left column)
|
|
170
|
+
- and a compaction that began at op `12` and completed at the end of op `15`.
|
|
171
|
+
|
|
172
|
+
```
|
|
173
|
+
op 0 4 8 12 16 20 24 (op, snapshot)
|
|
174
|
+
┼───┬───┼───┬───┼───┬───┼
|
|
175
|
+
12 ····────────$───
|
|
176
|
+
13 ····─────────$──
|
|
177
|
+
14 ····──────────$─
|
|
178
|
+
15 ····───────────$
|
|
179
|
+
16 $────····
|
|
180
|
+
17 ─$───····
|
|
181
|
+
18 ──$──····
|
|
182
|
+
19 ───$─····
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
However, commits in the first measure following recovery from a checkpoint prefetch from a higher
|
|
186
|
+
snapshot to avoid querying tables that were deleted at the checkpoint.
|
|
187
|
+
See [`lookup_snapshot_max_for_checkpoint()`](#tree.zig) for more detail.
|
|
188
|
+
|
|
189
|
+
#### Persistent Snapshots
|
|
190
|
+
|
|
191
|
+
TODO(Persistent Snapshots): Expand this section.
|
|
192
|
+
|
|
193
|
+
### Snapshot Values
|
|
194
|
+
|
|
195
|
+
- The on-disk tables visible to a snapshot `B` do not contain the updates from the commit with op `B`.
|
|
196
|
+
- Rather, snapshot `B` is first visible to a prefetch from the commit with op `B`.
|
|
197
|
+
|
|
198
|
+
Consider the following diagram (`lsm_batch_multiple=8`):
|
|
199
|
+
|
|
200
|
+
```
|
|
201
|
+
0 4 8 12 16 20 24 28 (op, snapshot)
|
|
202
|
+
┼───┬───┼───┬───┼───┬───┼───┬
|
|
203
|
+
,,,,,,,,........
|
|
204
|
+
↑A ↑B ↑C
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
Compaction is driven by the commits of ops `B→C` (`16…23`). While these ops are being committed:
|
|
208
|
+
- Updates from ops `0→A` (`0…7`) are on-disk.
|
|
209
|
+
- Updates from ops `A→B` (`8…15`) are in the immutable table.
|
|
210
|
+
- These updates were moved to the immutable table from the immutable table at the end of op `B-1`
|
|
211
|
+
(`15`).
|
|
212
|
+
- These updates will exist in the immutable table until it is reset at the end of op `C-1` (`23`).
|
|
213
|
+
- Updates from ops `B→C` (`16…23`) are added to the mutable table (by the respective commit).
|
|
214
|
+
- `tree.lookup_snapshot_max` is `B` when committing op `B`.
|
|
215
|
+
- `tree.lookup_snapshot_max` is `x` when committing op `x` (for `x ∈ {16,17,…,23}`).
|
|
216
|
+
|
|
217
|
+
At the end of the last beat of the compaction bar (`23`):
|
|
218
|
+
- Updates from ops `0→B` (`0…15`) are on disk.
|
|
219
|
+
- Updates from ops `B→C` (`16…23`) are moved from the mutable table to the immutable table.
|
|
220
|
+
- `tree.lookup_snapshot_max` is `x` when committing op `x` (for `x ∈ {24,25,…}`).
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
## Manifest
|
|
224
|
+
|
|
225
|
+
The manifest is a tree's index of table locations and metadata.
|
|
226
|
+
(Not to be confused with the [SuperBlock Manifest](../vsr/README.md#manifest)).
|
|
227
|
+
|
|
228
|
+
Each manifest has two components:
|
|
229
|
+
- a single [`ManifestLog`](#manifest-log) shared by all levels, and
|
|
230
|
+
- one [`ManifestLevel`](#manifest-level) for each on-disk level.
|
|
231
|
+
|
|
232
|
+
### Manifest Log
|
|
233
|
+
|
|
234
|
+
The manifest log is an on-disk log of all updates to the tree's table index.
|
|
235
|
+
|
|
236
|
+
The manifest log tracks:
|
|
237
|
+
|
|
238
|
+
- tables created as compaction output
|
|
239
|
+
- tables updated as compaction input (modifying their `snapshot_max`)
|
|
240
|
+
- tables moved between levels by compaction
|
|
241
|
+
- tables deleted after compaction
|
|
242
|
+
|
|
243
|
+
Updates are accumulated in-memory before being flushed:
|
|
244
|
+
|
|
245
|
+
- incrementally during compaction, or
|
|
246
|
+
- in their entirety during checkpoint.
|
|
247
|
+
|
|
248
|
+
The manifest log is periodically compacted to remove older entries that have been superseded by
|
|
249
|
+
newer entries. For example, if a table is created and later deleted, manifest log compaction
|
|
250
|
+
will eventually remove any reference to the table from the log blocks.
|
|
251
|
+
|
|
252
|
+
All manifest log blocks are tracked in the superblock manifest.
|
|
253
|
+
|
|
254
|
+
### Manifest Level
|
|
255
|
+
|
|
256
|
+
A `ManifestLevel` is an in-memory collection of the table metadata for a single level of a tree.
|
|
257
|
+
|
|
258
|
+
For a given level and snapshot, there may be gaps in the key ranges of the visible tables,
|
|
259
|
+
but the key ranges are disjoint.
|
|
260
|
+
|
|
261
|
+
Manifest levels are queried for tables at a target snapshot and within a key range.
|
|
262
|
+
|
|
263
|
+
#### Example
|
|
264
|
+
|
|
265
|
+
Given the `ManifestLevel` tables (with values chosen for visualization, not realism):
|
|
266
|
+
|
|
267
|
+
label A B C D E F G H I J K L M
|
|
268
|
+
key_min 0 4 12 16 4 8 12 26 4 25 4 16 24
|
|
269
|
+
key_max 3 11 15 19 7 11 15 27 7 27 11 19 27
|
|
270
|
+
snapshot_min 1 1 1 1 3 3 3 3 5 5 7 7 7
|
|
271
|
+
snapshot_max 9 3 3 7 5 7 9 5 7 7 9 9 9
|
|
272
|
+
|
|
273
|
+
A level's tables can be visualized in 2D as a partitioned rectangle:
|
|
274
|
+
|
|
275
|
+
0 1 2
|
|
276
|
+
0 4 8 2 6 0 4 8
|
|
277
|
+
9┌───┬───────┬───┬───┬───┬───┐
|
|
278
|
+
│ │ K │ │ L │###│ M │
|
|
279
|
+
7│ ├───┬───┤ ├───┤###└┬──┤
|
|
280
|
+
│ │ I │ │ G │ │####│ J│
|
|
281
|
+
5│ A ├───┤ F │ │ │####└┬─┤
|
|
282
|
+
│ │ E │ │ │ D │#####│H│
|
|
283
|
+
3│ ├───┴───┼───┤ │#####└─┤
|
|
284
|
+
│ │ B │ C │ │#######│
|
|
285
|
+
1└───┴───────┴───┴───┴───────┘
|
|
286
|
+
|
|
287
|
+
Example iterations:
|
|
288
|
+
|
|
289
|
+
visibility snapshots direction key_min key_max tables
|
|
290
|
+
visible 2 ascending 0 28 A, B, C, D
|
|
291
|
+
visible 4 ascending 0 28 A, E, F, G, D, H
|
|
292
|
+
visible 6 descending 12 28 J, D, G
|
|
293
|
+
visible 8 ascending 0 28 A, K, G, L, M
|
|
294
|
+
invisible 2, 4, 6 ascending 0 28 K, L, M
|
|
295
|
+
|
|
296
|
+
Legend:
|
|
297
|
+
|
|
298
|
+
- `#` represents a gap — no tables cover these keys during the snapshot.
|
|
299
|
+
- The horizontal axis represents the key range.
|
|
300
|
+
- The vertical axis represents the snapshot range.
|
|
301
|
+
- Each rectangle is a table within the manifest level.
|
|
302
|
+
- The sides of each rectangle depict:
|
|
303
|
+
- left: `table.key_min` (the diagram is inclusive, and the `table.key_min` is inclusive)
|
|
304
|
+
- right: `table.key_max` (the diagram is EXCLUSIVE, but the `table.key_max` is INCLUSIVE)
|
|
305
|
+
- bottom: `table.snapshot_min` (inclusive)
|
|
306
|
+
- top: `table.snapshot_max` (inclusive)
|
|
307
|
+
- (Not depicted: tables may have `table.key_min == table.key_max`.)
|
|
308
|
+
- (Not depicted: the newest set of tables would have `table.snapshot_max == maxInt(u64)`.)
|