tigerbeetle-node 0.9.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -2
- package/dist/index.d.ts +66 -61
- package/dist/index.js +66 -61
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/src/index.ts +5 -0
- package/src/node.zig +17 -18
- package/src/tigerbeetle/scripts/benchmark.bat +4 -3
- package/src/tigerbeetle/scripts/benchmark.sh +25 -10
- package/src/tigerbeetle/scripts/install.sh +2 -1
- package/src/tigerbeetle/scripts/install_zig.sh +14 -18
- package/src/tigerbeetle/scripts/upgrade_ubuntu_kernel.sh +12 -3
- package/src/tigerbeetle/scripts/vopr.sh +5 -5
- package/src/tigerbeetle/src/benchmark.zig +17 -9
- package/src/tigerbeetle/src/benchmark_array_search.zig +317 -0
- package/src/tigerbeetle/src/benchmarks/perf.zig +299 -0
- package/src/tigerbeetle/src/c/tb_client/context.zig +103 -0
- package/src/tigerbeetle/src/c/tb_client/packet.zig +80 -0
- package/src/tigerbeetle/src/c/tb_client/signal.zig +288 -0
- package/src/tigerbeetle/src/c/tb_client/thread.zig +329 -0
- package/src/tigerbeetle/src/c/tb_client.h +201 -0
- package/src/tigerbeetle/src/c/tb_client.zig +101 -0
- package/src/tigerbeetle/src/c/test.zig +1 -0
- package/src/tigerbeetle/src/cli.zig +142 -83
- package/src/tigerbeetle/src/config.zig +119 -10
- package/src/tigerbeetle/src/demo.zig +12 -8
- package/src/tigerbeetle/src/demo_05_post_pending_transfers.zig +2 -2
- package/src/tigerbeetle/src/ewah.zig +318 -0
- package/src/tigerbeetle/src/ewah_benchmark.zig +121 -0
- package/src/tigerbeetle/src/eytzinger_benchmark.zig +317 -0
- package/src/tigerbeetle/src/fifo.zig +17 -1
- package/src/tigerbeetle/src/io/darwin.zig +12 -10
- package/src/tigerbeetle/src/io/linux.zig +25 -9
- package/src/tigerbeetle/src/io/windows.zig +13 -9
- package/src/tigerbeetle/src/iops.zig +101 -0
- package/src/tigerbeetle/src/lsm/binary_search.zig +214 -0
- package/src/tigerbeetle/src/lsm/bloom_filter.zig +82 -0
- package/src/tigerbeetle/src/lsm/compaction.zig +603 -0
- package/src/tigerbeetle/src/lsm/composite_key.zig +75 -0
- package/src/tigerbeetle/src/lsm/direction.zig +11 -0
- package/src/tigerbeetle/src/lsm/eytzinger.zig +587 -0
- package/src/tigerbeetle/src/lsm/forest.zig +630 -0
- package/src/tigerbeetle/src/lsm/grid.zig +473 -0
- package/src/tigerbeetle/src/lsm/groove.zig +939 -0
- package/src/tigerbeetle/src/lsm/k_way_merge.zig +452 -0
- package/src/tigerbeetle/src/lsm/level_iterator.zig +296 -0
- package/src/tigerbeetle/src/lsm/manifest.zig +680 -0
- package/src/tigerbeetle/src/lsm/manifest_level.zig +1169 -0
- package/src/tigerbeetle/src/lsm/manifest_log.zig +904 -0
- package/src/tigerbeetle/src/lsm/node_pool.zig +231 -0
- package/src/tigerbeetle/src/lsm/posted_groove.zig +399 -0
- package/src/tigerbeetle/src/lsm/segmented_array.zig +998 -0
- package/src/tigerbeetle/src/lsm/set_associative_cache.zig +844 -0
- package/src/tigerbeetle/src/lsm/table.zig +932 -0
- package/src/tigerbeetle/src/lsm/table_immutable.zig +196 -0
- package/src/tigerbeetle/src/lsm/table_iterator.zig +295 -0
- package/src/tigerbeetle/src/lsm/table_mutable.zig +123 -0
- package/src/tigerbeetle/src/lsm/test.zig +429 -0
- package/src/tigerbeetle/src/lsm/tree.zig +1085 -0
- package/src/tigerbeetle/src/main.zig +119 -109
- package/src/tigerbeetle/src/message_bus.zig +49 -48
- package/src/tigerbeetle/src/message_pool.zig +15 -2
- package/src/tigerbeetle/src/ring_buffer.zig +126 -30
- package/src/tigerbeetle/src/simulator.zig +76 -44
- package/src/tigerbeetle/src/state_machine.zig +1022 -585
- package/src/tigerbeetle/src/storage.zig +46 -16
- package/src/tigerbeetle/src/test/cluster.zig +109 -63
- package/src/tigerbeetle/src/test/message_bus.zig +15 -24
- package/src/tigerbeetle/src/test/network.zig +26 -17
- package/src/tigerbeetle/src/test/state_checker.zig +7 -5
- package/src/tigerbeetle/src/test/state_machine.zig +159 -69
- package/src/tigerbeetle/src/test/storage.zig +57 -28
- package/src/tigerbeetle/src/tigerbeetle.zig +5 -0
- package/src/tigerbeetle/src/unit_tests.zig +8 -0
- package/src/tigerbeetle/src/util.zig +51 -0
- package/src/tigerbeetle/src/vsr/client.zig +21 -7
- package/src/tigerbeetle/src/vsr/journal.zig +154 -167
- package/src/tigerbeetle/src/vsr/replica.zig +744 -226
- package/src/tigerbeetle/src/vsr/superblock.zig +1743 -0
- package/src/tigerbeetle/src/vsr/superblock_client_table.zig +258 -0
- package/src/tigerbeetle/src/vsr/superblock_free_set.zig +644 -0
- package/src/tigerbeetle/src/vsr/superblock_manifest.zig +546 -0
- package/src/tigerbeetle/src/vsr.zig +43 -115
|
@@ -0,0 +1,1085 @@
|
|
|
1
|
+
const std = @import("std");
|
|
2
|
+
const builtin = @import("builtin");
|
|
3
|
+
const assert = std.debug.assert;
|
|
4
|
+
const math = std.math;
|
|
5
|
+
const mem = std.mem;
|
|
6
|
+
const os = std.os;
|
|
7
|
+
|
|
8
|
+
const log = std.log.scoped(.tree);
|
|
9
|
+
|
|
10
|
+
const config = @import("../config.zig");
|
|
11
|
+
const div_ceil = @import("../util.zig").div_ceil;
|
|
12
|
+
const eytzinger = @import("eytzinger.zig").eytzinger;
|
|
13
|
+
const vsr = @import("../vsr.zig");
|
|
14
|
+
const binary_search = @import("binary_search.zig");
|
|
15
|
+
const bloom_filter = @import("bloom_filter.zig");
|
|
16
|
+
|
|
17
|
+
const CompositeKey = @import("composite_key.zig").CompositeKey;
|
|
18
|
+
const NodePool = @import("node_pool.zig").NodePool(config.lsm_manifest_node_size, 16);
|
|
19
|
+
const RingBuffer = @import("../ring_buffer.zig").RingBuffer;
|
|
20
|
+
const SuperBlockType = vsr.SuperBlockType;
|
|
21
|
+
|
|
22
|
+
/// We reserve maxInt(u64) to indicate that a table has not been deleted.
|
|
23
|
+
/// Tables that have not been deleted have snapshot_max of maxInt(u64).
|
|
24
|
+
/// Since we ensure and assert that a query snapshot never exactly matches
|
|
25
|
+
/// the snapshot_min/snapshot_max of a table, we must use maxInt(u64) - 1
|
|
26
|
+
/// to query all non-deleted tables.
|
|
27
|
+
pub const snapshot_latest: u64 = math.maxInt(u64) - 1;
|
|
28
|
+
|
|
29
|
+
// StateMachine:
|
|
30
|
+
//
|
|
31
|
+
// /// state machine will pass this on to all object stores
|
|
32
|
+
// /// Read I/O only
|
|
33
|
+
// pub fn read(batch, callback) void
|
|
34
|
+
//
|
|
35
|
+
// /// write the ops in batch to the memtable/objcache, previously called commit()
|
|
36
|
+
// pub fn write(batch) void
|
|
37
|
+
//
|
|
38
|
+
// /// Flush in memory state to disk, preform merges, etc
|
|
39
|
+
// /// Only function that triggers Write I/O in LSMs, as well as some Read
|
|
40
|
+
// /// Make as incremental as possible, don't block the main thread, avoid high latency/spikes
|
|
41
|
+
// pub fn flush(callback) void
|
|
42
|
+
//
|
|
43
|
+
// /// Write manifest info for all object stores into buffer
|
|
44
|
+
// pub fn encode_superblock(buffer) void
|
|
45
|
+
//
|
|
46
|
+
// /// Restore all in-memory state from the superblock data
|
|
47
|
+
// pub fn decode_superblock(buffer) void
|
|
48
|
+
//
|
|
49
|
+
|
|
50
|
+
pub const table_count_max = table_count_max_for_tree(config.lsm_growth_factor, config.lsm_levels);
|
|
51
|
+
|
|
52
|
+
pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name: []const u8) type {
|
|
53
|
+
const Key = Table.Key;
|
|
54
|
+
const Value = Table.Value;
|
|
55
|
+
const compare_keys = Table.compare_keys;
|
|
56
|
+
const tombstone = Table.tombstone;
|
|
57
|
+
const tombstone_from_key = Table.tombstone_from_key;
|
|
58
|
+
|
|
59
|
+
const tree_hash = blk: {
|
|
60
|
+
// Blake3 hash does alot at comptime..
|
|
61
|
+
@setEvalBranchQuota(tree_name.len * 1024);
|
|
62
|
+
|
|
63
|
+
var hash: u256 = undefined;
|
|
64
|
+
std.crypto.hash.Blake3.hash(tree_name, std.mem.asBytes(&hash), .{});
|
|
65
|
+
break :blk @truncate(u128, hash);
|
|
66
|
+
};
|
|
67
|
+
|
|
68
|
+
return struct {
|
|
69
|
+
const Tree = @This();
|
|
70
|
+
|
|
71
|
+
// Expose the Table & hash for the Groove.
|
|
72
|
+
pub const TableType = Table;
|
|
73
|
+
pub const name = tree_name;
|
|
74
|
+
pub const hash = tree_hash;
|
|
75
|
+
|
|
76
|
+
const Grid = @import("grid.zig").GridType(Storage);
|
|
77
|
+
const Manifest = @import("manifest.zig").ManifestType(Table, Storage);
|
|
78
|
+
const TableMutable = @import("table_mutable.zig").TableMutableType(Table);
|
|
79
|
+
const TableImmutable = @import("table_immutable.zig").TableImmutableType(Table);
|
|
80
|
+
|
|
81
|
+
const CompactionType = @import("compaction.zig").CompactionType;
|
|
82
|
+
const TableIteratorType = @import("table_iterator.zig").TableIteratorType;
|
|
83
|
+
const TableImmutableIteratorType = @import("table_immutable.zig").TableImmutableIteratorType;
|
|
84
|
+
|
|
85
|
+
pub const ValueCache = std.HashMapUnmanaged(Value, void, Table.HashMapContextValue, 70);
|
|
86
|
+
|
|
87
|
+
const CompactionTable = CompactionType(Table, Storage, TableIteratorType);
|
|
88
|
+
const CompactionTableImmutable = CompactionType(Table, Storage, TableImmutableIteratorType);
|
|
89
|
+
|
|
90
|
+
grid: *Grid,
|
|
91
|
+
options: Options,
|
|
92
|
+
|
|
93
|
+
/// TODO(ifreund) Replace this with SetAssociativeCache:
|
|
94
|
+
/// A set associative cache of values shared by trees with the same key/value sizes.
|
|
95
|
+
/// This is used to accelerate point lookups and is not used for range queries.
|
|
96
|
+
/// Secondary index trees used only for range queries can therefore set this to null.
|
|
97
|
+
/// The value type will be []u8 and this will be shared by trees with the same value size.
|
|
98
|
+
value_cache: ?*ValueCache,
|
|
99
|
+
|
|
100
|
+
table_mutable: TableMutable,
|
|
101
|
+
table_immutable: TableImmutable,
|
|
102
|
+
|
|
103
|
+
manifest: Manifest,
|
|
104
|
+
|
|
105
|
+
compaction_table_immutable: CompactionTableImmutable,
|
|
106
|
+
|
|
107
|
+
/// The number of Compaction instances is divided by two as, at any given compaction tick,
|
|
108
|
+
/// we're only compacting either even or odd levels but never both.
|
|
109
|
+
/// Uses divFloor as the last level, even with odd lsm_levels, doesn't compact to anything.
|
|
110
|
+
/// (e.g. floor(5/2) = 2 for levels 0->1, 2->3 when even and immut->0, 1->2, 3->4 when odd).
|
|
111
|
+
/// This means, that for odd lsm_levels, the last CompactionTable is unused.
|
|
112
|
+
compaction_table: [@divFloor(config.lsm_levels, 2)]CompactionTable,
|
|
113
|
+
|
|
114
|
+
compaction_op: u64,
|
|
115
|
+
compaction_io_pending: usize,
|
|
116
|
+
compaction_callback: ?fn (*Tree) void,
|
|
117
|
+
|
|
118
|
+
checkpoint_callback: ?fn (*Tree) void,
|
|
119
|
+
open_callback: ?fn (*Tree) void,
|
|
120
|
+
|
|
121
|
+
pub const Options = struct {
|
|
122
|
+
/// The maximum number of keys that may be committed per batch.
|
|
123
|
+
commit_count_max: u32,
|
|
124
|
+
};
|
|
125
|
+
|
|
126
|
+
pub fn init(
|
|
127
|
+
allocator: mem.Allocator,
|
|
128
|
+
node_pool: *NodePool,
|
|
129
|
+
grid: *Grid,
|
|
130
|
+
value_cache: ?*ValueCache,
|
|
131
|
+
options: Options,
|
|
132
|
+
) !Tree {
|
|
133
|
+
var table_mutable = try TableMutable.init(allocator, options.commit_count_max);
|
|
134
|
+
errdefer table_mutable.deinit(allocator);
|
|
135
|
+
|
|
136
|
+
var table_immutable = try TableImmutable.init(allocator, options.commit_count_max);
|
|
137
|
+
errdefer table_immutable.deinit(allocator);
|
|
138
|
+
|
|
139
|
+
var manifest = try Manifest.init(allocator, node_pool, grid, tree_hash);
|
|
140
|
+
errdefer manifest.deinit(allocator);
|
|
141
|
+
|
|
142
|
+
var compaction_table_immutable = try CompactionTableImmutable.init(allocator);
|
|
143
|
+
errdefer compaction_table_immutable.deinit(allocator);
|
|
144
|
+
|
|
145
|
+
var compaction_table: [@divFloor(config.lsm_levels, 2)]CompactionTable = undefined;
|
|
146
|
+
for (compaction_table) |*compaction, i| {
|
|
147
|
+
errdefer for (compaction_table[0..i]) |*c| c.deinit(allocator);
|
|
148
|
+
compaction.* = try CompactionTable.init(allocator);
|
|
149
|
+
}
|
|
150
|
+
errdefer for (compaction_table) |*c| c.deinit(allocator);
|
|
151
|
+
|
|
152
|
+
return Tree{
|
|
153
|
+
.grid = grid,
|
|
154
|
+
.options = options,
|
|
155
|
+
.value_cache = value_cache,
|
|
156
|
+
.table_mutable = table_mutable,
|
|
157
|
+
.table_immutable = table_immutable,
|
|
158
|
+
.manifest = manifest,
|
|
159
|
+
.compaction_table_immutable = compaction_table_immutable,
|
|
160
|
+
.compaction_table = compaction_table,
|
|
161
|
+
.compaction_op = 0,
|
|
162
|
+
.compaction_io_pending = 0,
|
|
163
|
+
.compaction_callback = null,
|
|
164
|
+
.checkpoint_callback = null,
|
|
165
|
+
.open_callback = null,
|
|
166
|
+
};
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
pub fn deinit(tree: *Tree, allocator: mem.Allocator) void {
|
|
170
|
+
tree.compaction_table_immutable.deinit(allocator);
|
|
171
|
+
for (tree.compaction_table) |*compaction| compaction.deinit(allocator);
|
|
172
|
+
|
|
173
|
+
// TODO Consider whether we should release blocks acquired from Grid.block_free_set.
|
|
174
|
+
tree.table_mutable.deinit(allocator);
|
|
175
|
+
tree.table_immutable.deinit(allocator);
|
|
176
|
+
tree.manifest.deinit(allocator);
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
/// Get a cached value/tombstone for the given key.
|
|
180
|
+
/// Returns null if no value/tombstone for the given key is cached.
|
|
181
|
+
pub fn get_cached(tree: *const Tree, key: Key) ?*const Value {
|
|
182
|
+
const value = tree.table_mutable.get(key) orelse
|
|
183
|
+
tree.value_cache.?.getKeyPtr(tombstone_from_key(key));
|
|
184
|
+
|
|
185
|
+
return value;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
pub fn put(tree: *Tree, value: *const Value) void {
|
|
189
|
+
tree.table_mutable.put(value);
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
pub fn remove(tree: *Tree, value: *const Value) void {
|
|
193
|
+
tree.table_mutable.remove(value);
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
pub fn lookup(
|
|
197
|
+
tree: *Tree,
|
|
198
|
+
callback: fn (*LookupContext, ?*const Value) void,
|
|
199
|
+
context: *LookupContext,
|
|
200
|
+
snapshot: u64,
|
|
201
|
+
key: Key,
|
|
202
|
+
) void {
|
|
203
|
+
assert(snapshot <= snapshot_latest);
|
|
204
|
+
if (snapshot == snapshot_latest) {
|
|
205
|
+
// The mutable table is converted to an immutable table when a snapshot is created.
|
|
206
|
+
// This means that a snapshot will never be able to see the mutable table.
|
|
207
|
+
// This simplifies the mutable table and eliminates compaction for duplicate puts.
|
|
208
|
+
// The value cache is only used for the latest snapshot for simplicity.
|
|
209
|
+
// Earlier snapshots will still be able to utilize the block cache.
|
|
210
|
+
if (tree.table_mutable.get(key) orelse
|
|
211
|
+
tree.value_cache.?.getKeyPtr(tombstone_from_key(key))) |value|
|
|
212
|
+
{
|
|
213
|
+
callback(context, unwrap_tombstone(value));
|
|
214
|
+
return;
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
if (!tree.table_immutable.free and tree.table_immutable.snapshot_min < snapshot) {
|
|
219
|
+
if (tree.table_immutable.get(key)) |value| {
|
|
220
|
+
callback(context, unwrap_tombstone(value));
|
|
221
|
+
return;
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
var index_block_count: u8 = 0;
|
|
226
|
+
var index_block_addresses: [config.lsm_levels]u64 = undefined;
|
|
227
|
+
var index_block_checksums: [config.lsm_levels]u128 = undefined;
|
|
228
|
+
{
|
|
229
|
+
var it = tree.manifest.lookup(snapshot, key);
|
|
230
|
+
while (it.next()) |table| : (index_block_count += 1) {
|
|
231
|
+
assert(table.visible(snapshot));
|
|
232
|
+
assert(compare_keys(table.key_min, key) != .gt);
|
|
233
|
+
assert(compare_keys(table.key_max, key) != .lt);
|
|
234
|
+
|
|
235
|
+
index_block_addresses[index_block_count] = table.address;
|
|
236
|
+
index_block_checksums[index_block_count] = table.checksum;
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
if (index_block_count == 0) {
|
|
241
|
+
callback(context, null);
|
|
242
|
+
return;
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
// Hash the key to the fingerprint only once and reuse for all bloom filter checks.
|
|
246
|
+
const fingerprint = bloom_filter.Fingerprint.create(mem.asBytes(&key));
|
|
247
|
+
|
|
248
|
+
context.* = .{
|
|
249
|
+
.tree = tree,
|
|
250
|
+
.completion = undefined,
|
|
251
|
+
|
|
252
|
+
.key = key,
|
|
253
|
+
.fingerprint = fingerprint,
|
|
254
|
+
|
|
255
|
+
.index_block_count = index_block_count,
|
|
256
|
+
.index_block_addresses = index_block_addresses,
|
|
257
|
+
.index_block_checksums = index_block_checksums,
|
|
258
|
+
|
|
259
|
+
.callback = callback,
|
|
260
|
+
};
|
|
261
|
+
|
|
262
|
+
context.read_index_block();
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
pub const LookupContext = struct {
|
|
266
|
+
const Read = Grid.Read;
|
|
267
|
+
const BlockPtrConst = Grid.BlockPtrConst;
|
|
268
|
+
|
|
269
|
+
tree: *Tree,
|
|
270
|
+
completion: Read,
|
|
271
|
+
|
|
272
|
+
key: Key,
|
|
273
|
+
fingerprint: bloom_filter.Fingerprint,
|
|
274
|
+
|
|
275
|
+
/// This value is an index into the index_block_addresses/checksums arrays.
|
|
276
|
+
index_block: u8 = 0,
|
|
277
|
+
index_block_count: u8,
|
|
278
|
+
index_block_addresses: [config.lsm_levels]u64,
|
|
279
|
+
index_block_checksums: [config.lsm_levels]u128,
|
|
280
|
+
|
|
281
|
+
data_block: ?struct {
|
|
282
|
+
address: u64,
|
|
283
|
+
checksum: u128,
|
|
284
|
+
} = null,
|
|
285
|
+
|
|
286
|
+
callback: fn (*Tree.LookupContext, ?*const Value) void,
|
|
287
|
+
|
|
288
|
+
fn finish(context: *LookupContext, value: ?*const Value) void {
|
|
289
|
+
const callback = context.callback;
|
|
290
|
+
context.* = undefined;
|
|
291
|
+
callback(context, value);
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
fn read_index_block(context: *LookupContext) void {
|
|
295
|
+
assert(context.data_block == null);
|
|
296
|
+
assert(context.index_block < context.index_block_count);
|
|
297
|
+
assert(context.index_block_count > 0);
|
|
298
|
+
assert(context.index_block_count <= config.lsm_levels);
|
|
299
|
+
|
|
300
|
+
context.tree.grid.read_block(
|
|
301
|
+
read_index_block_callback,
|
|
302
|
+
&context.completion,
|
|
303
|
+
context.index_block_addresses[context.index_block],
|
|
304
|
+
context.index_block_checksums[context.index_block],
|
|
305
|
+
);
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
fn read_index_block_callback(completion: *Read, index_block: BlockPtrConst) void {
|
|
309
|
+
const context = @fieldParentPtr(LookupContext, "completion", completion);
|
|
310
|
+
assert(context.data_block == null);
|
|
311
|
+
assert(context.index_block < context.index_block_count);
|
|
312
|
+
assert(context.index_block_count > 0);
|
|
313
|
+
assert(context.index_block_count <= config.lsm_levels);
|
|
314
|
+
|
|
315
|
+
const blocks = Table.index_blocks_for_key(index_block, context.key);
|
|
316
|
+
|
|
317
|
+
context.data_block = .{
|
|
318
|
+
.address = blocks.data_block_address,
|
|
319
|
+
.checksum = blocks.data_block_checksum,
|
|
320
|
+
};
|
|
321
|
+
|
|
322
|
+
context.tree.grid.read_block(
|
|
323
|
+
read_filter_block_callback,
|
|
324
|
+
completion,
|
|
325
|
+
blocks.filter_block_address,
|
|
326
|
+
blocks.filter_block_checksum,
|
|
327
|
+
);
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
fn read_filter_block_callback(completion: *Read, filter_block: BlockPtrConst) void {
|
|
331
|
+
const context = @fieldParentPtr(LookupContext, "completion", completion);
|
|
332
|
+
assert(context.data_block != null);
|
|
333
|
+
assert(context.index_block < context.index_block_count);
|
|
334
|
+
assert(context.index_block_count > 0);
|
|
335
|
+
assert(context.index_block_count <= config.lsm_levels);
|
|
336
|
+
|
|
337
|
+
const filter_bytes = Table.filter_block_filter_const(filter_block);
|
|
338
|
+
if (bloom_filter.may_contain(context.fingerprint, filter_bytes)) {
|
|
339
|
+
context.tree.grid.read_block(
|
|
340
|
+
read_data_block_callback,
|
|
341
|
+
completion,
|
|
342
|
+
context.data_block.?.address,
|
|
343
|
+
context.data_block.?.checksum,
|
|
344
|
+
);
|
|
345
|
+
} else {
|
|
346
|
+
// The key is not present in this table, check the next level.
|
|
347
|
+
context.advance_to_next_level();
|
|
348
|
+
}
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
fn read_data_block_callback(completion: *Read, data_block: BlockPtrConst) void {
|
|
352
|
+
const context = @fieldParentPtr(LookupContext, "completion", completion);
|
|
353
|
+
assert(context.data_block != null);
|
|
354
|
+
assert(context.index_block < context.index_block_count);
|
|
355
|
+
assert(context.index_block_count > 0);
|
|
356
|
+
assert(context.index_block_count <= config.lsm_levels);
|
|
357
|
+
|
|
358
|
+
if (Table.data_block_search(data_block, context.key)) |value| {
|
|
359
|
+
context.finish(unwrap_tombstone(value));
|
|
360
|
+
} else {
|
|
361
|
+
// The key is not present in this table, check the next level.
|
|
362
|
+
context.advance_to_next_level();
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
fn advance_to_next_level(context: *LookupContext) void {
|
|
367
|
+
assert(context.data_block != null);
|
|
368
|
+
assert(context.index_block < context.index_block_count);
|
|
369
|
+
assert(context.index_block_count > 0);
|
|
370
|
+
assert(context.index_block_count <= config.lsm_levels);
|
|
371
|
+
|
|
372
|
+
context.index_block += 1;
|
|
373
|
+
if (context.index_block == context.index_block_count) {
|
|
374
|
+
context.finish(null);
|
|
375
|
+
return;
|
|
376
|
+
}
|
|
377
|
+
assert(context.index_block < context.index_block_count);
|
|
378
|
+
|
|
379
|
+
context.data_block = null;
|
|
380
|
+
context.read_index_block();
|
|
381
|
+
}
|
|
382
|
+
};
|
|
383
|
+
|
|
384
|
+
/// Returns null if the value is null or a tombstone, otherwise returns the value.
|
|
385
|
+
/// We use tombstone values internally, but expose them as null to the user.
|
|
386
|
+
/// This distinction enables us to cache a null result as a tombstone in our hash maps.
|
|
387
|
+
inline fn unwrap_tombstone(value: ?*const Value) ?*const Value {
|
|
388
|
+
return if (value == null or tombstone(value.?)) null else value.?;
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
pub fn open(tree: *Tree, callback: fn (*Tree) void) void {
|
|
392
|
+
assert(tree.open_callback == null);
|
|
393
|
+
tree.open_callback = callback;
|
|
394
|
+
|
|
395
|
+
tree.manifest.open(manifest_open_callback);
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
fn manifest_open_callback(manifest: *Manifest) void {
|
|
399
|
+
const tree = @fieldParentPtr(Tree, "manifest", manifest);
|
|
400
|
+
assert(tree.open_callback != null);
|
|
401
|
+
|
|
402
|
+
const callback = tree.open_callback.?;
|
|
403
|
+
tree.open_callback = null;
|
|
404
|
+
callback(tree);
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
// Tree compaction runs to the sound of music!
|
|
408
|
+
//
|
|
409
|
+
// Compacting LSM trees involves merging and moving tables into the next levels as needed.
|
|
410
|
+
// To avoid write amplification stalls and bound latency, compaction is done incrementally.
|
|
411
|
+
//
|
|
412
|
+
// A full compaction phase is denoted as a bar or measure, using terms from music notation.
|
|
413
|
+
// Each measure consists of `lsm_batch_multiple` beats or "compaction ticks" of work.
|
|
414
|
+
// A compaction beat is started asynchronously with `compact_io` which takes a callback.
|
|
415
|
+
// After `compact_io` is called, `compact_cpu` should be called to enable pipelining.
|
|
416
|
+
// The compaction beat completes when the `compact_io` callback is invoked.
|
|
417
|
+
//
|
|
418
|
+
// A measure is split in half according to the "first" down beat and "middle" down beat.
|
|
419
|
+
// The first half of the measure compacts even levels while the latter compacts odd levels.
|
|
420
|
+
// Mutable table changes are sorted and compacted into the immutable table.
|
|
421
|
+
// The immutable table is compacted into level 0 during the odd level half of the measure.
|
|
422
|
+
//
|
|
423
|
+
// At any given point, there's only levels/2 max compactions happening concurrently.
|
|
424
|
+
// The source level is denoted as `level_a` with the target level being `level_b`.
|
|
425
|
+
// The last level in the LSM tree has no target level so it's not compaction-from.
|
|
426
|
+
//
|
|
427
|
+
// Assuming a measure/`lsm_batch_multiple` of 4, the invariants can be described as follows:
|
|
428
|
+
// * assert: at the end of every beat, there's space in mutable table for the next beat.
|
|
429
|
+
// * manifest info for the tables compacted are updating during the compaction.
|
|
430
|
+
// * manifest is compacted at the end of every beat.
|
|
431
|
+
//
|
|
432
|
+
// - (first) down beat of the measure:
|
|
433
|
+
// * assert: no compactions are currently running.
|
|
434
|
+
// * compact immutable table if contains any sorted values (could be empty).
|
|
435
|
+
// * allow level visible table counts to overflow if needed.
|
|
436
|
+
// * start even level compactions if there's any tables to compact.
|
|
437
|
+
//
|
|
438
|
+
// - (second) up beat of the measure:
|
|
439
|
+
// * finish ticking running even-level compactions.
|
|
440
|
+
// * assert: on callback completion, all compactions should be completed.
|
|
441
|
+
//
|
|
442
|
+
// - (third) down beat of the measure:
|
|
443
|
+
// * assert: no compactions are currently running.
|
|
444
|
+
// * start odd level and immutable table compactions.
|
|
445
|
+
//
|
|
446
|
+
// - (fourth) last beat of the measure:
|
|
447
|
+
// * finish ticking running odd-level and immutable table compactions.
|
|
448
|
+
// * assert: on callback completion, all compactions should be completed.
|
|
449
|
+
// * assert: on callback completion, all level visible table counts shouldn't overflow.
|
|
450
|
+
// * flush, clear, and sort mutable table values into immutable table for next measure.
|
|
451
|
+
|
|
452
|
+
const half_measure_beat_count = @divExact(config.lsm_batch_multiple, 2);
|
|
453
|
+
|
|
454
|
+
const CompactionTableContext = struct {
|
|
455
|
+
compaction: *CompactionTable,
|
|
456
|
+
level_a: u8,
|
|
457
|
+
level_b: u8,
|
|
458
|
+
};
|
|
459
|
+
|
|
460
|
+
const CompactionTableIterator = struct {
|
|
461
|
+
tree: *Tree,
|
|
462
|
+
index: u8 = 0,
|
|
463
|
+
|
|
464
|
+
fn next(it: *CompactionTableIterator) ?CompactionTableContext {
|
|
465
|
+
assert(it.tree.compaction_callback != null);
|
|
466
|
+
|
|
467
|
+
const compaction_beat = it.tree.compaction_op % config.lsm_batch_multiple;
|
|
468
|
+
const even_levels = compaction_beat < half_measure_beat_count;
|
|
469
|
+
const level_a = (it.index * 2) + @boolToInt(!even_levels);
|
|
470
|
+
const level_b = level_a + 1;
|
|
471
|
+
|
|
472
|
+
if (level_a >= config.lsm_levels - 1) return null;
|
|
473
|
+
assert(level_b < config.lsm_levels);
|
|
474
|
+
|
|
475
|
+
defer it.index += 1;
|
|
476
|
+
return CompactionTableContext{
|
|
477
|
+
.compaction = &it.tree.compaction_table[it.index],
|
|
478
|
+
.level_a = level_a,
|
|
479
|
+
.level_b = level_b,
|
|
480
|
+
};
|
|
481
|
+
}
|
|
482
|
+
};
|
|
483
|
+
|
|
484
|
+
/// Since concurrent compactions into and out of a level may contend for the same range:
|
|
485
|
+
///
|
|
486
|
+
/// 1. compact level 0 to 1, level 2 to 3, level 4 to 5 etc., and then
|
|
487
|
+
/// 2. compact the immutable table to level 0, level 1 to 2, level 3 to 4 etc.
|
|
488
|
+
///
|
|
489
|
+
/// This order (even levels, then odd levels) is significant, since it reduces the number of
|
|
490
|
+
/// level 0 tables that overlap with the immutable table, reducing write amplification.
|
|
491
|
+
///
|
|
492
|
+
/// We therefore take the measure, during which all compactions run, and divide by two,
|
|
493
|
+
/// running the compactions from even levels in the first half measure, and then the odd.
|
|
494
|
+
///
|
|
495
|
+
/// Compactions start on the down beat of a half measure, using 0-based beats.
|
|
496
|
+
/// For example, if there are 4 beats in a measure, start on beat 0 or beat 2.
|
|
497
|
+
pub fn compact(tree: *Tree, callback: fn (*Tree) void, op: u64) void {
|
|
498
|
+
tree.compact_start(callback, op);
|
|
499
|
+
tree.compact_drive();
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
fn compact_drive(tree: *Tree) void {
|
|
503
|
+
tree.compact_io();
|
|
504
|
+
// tree.manifest.manifest_log.superblock.storage.tick();
|
|
505
|
+
tree.compact_cpu();
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
fn compact_start(tree: *Tree, callback: fn (*Tree) void, op: u64) void {
|
|
509
|
+
assert(tree.compaction_io_pending == 0);
|
|
510
|
+
assert(tree.compaction_callback == null);
|
|
511
|
+
|
|
512
|
+
if (op > 0) assert(op > tree.compaction_op);
|
|
513
|
+
tree.compaction_op = op;
|
|
514
|
+
tree.compaction_callback = callback;
|
|
515
|
+
|
|
516
|
+
const compaction_beat = tree.compaction_op % config.lsm_batch_multiple;
|
|
517
|
+
const start = (compaction_beat == 0) or
|
|
518
|
+
(compaction_beat == half_measure_beat_count);
|
|
519
|
+
|
|
520
|
+
// The target snapshot of a compaction is actually the previous batch minus one.
|
|
521
|
+
//
|
|
522
|
+
// At the start of the current batch, mutable table inserts from the previous batch
|
|
523
|
+
// would be in the immutable table. This means the current batch compaction will
|
|
524
|
+
// actually be flushing to disk (levels) mutable table updates from the previous batch.
|
|
525
|
+
//
|
|
526
|
+
// -1 as the ops are zero based so the "last" op from previous batch is reflected.
|
|
527
|
+
const snapshot = std.mem.alignBackward(op, config.lsm_batch_multiple) -| 1;
|
|
528
|
+
assert(snapshot != snapshot_latest);
|
|
529
|
+
|
|
530
|
+
log.debug(tree_name ++ ": compact_start: op={d} snapshot={d} beat={d}/{d}", .{
|
|
531
|
+
op,
|
|
532
|
+
snapshot,
|
|
533
|
+
compaction_beat + 1,
|
|
534
|
+
config.lsm_batch_multiple,
|
|
535
|
+
});
|
|
536
|
+
|
|
537
|
+
// Try to start compacting the immutable table.
|
|
538
|
+
const even_levels = compaction_beat < half_measure_beat_count;
|
|
539
|
+
if (even_levels) {
|
|
540
|
+
assert(tree.compaction_table_immutable.status == .idle);
|
|
541
|
+
} else {
|
|
542
|
+
if (start) tree.compact_io_start_table_immutable(snapshot);
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
// Try to start compacting the other levels.
|
|
546
|
+
var it = CompactionTableIterator{ .tree = tree };
|
|
547
|
+
while (it.next()) |context| {
|
|
548
|
+
if (start) tree.compact_io_start_table(snapshot, context);
|
|
549
|
+
}
|
|
550
|
+
}
|
|
551
|
+
|
|
552
|
+
fn compact_io_start_table_immutable(tree: *Tree, snapshot: u64) void {
|
|
553
|
+
const compaction_beat = tree.compaction_op % config.lsm_batch_multiple;
|
|
554
|
+
assert(compaction_beat == half_measure_beat_count);
|
|
555
|
+
|
|
556
|
+
// Do not start compaction if the immutable table does not require compaction.
|
|
557
|
+
if (tree.table_immutable.free) return;
|
|
558
|
+
|
|
559
|
+
const values_count = tree.table_immutable.values.len;
|
|
560
|
+
assert(values_count > 0);
|
|
561
|
+
|
|
562
|
+
const level_b: u8 = 0;
|
|
563
|
+
const range = tree.manifest.compaction_range(
|
|
564
|
+
level_b,
|
|
565
|
+
tree.table_immutable.key_min(),
|
|
566
|
+
tree.table_immutable.key_max(),
|
|
567
|
+
);
|
|
568
|
+
|
|
569
|
+
assert(range.table_count >= 1);
|
|
570
|
+
assert(compare_keys(range.key_min, tree.table_immutable.key_min()) != .gt);
|
|
571
|
+
assert(compare_keys(range.key_max, tree.table_immutable.key_max()) != .lt);
|
|
572
|
+
|
|
573
|
+
log.debug(tree_name ++
|
|
574
|
+
": compacting immutable table to level 0 " ++
|
|
575
|
+
"(values.len={d} snapshot_min={d} compaction.snapshot={d} table_count={d})", .{
|
|
576
|
+
tree.table_immutable.values.len,
|
|
577
|
+
tree.table_immutable.snapshot_min,
|
|
578
|
+
snapshot,
|
|
579
|
+
range.table_count,
|
|
580
|
+
});
|
|
581
|
+
|
|
582
|
+
tree.compaction_table_immutable.start(
|
|
583
|
+
tree.grid,
|
|
584
|
+
&tree.manifest,
|
|
585
|
+
level_b,
|
|
586
|
+
range,
|
|
587
|
+
snapshot,
|
|
588
|
+
.{ .table = &tree.table_immutable },
|
|
589
|
+
);
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
fn compact_io_start_table(tree: *Tree, snapshot: u64, context: CompactionTableContext) void {
|
|
593
|
+
assert(context.level_a < config.lsm_levels);
|
|
594
|
+
assert(context.level_b < config.lsm_levels);
|
|
595
|
+
assert(context.level_a + 1 == context.level_b);
|
|
596
|
+
|
|
597
|
+
// Do not start compaction if level A does not require compaction.
|
|
598
|
+
const table_range = tree.manifest.compaction_table(context.level_a) orelse return;
|
|
599
|
+
const table = table_range.table;
|
|
600
|
+
const range = table_range.range;
|
|
601
|
+
|
|
602
|
+
assert(range.table_count >= 1);
|
|
603
|
+
assert(compare_keys(table.key_min, table.key_max) != .gt);
|
|
604
|
+
assert(compare_keys(range.key_min, table.key_min) != .gt);
|
|
605
|
+
assert(compare_keys(range.key_max, table.key_max) != .lt);
|
|
606
|
+
|
|
607
|
+
log.debug(tree_name ++ ": compacting {d} tables from level {d} to level {d}", .{
|
|
608
|
+
range.table_count,
|
|
609
|
+
context.level_a,
|
|
610
|
+
context.level_b,
|
|
611
|
+
});
|
|
612
|
+
|
|
613
|
+
context.compaction.start(
|
|
614
|
+
tree.grid,
|
|
615
|
+
&tree.manifest,
|
|
616
|
+
context.level_b,
|
|
617
|
+
table_range.range,
|
|
618
|
+
snapshot,
|
|
619
|
+
.{
|
|
620
|
+
.grid = tree.grid,
|
|
621
|
+
.address = table.address,
|
|
622
|
+
.checksum = table.checksum,
|
|
623
|
+
},
|
|
624
|
+
);
|
|
625
|
+
}
|
|
626
|
+
|
|
627
|
+
fn compact_io(tree: *Tree) void {
|
|
628
|
+
assert(tree.compaction_io_pending <= 2 + tree.compaction_table.len);
|
|
629
|
+
assert(tree.compaction_callback != null);
|
|
630
|
+
|
|
631
|
+
// Try to tick the cpu portion of the immutable table compaction:
|
|
632
|
+
const compaction_beat = tree.compaction_op % config.lsm_batch_multiple;
|
|
633
|
+
const even_levels = compaction_beat < half_measure_beat_count;
|
|
634
|
+
if (even_levels) {
|
|
635
|
+
assert(tree.compaction_table_immutable.status == .idle);
|
|
636
|
+
} else {
|
|
637
|
+
if (tree.compaction_table_immutable.status == .compacting) {
|
|
638
|
+
tree.compact_io_tick(&tree.compaction_table_immutable);
|
|
639
|
+
}
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
// Try to tick the cpu portion of the level compactions:
|
|
643
|
+
var it = CompactionTableIterator{ .tree = tree };
|
|
644
|
+
while (it.next()) |context| {
|
|
645
|
+
if (context.compaction.status == .compacting) {
|
|
646
|
+
assert(context.compaction.level_b == context.level_b);
|
|
647
|
+
tree.compact_io_tick(context.compaction);
|
|
648
|
+
}
|
|
649
|
+
}
|
|
650
|
+
|
|
651
|
+
// Always start one io_pending that is resolved in compact_cpu()
|
|
652
|
+
// to handle the case of no level or immutable table being selected for compaction
|
|
653
|
+
tree.compaction_io_pending += 1;
|
|
654
|
+
assert(tree.compaction_io_pending <= 2 + tree.compaction_table.len);
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
fn compact_io_tick(tree: *Tree, compaction: anytype) void {
|
|
658
|
+
tree.compaction_io_pending += 1;
|
|
659
|
+
|
|
660
|
+
const compaction_beat = tree.compaction_op % config.lsm_batch_multiple;
|
|
661
|
+
const even_levels = compaction_beat < half_measure_beat_count;
|
|
662
|
+
assert(compaction.level_b < config.lsm_levels);
|
|
663
|
+
assert(compaction.level_b % 2 == @boolToInt(even_levels));
|
|
664
|
+
|
|
665
|
+
if (@TypeOf(compaction.*) == CompactionTableImmutable) {
|
|
666
|
+
assert(compaction.level_b == 0);
|
|
667
|
+
compaction.tick_io(Tree.compact_io_tick_callback_table_immutable);
|
|
668
|
+
log.debug(tree_name ++ ": queued compaction for immutable table to level 0", .{});
|
|
669
|
+
} else {
|
|
670
|
+
compaction.tick_io(Tree.compact_io_tick_callback_table);
|
|
671
|
+
log.debug(tree_name ++ ": queued compaction for level {d} to level {d}", .{
|
|
672
|
+
compaction.level_b - 1,
|
|
673
|
+
compaction.level_b,
|
|
674
|
+
});
|
|
675
|
+
}
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
fn compact_io_tick_callback_table_immutable(compaction: *CompactionTableImmutable) void {
|
|
679
|
+
assert(compaction.status == .compacting or compaction.status == .done);
|
|
680
|
+
assert(compaction.level_b < config.lsm_levels);
|
|
681
|
+
assert(compaction.level_b == 0);
|
|
682
|
+
|
|
683
|
+
const tree = @fieldParentPtr(Tree, "compaction_table_immutable", compaction);
|
|
684
|
+
const compaction_beat = tree.compaction_op % config.lsm_batch_multiple;
|
|
685
|
+
assert(compaction_beat >= half_measure_beat_count);
|
|
686
|
+
|
|
687
|
+
log.debug(tree_name ++ ": compact_io complete for immutable table to level 0", .{});
|
|
688
|
+
tree.compact_io_tick_done();
|
|
689
|
+
}
|
|
690
|
+
|
|
691
|
+
fn compact_io_tick_callback_table(compaction: *CompactionTable) void {
|
|
692
|
+
assert(compaction.status == .compacting or compaction.status == .done);
|
|
693
|
+
assert(compaction.level_b < config.lsm_levels);
|
|
694
|
+
assert(compaction.level_b > 0);
|
|
695
|
+
|
|
696
|
+
const table_offset = @divFloor(compaction.level_b - 1, 2);
|
|
697
|
+
const table_ptr = @ptrCast([*]CompactionTable, compaction) - table_offset;
|
|
698
|
+
|
|
699
|
+
const table_size = @divFloor(config.lsm_levels, 2);
|
|
700
|
+
const table: *[table_size]CompactionTable = table_ptr[0..table_size];
|
|
701
|
+
|
|
702
|
+
const tree = @fieldParentPtr(Tree, "compaction_table", table);
|
|
703
|
+
log.debug(tree_name ++ ": compact_io complete for level {d} to level {d}", .{
|
|
704
|
+
compaction.level_b - 1,
|
|
705
|
+
compaction.level_b,
|
|
706
|
+
});
|
|
707
|
+
|
|
708
|
+
tree.compact_io_tick_done();
|
|
709
|
+
}
|
|
710
|
+
|
|
711
|
+
fn compact_io_tick_done(tree: *Tree) void {
|
|
712
|
+
assert(tree.compaction_io_pending <= 2 + tree.compaction_table.len);
|
|
713
|
+
assert(tree.compaction_callback != null);
|
|
714
|
+
|
|
715
|
+
// compact_done() is called after all compact_io_tick()'s complete.
|
|
716
|
+
// This function can be triggered asynchronously or by compact_cpu() below.
|
|
717
|
+
tree.compaction_io_pending -= 1;
|
|
718
|
+
if (tree.compaction_io_pending == 0) tree.compact_done();
|
|
719
|
+
}
|
|
720
|
+
|
|
721
|
+
fn compact_cpu(tree: *Tree) void {
|
|
722
|
+
assert(tree.compaction_io_pending <= 2 + tree.compaction_table.len);
|
|
723
|
+
assert(tree.compaction_callback != null);
|
|
724
|
+
|
|
725
|
+
// Try to tick the cpu portion of the immutable table compaction:
|
|
726
|
+
const compaction_beat = tree.compaction_op % config.lsm_batch_multiple;
|
|
727
|
+
const even_levels = compaction_beat < half_measure_beat_count;
|
|
728
|
+
if (even_levels) {
|
|
729
|
+
assert(tree.compaction_table_immutable.status == .idle);
|
|
730
|
+
} else {
|
|
731
|
+
if (tree.compaction_table_immutable.status == .compacting) {
|
|
732
|
+
tree.compaction_table_immutable.tick_cpu();
|
|
733
|
+
}
|
|
734
|
+
}
|
|
735
|
+
|
|
736
|
+
// Try to tick the cpu portion of the level compactions:
|
|
737
|
+
var it = CompactionTableIterator{ .tree = tree };
|
|
738
|
+
while (it.next()) |context| {
|
|
739
|
+
if (context.compaction.status == .compacting) {
|
|
740
|
+
assert(context.compaction.level_b == context.level_b);
|
|
741
|
+
context.compaction.tick_cpu();
|
|
742
|
+
}
|
|
743
|
+
}
|
|
744
|
+
|
|
745
|
+
// Resolve the io_pending added by compact_io(). This may trigger compact_done().
|
|
746
|
+
tree.compact_io_tick_done();
|
|
747
|
+
}
|
|
748
|
+
|
|
749
|
+
fn compact_done(tree: *Tree) void {
|
|
750
|
+
assert(tree.compaction_io_pending == 0);
|
|
751
|
+
assert(tree.compaction_callback != null);
|
|
752
|
+
|
|
753
|
+
var still_compacting = false;
|
|
754
|
+
|
|
755
|
+
// Mark immutable compaction that reported done in their callback as "completed".
|
|
756
|
+
const compaction_beat = tree.compaction_op % config.lsm_batch_multiple;
|
|
757
|
+
const even_levels = compaction_beat < half_measure_beat_count;
|
|
758
|
+
if (even_levels) {
|
|
759
|
+
assert(tree.compaction_table_immutable.status == .idle);
|
|
760
|
+
} else {
|
|
761
|
+
if (tree.compaction_table_immutable.status == .done) {
|
|
762
|
+
tree.compaction_table_immutable.reset();
|
|
763
|
+
tree.table_immutable.clear();
|
|
764
|
+
} else if (tree.compaction_table_immutable.status == .compacting) {
|
|
765
|
+
still_compacting = true;
|
|
766
|
+
}
|
|
767
|
+
}
|
|
768
|
+
|
|
769
|
+
// Mark compactions that reported done in their callback as "completed" (done = null).
|
|
770
|
+
var it = CompactionTableIterator{ .tree = tree };
|
|
771
|
+
while (it.next()) |context| {
|
|
772
|
+
if (context.compaction.status == .done) {
|
|
773
|
+
assert(context.compaction.level_b == context.level_b);
|
|
774
|
+
context.compaction.reset();
|
|
775
|
+
} else if (context.compaction.status == .compacting) {
|
|
776
|
+
still_compacting = true;
|
|
777
|
+
}
|
|
778
|
+
}
|
|
779
|
+
|
|
780
|
+
// At the end of every beat, ensure mutable table can be flushed to immutable table.
|
|
781
|
+
assert(tree.table_mutable.can_commit_batch(tree.options.commit_count_max));
|
|
782
|
+
|
|
783
|
+
// At end of second/half measure:
|
|
784
|
+
// - assert: even compactions from previous tick are finished.
|
|
785
|
+
// - remove tables made invisible during compaction of even levels.
|
|
786
|
+
if (compaction_beat == half_measure_beat_count - 1) {
|
|
787
|
+
if (still_compacting) {
|
|
788
|
+
log.debug(tree_name ++ ": compact_done: driving outstanding compactions", .{});
|
|
789
|
+
return tree.compact_drive();
|
|
790
|
+
}
|
|
791
|
+
|
|
792
|
+
log.debug(tree_name ++ ": compact_done: compacted even levels", .{});
|
|
793
|
+
|
|
794
|
+
it = CompactionTableIterator{ .tree = tree };
|
|
795
|
+
while (it.next()) |context| {
|
|
796
|
+
assert(context.compaction.status == .idle);
|
|
797
|
+
tree.manifest.remove_invisible_tables(
|
|
798
|
+
context.level_a,
|
|
799
|
+
context.compaction.snapshot,
|
|
800
|
+
context.compaction.range.key_min,
|
|
801
|
+
context.compaction.range.key_max,
|
|
802
|
+
);
|
|
803
|
+
}
|
|
804
|
+
}
|
|
805
|
+
|
|
806
|
+
// At end of fourth/last measure:
|
|
807
|
+
// - assert: immutable table and odd level compactions from previous tick are finished.
|
|
808
|
+
// - remove tables made invisible during compaction of odd levels.
|
|
809
|
+
// - assert: all visible levels haven't overflowed their max.
|
|
810
|
+
// - convert mutable table to immutable tables for next measure.
|
|
811
|
+
if (compaction_beat == config.lsm_batch_multiple - 1) {
|
|
812
|
+
if (still_compacting) {
|
|
813
|
+
log.debug(tree_name ++ ": compact_done: driving outstanding compactions", .{});
|
|
814
|
+
return tree.compact_drive();
|
|
815
|
+
}
|
|
816
|
+
|
|
817
|
+
// TODO Make log message more accurate according to what was compacted.
|
|
818
|
+
log.debug(tree_name ++ ": compact_done: compacted immutable table and odd levels", .{});
|
|
819
|
+
|
|
820
|
+
assert(tree.compaction_table_immutable.status == .idle);
|
|
821
|
+
it = CompactionTableIterator{ .tree = tree };
|
|
822
|
+
while (it.next()) |context| {
|
|
823
|
+
assert(context.compaction.status == .idle);
|
|
824
|
+
tree.manifest.remove_invisible_tables(
|
|
825
|
+
context.level_a,
|
|
826
|
+
context.compaction.snapshot,
|
|
827
|
+
context.compaction.range.key_min,
|
|
828
|
+
context.compaction.range.key_max,
|
|
829
|
+
);
|
|
830
|
+
}
|
|
831
|
+
|
|
832
|
+
tree.manifest.assert_level_table_counts();
|
|
833
|
+
tree.compact_mutable_table_into_immutable();
|
|
834
|
+
}
|
|
835
|
+
|
|
836
|
+
// At the end of every beat, call manifest.compact before invoking the compact callback.
|
|
837
|
+
tree.manifest.compact(compact_manifest_callback);
|
|
838
|
+
}
|
|
839
|
+
|
|
840
|
+
fn compact_manifest_callback(manifest: *Manifest) void {
|
|
841
|
+
const tree = @fieldParentPtr(Tree, "manifest", manifest);
|
|
842
|
+
assert(tree.compaction_io_pending == 0);
|
|
843
|
+
assert(tree.compaction_callback != null);
|
|
844
|
+
|
|
845
|
+
// Invoke the compact_io() callback after the manifest compacts at the end of the beat.
|
|
846
|
+
const callback = tree.compaction_callback.?;
|
|
847
|
+
tree.compaction_callback = null;
|
|
848
|
+
callback(tree);
|
|
849
|
+
}
|
|
850
|
+
|
|
851
|
+
fn compact_mutable_table_into_immutable(tree: *Tree) void {
|
|
852
|
+
// Ensure mutable table can be flushed into immutable table.
|
|
853
|
+
if (tree.table_mutable.count() == 0) return;
|
|
854
|
+
assert(tree.table_immutable.free);
|
|
855
|
+
|
|
856
|
+
// Sort the mutable table values directly into the immutable table's array.
|
|
857
|
+
const values_max = tree.table_immutable.values_max();
|
|
858
|
+
const values = tree.table_mutable.sort_into_values_and_clear(values_max);
|
|
859
|
+
assert(values.ptr == values_max.ptr);
|
|
860
|
+
|
|
861
|
+
// Take a manifest snapshot and setup the immutable table with the sorted values.
|
|
862
|
+
const snapshot_min = tree.compaction_table_immutable.snapshot;
|
|
863
|
+
tree.table_immutable.reset_with_sorted_values(snapshot_min, values);
|
|
864
|
+
|
|
865
|
+
assert(tree.table_mutable.count() == 0);
|
|
866
|
+
assert(!tree.table_immutable.free);
|
|
867
|
+
}
|
|
868
|
+
|
|
869
|
+
pub fn checkpoint(tree: *Tree, callback: fn (*Tree) void) void {
|
|
870
|
+
// Assert no outstanding compact_io() work..
|
|
871
|
+
assert(tree.compaction_io_pending == 0);
|
|
872
|
+
assert(tree.compaction_callback == null);
|
|
873
|
+
|
|
874
|
+
// Avoid checkpointing if this is not the last beat in the compaction measure.
|
|
875
|
+
const compaction_beat = tree.compaction_op % config.lsm_batch_multiple;
|
|
876
|
+
const last_beat_in_measure = config.lsm_batch_multiple - 1;
|
|
877
|
+
if (compaction_beat != last_beat_in_measure) return callback(tree);
|
|
878
|
+
|
|
879
|
+
// Assert no outstanding compactions.
|
|
880
|
+
assert(tree.compaction_table_immutable.status == .idle);
|
|
881
|
+
for (tree.compaction_table) |*compaction| {
|
|
882
|
+
assert(compaction.status == .idle);
|
|
883
|
+
}
|
|
884
|
+
|
|
885
|
+
// Assert all manifest levels haven't overflowed their table counts.
|
|
886
|
+
tree.manifest.assert_level_table_counts();
|
|
887
|
+
|
|
888
|
+
// Assert that we're checkpointing only after invisible tables have been removed.
|
|
889
|
+
if (config.verify) {
|
|
890
|
+
tree.manifest.assert_no_invisible_tables(tree.compaction_op);
|
|
891
|
+
}
|
|
892
|
+
|
|
893
|
+
// Start an asynchronous checkpoint on the manifest.
|
|
894
|
+
assert(tree.checkpoint_callback == null);
|
|
895
|
+
tree.checkpoint_callback = callback;
|
|
896
|
+
tree.manifest.checkpoint(manifest_checkpoint_callback);
|
|
897
|
+
}
|
|
898
|
+
|
|
899
|
+
fn manifest_checkpoint_callback(manifest: *Manifest) void {
|
|
900
|
+
const tree = @fieldParentPtr(Tree, "manifest", manifest);
|
|
901
|
+
assert(tree.checkpoint_callback != null);
|
|
902
|
+
|
|
903
|
+
const callback = tree.checkpoint_callback.?;
|
|
904
|
+
tree.checkpoint_callback = null;
|
|
905
|
+
callback(tree);
|
|
906
|
+
}
|
|
907
|
+
|
|
908
|
+
pub const RangeQuery = union(enum) {
|
|
909
|
+
bounded: struct {
|
|
910
|
+
start: Key,
|
|
911
|
+
end: Key,
|
|
912
|
+
},
|
|
913
|
+
open: struct {
|
|
914
|
+
start: Key,
|
|
915
|
+
order: enum {
|
|
916
|
+
ascending,
|
|
917
|
+
descending,
|
|
918
|
+
},
|
|
919
|
+
},
|
|
920
|
+
};
|
|
921
|
+
|
|
922
|
+
pub const RangeQueryIterator = struct {
|
|
923
|
+
tree: *Tree,
|
|
924
|
+
snapshot: u64,
|
|
925
|
+
query: RangeQuery,
|
|
926
|
+
|
|
927
|
+
pub fn next(callback: fn (result: ?Value) void) void {
|
|
928
|
+
_ = callback;
|
|
929
|
+
}
|
|
930
|
+
};
|
|
931
|
+
|
|
932
|
+
pub fn range_query(
|
|
933
|
+
tree: *Tree,
|
|
934
|
+
/// The snapshot timestamp, if any
|
|
935
|
+
snapshot: u64,
|
|
936
|
+
query: RangeQuery,
|
|
937
|
+
) RangeQueryIterator {
|
|
938
|
+
_ = tree;
|
|
939
|
+
_ = snapshot;
|
|
940
|
+
_ = query;
|
|
941
|
+
}
|
|
942
|
+
};
|
|
943
|
+
}
|
|
944
|
+
|
|
945
|
+
/// The total number of tables that can be supported by the tree across so many levels.
|
|
946
|
+
pub fn table_count_max_for_tree(growth_factor: u32, levels_count: u32) u32 {
|
|
947
|
+
assert(growth_factor >= 4);
|
|
948
|
+
assert(growth_factor <= 16); // Limit excessive write amplification.
|
|
949
|
+
assert(levels_count >= 2);
|
|
950
|
+
assert(levels_count <= 10); // Limit excessive read amplification.
|
|
951
|
+
assert(levels_count <= config.lsm_levels);
|
|
952
|
+
|
|
953
|
+
var count: u32 = 0;
|
|
954
|
+
var level: u32 = 0;
|
|
955
|
+
while (level < levels_count) : (level += 1) {
|
|
956
|
+
count += table_count_max_for_level(growth_factor, level);
|
|
957
|
+
}
|
|
958
|
+
return count;
|
|
959
|
+
}
|
|
960
|
+
|
|
961
|
+
/// The total number of tables that can be supported by the level alone.
|
|
962
|
+
pub fn table_count_max_for_level(growth_factor: u32, level: u32) u32 {
|
|
963
|
+
assert(level >= 0);
|
|
964
|
+
assert(level < config.lsm_levels);
|
|
965
|
+
|
|
966
|
+
return math.pow(u32, growth_factor, level + 1);
|
|
967
|
+
}
|
|
968
|
+
|
|
969
|
+
test "table_count_max_for_level/tree" {
|
|
970
|
+
const expectEqual = std.testing.expectEqual;
|
|
971
|
+
|
|
972
|
+
try expectEqual(@as(u32, 8), table_count_max_for_level(8, 0));
|
|
973
|
+
try expectEqual(@as(u32, 64), table_count_max_for_level(8, 1));
|
|
974
|
+
try expectEqual(@as(u32, 512), table_count_max_for_level(8, 2));
|
|
975
|
+
try expectEqual(@as(u32, 4096), table_count_max_for_level(8, 3));
|
|
976
|
+
try expectEqual(@as(u32, 32768), table_count_max_for_level(8, 4));
|
|
977
|
+
try expectEqual(@as(u32, 262144), table_count_max_for_level(8, 5));
|
|
978
|
+
try expectEqual(@as(u32, 2097152), table_count_max_for_level(8, 6));
|
|
979
|
+
|
|
980
|
+
try expectEqual(@as(u32, 8 + 64), table_count_max_for_tree(8, 2));
|
|
981
|
+
try expectEqual(@as(u32, 72 + 512), table_count_max_for_tree(8, 3));
|
|
982
|
+
try expectEqual(@as(u32, 584 + 4096), table_count_max_for_tree(8, 4));
|
|
983
|
+
try expectEqual(@as(u32, 4680 + 32768), table_count_max_for_tree(8, 5));
|
|
984
|
+
try expectEqual(@as(u32, 37448 + 262144), table_count_max_for_tree(8, 6));
|
|
985
|
+
try expectEqual(@as(u32, 299592 + 2097152), table_count_max_for_tree(8, 7));
|
|
986
|
+
}
|
|
987
|
+
|
|
988
|
+
pub fn main() !void {
|
|
989
|
+
const testing = std.testing;
|
|
990
|
+
const allocator = testing.allocator;
|
|
991
|
+
|
|
992
|
+
const IO = @import("../io.zig").IO;
|
|
993
|
+
const Storage = @import("../storage.zig").Storage;
|
|
994
|
+
const Grid = @import("grid.zig").GridType(Storage);
|
|
995
|
+
|
|
996
|
+
const data_file_size_min = @import("../vsr/superblock.zig").data_file_size_min;
|
|
997
|
+
|
|
998
|
+
const dir_fd = try IO.open_dir(".");
|
|
999
|
+
const storage_fd = try IO.open_file(dir_fd, "test_tree", data_file_size_min, true);
|
|
1000
|
+
defer std.fs.cwd().deleteFile("test_tree") catch {};
|
|
1001
|
+
|
|
1002
|
+
var io = try IO.init(128, 0);
|
|
1003
|
+
defer io.deinit();
|
|
1004
|
+
|
|
1005
|
+
var storage = try Storage.init(&io, storage_fd);
|
|
1006
|
+
defer storage.deinit();
|
|
1007
|
+
|
|
1008
|
+
const Key = CompositeKey(u128);
|
|
1009
|
+
const Table = @import("table.zig").TableType(
|
|
1010
|
+
Key,
|
|
1011
|
+
Key.Value,
|
|
1012
|
+
Key.compare_keys,
|
|
1013
|
+
Key.key_from_value,
|
|
1014
|
+
Key.sentinel_key,
|
|
1015
|
+
Key.tombstone,
|
|
1016
|
+
Key.tombstone_from_key,
|
|
1017
|
+
);
|
|
1018
|
+
|
|
1019
|
+
const Tree = TreeType(Table, Storage, @typeName(Table) ++ "_test");
|
|
1020
|
+
|
|
1021
|
+
// Check out our spreadsheet to see how we calculate node_count for a forest of trees.
|
|
1022
|
+
const node_count = 1024;
|
|
1023
|
+
var node_pool = try NodePool.init(allocator, node_count);
|
|
1024
|
+
defer node_pool.deinit(allocator);
|
|
1025
|
+
|
|
1026
|
+
var value_cache = Tree.ValueCache{};
|
|
1027
|
+
try value_cache.ensureTotalCapacity(allocator, 10000);
|
|
1028
|
+
defer value_cache.deinit(allocator);
|
|
1029
|
+
|
|
1030
|
+
const batch_size_max = config.message_size_max - @sizeOf(vsr.Header);
|
|
1031
|
+
const commit_count_max = @divFloor(batch_size_max, 128);
|
|
1032
|
+
|
|
1033
|
+
var sort_buffer = try allocator.allocAdvanced(
|
|
1034
|
+
u8,
|
|
1035
|
+
16,
|
|
1036
|
+
// This must be the greatest commit_count_max and value_size across trees:
|
|
1037
|
+
commit_count_max * config.lsm_batch_multiple * 128,
|
|
1038
|
+
.exact,
|
|
1039
|
+
);
|
|
1040
|
+
defer allocator.free(sort_buffer);
|
|
1041
|
+
|
|
1042
|
+
// TODO Initialize SuperBlock:
|
|
1043
|
+
var superblock: SuperBlockType(Storage) = undefined;
|
|
1044
|
+
|
|
1045
|
+
var grid = try Grid.init(allocator, &superblock);
|
|
1046
|
+
defer grid.deinit(allocator);
|
|
1047
|
+
|
|
1048
|
+
var tree = try Tree.init(
|
|
1049
|
+
allocator,
|
|
1050
|
+
&node_pool,
|
|
1051
|
+
&grid,
|
|
1052
|
+
&value_cache,
|
|
1053
|
+
.{
|
|
1054
|
+
.prefetch_count_max = commit_count_max * 2,
|
|
1055
|
+
.commit_count_max = commit_count_max,
|
|
1056
|
+
},
|
|
1057
|
+
);
|
|
1058
|
+
defer tree.deinit(allocator);
|
|
1059
|
+
|
|
1060
|
+
testing.refAllDecls(@This());
|
|
1061
|
+
|
|
1062
|
+
// TODO: more references
|
|
1063
|
+
_ = Table;
|
|
1064
|
+
_ = Table.Builder.data_block_finish;
|
|
1065
|
+
|
|
1066
|
+
// TODO: more references
|
|
1067
|
+
_ = Tree.CompactionTable;
|
|
1068
|
+
|
|
1069
|
+
_ = tree.prefetch_enqueue;
|
|
1070
|
+
_ = tree.prefetch;
|
|
1071
|
+
_ = tree.prefetch_key;
|
|
1072
|
+
_ = tree.get;
|
|
1073
|
+
_ = tree.put;
|
|
1074
|
+
_ = tree.remove;
|
|
1075
|
+
_ = tree.lookup;
|
|
1076
|
+
_ = tree.compact_io;
|
|
1077
|
+
_ = tree.compact_cpu;
|
|
1078
|
+
|
|
1079
|
+
_ = Tree.Manifest.LookupIterator.next;
|
|
1080
|
+
_ = tree.manifest;
|
|
1081
|
+
_ = tree.manifest.lookup;
|
|
1082
|
+
_ = tree.manifest.insert_tables;
|
|
1083
|
+
|
|
1084
|
+
std.debug.print("table_count_max={}\n", .{table_count_max});
|
|
1085
|
+
}
|