tigerbeetle-node 0.9.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -2
- package/dist/index.d.ts +66 -61
- package/dist/index.js +66 -61
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/src/index.ts +5 -0
- package/src/node.zig +17 -18
- package/src/tigerbeetle/scripts/benchmark.bat +4 -3
- package/src/tigerbeetle/scripts/benchmark.sh +25 -10
- package/src/tigerbeetle/scripts/install.sh +2 -1
- package/src/tigerbeetle/scripts/install_zig.sh +14 -18
- package/src/tigerbeetle/scripts/upgrade_ubuntu_kernel.sh +12 -3
- package/src/tigerbeetle/scripts/vopr.sh +5 -5
- package/src/tigerbeetle/src/benchmark.zig +17 -9
- package/src/tigerbeetle/src/benchmark_array_search.zig +317 -0
- package/src/tigerbeetle/src/benchmarks/perf.zig +299 -0
- package/src/tigerbeetle/src/c/tb_client/context.zig +103 -0
- package/src/tigerbeetle/src/c/tb_client/packet.zig +80 -0
- package/src/tigerbeetle/src/c/tb_client/signal.zig +288 -0
- package/src/tigerbeetle/src/c/tb_client/thread.zig +329 -0
- package/src/tigerbeetle/src/c/tb_client.h +201 -0
- package/src/tigerbeetle/src/c/tb_client.zig +101 -0
- package/src/tigerbeetle/src/c/test.zig +1 -0
- package/src/tigerbeetle/src/cli.zig +142 -83
- package/src/tigerbeetle/src/config.zig +119 -10
- package/src/tigerbeetle/src/demo.zig +12 -8
- package/src/tigerbeetle/src/demo_05_post_pending_transfers.zig +2 -2
- package/src/tigerbeetle/src/ewah.zig +318 -0
- package/src/tigerbeetle/src/ewah_benchmark.zig +121 -0
- package/src/tigerbeetle/src/eytzinger_benchmark.zig +317 -0
- package/src/tigerbeetle/src/fifo.zig +17 -1
- package/src/tigerbeetle/src/io/darwin.zig +12 -10
- package/src/tigerbeetle/src/io/linux.zig +25 -9
- package/src/tigerbeetle/src/io/windows.zig +13 -9
- package/src/tigerbeetle/src/iops.zig +101 -0
- package/src/tigerbeetle/src/lsm/binary_search.zig +214 -0
- package/src/tigerbeetle/src/lsm/bloom_filter.zig +82 -0
- package/src/tigerbeetle/src/lsm/compaction.zig +603 -0
- package/src/tigerbeetle/src/lsm/composite_key.zig +75 -0
- package/src/tigerbeetle/src/lsm/direction.zig +11 -0
- package/src/tigerbeetle/src/lsm/eytzinger.zig +587 -0
- package/src/tigerbeetle/src/lsm/forest.zig +630 -0
- package/src/tigerbeetle/src/lsm/grid.zig +473 -0
- package/src/tigerbeetle/src/lsm/groove.zig +939 -0
- package/src/tigerbeetle/src/lsm/k_way_merge.zig +452 -0
- package/src/tigerbeetle/src/lsm/level_iterator.zig +296 -0
- package/src/tigerbeetle/src/lsm/manifest.zig +680 -0
- package/src/tigerbeetle/src/lsm/manifest_level.zig +1169 -0
- package/src/tigerbeetle/src/lsm/manifest_log.zig +904 -0
- package/src/tigerbeetle/src/lsm/node_pool.zig +231 -0
- package/src/tigerbeetle/src/lsm/posted_groove.zig +399 -0
- package/src/tigerbeetle/src/lsm/segmented_array.zig +998 -0
- package/src/tigerbeetle/src/lsm/set_associative_cache.zig +844 -0
- package/src/tigerbeetle/src/lsm/table.zig +932 -0
- package/src/tigerbeetle/src/lsm/table_immutable.zig +196 -0
- package/src/tigerbeetle/src/lsm/table_iterator.zig +295 -0
- package/src/tigerbeetle/src/lsm/table_mutable.zig +123 -0
- package/src/tigerbeetle/src/lsm/test.zig +429 -0
- package/src/tigerbeetle/src/lsm/tree.zig +1085 -0
- package/src/tigerbeetle/src/main.zig +119 -109
- package/src/tigerbeetle/src/message_bus.zig +49 -48
- package/src/tigerbeetle/src/message_pool.zig +15 -2
- package/src/tigerbeetle/src/ring_buffer.zig +126 -30
- package/src/tigerbeetle/src/simulator.zig +76 -44
- package/src/tigerbeetle/src/state_machine.zig +1022 -585
- package/src/tigerbeetle/src/storage.zig +46 -16
- package/src/tigerbeetle/src/test/cluster.zig +109 -63
- package/src/tigerbeetle/src/test/message_bus.zig +15 -24
- package/src/tigerbeetle/src/test/network.zig +26 -17
- package/src/tigerbeetle/src/test/state_checker.zig +7 -5
- package/src/tigerbeetle/src/test/state_machine.zig +159 -69
- package/src/tigerbeetle/src/test/storage.zig +57 -28
- package/src/tigerbeetle/src/tigerbeetle.zig +5 -0
- package/src/tigerbeetle/src/unit_tests.zig +8 -0
- package/src/tigerbeetle/src/util.zig +51 -0
- package/src/tigerbeetle/src/vsr/client.zig +21 -7
- package/src/tigerbeetle/src/vsr/journal.zig +154 -167
- package/src/tigerbeetle/src/vsr/replica.zig +744 -226
- package/src/tigerbeetle/src/vsr/superblock.zig +1743 -0
- package/src/tigerbeetle/src/vsr/superblock_client_table.zig +258 -0
- package/src/tigerbeetle/src/vsr/superblock_free_set.zig +644 -0
- package/src/tigerbeetle/src/vsr/superblock_manifest.zig +546 -0
- package/src/tigerbeetle/src/vsr.zig +43 -115
|
@@ -0,0 +1,680 @@
|
|
|
1
|
+
const std = @import("std");
|
|
2
|
+
const mem = std.mem;
|
|
3
|
+
const math = std.math;
|
|
4
|
+
const assert = std.debug.assert;
|
|
5
|
+
|
|
6
|
+
const config = @import("../config.zig");
|
|
7
|
+
const growth_factor = config.lsm_growth_factor;
|
|
8
|
+
|
|
9
|
+
const table_count_max = @import("tree.zig").table_count_max;
|
|
10
|
+
const table_count_max_for_level = @import("tree.zig").table_count_max_for_level;
|
|
11
|
+
const snapshot_latest = @import("tree.zig").snapshot_latest;
|
|
12
|
+
|
|
13
|
+
const Direction = @import("direction.zig").Direction;
|
|
14
|
+
const GridType = @import("grid.zig").GridType;
|
|
15
|
+
const ManifestLogType = @import("manifest_log.zig").ManifestLogType;
|
|
16
|
+
const ManifestLevelType = @import("manifest_level.zig").ManifestLevelType;
|
|
17
|
+
const NodePool = @import("node_pool.zig").NodePool(config.lsm_manifest_node_size, 16);
|
|
18
|
+
const SegmentedArray = @import("segmented_array.zig").SegmentedArray;
|
|
19
|
+
|
|
20
|
+
pub fn TableInfoType(comptime Table: type) type {
|
|
21
|
+
const Key = Table.Key;
|
|
22
|
+
const compare_keys = Table.compare_keys;
|
|
23
|
+
|
|
24
|
+
return extern struct {
|
|
25
|
+
const TableInfo = @This();
|
|
26
|
+
|
|
27
|
+
checksum: u128,
|
|
28
|
+
address: u64,
|
|
29
|
+
flags: u64 = 0,
|
|
30
|
+
|
|
31
|
+
/// The minimum snapshot that can see this table (with exclusive bounds).
|
|
32
|
+
/// This value is set to the current snapshot tick on table creation.
|
|
33
|
+
snapshot_min: u64,
|
|
34
|
+
|
|
35
|
+
/// The maximum snapshot that can see this table (with exclusive bounds).
|
|
36
|
+
/// This value is set to the current snapshot tick on table deletion.
|
|
37
|
+
snapshot_max: u64 = math.maxInt(u64),
|
|
38
|
+
|
|
39
|
+
key_min: Key,
|
|
40
|
+
key_max: Key,
|
|
41
|
+
|
|
42
|
+
comptime {
|
|
43
|
+
assert(@sizeOf(TableInfo) == 48 + Table.key_size * 2);
|
|
44
|
+
assert(@alignOf(TableInfo) == 16);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
pub fn visible(table: *const TableInfo, snapshot: u64) bool {
|
|
48
|
+
assert(table.address != 0);
|
|
49
|
+
assert(table.snapshot_min < table.snapshot_max);
|
|
50
|
+
assert(snapshot <= snapshot_latest);
|
|
51
|
+
|
|
52
|
+
assert(snapshot != table.snapshot_min);
|
|
53
|
+
assert(snapshot != table.snapshot_max);
|
|
54
|
+
|
|
55
|
+
return table.snapshot_min < snapshot and snapshot < table.snapshot_max;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
pub fn invisible(table: *const TableInfo, snapshots: []const u64) bool {
|
|
59
|
+
// Return early and do not iterate all snapshots if the table was never deleted:
|
|
60
|
+
if (table.visible(snapshot_latest)) return false;
|
|
61
|
+
for (snapshots) |snapshot| if (table.visible(snapshot)) return false;
|
|
62
|
+
assert(table.snapshot_max < math.maxInt(u64));
|
|
63
|
+
return true;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
pub fn equal(table: *const TableInfo, other: *const TableInfo) bool {
|
|
67
|
+
// TODO Since the layout of TableInfo is well defined, a direct memcmp may be faster
|
|
68
|
+
// here. However, it's not clear if we can make the assumption that compare_keys()
|
|
69
|
+
// will return .eq exactly when the memory of the keys are equal.
|
|
70
|
+
// Consider defining the API to allow this.
|
|
71
|
+
return table.checksum == other.checksum and
|
|
72
|
+
table.address == other.address and
|
|
73
|
+
table.flags == other.flags and
|
|
74
|
+
table.snapshot_min == other.snapshot_min and
|
|
75
|
+
table.snapshot_max == other.snapshot_max and
|
|
76
|
+
compare_keys(table.key_min, other.key_min) == .eq and
|
|
77
|
+
compare_keys(table.key_max, other.key_max) == .eq;
|
|
78
|
+
}
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
pub fn TableInfoBufferType(comptime Table: type, comptime sort_direction: ?Direction) type {
|
|
83
|
+
const TableInfo = TableInfoType(Table);
|
|
84
|
+
const compare_keys = Table.compare_keys;
|
|
85
|
+
|
|
86
|
+
return struct {
|
|
87
|
+
array: []TableInfo,
|
|
88
|
+
count: usize = 0,
|
|
89
|
+
|
|
90
|
+
const TableInfoBuffer = @This();
|
|
91
|
+
|
|
92
|
+
pub fn init(allocator: mem.Allocator, count_max: usize) !TableInfoBuffer {
|
|
93
|
+
const array = try allocator.alloc(TableInfo, count_max);
|
|
94
|
+
errdefer allocator.free(array);
|
|
95
|
+
|
|
96
|
+
return TableInfoBuffer{ .array = array };
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
pub fn deinit(buffer: *TableInfoBuffer, allocator: mem.Allocator) void {
|
|
100
|
+
allocator.free(buffer.array);
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
pub fn full(buffer: *const TableInfoBuffer) bool {
|
|
104
|
+
assert(buffer.count <= buffer.array.len);
|
|
105
|
+
return buffer.count == buffer.array.len;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/// Asserts that tables are pushed in sort order.
|
|
109
|
+
pub fn push(buffer: *TableInfoBuffer, table: *const TableInfo) void {
|
|
110
|
+
assert(!buffer.full());
|
|
111
|
+
|
|
112
|
+
if (sort_direction) |direction| {
|
|
113
|
+
if (buffer.count > 0) {
|
|
114
|
+
const tail = &buffer.array[buffer.count - 1];
|
|
115
|
+
switch (direction) {
|
|
116
|
+
.ascending => assert(compare_keys(table.key_min, tail.key_min) != .lt),
|
|
117
|
+
.descending => assert(compare_keys(table.key_max, tail.key_max) != .gt),
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
buffer.array[buffer.count] = table.*;
|
|
123
|
+
buffer.count += 1;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
pub fn drain(buffer: *TableInfoBuffer) []TableInfo {
|
|
127
|
+
assert(buffer.count <= buffer.array.len);
|
|
128
|
+
|
|
129
|
+
defer buffer.count = 0;
|
|
130
|
+
// Slice on array.ptr instead of array to avoid
|
|
131
|
+
// having stage1 give us an array.ptr=undefined when buffer.count=0.
|
|
132
|
+
return buffer.array.ptr[0..buffer.count];
|
|
133
|
+
}
|
|
134
|
+
};
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
pub fn ManifestType(comptime Table: type, comptime Storage: type) type {
|
|
138
|
+
const Key = Table.Key;
|
|
139
|
+
const compare_keys = Table.compare_keys;
|
|
140
|
+
|
|
141
|
+
return struct {
|
|
142
|
+
const Manifest = @This();
|
|
143
|
+
|
|
144
|
+
pub const TableInfo = TableInfoType(Table);
|
|
145
|
+
const TableInfoBuffer = TableInfoBufferType(Table, null);
|
|
146
|
+
|
|
147
|
+
const Grid = GridType(Storage);
|
|
148
|
+
const Callback = fn (*Manifest) void;
|
|
149
|
+
|
|
150
|
+
/// Levels beyond level 0 have tables with disjoint key ranges.
|
|
151
|
+
/// Here, we use a structure with indexes over the segmented array for performance.
|
|
152
|
+
const Level = ManifestLevelType(NodePool, Key, TableInfo, compare_keys, table_count_max);
|
|
153
|
+
const KeyRange = Level.KeyRange;
|
|
154
|
+
|
|
155
|
+
const ManifestLog = ManifestLogType(Storage, TableInfo);
|
|
156
|
+
|
|
157
|
+
node_pool: *NodePool,
|
|
158
|
+
|
|
159
|
+
levels: [config.lsm_levels]Level,
|
|
160
|
+
|
|
161
|
+
open_buffers: [config.lsm_levels]TableInfoBuffer,
|
|
162
|
+
|
|
163
|
+
// TODO Set this at startup when reading in the manifest.
|
|
164
|
+
// This should be the greatest TableInfo.snapshot_min/snapshot_max (if deleted) or
|
|
165
|
+
// registered snapshot seen so far.
|
|
166
|
+
snapshot_max: u64 = 1,
|
|
167
|
+
|
|
168
|
+
manifest_log: ManifestLog,
|
|
169
|
+
|
|
170
|
+
open_callback: ?Callback = null,
|
|
171
|
+
compact_callback: ?Callback = null,
|
|
172
|
+
checkpoint_callback: ?Callback = null,
|
|
173
|
+
|
|
174
|
+
pub fn init(
|
|
175
|
+
allocator: mem.Allocator,
|
|
176
|
+
node_pool: *NodePool,
|
|
177
|
+
grid: *Grid,
|
|
178
|
+
tree_hash: u128,
|
|
179
|
+
) !Manifest {
|
|
180
|
+
var levels: [config.lsm_levels]Level = undefined;
|
|
181
|
+
for (levels) |*level, i| {
|
|
182
|
+
errdefer for (levels[0..i]) |*l| l.deinit(allocator, node_pool);
|
|
183
|
+
level.* = try Level.init(allocator);
|
|
184
|
+
}
|
|
185
|
+
errdefer for (levels) |*l| l.deinit(allocator, node_pool);
|
|
186
|
+
|
|
187
|
+
var open_buffers: [config.lsm_levels]TableInfoBuffer = undefined;
|
|
188
|
+
for (open_buffers) |*buffer, i| {
|
|
189
|
+
errdefer for (open_buffers[0..i]) |*b| b.deinit(allocator);
|
|
190
|
+
buffer.* = try TableInfoBuffer.init(allocator, 32);
|
|
191
|
+
}
|
|
192
|
+
errdefer for (open_buffers) |*b| b.deinit(allocator);
|
|
193
|
+
|
|
194
|
+
var manifest_log = try ManifestLog.init(allocator, grid, tree_hash);
|
|
195
|
+
errdefer manifest_log.deinit(allocator);
|
|
196
|
+
|
|
197
|
+
return Manifest{
|
|
198
|
+
.node_pool = node_pool,
|
|
199
|
+
.levels = levels,
|
|
200
|
+
.manifest_log = manifest_log,
|
|
201
|
+
.open_buffers = open_buffers,
|
|
202
|
+
};
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
pub fn deinit(manifest: *Manifest, allocator: mem.Allocator) void {
|
|
206
|
+
for (manifest.levels) |*l| l.deinit(allocator, manifest.node_pool);
|
|
207
|
+
for (manifest.open_buffers) |*b| b.deinit(allocator);
|
|
208
|
+
|
|
209
|
+
manifest.manifest_log.deinit(allocator);
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
pub fn open(manifest: *Manifest, callback: Callback) void {
|
|
213
|
+
assert(manifest.open_callback == null);
|
|
214
|
+
manifest.open_callback = callback;
|
|
215
|
+
|
|
216
|
+
manifest.manifest_log.open(manifest_log_open_event, manifest_log_open_callback);
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
fn manifest_log_open_event(
|
|
220
|
+
manifest_log: *ManifestLog,
|
|
221
|
+
level: u7,
|
|
222
|
+
table: *const TableInfo,
|
|
223
|
+
) void {
|
|
224
|
+
const manifest = @fieldParentPtr(Manifest, "manifest_log", manifest_log);
|
|
225
|
+
assert(manifest.open_callback != null);
|
|
226
|
+
|
|
227
|
+
assert(level < config.lsm_levels);
|
|
228
|
+
const buffer = &manifest.open_buffers[level];
|
|
229
|
+
|
|
230
|
+
// Make sure theres room in the open buffer to push the table to.
|
|
231
|
+
if (buffer.full()) manifest.drain_open_buffer(buffer, level);
|
|
232
|
+
|
|
233
|
+
buffer.push(table);
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
fn manifest_log_open_callback(manifest_log: *ManifestLog) void {
|
|
237
|
+
const manifest = @fieldParentPtr(Manifest, "manifest_log", manifest_log);
|
|
238
|
+
assert(manifest.open_callback != null);
|
|
239
|
+
|
|
240
|
+
// Insert all left-over pushed open tables into the ManifestLevels.
|
|
241
|
+
for (manifest.open_buffers) |*buffer, level| {
|
|
242
|
+
manifest.drain_open_buffer(buffer, @intCast(u7, level));
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
const callback = manifest.open_callback.?;
|
|
246
|
+
manifest.open_callback = null;
|
|
247
|
+
callback(manifest);
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
fn drain_open_buffer(manifest: *Manifest, buffer: *TableInfoBuffer, level: u7) void {
|
|
251
|
+
assert(level < config.lsm_levels);
|
|
252
|
+
assert(buffer == &manifest.open_buffers[level]);
|
|
253
|
+
|
|
254
|
+
const tables = buffer.drain();
|
|
255
|
+
if (tables.len == 0) return;
|
|
256
|
+
|
|
257
|
+
// open() reports table in reverse sorted order
|
|
258
|
+
std.mem.reverse(TableInfo, tables);
|
|
259
|
+
manifest.levels[level].insert_tables(manifest.node_pool, tables);
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
pub fn insert_tables(
|
|
263
|
+
manifest: *Manifest,
|
|
264
|
+
level: u8,
|
|
265
|
+
tables: []const TableInfo,
|
|
266
|
+
) void {
|
|
267
|
+
assert(tables.len > 0);
|
|
268
|
+
|
|
269
|
+
const manifest_level = &manifest.levels[level];
|
|
270
|
+
manifest_level.insert_tables(manifest.node_pool, tables);
|
|
271
|
+
|
|
272
|
+
// Appends insert changes to the manifest log
|
|
273
|
+
for (tables) |*table| {
|
|
274
|
+
const log_level = @intCast(u7, level);
|
|
275
|
+
manifest.manifest_log.insert(log_level, table);
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
// TODO Verify that tables can be found exactly before returning.
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
pub fn update_tables(
|
|
282
|
+
manifest: *Manifest,
|
|
283
|
+
level: u8,
|
|
284
|
+
snapshot: u64,
|
|
285
|
+
tables: []const TableInfo,
|
|
286
|
+
) void {
|
|
287
|
+
assert(tables.len > 0);
|
|
288
|
+
|
|
289
|
+
const manifest_level = &manifest.levels[level];
|
|
290
|
+
manifest_level.set_snapshot_max(snapshot, tables);
|
|
291
|
+
|
|
292
|
+
// Appends update changes to the manifest log
|
|
293
|
+
for (tables) |*table| {
|
|
294
|
+
const log_level = @intCast(u7, level);
|
|
295
|
+
manifest.manifest_log.insert(log_level, table);
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
/// Moves the table at the address/checksum pair from one level to another.
|
|
300
|
+
/// Unlike `update_tables`, this avoids leaving the same TableInfo with different snapshots
|
|
301
|
+
/// in both levels by removing it from level_a before inserting to level_b.
|
|
302
|
+
pub fn move_table(
|
|
303
|
+
manifest: *Manifest,
|
|
304
|
+
level_a: u8,
|
|
305
|
+
level_b: u8,
|
|
306
|
+
snapshot: u64,
|
|
307
|
+
address: u64,
|
|
308
|
+
checksum: u128,
|
|
309
|
+
) void {
|
|
310
|
+
assert(level_a < config.lsm_levels);
|
|
311
|
+
assert(level_b < config.lsm_levels);
|
|
312
|
+
assert(level_a + 1 == level_b);
|
|
313
|
+
|
|
314
|
+
const table_info: TableInfo = blk: {
|
|
315
|
+
_ = address;
|
|
316
|
+
_ = checksum;
|
|
317
|
+
break :blk @panic("TODO(Joran): lookup using address/checksum");
|
|
318
|
+
};
|
|
319
|
+
|
|
320
|
+
const tables = [_]TableInfo{table_info};
|
|
321
|
+
manifest.levels[level_a].remove_tables(manifest.node_pool, &.{snapshot}, &tables);
|
|
322
|
+
manifest.levels[level_b].insert_tables(manifest.node_pool, &tables);
|
|
323
|
+
|
|
324
|
+
// Appends move changes to the manifest log. (A move is only recorded as an insert).
|
|
325
|
+
for (tables) |*table| {
|
|
326
|
+
const log_level = @intCast(u7, level_b);
|
|
327
|
+
manifest.manifest_log.insert(log_level, table);
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
pub fn remove_invisible_tables(
|
|
332
|
+
manifest: *Manifest,
|
|
333
|
+
level: u8,
|
|
334
|
+
snapshot: u64,
|
|
335
|
+
key_min: Key,
|
|
336
|
+
key_max: Key,
|
|
337
|
+
) void {
|
|
338
|
+
assert(level < config.lsm_levels);
|
|
339
|
+
assert(compare_keys(key_min, key_max) != .gt);
|
|
340
|
+
|
|
341
|
+
const snapshots = [_]u64{snapshot};
|
|
342
|
+
const manifest_level = &manifest.levels[level];
|
|
343
|
+
|
|
344
|
+
var count: u32 = 0;
|
|
345
|
+
var tables: [64]TableInfo = undefined;
|
|
346
|
+
var it = manifest_level.iterator(
|
|
347
|
+
.invisible,
|
|
348
|
+
&snapshots,
|
|
349
|
+
.ascending,
|
|
350
|
+
KeyRange{ .key_min = key_min, .key_max = key_max },
|
|
351
|
+
);
|
|
352
|
+
|
|
353
|
+
while (it.next()) |table| {
|
|
354
|
+
assert(table.invisible(&snapshots));
|
|
355
|
+
assert(compare_keys(key_min, table.key_min) != .gt);
|
|
356
|
+
assert(compare_keys(key_max, table.key_max) != .lt);
|
|
357
|
+
|
|
358
|
+
// Append remove changes to the manifest log.
|
|
359
|
+
const log_level = @intCast(u7, level);
|
|
360
|
+
manifest.manifest_log.remove(log_level, table);
|
|
361
|
+
|
|
362
|
+
if (count > 0) {
|
|
363
|
+
manifest_level.remove_tables(manifest.node_pool, &snapshots, tables[0..count]);
|
|
364
|
+
count = 0;
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
assert(count < tables.len);
|
|
368
|
+
tables[count] = table.*;
|
|
369
|
+
count += 1;
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
if (count > 0) {
|
|
373
|
+
manifest_level.remove_tables(manifest.node_pool, &snapshots, tables[0..count]);
|
|
374
|
+
}
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
pub fn lookup(manifest: *Manifest, snapshot: u64, key: Key) LookupIterator {
|
|
378
|
+
return .{
|
|
379
|
+
.manifest = manifest,
|
|
380
|
+
.snapshot = snapshot,
|
|
381
|
+
.key = key,
|
|
382
|
+
};
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
pub const LookupIterator = struct {
|
|
386
|
+
manifest: *Manifest,
|
|
387
|
+
snapshot: u64,
|
|
388
|
+
key: Key,
|
|
389
|
+
level: u8 = 0,
|
|
390
|
+
inner: ?Level.Iterator = null,
|
|
391
|
+
precedence: ?u64 = null,
|
|
392
|
+
|
|
393
|
+
pub fn next(it: *LookupIterator) ?*const TableInfo {
|
|
394
|
+
while (it.level < config.lsm_levels) : (it.level += 1) {
|
|
395
|
+
const level = &it.manifest.levels[it.level];
|
|
396
|
+
|
|
397
|
+
var inner = level.iterator(
|
|
398
|
+
.visible,
|
|
399
|
+
@as(*const [1]u64, &it.snapshot),
|
|
400
|
+
.ascending,
|
|
401
|
+
KeyRange{ .key_min = it.key, .key_max = it.key },
|
|
402
|
+
);
|
|
403
|
+
|
|
404
|
+
if (inner.next()) |table| {
|
|
405
|
+
if (it.precedence) |p| assert(p > table.snapshot_min);
|
|
406
|
+
it.precedence = table.snapshot_min;
|
|
407
|
+
|
|
408
|
+
assert(table.visible(it.snapshot));
|
|
409
|
+
assert(compare_keys(it.key, table.key_min) != .lt);
|
|
410
|
+
assert(compare_keys(it.key, table.key_max) != .gt);
|
|
411
|
+
assert(inner.next() == null);
|
|
412
|
+
|
|
413
|
+
it.level += 1;
|
|
414
|
+
return table;
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
assert(it.level == config.lsm_levels);
|
|
419
|
+
return null;
|
|
420
|
+
}
|
|
421
|
+
};
|
|
422
|
+
|
|
423
|
+
pub fn assert_level_table_counts(manifest: *const Manifest) void {
|
|
424
|
+
for (manifest.levels) |*manifest_level, index| {
|
|
425
|
+
const level = @intCast(u8, index);
|
|
426
|
+
const table_count_visible_max = table_count_max_for_level(growth_factor, level);
|
|
427
|
+
assert(manifest_level.table_count_visible <= table_count_visible_max);
|
|
428
|
+
}
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
pub fn assert_no_invisible_tables(manifest: *const Manifest, snapshot: u64) void {
|
|
432
|
+
for (manifest.levels) |*manifest_level| {
|
|
433
|
+
var it = manifest_level.iterator(
|
|
434
|
+
.invisible,
|
|
435
|
+
@as(*const [1]u64, &snapshot),
|
|
436
|
+
.ascending,
|
|
437
|
+
null,
|
|
438
|
+
);
|
|
439
|
+
assert(it.next() == null);
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
/// Returns the next table in the range, after `key_exclusive` if provided.
|
|
444
|
+
pub fn next_table(
|
|
445
|
+
manifest: *const Manifest,
|
|
446
|
+
level: u8,
|
|
447
|
+
snapshot: u64,
|
|
448
|
+
key_min: Key,
|
|
449
|
+
key_max: Key,
|
|
450
|
+
key_exclusive: ?Key,
|
|
451
|
+
direction: Direction,
|
|
452
|
+
) ?*const TableInfo {
|
|
453
|
+
assert(level < config.lsm_levels);
|
|
454
|
+
assert(compare_keys(key_min, key_max) != .gt);
|
|
455
|
+
|
|
456
|
+
const snapshots = [_]u64{snapshot};
|
|
457
|
+
|
|
458
|
+
if (key_exclusive == null) {
|
|
459
|
+
return manifest.levels[level].iterator(
|
|
460
|
+
.visible,
|
|
461
|
+
&snapshots,
|
|
462
|
+
direction,
|
|
463
|
+
KeyRange{ .key_min = key_min, .key_max = key_max },
|
|
464
|
+
).next();
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
assert(compare_keys(key_exclusive.?, key_min) != .lt);
|
|
468
|
+
assert(compare_keys(key_exclusive.?, key_max) != .gt);
|
|
469
|
+
|
|
470
|
+
const key_min_exclusive = if (direction == .ascending) key_exclusive.? else key_min;
|
|
471
|
+
const key_max_exclusive = if (direction == .descending) key_exclusive.? else key_max;
|
|
472
|
+
assert(compare_keys(key_min_exclusive, key_max_exclusive) != .gt);
|
|
473
|
+
|
|
474
|
+
var it = manifest.levels[level].iterator(
|
|
475
|
+
.visible,
|
|
476
|
+
&snapshots,
|
|
477
|
+
direction,
|
|
478
|
+
KeyRange{ .key_min = key_min_exclusive, .key_max = key_max_exclusive },
|
|
479
|
+
);
|
|
480
|
+
|
|
481
|
+
while (it.next()) |table| {
|
|
482
|
+
assert(table.visible(snapshot));
|
|
483
|
+
assert(compare_keys(table.key_min, table.key_max) != .gt);
|
|
484
|
+
assert(compare_keys(table.key_max, key_min_exclusive) != .lt);
|
|
485
|
+
assert(compare_keys(table.key_min, key_max_exclusive) != .gt);
|
|
486
|
+
|
|
487
|
+
const next = switch (direction) {
|
|
488
|
+
.ascending => compare_keys(table.key_min, key_exclusive.?) == .gt,
|
|
489
|
+
.descending => compare_keys(table.key_max, key_exclusive.?) == .lt,
|
|
490
|
+
};
|
|
491
|
+
if (next) return table;
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
return null;
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
/// Returns the most optimal table for compaction from a level that is due for compaction.
|
|
498
|
+
/// Returns null if the level is not due for compaction (table_count_visible < count_max).
|
|
499
|
+
pub fn compaction_table(manifest: *const Manifest, level_a: u8) ?CompactionTableRange {
|
|
500
|
+
assert(level_a < config.lsm_levels - 1); // The last level is not compacted into another.
|
|
501
|
+
|
|
502
|
+
const table_count_visible_max = table_count_max_for_level(growth_factor, level_a);
|
|
503
|
+
assert(table_count_visible_max > 0);
|
|
504
|
+
|
|
505
|
+
const manifest_level: *const Level = &manifest.levels[level_a];
|
|
506
|
+
if (manifest_level.table_count_visible < table_count_visible_max) return null;
|
|
507
|
+
// If even levels are compacted ahead of odd levels, then odd levels may burst.
|
|
508
|
+
assert(manifest_level.table_count_visible <= table_count_visible_max + 1);
|
|
509
|
+
|
|
510
|
+
var optimal: ?CompactionTableRange = null;
|
|
511
|
+
|
|
512
|
+
const snapshots = [1]u64{snapshot_latest};
|
|
513
|
+
var iterations: usize = 0;
|
|
514
|
+
var it = manifest.levels[level_a].iterator(
|
|
515
|
+
.visible,
|
|
516
|
+
&snapshots,
|
|
517
|
+
.ascending,
|
|
518
|
+
null, // All visible tables in the level therefore no KeyRange filter.
|
|
519
|
+
);
|
|
520
|
+
|
|
521
|
+
while (it.next()) |table| {
|
|
522
|
+
iterations += 1;
|
|
523
|
+
|
|
524
|
+
const range = manifest.compaction_range(level_a + 1, table.key_min, table.key_max);
|
|
525
|
+
if (optimal == null or range.table_count < optimal.?.range.table_count) {
|
|
526
|
+
optimal = .{
|
|
527
|
+
.table = table,
|
|
528
|
+
.range = range,
|
|
529
|
+
};
|
|
530
|
+
}
|
|
531
|
+
// If the table can be moved directly between levels then that is already optimal.
|
|
532
|
+
if (optimal.?.range.table_count == 1) break;
|
|
533
|
+
}
|
|
534
|
+
assert(iterations > 0);
|
|
535
|
+
assert(iterations == manifest_level.table_count_visible or
|
|
536
|
+
optimal.?.range.table_count == 1);
|
|
537
|
+
|
|
538
|
+
return optimal.?;
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
pub const CompactionTableRange = struct {
|
|
542
|
+
table: *const TableInfo,
|
|
543
|
+
range: CompactionRange,
|
|
544
|
+
};
|
|
545
|
+
|
|
546
|
+
pub const CompactionRange = struct {
|
|
547
|
+
/// The total number of tables in the compaction across both levels, always at least 1.
|
|
548
|
+
table_count: usize,
|
|
549
|
+
/// The minimum key across both levels.
|
|
550
|
+
key_min: Key,
|
|
551
|
+
/// The maximum key across both levels.
|
|
552
|
+
key_max: Key,
|
|
553
|
+
};
|
|
554
|
+
|
|
555
|
+
/// Returns the smallest visible range across level A and B that overlaps key_min/max.
|
|
556
|
+
///
|
|
557
|
+
/// For example, for a table in level 2, count how many tables overlap in level 3, and
|
|
558
|
+
/// determine the span of their entire key range, which may be broader or narrower.
|
|
559
|
+
///
|
|
560
|
+
/// The range.table_count includes the input table from level A represented by key_min/max.
|
|
561
|
+
/// Thus range.table_count=1 means that the table may be moved directly between levels.
|
|
562
|
+
///
|
|
563
|
+
/// The range keys are guaranteed to encompass all the relevant level A and level B tables:
|
|
564
|
+
/// range.key_min = min(a.key_min, b.key_min)
|
|
565
|
+
/// range.key_max = max(a.key_max, b.key_max)
|
|
566
|
+
///
|
|
567
|
+
/// This last invariant is critical to ensuring that tombstones are dropped correctly.
|
|
568
|
+
pub fn compaction_range(
|
|
569
|
+
manifest: *const Manifest,
|
|
570
|
+
level_b: u8,
|
|
571
|
+
key_min: Key,
|
|
572
|
+
key_max: Key,
|
|
573
|
+
) CompactionRange {
|
|
574
|
+
assert(level_b < config.lsm_levels);
|
|
575
|
+
assert(compare_keys(key_min, key_max) != .gt);
|
|
576
|
+
|
|
577
|
+
var range = CompactionRange{
|
|
578
|
+
.table_count = 1,
|
|
579
|
+
.key_min = key_min,
|
|
580
|
+
.key_max = key_max,
|
|
581
|
+
};
|
|
582
|
+
|
|
583
|
+
const snapshots = [_]u64{snapshot_latest};
|
|
584
|
+
var it = manifest.levels[level_b].iterator(
|
|
585
|
+
.visible,
|
|
586
|
+
&snapshots,
|
|
587
|
+
.ascending,
|
|
588
|
+
KeyRange{ .key_min = range.key_min, .key_max = range.key_max },
|
|
589
|
+
);
|
|
590
|
+
|
|
591
|
+
while (it.next()) |table| : (range.table_count += 1) {
|
|
592
|
+
assert(table.visible(snapshot_latest));
|
|
593
|
+
assert(compare_keys(table.key_min, table.key_max) != .gt);
|
|
594
|
+
assert(compare_keys(table.key_max, range.key_min) != .lt);
|
|
595
|
+
assert(compare_keys(table.key_min, range.key_max) != .gt);
|
|
596
|
+
|
|
597
|
+
// The first iterated table.key_min/max may overlap range.key_min/max entirely.
|
|
598
|
+
if (compare_keys(table.key_min, range.key_min) == .lt) {
|
|
599
|
+
range.key_min = table.key_min;
|
|
600
|
+
}
|
|
601
|
+
|
|
602
|
+
// Thereafter, iterated tables may/may not extend the range in ascending order.
|
|
603
|
+
if (compare_keys(table.key_max, range.key_max) == .gt) {
|
|
604
|
+
range.key_max = table.key_max;
|
|
605
|
+
}
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
assert(range.table_count > 0);
|
|
609
|
+
assert(compare_keys(range.key_min, range.key_max) != .gt);
|
|
610
|
+
assert(compare_keys(range.key_min, key_min) != .gt);
|
|
611
|
+
assert(compare_keys(range.key_max, key_max) != .lt);
|
|
612
|
+
|
|
613
|
+
return range;
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
/// If no subsequent levels have any overlap, then tombstones must be dropped.
|
|
617
|
+
pub fn compaction_must_drop_tombstones(
|
|
618
|
+
manifest: *const Manifest,
|
|
619
|
+
level_b: u8,
|
|
620
|
+
range: CompactionRange,
|
|
621
|
+
) bool {
|
|
622
|
+
assert(level_b < config.lsm_levels);
|
|
623
|
+
assert(range.table_count > 0);
|
|
624
|
+
assert(compare_keys(range.key_min, range.key_max) != .gt);
|
|
625
|
+
|
|
626
|
+
var level_c: u8 = level_b + 1;
|
|
627
|
+
while (level_c < config.lsm_levels) : (level_c += 1) {
|
|
628
|
+
const snapshots = [_]u64{snapshot_latest};
|
|
629
|
+
|
|
630
|
+
var it = manifest.levels[level_c].iterator(
|
|
631
|
+
.visible,
|
|
632
|
+
&snapshots,
|
|
633
|
+
.ascending,
|
|
634
|
+
KeyRange{ .key_min = range.key_min, .key_max = range.key_max },
|
|
635
|
+
);
|
|
636
|
+
if (it.next() == null) {
|
|
637
|
+
// If the range is being compacted into the last level then this is unreachable,
|
|
638
|
+
// as the last level has no subsequent levels and must always drop tombstones.
|
|
639
|
+
assert(level_b != config.lsm_levels - 1);
|
|
640
|
+
return false;
|
|
641
|
+
}
|
|
642
|
+
}
|
|
643
|
+
|
|
644
|
+
assert(level_c == config.lsm_levels);
|
|
645
|
+
return true;
|
|
646
|
+
}
|
|
647
|
+
|
|
648
|
+
pub fn compact(manifest: *Manifest, callback: Callback) void {
|
|
649
|
+
assert(manifest.compact_callback == null);
|
|
650
|
+
manifest.compact_callback = callback;
|
|
651
|
+
|
|
652
|
+
manifest.manifest_log.compact(manifest_log_compact_callback);
|
|
653
|
+
}
|
|
654
|
+
|
|
655
|
+
fn manifest_log_compact_callback(manifest_log: *ManifestLog) void {
|
|
656
|
+
const manifest = @fieldParentPtr(Manifest, "manifest_log", manifest_log);
|
|
657
|
+
assert(manifest.compact_callback != null);
|
|
658
|
+
|
|
659
|
+
const callback = manifest.compact_callback.?;
|
|
660
|
+
manifest.compact_callback = null;
|
|
661
|
+
callback(manifest);
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
pub fn checkpoint(manifest: *Manifest, callback: Callback) void {
|
|
665
|
+
assert(manifest.checkpoint_callback == null);
|
|
666
|
+
manifest.checkpoint_callback = callback;
|
|
667
|
+
|
|
668
|
+
manifest.manifest_log.checkpoint(manifest_log_checkpoint_callback);
|
|
669
|
+
}
|
|
670
|
+
|
|
671
|
+
fn manifest_log_checkpoint_callback(manifest_log: *ManifestLog) void {
|
|
672
|
+
const manifest = @fieldParentPtr(Manifest, "manifest_log", manifest_log);
|
|
673
|
+
assert(manifest.checkpoint_callback != null);
|
|
674
|
+
|
|
675
|
+
const callback = manifest.checkpoint_callback.?;
|
|
676
|
+
manifest.checkpoint_callback = null;
|
|
677
|
+
callback(manifest);
|
|
678
|
+
}
|
|
679
|
+
};
|
|
680
|
+
}
|