tigerbeetle-node 0.10.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +302 -101
- package/dist/index.d.ts +70 -72
- package/dist/index.js +70 -72
- package/dist/index.js.map +1 -1
- package/package.json +6 -6
- package/scripts/download_node_headers.sh +14 -7
- package/src/index.ts +6 -10
- package/src/node.zig +6 -3
- package/src/tigerbeetle/scripts/benchmark.sh +4 -4
- package/src/tigerbeetle/scripts/confirm_image.sh +44 -0
- package/src/tigerbeetle/scripts/fuzz_loop.sh +15 -0
- package/src/tigerbeetle/scripts/fuzz_unique_errors.sh +7 -0
- package/src/tigerbeetle/scripts/install.sh +19 -4
- package/src/tigerbeetle/scripts/install_zig.bat +5 -1
- package/src/tigerbeetle/scripts/install_zig.sh +24 -14
- package/src/tigerbeetle/scripts/pre-commit.sh +9 -0
- package/src/tigerbeetle/scripts/shellcheck.sh +5 -0
- package/src/tigerbeetle/scripts/tests_on_alpine.sh +10 -0
- package/src/tigerbeetle/scripts/tests_on_ubuntu.sh +14 -0
- package/src/tigerbeetle/src/benchmark.zig +4 -2
- package/src/tigerbeetle/src/benchmark_array_search.zig +3 -3
- package/src/tigerbeetle/src/c/tb_client/thread.zig +8 -9
- package/src/tigerbeetle/src/c/tb_client.h +100 -80
- package/src/tigerbeetle/src/c/tb_client.zig +4 -1
- package/src/tigerbeetle/src/cli.zig +1 -1
- package/src/tigerbeetle/src/config.zig +48 -16
- package/src/tigerbeetle/src/demo.zig +3 -1
- package/src/tigerbeetle/src/eytzinger_benchmark.zig +3 -3
- package/src/tigerbeetle/src/io/linux.zig +1 -1
- package/src/tigerbeetle/src/lsm/README.md +214 -0
- package/src/tigerbeetle/src/lsm/binary_search.zig +137 -10
- package/src/tigerbeetle/src/lsm/bloom_filter.zig +43 -0
- package/src/tigerbeetle/src/lsm/compaction.zig +352 -398
- package/src/tigerbeetle/src/lsm/composite_key.zig +2 -0
- package/src/tigerbeetle/src/lsm/eytzinger.zig +1 -1
- package/src/tigerbeetle/src/lsm/forest.zig +21 -447
- package/src/tigerbeetle/src/lsm/forest_fuzz.zig +412 -0
- package/src/tigerbeetle/src/lsm/grid.zig +145 -69
- package/src/tigerbeetle/src/lsm/groove.zig +196 -133
- package/src/tigerbeetle/src/lsm/k_way_merge.zig +40 -18
- package/src/tigerbeetle/src/lsm/level_iterator.zig +28 -9
- package/src/tigerbeetle/src/lsm/manifest.zig +81 -181
- package/src/tigerbeetle/src/lsm/manifest_level.zig +210 -454
- package/src/tigerbeetle/src/lsm/manifest_log.zig +77 -28
- package/src/tigerbeetle/src/lsm/posted_groove.zig +64 -76
- package/src/tigerbeetle/src/lsm/segmented_array.zig +561 -241
- package/src/tigerbeetle/src/lsm/segmented_array_benchmark.zig +148 -0
- package/src/tigerbeetle/src/lsm/segmented_array_fuzz.zig +9 -0
- package/src/tigerbeetle/src/lsm/set_associative_cache.zig +62 -12
- package/src/tigerbeetle/src/lsm/table.zig +83 -48
- package/src/tigerbeetle/src/lsm/table_immutable.zig +30 -23
- package/src/tigerbeetle/src/lsm/table_iterator.zig +25 -14
- package/src/tigerbeetle/src/lsm/table_mutable.zig +63 -12
- package/src/tigerbeetle/src/lsm/test.zig +49 -55
- package/src/tigerbeetle/src/lsm/tree.zig +407 -402
- package/src/tigerbeetle/src/lsm/tree_fuzz.zig +457 -0
- package/src/tigerbeetle/src/main.zig +28 -6
- package/src/tigerbeetle/src/message_bus.zig +2 -2
- package/src/tigerbeetle/src/message_pool.zig +14 -17
- package/src/tigerbeetle/src/simulator.zig +145 -112
- package/src/tigerbeetle/src/state_machine.zig +338 -228
- package/src/tigerbeetle/src/static_allocator.zig +65 -0
- package/src/tigerbeetle/src/storage.zig +3 -7
- package/src/tigerbeetle/src/test/accounting/auditor.zig +577 -0
- package/src/tigerbeetle/src/test/accounting/workload.zig +819 -0
- package/src/tigerbeetle/src/test/cluster.zig +18 -48
- package/src/tigerbeetle/src/test/conductor.zig +365 -0
- package/src/tigerbeetle/src/test/fuzz.zig +121 -0
- package/src/tigerbeetle/src/test/id.zig +89 -0
- package/src/tigerbeetle/src/test/priority_queue.zig +645 -0
- package/src/tigerbeetle/src/test/state_checker.zig +93 -69
- package/src/tigerbeetle/src/test/state_machine.zig +11 -35
- package/src/tigerbeetle/src/test/storage.zig +29 -8
- package/src/tigerbeetle/src/tigerbeetle.zig +14 -16
- package/src/tigerbeetle/src/unit_tests.zig +7 -0
- package/src/tigerbeetle/src/vopr.zig +494 -0
- package/src/tigerbeetle/src/vopr_hub/README.md +58 -0
- package/src/tigerbeetle/src/vopr_hub/SETUP.md +199 -0
- package/src/tigerbeetle/src/vopr_hub/go.mod +3 -0
- package/src/tigerbeetle/src/vopr_hub/main.go +1022 -0
- package/src/tigerbeetle/src/vopr_hub/scheduler/go.mod +3 -0
- package/src/tigerbeetle/src/vopr_hub/scheduler/main.go +403 -0
- package/src/tigerbeetle/src/vsr/client.zig +13 -0
- package/src/tigerbeetle/src/vsr/journal.zig +16 -13
- package/src/tigerbeetle/src/vsr/replica.zig +924 -491
- package/src/tigerbeetle/src/vsr/superblock.zig +55 -37
- package/src/tigerbeetle/src/vsr/superblock_client_table.zig +7 -10
- package/src/tigerbeetle/src/vsr/superblock_free_set.zig +2 -2
- package/src/tigerbeetle/src/vsr/superblock_manifest.zig +18 -3
- package/src/tigerbeetle/src/vsr.zig +75 -55
- package/src/tigerbeetle/scripts/vopr.bat +0 -48
- package/src/tigerbeetle/scripts/vopr.sh +0 -33
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
//! An LSM tree.
|
|
1
2
|
const std = @import("std");
|
|
2
3
|
const builtin = @import("builtin");
|
|
3
4
|
const assert = std.debug.assert;
|
|
@@ -11,13 +12,11 @@ const config = @import("../config.zig");
|
|
|
11
12
|
const div_ceil = @import("../util.zig").div_ceil;
|
|
12
13
|
const eytzinger = @import("eytzinger.zig").eytzinger;
|
|
13
14
|
const vsr = @import("../vsr.zig");
|
|
14
|
-
const binary_search = @import("binary_search.zig");
|
|
15
15
|
const bloom_filter = @import("bloom_filter.zig");
|
|
16
16
|
|
|
17
17
|
const CompositeKey = @import("composite_key.zig").CompositeKey;
|
|
18
18
|
const NodePool = @import("node_pool.zig").NodePool(config.lsm_manifest_node_size, 16);
|
|
19
19
|
const RingBuffer = @import("../ring_buffer.zig").RingBuffer;
|
|
20
|
-
const SuperBlockType = vsr.SuperBlockType;
|
|
21
20
|
|
|
22
21
|
/// We reserve maxInt(u64) to indicate that a table has not been deleted.
|
|
23
22
|
/// Tables that have not been deleted have snapshot_max of maxInt(u64).
|
|
@@ -26,6 +25,8 @@ const SuperBlockType = vsr.SuperBlockType;
|
|
|
26
25
|
/// to query all non-deleted tables.
|
|
27
26
|
pub const snapshot_latest: u64 = math.maxInt(u64) - 1;
|
|
28
27
|
|
|
28
|
+
const half_bar_beat_count = @divExact(config.lsm_batch_multiple, 2);
|
|
29
|
+
|
|
29
30
|
// StateMachine:
|
|
30
31
|
//
|
|
31
32
|
// /// state machine will pass this on to all object stores
|
|
@@ -35,7 +36,7 @@ pub const snapshot_latest: u64 = math.maxInt(u64) - 1;
|
|
|
35
36
|
// /// write the ops in batch to the memtable/objcache, previously called commit()
|
|
36
37
|
// pub fn write(batch) void
|
|
37
38
|
//
|
|
38
|
-
// /// Flush in memory state to disk,
|
|
39
|
+
// /// Flush in memory state to disk, perform merges, etc
|
|
39
40
|
// /// Only function that triggers Write I/O in LSMs, as well as some Read
|
|
40
41
|
// /// Make as incremental as possible, don't block the main thread, avoid high latency/spikes
|
|
41
42
|
// pub fn flush(callback) void
|
|
@@ -47,14 +48,14 @@ pub const snapshot_latest: u64 = math.maxInt(u64) - 1;
|
|
|
47
48
|
// pub fn decode_superblock(buffer) void
|
|
48
49
|
//
|
|
49
50
|
|
|
51
|
+
/// The maximum number of tables for a single tree.
|
|
50
52
|
pub const table_count_max = table_count_max_for_tree(config.lsm_growth_factor, config.lsm_levels);
|
|
51
53
|
|
|
52
|
-
pub fn TreeType(comptime
|
|
53
|
-
const Key =
|
|
54
|
-
const Value =
|
|
55
|
-
const compare_keys =
|
|
56
|
-
const tombstone =
|
|
57
|
-
const tombstone_from_key = Table.tombstone_from_key;
|
|
54
|
+
pub fn TreeType(comptime TreeTable: type, comptime Storage: type, comptime tree_name: []const u8) type {
|
|
55
|
+
const Key = TreeTable.Key;
|
|
56
|
+
const Value = TreeTable.Value;
|
|
57
|
+
const compare_keys = TreeTable.compare_keys;
|
|
58
|
+
const tombstone = TreeTable.tombstone;
|
|
58
59
|
|
|
59
60
|
const tree_hash = blk: {
|
|
60
61
|
// Blake3 hash does alot at comptime..
|
|
@@ -69,34 +70,25 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
|
|
|
69
70
|
const Tree = @This();
|
|
70
71
|
|
|
71
72
|
// Expose the Table & hash for the Groove.
|
|
72
|
-
pub const
|
|
73
|
+
pub const Table = TreeTable;
|
|
73
74
|
pub const name = tree_name;
|
|
74
75
|
pub const hash = tree_hash;
|
|
75
76
|
|
|
76
77
|
const Grid = @import("grid.zig").GridType(Storage);
|
|
77
78
|
const Manifest = @import("manifest.zig").ManifestType(Table, Storage);
|
|
78
|
-
const TableMutable = @import("table_mutable.zig").TableMutableType(Table);
|
|
79
|
+
pub const TableMutable = @import("table_mutable.zig").TableMutableType(Table);
|
|
79
80
|
const TableImmutable = @import("table_immutable.zig").TableImmutableType(Table);
|
|
80
81
|
|
|
81
82
|
const CompactionType = @import("compaction.zig").CompactionType;
|
|
82
83
|
const TableIteratorType = @import("table_iterator.zig").TableIteratorType;
|
|
83
84
|
const TableImmutableIteratorType = @import("table_immutable.zig").TableImmutableIteratorType;
|
|
84
85
|
|
|
85
|
-
pub const ValueCache = std.HashMapUnmanaged(Value, void, Table.HashMapContextValue, 70);
|
|
86
|
-
|
|
87
86
|
const CompactionTable = CompactionType(Table, Storage, TableIteratorType);
|
|
88
87
|
const CompactionTableImmutable = CompactionType(Table, Storage, TableImmutableIteratorType);
|
|
89
88
|
|
|
90
89
|
grid: *Grid,
|
|
91
90
|
options: Options,
|
|
92
91
|
|
|
93
|
-
/// TODO(ifreund) Replace this with SetAssociativeCache:
|
|
94
|
-
/// A set associative cache of values shared by trees with the same key/value sizes.
|
|
95
|
-
/// This is used to accelerate point lookups and is not used for range queries.
|
|
96
|
-
/// Secondary index trees used only for range queries can therefore set this to null.
|
|
97
|
-
/// The value type will be []u8 and this will be shared by trees with the same value size.
|
|
98
|
-
value_cache: ?*ValueCache,
|
|
99
|
-
|
|
100
92
|
table_mutable: TableMutable,
|
|
101
93
|
table_immutable: TableImmutable,
|
|
102
94
|
|
|
@@ -111,7 +103,26 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
|
|
|
111
103
|
/// This means, that for odd lsm_levels, the last CompactionTable is unused.
|
|
112
104
|
compaction_table: [@divFloor(config.lsm_levels, 2)]CompactionTable,
|
|
113
105
|
|
|
106
|
+
/// While a compaction is running, this is the op of the last compact().
|
|
107
|
+
/// While no compaction is running, this is the op of the last compact() to complete.
|
|
108
|
+
/// (When recovering from a checkpoint, compaction_op starts at op_checkpoint).
|
|
114
109
|
compaction_op: u64,
|
|
110
|
+
|
|
111
|
+
/// The maximum snapshot which is safe to prefetch from.
|
|
112
|
+
/// The minimum snapshot which can see the mutable table.
|
|
113
|
+
///
|
|
114
|
+
/// This field ensures that the tree never queries the output tables of a running
|
|
115
|
+
/// compaction; they are incomplete.
|
|
116
|
+
///
|
|
117
|
+
/// See lookup_snapshot_max_for_checkpoint().
|
|
118
|
+
///
|
|
119
|
+
/// Invariants:
|
|
120
|
+
/// * `lookup_snapshot_max = compaction_op` while any compaction beat is in progress.
|
|
121
|
+
/// * `lookup_snapshot_max = compaction_op + 1` after a compaction beat finishes.
|
|
122
|
+
/// * `lookup_snapshot_max ≥ op_checkpoint + 1 + lsm_batch_multiple`
|
|
123
|
+
/// when `op_checkpoint ≠ 0`.
|
|
124
|
+
lookup_snapshot_max: u64,
|
|
125
|
+
|
|
115
126
|
compaction_io_pending: usize,
|
|
116
127
|
compaction_callback: ?fn (*Tree) void,
|
|
117
128
|
|
|
@@ -120,22 +131,45 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
|
|
|
120
131
|
|
|
121
132
|
pub const Options = struct {
|
|
122
133
|
/// The maximum number of keys that may be committed per batch.
|
|
123
|
-
|
|
134
|
+
///
|
|
135
|
+
/// In general, the commit count max for a field depends on the field's object —
|
|
136
|
+
/// how many objects might be inserted/updated/removed by a batch:
|
|
137
|
+
/// (config.message_size_max - sizeOf(vsr.header))
|
|
138
|
+
/// For example, there are at most 8191 transfers in a batch.
|
|
139
|
+
/// So commit_entries_max=8191 for transfer objects and indexes.
|
|
140
|
+
///
|
|
141
|
+
/// However, if a transfer is ever mutated, then this will double commit_entries_max
|
|
142
|
+
/// since the old index might need to be removed, and the new index inserted.
|
|
143
|
+
///
|
|
144
|
+
/// A way to see this is by looking at the state machine. If a transfer is inserted,
|
|
145
|
+
/// how many accounts and transfer put/removes will be generated?
|
|
146
|
+
///
|
|
147
|
+
/// This also means looking at the state machine operation that will generate the
|
|
148
|
+
/// most put/removes in the worst case.
|
|
149
|
+
/// For example, create_accounts will put at most 8191 accounts.
|
|
150
|
+
/// However, create_transfers will put 2 accounts (8191 * 2) for every transfer, and
|
|
151
|
+
/// some of these accounts may exist, requiring a remove/put to update the index.
|
|
152
|
+
commit_entries_max: u32,
|
|
124
153
|
};
|
|
125
154
|
|
|
126
155
|
pub fn init(
|
|
127
156
|
allocator: mem.Allocator,
|
|
128
157
|
node_pool: *NodePool,
|
|
129
158
|
grid: *Grid,
|
|
130
|
-
|
|
159
|
+
values_cache: ?*TableMutable.ValuesCache,
|
|
131
160
|
options: Options,
|
|
132
161
|
) !Tree {
|
|
133
|
-
|
|
162
|
+
assert(options.commit_entries_max > 0);
|
|
163
|
+
assert(grid.superblock.opened);
|
|
164
|
+
|
|
165
|
+
var table_mutable = try TableMutable.init(allocator, values_cache, options.commit_entries_max);
|
|
134
166
|
errdefer table_mutable.deinit(allocator);
|
|
135
167
|
|
|
136
|
-
var table_immutable = try TableImmutable.init(allocator, options.
|
|
168
|
+
var table_immutable = try TableImmutable.init(allocator, options.commit_entries_max);
|
|
137
169
|
errdefer table_immutable.deinit(allocator);
|
|
138
170
|
|
|
171
|
+
assert(table_immutable.value_count_max == table_mutable.value_count_max);
|
|
172
|
+
|
|
139
173
|
var manifest = try Manifest.init(allocator, node_pool, grid, tree_hash);
|
|
140
174
|
errdefer manifest.deinit(allocator);
|
|
141
175
|
|
|
@@ -149,16 +183,21 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
|
|
|
149
183
|
}
|
|
150
184
|
errdefer for (compaction_table) |*c| c.deinit(allocator);
|
|
151
185
|
|
|
186
|
+
// Compaction is one bar ahead of superblock's commit_min.
|
|
187
|
+
const op_checkpoint = grid.superblock.working.vsr_state.commit_min;
|
|
188
|
+
const lookup_snapshot_max = lookup_snapshot_max_for_checkpoint(op_checkpoint);
|
|
189
|
+
const compaction_op = op_checkpoint;
|
|
190
|
+
|
|
152
191
|
return Tree{
|
|
153
192
|
.grid = grid,
|
|
154
193
|
.options = options,
|
|
155
|
-
.value_cache = value_cache,
|
|
156
194
|
.table_mutable = table_mutable,
|
|
157
195
|
.table_immutable = table_immutable,
|
|
158
196
|
.manifest = manifest,
|
|
159
197
|
.compaction_table_immutable = compaction_table_immutable,
|
|
160
198
|
.compaction_table = compaction_table,
|
|
161
|
-
.compaction_op =
|
|
199
|
+
.compaction_op = compaction_op,
|
|
200
|
+
.lookup_snapshot_max = lookup_snapshot_max,
|
|
162
201
|
.compaction_io_pending = 0,
|
|
163
202
|
.compaction_callback = null,
|
|
164
203
|
.checkpoint_callback = null,
|
|
@@ -176,15 +215,6 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
|
|
|
176
215
|
tree.manifest.deinit(allocator);
|
|
177
216
|
}
|
|
178
217
|
|
|
179
|
-
/// Get a cached value/tombstone for the given key.
|
|
180
|
-
/// Returns null if no value/tombstone for the given key is cached.
|
|
181
|
-
pub fn get_cached(tree: *const Tree, key: Key) ?*const Value {
|
|
182
|
-
const value = tree.table_mutable.get(key) orelse
|
|
183
|
-
tree.value_cache.?.getKeyPtr(tombstone_from_key(key));
|
|
184
|
-
|
|
185
|
-
return value;
|
|
186
|
-
}
|
|
187
|
-
|
|
188
218
|
pub fn put(tree: *Tree, value: *const Value) void {
|
|
189
219
|
tree.table_mutable.put(value);
|
|
190
220
|
}
|
|
@@ -193,33 +223,41 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
|
|
|
193
223
|
tree.table_mutable.remove(value);
|
|
194
224
|
}
|
|
195
225
|
|
|
196
|
-
|
|
226
|
+
/// Returns the value from the mutable or immutable table (possibly a tombstone),
|
|
227
|
+
/// if one is available for the specified snapshot.
|
|
228
|
+
pub fn lookup_from_memory(tree: *Tree, snapshot: u64, key: Key) ?*const Value {
|
|
229
|
+
assert(tree.lookup_snapshot_max >= snapshot);
|
|
230
|
+
|
|
231
|
+
if (tree.lookup_snapshot_max == snapshot) {
|
|
232
|
+
if (tree.table_mutable.get(key)) |value| return value;
|
|
233
|
+
} else {
|
|
234
|
+
// The mutable table is converted to an immutable table when a snapshot is created.
|
|
235
|
+
// This means that a past snapshot will never be able to see the mutable table.
|
|
236
|
+
// This simplifies the mutable table and eliminates compaction for duplicate puts.
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
if (!tree.table_immutable.free and tree.table_immutable.snapshot_min <= snapshot) {
|
|
240
|
+
if (tree.table_immutable.get(key)) |value| return value;
|
|
241
|
+
} else {
|
|
242
|
+
// If the immutable table is invisible, then the mutable table is also invisible.
|
|
243
|
+
assert(tree.table_immutable.free or snapshot != tree.lookup_snapshot_max);
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
return null;
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
/// Call this function only after checking `lookup_from_memory()`.
|
|
250
|
+
pub fn lookup_from_levels(
|
|
197
251
|
tree: *Tree,
|
|
198
252
|
callback: fn (*LookupContext, ?*const Value) void,
|
|
199
253
|
context: *LookupContext,
|
|
200
254
|
snapshot: u64,
|
|
201
255
|
key: Key,
|
|
202
256
|
) void {
|
|
203
|
-
assert(
|
|
204
|
-
if (
|
|
205
|
-
// The
|
|
206
|
-
|
|
207
|
-
// This simplifies the mutable table and eliminates compaction for duplicate puts.
|
|
208
|
-
// The value cache is only used for the latest snapshot for simplicity.
|
|
209
|
-
// Earlier snapshots will still be able to utilize the block cache.
|
|
210
|
-
if (tree.table_mutable.get(key) orelse
|
|
211
|
-
tree.value_cache.?.getKeyPtr(tombstone_from_key(key))) |value|
|
|
212
|
-
{
|
|
213
|
-
callback(context, unwrap_tombstone(value));
|
|
214
|
-
return;
|
|
215
|
-
}
|
|
216
|
-
}
|
|
217
|
-
|
|
218
|
-
if (!tree.table_immutable.free and tree.table_immutable.snapshot_min < snapshot) {
|
|
219
|
-
if (tree.table_immutable.get(key)) |value| {
|
|
220
|
-
callback(context, unwrap_tombstone(value));
|
|
221
|
-
return;
|
|
222
|
-
}
|
|
257
|
+
assert(tree.lookup_snapshot_max >= snapshot);
|
|
258
|
+
if (config.verify) {
|
|
259
|
+
// The caller is responsible for checking the mutable table.
|
|
260
|
+
assert(tree.lookup_from_memory(snapshot, key) == null);
|
|
223
261
|
}
|
|
224
262
|
|
|
225
263
|
var index_block_count: u8 = 0;
|
|
@@ -285,12 +323,6 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
|
|
|
285
323
|
|
|
286
324
|
callback: fn (*Tree.LookupContext, ?*const Value) void,
|
|
287
325
|
|
|
288
|
-
fn finish(context: *LookupContext, value: ?*const Value) void {
|
|
289
|
-
const callback = context.callback;
|
|
290
|
-
context.* = undefined;
|
|
291
|
-
callback(context, value);
|
|
292
|
-
}
|
|
293
|
-
|
|
294
326
|
fn read_index_block(context: *LookupContext) void {
|
|
295
327
|
assert(context.data_block == null);
|
|
296
328
|
assert(context.index_block < context.index_block_count);
|
|
@@ -302,6 +334,7 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
|
|
|
302
334
|
&context.completion,
|
|
303
335
|
context.index_block_addresses[context.index_block],
|
|
304
336
|
context.index_block_checksums[context.index_block],
|
|
337
|
+
.index,
|
|
305
338
|
);
|
|
306
339
|
}
|
|
307
340
|
|
|
@@ -324,6 +357,7 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
|
|
|
324
357
|
completion,
|
|
325
358
|
blocks.filter_block_address,
|
|
326
359
|
blocks.filter_block_checksum,
|
|
360
|
+
.filter,
|
|
327
361
|
);
|
|
328
362
|
}
|
|
329
363
|
|
|
@@ -341,6 +375,7 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
|
|
|
341
375
|
completion,
|
|
342
376
|
context.data_block.?.address,
|
|
343
377
|
context.data_block.?.checksum,
|
|
378
|
+
.data,
|
|
344
379
|
);
|
|
345
380
|
} else {
|
|
346
381
|
// The key is not present in this table, check the next level.
|
|
@@ -356,7 +391,7 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
|
|
|
356
391
|
assert(context.index_block_count <= config.lsm_levels);
|
|
357
392
|
|
|
358
393
|
if (Table.data_block_search(data_block, context.key)) |value| {
|
|
359
|
-
context.
|
|
394
|
+
context.callback(context, unwrap_tombstone(value));
|
|
360
395
|
} else {
|
|
361
396
|
// The key is not present in this table, check the next level.
|
|
362
397
|
context.advance_to_next_level();
|
|
@@ -371,7 +406,7 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
|
|
|
371
406
|
|
|
372
407
|
context.index_block += 1;
|
|
373
408
|
if (context.index_block == context.index_block_count) {
|
|
374
|
-
context.
|
|
409
|
+
context.callback(context, null);
|
|
375
410
|
return;
|
|
376
411
|
}
|
|
377
412
|
assert(context.index_block < context.index_block_count);
|
|
@@ -384,7 +419,7 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
|
|
|
384
419
|
/// Returns null if the value is null or a tombstone, otherwise returns the value.
|
|
385
420
|
/// We use tombstone values internally, but expose them as null to the user.
|
|
386
421
|
/// This distinction enables us to cache a null result as a tombstone in our hash maps.
|
|
387
|
-
inline fn unwrap_tombstone(value: ?*const Value) ?*const Value {
|
|
422
|
+
pub inline fn unwrap_tombstone(value: ?*const Value) ?*const Value {
|
|
388
423
|
return if (value == null or tombstone(value.?)) null else value.?;
|
|
389
424
|
}
|
|
390
425
|
|
|
@@ -404,53 +439,6 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
|
|
|
404
439
|
callback(tree);
|
|
405
440
|
}
|
|
406
441
|
|
|
407
|
-
// Tree compaction runs to the sound of music!
|
|
408
|
-
//
|
|
409
|
-
// Compacting LSM trees involves merging and moving tables into the next levels as needed.
|
|
410
|
-
// To avoid write amplification stalls and bound latency, compaction is done incrementally.
|
|
411
|
-
//
|
|
412
|
-
// A full compaction phase is denoted as a bar or measure, using terms from music notation.
|
|
413
|
-
// Each measure consists of `lsm_batch_multiple` beats or "compaction ticks" of work.
|
|
414
|
-
// A compaction beat is started asynchronously with `compact_io` which takes a callback.
|
|
415
|
-
// After `compact_io` is called, `compact_cpu` should be called to enable pipelining.
|
|
416
|
-
// The compaction beat completes when the `compact_io` callback is invoked.
|
|
417
|
-
//
|
|
418
|
-
// A measure is split in half according to the "first" down beat and "middle" down beat.
|
|
419
|
-
// The first half of the measure compacts even levels while the latter compacts odd levels.
|
|
420
|
-
// Mutable table changes are sorted and compacted into the immutable table.
|
|
421
|
-
// The immutable table is compacted into level 0 during the odd level half of the measure.
|
|
422
|
-
//
|
|
423
|
-
// At any given point, there's only levels/2 max compactions happening concurrently.
|
|
424
|
-
// The source level is denoted as `level_a` with the target level being `level_b`.
|
|
425
|
-
// The last level in the LSM tree has no target level so it's not compaction-from.
|
|
426
|
-
//
|
|
427
|
-
// Assuming a measure/`lsm_batch_multiple` of 4, the invariants can be described as follows:
|
|
428
|
-
// * assert: at the end of every beat, there's space in mutable table for the next beat.
|
|
429
|
-
// * manifest info for the tables compacted are updating during the compaction.
|
|
430
|
-
// * manifest is compacted at the end of every beat.
|
|
431
|
-
//
|
|
432
|
-
// - (first) down beat of the measure:
|
|
433
|
-
// * assert: no compactions are currently running.
|
|
434
|
-
// * compact immutable table if contains any sorted values (could be empty).
|
|
435
|
-
// * allow level visible table counts to overflow if needed.
|
|
436
|
-
// * start even level compactions if there's any tables to compact.
|
|
437
|
-
//
|
|
438
|
-
// - (second) up beat of the measure:
|
|
439
|
-
// * finish ticking running even-level compactions.
|
|
440
|
-
// * assert: on callback completion, all compactions should be completed.
|
|
441
|
-
//
|
|
442
|
-
// - (third) down beat of the measure:
|
|
443
|
-
// * assert: no compactions are currently running.
|
|
444
|
-
// * start odd level and immutable table compactions.
|
|
445
|
-
//
|
|
446
|
-
// - (fourth) last beat of the measure:
|
|
447
|
-
// * finish ticking running odd-level and immutable table compactions.
|
|
448
|
-
// * assert: on callback completion, all compactions should be completed.
|
|
449
|
-
// * assert: on callback completion, all level visible table counts shouldn't overflow.
|
|
450
|
-
// * flush, clear, and sort mutable table values into immutable table for next measure.
|
|
451
|
-
|
|
452
|
-
const half_measure_beat_count = @divExact(config.lsm_batch_multiple, 2);
|
|
453
|
-
|
|
454
442
|
const CompactionTableContext = struct {
|
|
455
443
|
compaction: *CompactionTable,
|
|
456
444
|
level_a: u8,
|
|
@@ -465,7 +453,7 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
|
|
|
465
453
|
assert(it.tree.compaction_callback != null);
|
|
466
454
|
|
|
467
455
|
const compaction_beat = it.tree.compaction_op % config.lsm_batch_multiple;
|
|
468
|
-
const even_levels = compaction_beat <
|
|
456
|
+
const even_levels = compaction_beat < half_bar_beat_count;
|
|
469
457
|
const level_a = (it.index * 2) + @boolToInt(!even_levels);
|
|
470
458
|
const level_b = level_a + 1;
|
|
471
459
|
|
|
@@ -489,77 +477,93 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
|
|
|
489
477
|
/// This order (even levels, then odd levels) is significant, since it reduces the number of
|
|
490
478
|
/// level 0 tables that overlap with the immutable table, reducing write amplification.
|
|
491
479
|
///
|
|
492
|
-
/// We therefore take the
|
|
493
|
-
/// running the compactions from even levels in the first half
|
|
480
|
+
/// We therefore take the bar, during which all compactions run, and divide by two,
|
|
481
|
+
/// running the compactions from even levels in the first half bar, and then the odd.
|
|
494
482
|
///
|
|
495
|
-
/// Compactions start on the down beat of a half
|
|
496
|
-
/// For example, if there are 4 beats in a
|
|
483
|
+
/// Compactions start on the down beat of a half bar, using 0-based beats.
|
|
484
|
+
/// For example, if there are 4 beats in a bar, start on beat 0 or beat 2.
|
|
497
485
|
pub fn compact(tree: *Tree, callback: fn (*Tree) void, op: u64) void {
|
|
498
|
-
tree.
|
|
499
|
-
|
|
500
|
-
|
|
486
|
+
assert(tree.compaction_callback == null);
|
|
487
|
+
assert(op != 0);
|
|
488
|
+
assert(op == tree.compaction_op + 1);
|
|
489
|
+
assert(op > tree.grid.superblock.working.vsr_state.commit_min);
|
|
501
490
|
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
491
|
+
tree.compaction_op = op;
|
|
492
|
+
|
|
493
|
+
if (tree.grid.superblock.working.vsr_state.op_compacted(op)) {
|
|
494
|
+
// We recovered from a checkpoint, and must avoid replaying one bar of
|
|
495
|
+
// compactions that were applied before the checkpoint. Repeating these ops'
|
|
496
|
+
// compactions would actually perform different compactions than before,
|
|
497
|
+
// causing the storage state of the replica to diverge from the cluster.
|
|
498
|
+
// See also: lookup_snapshot_max_for_checkpoint().
|
|
499
|
+
|
|
500
|
+
if (tree.compaction_op + 1 == tree.lookup_snapshot_max) {
|
|
501
|
+
// This is the last op of the skipped compaction bar.
|
|
502
|
+
// Prepare the immutable table for the next bar — since this state is
|
|
503
|
+
// in-memory, it cannot be skipped.
|
|
504
|
+
tree.compact_mutable_table_into_immutable();
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
// TODO Defer this callback until tick() to avoid stack growth.
|
|
508
|
+
callback(tree);
|
|
509
|
+
return;
|
|
510
|
+
}
|
|
511
|
+
assert(op == tree.lookup_snapshot_max);
|
|
512
|
+
|
|
513
|
+
tree.compact_start(callback);
|
|
514
|
+
tree.compact_drive();
|
|
506
515
|
}
|
|
507
516
|
|
|
508
|
-
fn compact_start(tree: *Tree, callback: fn (*Tree) void
|
|
517
|
+
fn compact_start(tree: *Tree, callback: fn (*Tree) void) void {
|
|
509
518
|
assert(tree.compaction_io_pending == 0);
|
|
510
519
|
assert(tree.compaction_callback == null);
|
|
511
|
-
|
|
512
|
-
if (op > 0) assert(op > tree.compaction_op);
|
|
513
|
-
tree.compaction_op = op;
|
|
520
|
+
|
|
514
521
|
tree.compaction_callback = callback;
|
|
515
522
|
|
|
516
523
|
const compaction_beat = tree.compaction_op % config.lsm_batch_multiple;
|
|
517
524
|
const start = (compaction_beat == 0) or
|
|
518
|
-
(compaction_beat ==
|
|
525
|
+
(compaction_beat == half_bar_beat_count);
|
|
519
526
|
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
// would be in the immutable table. This means the current batch compaction will
|
|
524
|
-
// actually be flushing to disk (levels) mutable table updates from the previous batch.
|
|
525
|
-
//
|
|
526
|
-
// -1 as the ops are zero based so the "last" op from previous batch is reflected.
|
|
527
|
-
const snapshot = std.mem.alignBackward(op, config.lsm_batch_multiple) -| 1;
|
|
528
|
-
assert(snapshot != snapshot_latest);
|
|
527
|
+
const op_min = compaction_op_min(tree.compaction_op);
|
|
528
|
+
assert(op_min < snapshot_latest);
|
|
529
|
+
assert(op_min % half_bar_beat_count == 0);
|
|
529
530
|
|
|
530
|
-
log.debug(tree_name ++ ": compact_start: op={d}
|
|
531
|
-
|
|
532
|
-
|
|
531
|
+
log.debug(tree_name ++ ": compact_start: op={d} op_min={d} beat={d}/{d}", .{
|
|
532
|
+
tree.compaction_op,
|
|
533
|
+
op_min,
|
|
533
534
|
compaction_beat + 1,
|
|
534
535
|
config.lsm_batch_multiple,
|
|
535
536
|
});
|
|
536
537
|
|
|
537
538
|
// Try to start compacting the immutable table.
|
|
538
|
-
const even_levels = compaction_beat <
|
|
539
|
+
const even_levels = compaction_beat < half_bar_beat_count;
|
|
539
540
|
if (even_levels) {
|
|
540
541
|
assert(tree.compaction_table_immutable.status == .idle);
|
|
541
542
|
} else {
|
|
542
|
-
if (start) tree.
|
|
543
|
+
if (start) tree.compact_start_table_immutable(op_min);
|
|
543
544
|
}
|
|
544
545
|
|
|
545
546
|
// Try to start compacting the other levels.
|
|
546
547
|
var it = CompactionTableIterator{ .tree = tree };
|
|
547
548
|
while (it.next()) |context| {
|
|
548
|
-
if (start) tree.
|
|
549
|
+
if (start) tree.compact_start_table(op_min, context);
|
|
549
550
|
}
|
|
550
551
|
}
|
|
551
552
|
|
|
552
|
-
fn
|
|
553
|
+
fn compact_start_table_immutable(tree: *Tree, op_min: u64) void {
|
|
553
554
|
const compaction_beat = tree.compaction_op % config.lsm_batch_multiple;
|
|
554
|
-
assert(compaction_beat ==
|
|
555
|
+
assert(compaction_beat == half_bar_beat_count);
|
|
555
556
|
|
|
556
557
|
// Do not start compaction if the immutable table does not require compaction.
|
|
557
558
|
if (tree.table_immutable.free) return;
|
|
558
559
|
|
|
560
|
+
assert(tree.table_immutable.snapshot_min % half_bar_beat_count == 0);
|
|
561
|
+
|
|
559
562
|
const values_count = tree.table_immutable.values.len;
|
|
560
563
|
assert(values_count > 0);
|
|
561
564
|
|
|
562
565
|
const level_b: u8 = 0;
|
|
566
|
+
const table_a: ?*const Manifest.TableInfo = null;
|
|
563
567
|
const range = tree.manifest.compaction_range(
|
|
564
568
|
level_b,
|
|
565
569
|
tree.table_immutable.key_min(),
|
|
@@ -572,24 +576,25 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
|
|
|
572
576
|
|
|
573
577
|
log.debug(tree_name ++
|
|
574
578
|
": compacting immutable table to level 0 " ++
|
|
575
|
-
"(values.len={d} snapshot_min={d} compaction.
|
|
579
|
+
"(values.len={d} snapshot_min={d} compaction.op_min={d} table_count={d})", .{
|
|
576
580
|
tree.table_immutable.values.len,
|
|
577
581
|
tree.table_immutable.snapshot_min,
|
|
578
|
-
|
|
582
|
+
op_min,
|
|
579
583
|
range.table_count,
|
|
580
584
|
});
|
|
581
585
|
|
|
582
586
|
tree.compaction_table_immutable.start(
|
|
583
587
|
tree.grid,
|
|
584
588
|
&tree.manifest,
|
|
585
|
-
|
|
589
|
+
op_min,
|
|
586
590
|
range,
|
|
587
|
-
|
|
591
|
+
table_a,
|
|
592
|
+
level_b,
|
|
588
593
|
.{ .table = &tree.table_immutable },
|
|
589
594
|
);
|
|
590
595
|
}
|
|
591
596
|
|
|
592
|
-
fn
|
|
597
|
+
fn compact_start_table(tree: *Tree, op_min: u64, context: CompactionTableContext) void {
|
|
593
598
|
assert(context.level_a < config.lsm_levels);
|
|
594
599
|
assert(context.level_b < config.lsm_levels);
|
|
595
600
|
assert(context.level_a + 1 == context.level_b);
|
|
@@ -613,9 +618,10 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
|
|
|
613
618
|
context.compaction.start(
|
|
614
619
|
tree.grid,
|
|
615
620
|
&tree.manifest,
|
|
616
|
-
|
|
621
|
+
op_min,
|
|
617
622
|
table_range.range,
|
|
618
|
-
|
|
623
|
+
table_range.table,
|
|
624
|
+
context.level_b,
|
|
619
625
|
.{
|
|
620
626
|
.grid = tree.grid,
|
|
621
627
|
.address = table.address,
|
|
@@ -624,72 +630,70 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
|
|
|
624
630
|
);
|
|
625
631
|
}
|
|
626
632
|
|
|
627
|
-
fn
|
|
633
|
+
fn compact_drive(tree: *Tree) void {
|
|
628
634
|
assert(tree.compaction_io_pending <= 2 + tree.compaction_table.len);
|
|
629
635
|
assert(tree.compaction_callback != null);
|
|
630
636
|
|
|
631
|
-
//
|
|
637
|
+
// Always start one fake io_pending that is resolved right after
|
|
638
|
+
// to handle the case where this compaction tick triggers no IO.
|
|
639
|
+
// (For example, ticking the immutable table, or level B is already done).
|
|
640
|
+
tree.compaction_io_pending += 1;
|
|
641
|
+
defer tree.compact_tick_done();
|
|
642
|
+
|
|
643
|
+
// Try to tick the immutable table compaction:
|
|
632
644
|
const compaction_beat = tree.compaction_op % config.lsm_batch_multiple;
|
|
633
|
-
const even_levels = compaction_beat <
|
|
645
|
+
const even_levels = compaction_beat < half_bar_beat_count;
|
|
634
646
|
if (even_levels) {
|
|
635
647
|
assert(tree.compaction_table_immutable.status == .idle);
|
|
636
648
|
} else {
|
|
637
|
-
|
|
638
|
-
tree.compact_io_tick(&tree.compaction_table_immutable);
|
|
639
|
-
}
|
|
649
|
+
tree.compact_tick(&tree.compaction_table_immutable);
|
|
640
650
|
}
|
|
641
651
|
|
|
642
|
-
// Try to tick the
|
|
652
|
+
// Try to tick the compaction for each level:
|
|
643
653
|
var it = CompactionTableIterator{ .tree = tree };
|
|
644
654
|
while (it.next()) |context| {
|
|
645
|
-
|
|
646
|
-
assert(context.compaction.level_b == context.level_b);
|
|
647
|
-
tree.compact_io_tick(context.compaction);
|
|
648
|
-
}
|
|
655
|
+
tree.compact_tick(context.compaction);
|
|
649
656
|
}
|
|
650
|
-
|
|
651
|
-
// Always start one io_pending that is resolved in compact_cpu()
|
|
652
|
-
// to handle the case of no level or immutable table being selected for compaction
|
|
653
|
-
tree.compaction_io_pending += 1;
|
|
654
|
-
assert(tree.compaction_io_pending <= 2 + tree.compaction_table.len);
|
|
655
657
|
}
|
|
656
658
|
|
|
657
|
-
fn
|
|
659
|
+
fn compact_tick(tree: *Tree, compaction: anytype) void {
|
|
660
|
+
if (compaction.status != .processing) return;
|
|
658
661
|
tree.compaction_io_pending += 1;
|
|
659
662
|
|
|
660
663
|
const compaction_beat = tree.compaction_op % config.lsm_batch_multiple;
|
|
661
|
-
const even_levels = compaction_beat <
|
|
664
|
+
const even_levels = compaction_beat < half_bar_beat_count;
|
|
662
665
|
assert(compaction.level_b < config.lsm_levels);
|
|
663
666
|
assert(compaction.level_b % 2 == @boolToInt(even_levels));
|
|
664
667
|
|
|
665
668
|
if (@TypeOf(compaction.*) == CompactionTableImmutable) {
|
|
666
669
|
assert(compaction.level_b == 0);
|
|
667
|
-
|
|
668
|
-
|
|
670
|
+
log.debug(tree_name ++ ": compact_tick() for immutable table to level 0", .{});
|
|
671
|
+
compaction.compact_tick(Tree.compact_tick_callback_table_immutable);
|
|
669
672
|
} else {
|
|
670
|
-
compaction
|
|
671
|
-
log.debug(tree_name ++ ":
|
|
673
|
+
assert(@TypeOf(compaction.*) == CompactionTable);
|
|
674
|
+
log.debug(tree_name ++ ": compact_tick() for level {d} to level {d}", .{
|
|
672
675
|
compaction.level_b - 1,
|
|
673
676
|
compaction.level_b,
|
|
674
677
|
});
|
|
678
|
+
compaction.compact_tick(Tree.compact_tick_callback_table);
|
|
675
679
|
}
|
|
676
680
|
}
|
|
677
681
|
|
|
678
|
-
fn
|
|
679
|
-
assert(compaction.status == .
|
|
682
|
+
fn compact_tick_callback_table_immutable(compaction: *CompactionTableImmutable) void {
|
|
683
|
+
assert(compaction.status == .processing or compaction.status == .done);
|
|
680
684
|
assert(compaction.level_b < config.lsm_levels);
|
|
681
685
|
assert(compaction.level_b == 0);
|
|
682
686
|
|
|
683
687
|
const tree = @fieldParentPtr(Tree, "compaction_table_immutable", compaction);
|
|
684
688
|
const compaction_beat = tree.compaction_op % config.lsm_batch_multiple;
|
|
685
|
-
assert(compaction_beat >=
|
|
689
|
+
assert(compaction_beat >= half_bar_beat_count);
|
|
686
690
|
|
|
687
|
-
log.debug(tree_name ++ ":
|
|
688
|
-
tree.
|
|
691
|
+
log.debug(tree_name ++ ": compact_tick() complete for immutable table to level 0", .{});
|
|
692
|
+
tree.compact_tick_done();
|
|
689
693
|
}
|
|
690
694
|
|
|
691
|
-
fn
|
|
692
|
-
assert(compaction.status == .
|
|
695
|
+
fn compact_tick_callback_table(compaction: *CompactionTable) void {
|
|
696
|
+
assert(compaction.status == .processing or compaction.status == .done);
|
|
693
697
|
assert(compaction.level_b < config.lsm_levels);
|
|
694
698
|
assert(compaction.level_b > 0);
|
|
695
699
|
|
|
@@ -699,182 +703,201 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
|
|
|
699
703
|
const table_size = @divFloor(config.lsm_levels, 2);
|
|
700
704
|
const table: *[table_size]CompactionTable = table_ptr[0..table_size];
|
|
701
705
|
|
|
702
|
-
|
|
703
|
-
log.debug(tree_name ++ ": compact_io complete for level {d} to level {d}", .{
|
|
706
|
+
log.debug(tree_name ++ ": compact_tick() complete for level {d} to level {d}", .{
|
|
704
707
|
compaction.level_b - 1,
|
|
705
708
|
compaction.level_b,
|
|
706
709
|
});
|
|
707
710
|
|
|
708
|
-
tree
|
|
711
|
+
const tree = @fieldParentPtr(Tree, "compaction_table", table);
|
|
712
|
+
tree.compact_tick_done();
|
|
709
713
|
}
|
|
710
714
|
|
|
711
|
-
fn
|
|
715
|
+
fn compact_tick_done(tree: *Tree) void {
|
|
712
716
|
assert(tree.compaction_io_pending <= 2 + tree.compaction_table.len);
|
|
713
717
|
assert(tree.compaction_callback != null);
|
|
714
718
|
|
|
715
|
-
// compact_done() is called after all
|
|
716
|
-
// This function can be triggered asynchronously or by compact_cpu() below.
|
|
719
|
+
// compact_done() is called after all compact_tick()'s complete.
|
|
717
720
|
tree.compaction_io_pending -= 1;
|
|
718
721
|
if (tree.compaction_io_pending == 0) tree.compact_done();
|
|
719
722
|
}
|
|
720
723
|
|
|
721
|
-
|
|
722
|
-
|
|
724
|
+
/// Called at the end of each compaction tick.
|
|
725
|
+
fn compact_done(tree: *Tree) void {
|
|
726
|
+
assert(tree.compaction_io_pending == 0);
|
|
723
727
|
assert(tree.compaction_callback != null);
|
|
728
|
+
assert(tree.compaction_op == tree.lookup_snapshot_max);
|
|
724
729
|
|
|
725
|
-
// Try to tick the cpu portion of the immutable table compaction:
|
|
726
730
|
const compaction_beat = tree.compaction_op % config.lsm_batch_multiple;
|
|
727
|
-
const even_levels = compaction_beat <
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
731
|
+
const even_levels = compaction_beat < half_bar_beat_count;
|
|
732
|
+
const compacted_levels_even = compaction_beat == half_bar_beat_count - 1;
|
|
733
|
+
const compacted_levels_odd = compaction_beat == config.lsm_batch_multiple - 1;
|
|
734
|
+
if (!compacted_levels_even and !compacted_levels_odd) {
|
|
735
|
+
// TODO(Deterministic Beats): Remove this when compact_done() is called exactly
|
|
736
|
+
// once when the beat finishes.
|
|
737
|
+
tree.lookup_snapshot_max = tree.compaction_op + 1;
|
|
738
|
+
|
|
739
|
+
tree.compact_finish();
|
|
740
|
+
return;
|
|
734
741
|
}
|
|
735
742
|
|
|
736
|
-
//
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
743
|
+
// At the end of the second and fourth beat:
|
|
744
|
+
// 1. Tick the Compactions until all have completed.
|
|
745
|
+
// 2. Remove invisible tables from the manifest.
|
|
746
|
+
// 3. Compact the manifest.
|
|
747
|
+
// Then at the end of the fourth beat, freeze the mutable table.
|
|
748
|
+
assert(compacted_levels_even or compacted_levels_odd);
|
|
749
|
+
assert(compacted_levels_even != compacted_levels_odd);
|
|
750
|
+
|
|
751
|
+
const still_compacting = blk: {
|
|
752
|
+
if (even_levels) {
|
|
753
|
+
assert(tree.compaction_table_immutable.status == .idle);
|
|
754
|
+
} else {
|
|
755
|
+
if (tree.compaction_table_immutable.status == .processing) break :blk true;
|
|
742
756
|
}
|
|
743
|
-
}
|
|
744
757
|
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
assert(tree.compaction_callback != null);
|
|
758
|
+
var it = CompactionTableIterator{ .tree = tree };
|
|
759
|
+
while (it.next()) |context| {
|
|
760
|
+
if (context.compaction.status == .processing) break :blk true;
|
|
761
|
+
}
|
|
762
|
+
break :blk false;
|
|
763
|
+
};
|
|
752
764
|
|
|
753
|
-
|
|
765
|
+
if (still_compacting) {
|
|
766
|
+
// We are at the end of a half-bar, but the compactions have not finished.
|
|
767
|
+
// We keep ticking them until they finish.
|
|
768
|
+
log.debug(tree_name ++ ": compact_done: driving outstanding compactions", .{});
|
|
769
|
+
tree.compact_drive();
|
|
770
|
+
return;
|
|
771
|
+
}
|
|
754
772
|
|
|
755
|
-
//
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
773
|
+
// TODO(Deterministic Beats): Move this to the top of the function when compact_done()
|
|
774
|
+
// is called exactly once when the beat finishes.
|
|
775
|
+
tree.lookup_snapshot_max = tree.compaction_op + 1;
|
|
776
|
+
|
|
777
|
+
// All compactions have finished for the current half-bar.
|
|
778
|
+
// We couldn't remove the (invisible) input tables until now because prefetch()
|
|
779
|
+
// needs a complete set of tables for lookups to avoid missing data.
|
|
780
|
+
|
|
781
|
+
// Reset the immutable table Compaction.
|
|
782
|
+
// Also clear any tables made invisible by the compaction.
|
|
783
|
+
if (!even_levels) {
|
|
784
|
+
switch (tree.compaction_table_immutable.status) {
|
|
785
|
+
// The compaction wasn't started for this half bar.
|
|
786
|
+
.idle => assert(tree.table_immutable.free),
|
|
787
|
+
.processing => unreachable,
|
|
788
|
+
.done => {
|
|
789
|
+
tree.compaction_table_immutable.reset();
|
|
790
|
+
tree.table_immutable.clear();
|
|
791
|
+
tree.manifest.remove_invisible_tables(
|
|
792
|
+
tree.compaction_table_immutable.level_b,
|
|
793
|
+
tree.lookup_snapshot_max,
|
|
794
|
+
tree.compaction_table_immutable.range.key_min,
|
|
795
|
+
tree.compaction_table_immutable.range.key_max,
|
|
796
|
+
);
|
|
797
|
+
},
|
|
766
798
|
}
|
|
767
799
|
}
|
|
768
800
|
|
|
769
|
-
//
|
|
801
|
+
// Reset all the other Compactions.
|
|
802
|
+
// Also clear any tables made invisible by the compactions.
|
|
770
803
|
var it = CompactionTableIterator{ .tree = tree };
|
|
771
804
|
while (it.next()) |context| {
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
805
|
+
switch (context.compaction.status) {
|
|
806
|
+
.idle => {}, // The compaction wasn't started for this half bar.
|
|
807
|
+
.processing => unreachable,
|
|
808
|
+
.done => {
|
|
809
|
+
context.compaction.reset();
|
|
810
|
+
tree.manifest.remove_invisible_tables(
|
|
811
|
+
context.compaction.level_b,
|
|
812
|
+
tree.lookup_snapshot_max,
|
|
813
|
+
context.compaction.range.key_min,
|
|
814
|
+
context.compaction.range.key_max,
|
|
815
|
+
);
|
|
816
|
+
if (context.compaction.level_b > 0) {
|
|
817
|
+
tree.manifest.remove_invisible_tables(
|
|
818
|
+
context.compaction.level_b - 1,
|
|
819
|
+
tree.lookup_snapshot_max,
|
|
820
|
+
context.compaction.range.key_min,
|
|
821
|
+
context.compaction.range.key_max,
|
|
822
|
+
);
|
|
823
|
+
}
|
|
824
|
+
},
|
|
777
825
|
}
|
|
778
826
|
}
|
|
779
827
|
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
// - assert: even compactions from previous tick are finished.
|
|
785
|
-
// - remove tables made invisible during compaction of even levels.
|
|
786
|
-
if (compaction_beat == half_measure_beat_count - 1) {
|
|
787
|
-
if (still_compacting) {
|
|
788
|
-
log.debug(tree_name ++ ": compact_done: driving outstanding compactions", .{});
|
|
789
|
-
return tree.compact_drive();
|
|
790
|
-
}
|
|
791
|
-
|
|
792
|
-
log.debug(tree_name ++ ": compact_done: compacted even levels", .{});
|
|
793
|
-
|
|
794
|
-
it = CompactionTableIterator{ .tree = tree };
|
|
795
|
-
while (it.next()) |context| {
|
|
796
|
-
assert(context.compaction.status == .idle);
|
|
797
|
-
tree.manifest.remove_invisible_tables(
|
|
798
|
-
context.level_a,
|
|
799
|
-
context.compaction.snapshot,
|
|
800
|
-
context.compaction.range.key_min,
|
|
801
|
-
context.compaction.range.key_max,
|
|
802
|
-
);
|
|
803
|
-
}
|
|
828
|
+
assert(tree.compaction_table_immutable.status == .idle);
|
|
829
|
+
it = CompactionTableIterator{ .tree = tree };
|
|
830
|
+
while (it.next()) |context| {
|
|
831
|
+
assert(context.compaction.status == .idle);
|
|
804
832
|
}
|
|
805
833
|
|
|
806
|
-
// At end of fourth/last
|
|
807
|
-
// -
|
|
808
|
-
// -
|
|
809
|
-
|
|
810
|
-
// - convert mutable table to immutable tables for next measure.
|
|
811
|
-
if (compaction_beat == config.lsm_batch_multiple - 1) {
|
|
812
|
-
if (still_compacting) {
|
|
813
|
-
log.debug(tree_name ++ ": compact_done: driving outstanding compactions", .{});
|
|
814
|
-
return tree.compact_drive();
|
|
815
|
-
}
|
|
816
|
-
|
|
817
|
-
// TODO Make log message more accurate according to what was compacted.
|
|
818
|
-
log.debug(tree_name ++ ": compact_done: compacted immutable table and odd levels", .{});
|
|
819
|
-
|
|
820
|
-
assert(tree.compaction_table_immutable.status == .idle);
|
|
821
|
-
it = CompactionTableIterator{ .tree = tree };
|
|
822
|
-
while (it.next()) |context| {
|
|
823
|
-
assert(context.compaction.status == .idle);
|
|
824
|
-
tree.manifest.remove_invisible_tables(
|
|
825
|
-
context.level_a,
|
|
826
|
-
context.compaction.snapshot,
|
|
827
|
-
context.compaction.range.key_min,
|
|
828
|
-
context.compaction.range.key_max,
|
|
829
|
-
);
|
|
830
|
-
}
|
|
831
|
-
|
|
834
|
+
// At the end of the fourth/last beat:
|
|
835
|
+
// - Assert all visible tables haven't overflowed their max per level.
|
|
836
|
+
// - Convert mutable table to immutable table for next bar.
|
|
837
|
+
if (compacted_levels_odd) {
|
|
832
838
|
tree.manifest.assert_level_table_counts();
|
|
833
839
|
tree.compact_mutable_table_into_immutable();
|
|
834
840
|
}
|
|
835
841
|
|
|
836
|
-
// At the end of
|
|
842
|
+
// At the end of the second/fourth beat:
|
|
843
|
+
// - Compact the manifest before invoking the compact() callback.
|
|
837
844
|
tree.manifest.compact(compact_manifest_callback);
|
|
838
845
|
}
|
|
839
846
|
|
|
840
|
-
|
|
841
|
-
const tree = @fieldParentPtr(Tree, "manifest", manifest);
|
|
842
|
-
assert(tree.compaction_io_pending == 0);
|
|
843
|
-
assert(tree.compaction_callback != null);
|
|
844
|
-
|
|
845
|
-
// Invoke the compact_io() callback after the manifest compacts at the end of the beat.
|
|
846
|
-
const callback = tree.compaction_callback.?;
|
|
847
|
-
tree.compaction_callback = null;
|
|
848
|
-
callback(tree);
|
|
849
|
-
}
|
|
850
|
-
|
|
847
|
+
/// Called after the last beat of a full compaction bar.
|
|
851
848
|
fn compact_mutable_table_into_immutable(tree: *Tree) void {
|
|
852
|
-
// Ensure mutable table can be flushed into immutable table.
|
|
853
|
-
if (tree.table_mutable.count() == 0) return;
|
|
854
849
|
assert(tree.table_immutable.free);
|
|
850
|
+
assert((tree.compaction_op + 1) % config.lsm_batch_multiple == 0);
|
|
851
|
+
assert(tree.compaction_op + 1 == tree.lookup_snapshot_max);
|
|
852
|
+
|
|
853
|
+
if (tree.table_mutable.count() == 0) return;
|
|
855
854
|
|
|
856
855
|
// Sort the mutable table values directly into the immutable table's array.
|
|
857
856
|
const values_max = tree.table_immutable.values_max();
|
|
858
857
|
const values = tree.table_mutable.sort_into_values_and_clear(values_max);
|
|
859
858
|
assert(values.ptr == values_max.ptr);
|
|
860
859
|
|
|
861
|
-
//
|
|
862
|
-
|
|
863
|
-
|
|
860
|
+
// The immutable table must be visible to the next bar — setting its snapshot_min to
|
|
861
|
+
// lookup_snapshot_max guarantees.
|
|
862
|
+
//
|
|
863
|
+
// In addition, the immutable table is conceptually an output table of this compaction
|
|
864
|
+
// bar, and now its snapshot_min matches the snapshot_min of the Compactions' output
|
|
865
|
+
// tables.
|
|
866
|
+
tree.table_immutable.reset_with_sorted_values(tree.lookup_snapshot_max, values);
|
|
864
867
|
|
|
865
868
|
assert(tree.table_mutable.count() == 0);
|
|
866
869
|
assert(!tree.table_immutable.free);
|
|
867
870
|
}
|
|
868
871
|
|
|
872
|
+
fn compact_manifest_callback(manifest: *Manifest) void {
|
|
873
|
+
const tree = @fieldParentPtr(Tree, "manifest", manifest);
|
|
874
|
+
assert(tree.compaction_io_pending == 0);
|
|
875
|
+
assert(tree.compaction_callback != null);
|
|
876
|
+
tree.compact_finish();
|
|
877
|
+
}
|
|
878
|
+
|
|
879
|
+
/// Called at the end of each compaction beat.
|
|
880
|
+
fn compact_finish(tree: *Tree) void {
|
|
881
|
+
assert(tree.compaction_io_pending == 0);
|
|
882
|
+
assert(tree.table_mutable.can_commit_batch(tree.options.commit_entries_max));
|
|
883
|
+
|
|
884
|
+
// Invoke the compact() callback after the manifest compacts at the end of the beat.
|
|
885
|
+
const callback = tree.compaction_callback.?;
|
|
886
|
+
tree.compaction_callback = null;
|
|
887
|
+
callback(tree);
|
|
888
|
+
}
|
|
889
|
+
|
|
869
890
|
pub fn checkpoint(tree: *Tree, callback: fn (*Tree) void) void {
|
|
870
|
-
// Assert no outstanding
|
|
891
|
+
// Assert no outstanding compact_tick() work..
|
|
871
892
|
assert(tree.compaction_io_pending == 0);
|
|
872
893
|
assert(tree.compaction_callback == null);
|
|
894
|
+
assert(tree.compaction_op > 0);
|
|
895
|
+
assert(tree.compaction_op + 1 == tree.lookup_snapshot_max);
|
|
873
896
|
|
|
874
|
-
//
|
|
897
|
+
// Assert that this is the last beat in the compaction bar.
|
|
875
898
|
const compaction_beat = tree.compaction_op % config.lsm_batch_multiple;
|
|
876
|
-
const
|
|
877
|
-
|
|
899
|
+
const last_beat_in_bar = config.lsm_batch_multiple - 1;
|
|
900
|
+
assert(last_beat_in_bar == compaction_beat);
|
|
878
901
|
|
|
879
902
|
// Assert no outstanding compactions.
|
|
880
903
|
assert(tree.compaction_table_immutable.status == .idle);
|
|
@@ -887,7 +910,7 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
|
|
|
887
910
|
|
|
888
911
|
// Assert that we're checkpointing only after invisible tables have been removed.
|
|
889
912
|
if (config.verify) {
|
|
890
|
-
tree.manifest.assert_no_invisible_tables(tree.compaction_op);
|
|
913
|
+
tree.manifest.assert_no_invisible_tables(compaction_op_min(tree.compaction_op));
|
|
891
914
|
}
|
|
892
915
|
|
|
893
916
|
// Start an asynchronous checkpoint on the manifest.
|
|
@@ -932,7 +955,7 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
|
|
|
932
955
|
pub fn range_query(
|
|
933
956
|
tree: *Tree,
|
|
934
957
|
/// The snapshot timestamp, if any
|
|
935
|
-
snapshot: u64,
|
|
958
|
+
snapshot: ?u64,
|
|
936
959
|
query: RangeQuery,
|
|
937
960
|
) RangeQueryIterator {
|
|
938
961
|
_ = tree;
|
|
@@ -942,6 +965,87 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
|
|
|
942
965
|
};
|
|
943
966
|
}
|
|
944
967
|
|
|
968
|
+
/// Returns the first op of the compaction (Compaction.op_min) for a given op/beat.
|
|
969
|
+
///
|
|
970
|
+
/// After this compaction finishes:
|
|
971
|
+
/// - `op_min + half_bar_beat_count - 1` will be the input tables' snapshot_max.
|
|
972
|
+
/// - `op_min + half_bar_beat_count` will be the output tables' snapshot_min.
|
|
973
|
+
///
|
|
974
|
+
/// Each half-bar has a separate op_min (for deriving the output snapshot_min) instead of each full
|
|
975
|
+
/// bar because this allows the output tables of the first half-bar's compaction to be prefetched
|
|
976
|
+
/// against earlier — hopefully while they are still warm in the cache from being written.
|
|
977
|
+
pub fn compaction_op_min(op: u64) u64 {
|
|
978
|
+
return op - op % half_bar_beat_count;
|
|
979
|
+
}
|
|
980
|
+
|
|
981
|
+
/// These charts depict the commit/compact ops and `lookup_snapshot_max` over a series of
|
|
982
|
+
/// commits and compactions (with lsm_batch_multiple=8).
|
|
983
|
+
///
|
|
984
|
+
/// Legend:
|
|
985
|
+
///
|
|
986
|
+
/// ┼ full bar (first half-bar start)
|
|
987
|
+
/// ┬ half bar (second half-bar start)
|
|
988
|
+
/// $ lookup_snapshot_max (prefetch reads from the current snapshot)
|
|
989
|
+
/// This is incremented at the end of each compact().
|
|
990
|
+
/// . op is in mutable table (in memory)
|
|
991
|
+
/// , op is in immutable table (in memory)
|
|
992
|
+
/// # op is on disk
|
|
993
|
+
/// ✓ checkpoint() may follow compact()
|
|
994
|
+
///
|
|
995
|
+
/// 0 2 4 6 8 0 2 4 6
|
|
996
|
+
/// ┼───┬───┼───┬───┼
|
|
997
|
+
/// .$ ╷ ╷ init(superblock.commit_min=0)⎤ Compaction is effectively a noop for the
|
|
998
|
+
/// .$ ╷ ╷ commit;compact( 1) start/end ⎥ first bar because there are no tables on
|
|
999
|
+
/// ..$ ╷ ╷ commit;compact( 2) start/end ⎥ disk yet, and no immutable table to
|
|
1000
|
+
/// ...$ ╷ ╷ commit;compact( 3) start/end ⎥ flush.
|
|
1001
|
+
/// ....$ ╷ ╷ commit;compact( 4) start/end ⎥
|
|
1002
|
+
/// .....$ ╷ ╷ commit;compact( 5) start/end ⎥ This applies:
|
|
1003
|
+
/// ......$ ╷ ╷ commit;compact( 6) start/end ⎥ - when the LSM is starting on a freshly
|
|
1004
|
+
/// .......$╷ ╷ commit;compact( 7) start ⎤⎥ formatted data file, and also
|
|
1005
|
+
/// ,,,,,,,,$ ╷ ✓ compact( 7) end⎦⎦ - when the LSM is recovering from a crash
|
|
1006
|
+
/// ,,,,,,,,$ ╷ commit;compact( 8) start/end (see below).
|
|
1007
|
+
/// ,,,,,,,,.$ ╷ commit;compact( 9) start/end
|
|
1008
|
+
/// ,,,,,,,,..$ ╷ commit;compact(10) start/end
|
|
1009
|
+
/// ,,,,,,,,...$ ╷ commit;compact(11) start/end
|
|
1010
|
+
/// ,,,,,,,,....$ ╷ commit;compact(12) start/end
|
|
1011
|
+
/// ,,,,,,,,.....$ ╷ commit;compact(13) start/end
|
|
1012
|
+
/// ,,,,,,,,......$ ╷ commit;compact(14) start/end
|
|
1013
|
+
/// ,,,,,,,,.......$╷ commit;compact(15) start ⎤
|
|
1014
|
+
/// ########,,,,,,,,$ ✓ compact(15) end⎦
|
|
1015
|
+
/// ########,,,,,,,,$ commit;compact(16) start/end
|
|
1016
|
+
/// ┼───┬───┼───┬───┼
|
|
1017
|
+
/// 0 2 4 6 8 0 2 4 6
|
|
1018
|
+
/// ┼───┬───┼───┬───┼ Recover with a checkpoint taken at op 15.
|
|
1019
|
+
/// ######## $ init(superblock.commit_min=7) At op 15, ops 8…15 are in memory, so they
|
|
1020
|
+
/// ########. $ commit ( 8) start/end ⎤ were dropped by the crash.
|
|
1021
|
+
/// ########.. $ commit ( 9) start/end ⎥
|
|
1022
|
+
/// ########... $ commit (10) start/end ⎥ But compaction is not run for ops 8…15
|
|
1023
|
+
/// ########.... $ commit (11) start/end ⎥ because it was already performed
|
|
1024
|
+
/// ########..... $ commit (12) start/end ⎥ before the checkpoint.
|
|
1025
|
+
/// ########...... $ commit (13) start/end ⎥
|
|
1026
|
+
/// ########....... $ commit (14) start/end ⎥ We can begin to compact again at op 16,
|
|
1027
|
+
/// ########........$ commit (15) start ⎤⎥ because those compactions (if previously
|
|
1028
|
+
/// ########,,,,,,,,$ ✓ (15) end⎦⎦ performed) are not included in the
|
|
1029
|
+
/// ########,,,,,,,,$ commit;compact(16) start/end checkpoint.
|
|
1030
|
+
/// ┼───┬───┼───┬───┼
|
|
1031
|
+
/// 0 2 4 6 8 0 2 4 6
|
|
1032
|
+
///
|
|
1033
|
+
/// Notice how in the checkpoint recovery example above, we are careful not to `compact(op)` twice
|
|
1034
|
+
/// for any op (even if we crash/recover), since that could lead to differences between replicas'
|
|
1035
|
+
/// storage. The last bar of `commit()`s is always only in memory, so it is safe to repeat.
|
|
1036
|
+
///
|
|
1037
|
+
/// Additionally, while skipping compactions during recovery, we use a `lookup_snapshot_max`
|
|
1038
|
+
/// different than the original compactions — the old tables may have been removed during the
|
|
1039
|
+
/// checkpoint.
|
|
1040
|
+
fn lookup_snapshot_max_for_checkpoint(op_checkpoint: u64) u64 {
|
|
1041
|
+
if (op_checkpoint == 0) {
|
|
1042
|
+
// Start from 1 because we never commit op 0.
|
|
1043
|
+
return 1;
|
|
1044
|
+
} else {
|
|
1045
|
+
return op_checkpoint + config.lsm_batch_multiple + 1;
|
|
1046
|
+
}
|
|
1047
|
+
}
|
|
1048
|
+
|
|
945
1049
|
/// The total number of tables that can be supported by the tree across so many levels.
|
|
946
1050
|
pub fn table_count_max_for_tree(growth_factor: u32, levels_count: u32) u32 {
|
|
947
1051
|
assert(growth_factor >= 4);
|
|
@@ -984,102 +1088,3 @@ test "table_count_max_for_level/tree" {
|
|
|
984
1088
|
try expectEqual(@as(u32, 37448 + 262144), table_count_max_for_tree(8, 6));
|
|
985
1089
|
try expectEqual(@as(u32, 299592 + 2097152), table_count_max_for_tree(8, 7));
|
|
986
1090
|
}
|
|
987
|
-
|
|
988
|
-
pub fn main() !void {
|
|
989
|
-
const testing = std.testing;
|
|
990
|
-
const allocator = testing.allocator;
|
|
991
|
-
|
|
992
|
-
const IO = @import("../io.zig").IO;
|
|
993
|
-
const Storage = @import("../storage.zig").Storage;
|
|
994
|
-
const Grid = @import("grid.zig").GridType(Storage);
|
|
995
|
-
|
|
996
|
-
const data_file_size_min = @import("../vsr/superblock.zig").data_file_size_min;
|
|
997
|
-
|
|
998
|
-
const dir_fd = try IO.open_dir(".");
|
|
999
|
-
const storage_fd = try IO.open_file(dir_fd, "test_tree", data_file_size_min, true);
|
|
1000
|
-
defer std.fs.cwd().deleteFile("test_tree") catch {};
|
|
1001
|
-
|
|
1002
|
-
var io = try IO.init(128, 0);
|
|
1003
|
-
defer io.deinit();
|
|
1004
|
-
|
|
1005
|
-
var storage = try Storage.init(&io, storage_fd);
|
|
1006
|
-
defer storage.deinit();
|
|
1007
|
-
|
|
1008
|
-
const Key = CompositeKey(u128);
|
|
1009
|
-
const Table = @import("table.zig").TableType(
|
|
1010
|
-
Key,
|
|
1011
|
-
Key.Value,
|
|
1012
|
-
Key.compare_keys,
|
|
1013
|
-
Key.key_from_value,
|
|
1014
|
-
Key.sentinel_key,
|
|
1015
|
-
Key.tombstone,
|
|
1016
|
-
Key.tombstone_from_key,
|
|
1017
|
-
);
|
|
1018
|
-
|
|
1019
|
-
const Tree = TreeType(Table, Storage, @typeName(Table) ++ "_test");
|
|
1020
|
-
|
|
1021
|
-
// Check out our spreadsheet to see how we calculate node_count for a forest of trees.
|
|
1022
|
-
const node_count = 1024;
|
|
1023
|
-
var node_pool = try NodePool.init(allocator, node_count);
|
|
1024
|
-
defer node_pool.deinit(allocator);
|
|
1025
|
-
|
|
1026
|
-
var value_cache = Tree.ValueCache{};
|
|
1027
|
-
try value_cache.ensureTotalCapacity(allocator, 10000);
|
|
1028
|
-
defer value_cache.deinit(allocator);
|
|
1029
|
-
|
|
1030
|
-
const batch_size_max = config.message_size_max - @sizeOf(vsr.Header);
|
|
1031
|
-
const commit_count_max = @divFloor(batch_size_max, 128);
|
|
1032
|
-
|
|
1033
|
-
var sort_buffer = try allocator.allocAdvanced(
|
|
1034
|
-
u8,
|
|
1035
|
-
16,
|
|
1036
|
-
// This must be the greatest commit_count_max and value_size across trees:
|
|
1037
|
-
commit_count_max * config.lsm_batch_multiple * 128,
|
|
1038
|
-
.exact,
|
|
1039
|
-
);
|
|
1040
|
-
defer allocator.free(sort_buffer);
|
|
1041
|
-
|
|
1042
|
-
// TODO Initialize SuperBlock:
|
|
1043
|
-
var superblock: SuperBlockType(Storage) = undefined;
|
|
1044
|
-
|
|
1045
|
-
var grid = try Grid.init(allocator, &superblock);
|
|
1046
|
-
defer grid.deinit(allocator);
|
|
1047
|
-
|
|
1048
|
-
var tree = try Tree.init(
|
|
1049
|
-
allocator,
|
|
1050
|
-
&node_pool,
|
|
1051
|
-
&grid,
|
|
1052
|
-
&value_cache,
|
|
1053
|
-
.{
|
|
1054
|
-
.prefetch_count_max = commit_count_max * 2,
|
|
1055
|
-
.commit_count_max = commit_count_max,
|
|
1056
|
-
},
|
|
1057
|
-
);
|
|
1058
|
-
defer tree.deinit(allocator);
|
|
1059
|
-
|
|
1060
|
-
testing.refAllDecls(@This());
|
|
1061
|
-
|
|
1062
|
-
// TODO: more references
|
|
1063
|
-
_ = Table;
|
|
1064
|
-
_ = Table.Builder.data_block_finish;
|
|
1065
|
-
|
|
1066
|
-
// TODO: more references
|
|
1067
|
-
_ = Tree.CompactionTable;
|
|
1068
|
-
|
|
1069
|
-
_ = tree.prefetch_enqueue;
|
|
1070
|
-
_ = tree.prefetch;
|
|
1071
|
-
_ = tree.prefetch_key;
|
|
1072
|
-
_ = tree.get;
|
|
1073
|
-
_ = tree.put;
|
|
1074
|
-
_ = tree.remove;
|
|
1075
|
-
_ = tree.lookup;
|
|
1076
|
-
_ = tree.compact_io;
|
|
1077
|
-
_ = tree.compact_cpu;
|
|
1078
|
-
|
|
1079
|
-
_ = Tree.Manifest.LookupIterator.next;
|
|
1080
|
-
_ = tree.manifest;
|
|
1081
|
-
_ = tree.manifest.lookup;
|
|
1082
|
-
_ = tree.manifest.insert_tables;
|
|
1083
|
-
|
|
1084
|
-
std.debug.print("table_count_max={}\n", .{table_count_max});
|
|
1085
|
-
}
|