tigerbeetle-node 0.11.13 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin/aarch64-linux-gnu/client.node +0 -0
- package/dist/bin/aarch64-linux-musl/client.node +0 -0
- package/dist/bin/aarch64-macos/client.node +0 -0
- package/dist/bin/x86_64-linux-gnu/client.node +0 -0
- package/dist/bin/x86_64-linux-musl/client.node +0 -0
- package/dist/bin/x86_64-macos/client.node +0 -0
- package/dist/index.js +33 -1
- package/dist/index.js.map +1 -1
- package/package-lock.json +66 -0
- package/package.json +6 -16
- package/src/index.ts +56 -1
- package/src/node.zig +9 -9
- package/dist/.client.node.sha256 +0 -1
- package/scripts/build_lib.sh +0 -61
- package/scripts/download_node_headers.sh +0 -32
- package/src/tigerbeetle/scripts/benchmark.bat +0 -55
- package/src/tigerbeetle/scripts/benchmark.sh +0 -66
- package/src/tigerbeetle/scripts/confirm_image.sh +0 -44
- package/src/tigerbeetle/scripts/fail_on_diff.sh +0 -9
- package/src/tigerbeetle/scripts/fuzz_loop.sh +0 -15
- package/src/tigerbeetle/scripts/fuzz_loop_hash_log.sh +0 -12
- package/src/tigerbeetle/scripts/fuzz_unique_errors.sh +0 -7
- package/src/tigerbeetle/scripts/install.bat +0 -7
- package/src/tigerbeetle/scripts/install.sh +0 -21
- package/src/tigerbeetle/scripts/install_zig.bat +0 -113
- package/src/tigerbeetle/scripts/install_zig.sh +0 -90
- package/src/tigerbeetle/scripts/lint.zig +0 -199
- package/src/tigerbeetle/scripts/pre-commit.sh +0 -9
- package/src/tigerbeetle/scripts/scripts/benchmark.bat +0 -55
- package/src/tigerbeetle/scripts/scripts/benchmark.sh +0 -66
- package/src/tigerbeetle/scripts/scripts/confirm_image.sh +0 -44
- package/src/tigerbeetle/scripts/scripts/fail_on_diff.sh +0 -9
- package/src/tigerbeetle/scripts/scripts/fuzz_loop.sh +0 -15
- package/src/tigerbeetle/scripts/scripts/fuzz_loop_hash_log.sh +0 -12
- package/src/tigerbeetle/scripts/scripts/fuzz_unique_errors.sh +0 -7
- package/src/tigerbeetle/scripts/scripts/install.bat +0 -7
- package/src/tigerbeetle/scripts/scripts/install.sh +0 -21
- package/src/tigerbeetle/scripts/scripts/install_zig.bat +0 -113
- package/src/tigerbeetle/scripts/scripts/install_zig.sh +0 -90
- package/src/tigerbeetle/scripts/scripts/lint.zig +0 -199
- package/src/tigerbeetle/scripts/scripts/pre-commit.sh +0 -9
- package/src/tigerbeetle/scripts/scripts/shellcheck.sh +0 -5
- package/src/tigerbeetle/scripts/scripts/tests_on_alpine.sh +0 -10
- package/src/tigerbeetle/scripts/scripts/tests_on_ubuntu.sh +0 -14
- package/src/tigerbeetle/scripts/scripts/upgrade_ubuntu_kernel.sh +0 -48
- package/src/tigerbeetle/scripts/scripts/validate_docs.sh +0 -23
- package/src/tigerbeetle/scripts/scripts/vr_state_enumerate +0 -46
- package/src/tigerbeetle/scripts/shellcheck.sh +0 -5
- package/src/tigerbeetle/scripts/tests_on_alpine.sh +0 -10
- package/src/tigerbeetle/scripts/tests_on_ubuntu.sh +0 -14
- package/src/tigerbeetle/scripts/upgrade_ubuntu_kernel.sh +0 -48
- package/src/tigerbeetle/scripts/validate_docs.sh +0 -23
- package/src/tigerbeetle/scripts/vr_state_enumerate +0 -46
- package/src/tigerbeetle/src/benchmark.zig +0 -336
- package/src/tigerbeetle/src/config.zig +0 -233
- package/src/tigerbeetle/src/constants.zig +0 -428
- package/src/tigerbeetle/src/ewah.zig +0 -286
- package/src/tigerbeetle/src/ewah_benchmark.zig +0 -120
- package/src/tigerbeetle/src/ewah_fuzz.zig +0 -130
- package/src/tigerbeetle/src/fifo.zig +0 -120
- package/src/tigerbeetle/src/io/benchmark.zig +0 -213
- package/src/tigerbeetle/src/io/darwin.zig +0 -814
- package/src/tigerbeetle/src/io/linux.zig +0 -1071
- package/src/tigerbeetle/src/io/test.zig +0 -643
- package/src/tigerbeetle/src/io/windows.zig +0 -1183
- package/src/tigerbeetle/src/io.zig +0 -34
- package/src/tigerbeetle/src/iops.zig +0 -107
- package/src/tigerbeetle/src/lsm/README.md +0 -308
- package/src/tigerbeetle/src/lsm/binary_search.zig +0 -341
- package/src/tigerbeetle/src/lsm/bloom_filter.zig +0 -125
- package/src/tigerbeetle/src/lsm/compaction.zig +0 -603
- package/src/tigerbeetle/src/lsm/composite_key.zig +0 -77
- package/src/tigerbeetle/src/lsm/direction.zig +0 -11
- package/src/tigerbeetle/src/lsm/eytzinger.zig +0 -587
- package/src/tigerbeetle/src/lsm/eytzinger_benchmark.zig +0 -330
- package/src/tigerbeetle/src/lsm/forest.zig +0 -205
- package/src/tigerbeetle/src/lsm/forest_fuzz.zig +0 -450
- package/src/tigerbeetle/src/lsm/grid.zig +0 -573
- package/src/tigerbeetle/src/lsm/groove.zig +0 -1036
- package/src/tigerbeetle/src/lsm/k_way_merge.zig +0 -474
- package/src/tigerbeetle/src/lsm/level_iterator.zig +0 -332
- package/src/tigerbeetle/src/lsm/manifest.zig +0 -617
- package/src/tigerbeetle/src/lsm/manifest_level.zig +0 -878
- package/src/tigerbeetle/src/lsm/manifest_log.zig +0 -789
- package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +0 -691
- package/src/tigerbeetle/src/lsm/merge_iterator.zig +0 -106
- package/src/tigerbeetle/src/lsm/node_pool.zig +0 -235
- package/src/tigerbeetle/src/lsm/posted_groove.zig +0 -381
- package/src/tigerbeetle/src/lsm/segmented_array.zig +0 -1329
- package/src/tigerbeetle/src/lsm/segmented_array_benchmark.zig +0 -148
- package/src/tigerbeetle/src/lsm/segmented_array_fuzz.zig +0 -9
- package/src/tigerbeetle/src/lsm/set_associative_cache.zig +0 -850
- package/src/tigerbeetle/src/lsm/table.zig +0 -1009
- package/src/tigerbeetle/src/lsm/table_immutable.zig +0 -192
- package/src/tigerbeetle/src/lsm/table_iterator.zig +0 -340
- package/src/tigerbeetle/src/lsm/table_mutable.zig +0 -203
- package/src/tigerbeetle/src/lsm/test.zig +0 -439
- package/src/tigerbeetle/src/lsm/tree.zig +0 -1169
- package/src/tigerbeetle/src/lsm/tree_fuzz.zig +0 -479
- package/src/tigerbeetle/src/message_bus.zig +0 -1013
- package/src/tigerbeetle/src/message_pool.zig +0 -156
- package/src/tigerbeetle/src/ring_buffer.zig +0 -399
- package/src/tigerbeetle/src/simulator.zig +0 -580
- package/src/tigerbeetle/src/state_machine/auditor.zig +0 -578
- package/src/tigerbeetle/src/state_machine/workload.zig +0 -883
- package/src/tigerbeetle/src/state_machine.zig +0 -2099
- package/src/tigerbeetle/src/static_allocator.zig +0 -65
- package/src/tigerbeetle/src/stdx.zig +0 -171
- package/src/tigerbeetle/src/storage.zig +0 -393
- package/src/tigerbeetle/src/testing/cluster/message_bus.zig +0 -82
- package/src/tigerbeetle/src/testing/cluster/network.zig +0 -237
- package/src/tigerbeetle/src/testing/cluster/state_checker.zig +0 -169
- package/src/tigerbeetle/src/testing/cluster/storage_checker.zig +0 -202
- package/src/tigerbeetle/src/testing/cluster.zig +0 -444
- package/src/tigerbeetle/src/testing/fuzz.zig +0 -140
- package/src/tigerbeetle/src/testing/hash_log.zig +0 -66
- package/src/tigerbeetle/src/testing/id.zig +0 -99
- package/src/tigerbeetle/src/testing/packet_simulator.zig +0 -374
- package/src/tigerbeetle/src/testing/priority_queue.zig +0 -645
- package/src/tigerbeetle/src/testing/reply_sequence.zig +0 -139
- package/src/tigerbeetle/src/testing/state_machine.zig +0 -250
- package/src/tigerbeetle/src/testing/storage.zig +0 -757
- package/src/tigerbeetle/src/testing/table.zig +0 -247
- package/src/tigerbeetle/src/testing/time.zig +0 -84
- package/src/tigerbeetle/src/tigerbeetle.zig +0 -227
- package/src/tigerbeetle/src/time.zig +0 -112
- package/src/tigerbeetle/src/tracer.zig +0 -529
- package/src/tigerbeetle/src/unit_tests.zig +0 -40
- package/src/tigerbeetle/src/vopr.zig +0 -495
- package/src/tigerbeetle/src/vsr/README.md +0 -209
- package/src/tigerbeetle/src/vsr/client.zig +0 -544
- package/src/tigerbeetle/src/vsr/clock.zig +0 -855
- package/src/tigerbeetle/src/vsr/journal.zig +0 -2415
- package/src/tigerbeetle/src/vsr/journal_format_fuzz.zig +0 -111
- package/src/tigerbeetle/src/vsr/marzullo.zig +0 -309
- package/src/tigerbeetle/src/vsr/replica.zig +0 -6616
- package/src/tigerbeetle/src/vsr/replica_format.zig +0 -219
- package/src/tigerbeetle/src/vsr/superblock.zig +0 -1631
- package/src/tigerbeetle/src/vsr/superblock_client_table.zig +0 -256
- package/src/tigerbeetle/src/vsr/superblock_free_set.zig +0 -929
- package/src/tigerbeetle/src/vsr/superblock_free_set_fuzz.zig +0 -334
- package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +0 -390
- package/src/tigerbeetle/src/vsr/superblock_manifest.zig +0 -615
- package/src/tigerbeetle/src/vsr/superblock_quorums.zig +0 -394
- package/src/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +0 -314
- package/src/tigerbeetle/src/vsr.zig +0 -1425
|
@@ -1,1169 +0,0 @@
|
|
|
1
|
-
//! An LSM tree.
|
|
2
|
-
const std = @import("std");
|
|
3
|
-
const builtin = @import("builtin");
|
|
4
|
-
const assert = std.debug.assert;
|
|
5
|
-
const math = std.math;
|
|
6
|
-
const mem = std.mem;
|
|
7
|
-
const os = std.os;
|
|
8
|
-
|
|
9
|
-
const log = std.log.scoped(.tree);
|
|
10
|
-
const tracer = @import("../tracer.zig");
|
|
11
|
-
|
|
12
|
-
const constants = @import("../constants.zig");
|
|
13
|
-
const div_ceil = @import("../stdx.zig").div_ceil;
|
|
14
|
-
const eytzinger = @import("eytzinger.zig").eytzinger;
|
|
15
|
-
const vsr = @import("../vsr.zig");
|
|
16
|
-
const bloom_filter = @import("bloom_filter.zig");
|
|
17
|
-
|
|
18
|
-
const CompositeKey = @import("composite_key.zig").CompositeKey;
|
|
19
|
-
const NodePool = @import("node_pool.zig").NodePool(constants.lsm_manifest_node_size, 16);
|
|
20
|
-
const RingBuffer = @import("../ring_buffer.zig").RingBuffer;
|
|
21
|
-
|
|
22
|
-
/// We reserve maxInt(u64) to indicate that a table has not been deleted.
|
|
23
|
-
/// Tables that have not been deleted have snapshot_max of maxInt(u64).
|
|
24
|
-
/// Since we ensure and assert that a query snapshot never exactly matches
|
|
25
|
-
/// the snapshot_min/snapshot_max of a table, we must use maxInt(u64) - 1
|
|
26
|
-
/// to query all non-deleted tables.
|
|
27
|
-
pub const snapshot_latest: u64 = math.maxInt(u64) - 1;
|
|
28
|
-
|
|
29
|
-
const half_bar_beat_count = @divExact(constants.lsm_batch_multiple, 2);
|
|
30
|
-
|
|
31
|
-
// StateMachine:
|
|
32
|
-
//
|
|
33
|
-
// /// state machine will pass this on to all object stores
|
|
34
|
-
// /// Read I/O only
|
|
35
|
-
// pub fn read(batch, callback) void
|
|
36
|
-
//
|
|
37
|
-
// /// write the ops in batch to the memtable/objcache, previously called commit()
|
|
38
|
-
// pub fn write(batch) void
|
|
39
|
-
//
|
|
40
|
-
// /// Flush in memory state to disk, perform merges, etc
|
|
41
|
-
// /// Only function that triggers Write I/O in LSMs, as well as some Read
|
|
42
|
-
// /// Make as incremental as possible, don't block the main thread, avoid high latency/spikes
|
|
43
|
-
// pub fn flush(callback) void
|
|
44
|
-
//
|
|
45
|
-
// /// Write manifest info for all object stores into buffer
|
|
46
|
-
// pub fn encode_superblock(buffer) void
|
|
47
|
-
//
|
|
48
|
-
// /// Restore all in-memory state from the superblock data
|
|
49
|
-
// pub fn decode_superblock(buffer) void
|
|
50
|
-
//
|
|
51
|
-
|
|
52
|
-
/// The maximum number of tables for a single tree.
|
|
53
|
-
pub const table_count_max = table_count_max_for_tree(
|
|
54
|
-
constants.lsm_growth_factor,
|
|
55
|
-
constants.lsm_levels,
|
|
56
|
-
);
|
|
57
|
-
|
|
58
|
-
/// The upper-bound count of input tables to a single tree's compaction.
|
|
59
|
-
///
|
|
60
|
-
/// - +1 from level A.
|
|
61
|
-
/// - +lsm_growth_factor from level B. The A-input table cannot overlap with an extra B-input table
|
|
62
|
-
/// because input table selection is least-overlap. If the input table overlaps on one or both
|
|
63
|
-
/// edges, there must be another table with less overlap to select.
|
|
64
|
-
pub const compaction_tables_input_max = 1 + constants.lsm_growth_factor;
|
|
65
|
-
|
|
66
|
-
/// The upper-bound count of output tables from a single tree's compaction.
|
|
67
|
-
/// In the "worst" case, no keys are overwritten/merged, and no tombstones are dropped.
|
|
68
|
-
pub const compaction_tables_output_max = compaction_tables_input_max;
|
|
69
|
-
|
|
70
|
-
/// The maximum number of concurrent compactions (per tree).
|
|
71
|
-
pub const compactions_max = div_ceil(constants.lsm_levels, 2);
|
|
72
|
-
|
|
73
|
-
pub fn TreeType(comptime TreeTable: type, comptime Storage: type, comptime tree_name: [:0]const u8) type {
|
|
74
|
-
const Key = TreeTable.Key;
|
|
75
|
-
const Value = TreeTable.Value;
|
|
76
|
-
const compare_keys = TreeTable.compare_keys;
|
|
77
|
-
const tombstone = TreeTable.tombstone;
|
|
78
|
-
|
|
79
|
-
const tree_hash = blk: {
|
|
80
|
-
// Blake3 hash does alot at comptime..
|
|
81
|
-
@setEvalBranchQuota(tree_name.len * 1024);
|
|
82
|
-
|
|
83
|
-
var hash: u256 = undefined;
|
|
84
|
-
std.crypto.hash.Blake3.hash(tree_name, std.mem.asBytes(&hash), .{});
|
|
85
|
-
break :blk @truncate(u128, hash);
|
|
86
|
-
};
|
|
87
|
-
|
|
88
|
-
return struct {
|
|
89
|
-
const Tree = @This();
|
|
90
|
-
|
|
91
|
-
// Expose the Table & hash for the Groove.
|
|
92
|
-
pub const Table = TreeTable;
|
|
93
|
-
pub const name = tree_name;
|
|
94
|
-
pub const hash = tree_hash;
|
|
95
|
-
|
|
96
|
-
const Grid = @import("grid.zig").GridType(Storage);
|
|
97
|
-
const Manifest = @import("manifest.zig").ManifestType(Table, Storage);
|
|
98
|
-
pub const TableMutable = @import("table_mutable.zig").TableMutableType(Table);
|
|
99
|
-
const TableImmutable = @import("table_immutable.zig").TableImmutableType(Table);
|
|
100
|
-
|
|
101
|
-
const CompactionType = @import("compaction.zig").CompactionType;
|
|
102
|
-
const TableIteratorType = @import("table_iterator.zig").TableIteratorType;
|
|
103
|
-
const TableImmutableIteratorType = @import("table_immutable.zig").TableImmutableIteratorType;
|
|
104
|
-
|
|
105
|
-
const CompactionTable = CompactionType(Table, Storage, TableIteratorType);
|
|
106
|
-
const CompactionTableImmutable = CompactionType(Table, Storage, TableImmutableIteratorType);
|
|
107
|
-
|
|
108
|
-
grid: *Grid,
|
|
109
|
-
options: Options,
|
|
110
|
-
|
|
111
|
-
table_mutable: TableMutable,
|
|
112
|
-
table_immutable: TableImmutable,
|
|
113
|
-
values_cache: ?*TableMutable.ValuesCache,
|
|
114
|
-
|
|
115
|
-
manifest: Manifest,
|
|
116
|
-
|
|
117
|
-
compaction_table_immutable: CompactionTableImmutable,
|
|
118
|
-
|
|
119
|
-
/// The number of Compaction instances is divided by two as, at any given compaction tick,
|
|
120
|
-
/// we're only compacting either even or odd levels but never both.
|
|
121
|
-
/// Uses divFloor as the last level, even with odd lsm_levels, doesn't compact to anything.
|
|
122
|
-
/// (e.g. floor(5/2) = 2 for levels 0->1, 2->3 when even and immut->0, 1->2, 3->4 when odd).
|
|
123
|
-
/// This means, that for odd lsm_levels, the last CompactionTable is unused.
|
|
124
|
-
compaction_table: [@divFloor(constants.lsm_levels, 2)]CompactionTable,
|
|
125
|
-
|
|
126
|
-
/// While a compaction is running, this is the op of the last compact().
|
|
127
|
-
/// While no compaction is running, this is the op of the last compact() to complete.
|
|
128
|
-
/// (When recovering from a checkpoint, compaction_op starts at op_checkpoint).
|
|
129
|
-
compaction_op: u64,
|
|
130
|
-
|
|
131
|
-
/// The maximum snapshot which is safe to prefetch from.
|
|
132
|
-
/// The minimum snapshot which can see the mutable table.
|
|
133
|
-
///
|
|
134
|
-
/// This field ensures that the tree never queries the output tables of a running
|
|
135
|
-
/// compaction; they are incomplete.
|
|
136
|
-
///
|
|
137
|
-
/// See lookup_snapshot_max_for_checkpoint().
|
|
138
|
-
///
|
|
139
|
-
/// Invariants:
|
|
140
|
-
/// * `lookup_snapshot_max = compaction_op` while any compaction beat is in progress.
|
|
141
|
-
/// * `lookup_snapshot_max = compaction_op + 1` after a compaction beat finishes.
|
|
142
|
-
/// * `lookup_snapshot_max ≥ op_checkpoint + 1 + lsm_batch_multiple`
|
|
143
|
-
/// when `op_checkpoint ≠ 0`.
|
|
144
|
-
lookup_snapshot_max: u64,
|
|
145
|
-
|
|
146
|
-
compaction_io_pending: usize,
|
|
147
|
-
compaction_callback: ?fn (*Tree) void,
|
|
148
|
-
|
|
149
|
-
checkpoint_callback: ?fn (*Tree) void,
|
|
150
|
-
open_callback: ?fn (*Tree) void,
|
|
151
|
-
|
|
152
|
-
tracer_slot: ?tracer.SpanStart = null,
|
|
153
|
-
|
|
154
|
-
pub const Options = struct {
|
|
155
|
-
/// The number of objects to cache in the set-associative value cache.
|
|
156
|
-
cache_entries_max: u32 = 0,
|
|
157
|
-
};
|
|
158
|
-
|
|
159
|
-
pub fn init(
|
|
160
|
-
allocator: mem.Allocator,
|
|
161
|
-
node_pool: *NodePool,
|
|
162
|
-
grid: *Grid,
|
|
163
|
-
options: Options,
|
|
164
|
-
) !Tree {
|
|
165
|
-
assert(grid.superblock.opened);
|
|
166
|
-
|
|
167
|
-
var values_cache: ?*TableMutable.ValuesCache = null;
|
|
168
|
-
|
|
169
|
-
if (options.cache_entries_max > 0) {
|
|
170
|
-
// Cache is heap-allocated to pass a pointer into the mutable table.
|
|
171
|
-
values_cache = try allocator.create(TableMutable.ValuesCache);
|
|
172
|
-
}
|
|
173
|
-
errdefer if (values_cache) |c| allocator.destroy(c);
|
|
174
|
-
|
|
175
|
-
if (options.cache_entries_max > 0) {
|
|
176
|
-
values_cache.?.* = try TableMutable.ValuesCache.init(
|
|
177
|
-
allocator,
|
|
178
|
-
options.cache_entries_max,
|
|
179
|
-
);
|
|
180
|
-
}
|
|
181
|
-
errdefer if (values_cache) |c| c.deinit(allocator);
|
|
182
|
-
|
|
183
|
-
var table_mutable = try TableMutable.init(allocator, values_cache);
|
|
184
|
-
errdefer table_mutable.deinit(allocator);
|
|
185
|
-
|
|
186
|
-
var table_immutable = try TableImmutable.init(allocator);
|
|
187
|
-
errdefer table_immutable.deinit(allocator);
|
|
188
|
-
|
|
189
|
-
var manifest = try Manifest.init(allocator, node_pool, grid, tree_hash);
|
|
190
|
-
errdefer manifest.deinit(allocator);
|
|
191
|
-
|
|
192
|
-
var compaction_table_immutable = try CompactionTableImmutable.init(
|
|
193
|
-
allocator,
|
|
194
|
-
std.fmt.comptimePrint("{s}(immutable->0)", .{tree_name}),
|
|
195
|
-
);
|
|
196
|
-
errdefer compaction_table_immutable.deinit(allocator);
|
|
197
|
-
|
|
198
|
-
var compaction_table: [@divFloor(constants.lsm_levels, 2)]CompactionTable = undefined;
|
|
199
|
-
{
|
|
200
|
-
comptime var i: usize = 0;
|
|
201
|
-
inline while (i < compaction_table.len) : (i += 1) {
|
|
202
|
-
errdefer for (compaction_table[0..i]) |*c| c.deinit(allocator);
|
|
203
|
-
const compaction_name = std.fmt.comptimePrint("{s}({}->{}/{}->{})", .{
|
|
204
|
-
tree_name,
|
|
205
|
-
2 * i,
|
|
206
|
-
2 * i + 1,
|
|
207
|
-
2 * i + 1,
|
|
208
|
-
2 * i + 2,
|
|
209
|
-
});
|
|
210
|
-
compaction_table[i] = try CompactionTable.init(allocator, compaction_name);
|
|
211
|
-
}
|
|
212
|
-
}
|
|
213
|
-
errdefer for (compaction_table) |*c| c.deinit(allocator);
|
|
214
|
-
|
|
215
|
-
// Compaction is one bar ahead of superblock's commit_min.
|
|
216
|
-
const op_checkpoint = grid.superblock.working.vsr_state.commit_min;
|
|
217
|
-
const lookup_snapshot_max = lookup_snapshot_max_for_checkpoint(op_checkpoint);
|
|
218
|
-
const compaction_op = op_checkpoint;
|
|
219
|
-
|
|
220
|
-
return Tree{
|
|
221
|
-
.grid = grid,
|
|
222
|
-
.options = options,
|
|
223
|
-
.table_mutable = table_mutable,
|
|
224
|
-
.table_immutable = table_immutable,
|
|
225
|
-
.values_cache = values_cache,
|
|
226
|
-
.manifest = manifest,
|
|
227
|
-
.compaction_table_immutable = compaction_table_immutable,
|
|
228
|
-
.compaction_table = compaction_table,
|
|
229
|
-
.compaction_op = compaction_op,
|
|
230
|
-
.lookup_snapshot_max = lookup_snapshot_max,
|
|
231
|
-
.compaction_io_pending = 0,
|
|
232
|
-
.compaction_callback = null,
|
|
233
|
-
.checkpoint_callback = null,
|
|
234
|
-
.open_callback = null,
|
|
235
|
-
};
|
|
236
|
-
}
|
|
237
|
-
|
|
238
|
-
pub fn deinit(tree: *Tree, allocator: mem.Allocator) void {
|
|
239
|
-
assert(tree.tracer_slot == null);
|
|
240
|
-
|
|
241
|
-
tree.compaction_table_immutable.deinit(allocator);
|
|
242
|
-
for (tree.compaction_table) |*compaction| compaction.deinit(allocator);
|
|
243
|
-
|
|
244
|
-
// TODO Consider whether we should release blocks acquired from Grid.block_free_set.
|
|
245
|
-
tree.table_mutable.deinit(allocator);
|
|
246
|
-
tree.table_immutable.deinit(allocator);
|
|
247
|
-
tree.manifest.deinit(allocator);
|
|
248
|
-
|
|
249
|
-
if (tree.values_cache) |cache| {
|
|
250
|
-
cache.deinit(allocator);
|
|
251
|
-
allocator.destroy(cache);
|
|
252
|
-
}
|
|
253
|
-
}
|
|
254
|
-
|
|
255
|
-
pub fn put(tree: *Tree, value: *const Value) void {
|
|
256
|
-
tree.table_mutable.put(value);
|
|
257
|
-
}
|
|
258
|
-
|
|
259
|
-
pub fn remove(tree: *Tree, value: *const Value) void {
|
|
260
|
-
tree.table_mutable.remove(value);
|
|
261
|
-
}
|
|
262
|
-
|
|
263
|
-
/// Returns the value from the mutable or immutable table (possibly a tombstone),
|
|
264
|
-
/// if one is available for the specified snapshot.
|
|
265
|
-
pub fn lookup_from_memory(tree: *Tree, snapshot: u64, key: Key) ?*const Value {
|
|
266
|
-
assert(tree.lookup_snapshot_max >= snapshot);
|
|
267
|
-
|
|
268
|
-
if (tree.lookup_snapshot_max == snapshot) {
|
|
269
|
-
if (tree.table_mutable.get(key)) |value| return value;
|
|
270
|
-
} else {
|
|
271
|
-
// The mutable table is converted to an immutable table when a snapshot is created.
|
|
272
|
-
// This means that a past snapshot will never be able to see the mutable table.
|
|
273
|
-
// This simplifies the mutable table and eliminates compaction for duplicate puts.
|
|
274
|
-
}
|
|
275
|
-
|
|
276
|
-
if (!tree.table_immutable.free and tree.table_immutable.snapshot_min <= snapshot) {
|
|
277
|
-
if (tree.table_immutable.get(key)) |value| return value;
|
|
278
|
-
} else {
|
|
279
|
-
// If the immutable table is invisible, then the mutable table is also invisible.
|
|
280
|
-
assert(tree.table_immutable.free or snapshot != tree.lookup_snapshot_max);
|
|
281
|
-
}
|
|
282
|
-
|
|
283
|
-
return null;
|
|
284
|
-
}
|
|
285
|
-
|
|
286
|
-
/// Call this function only after checking `lookup_from_memory()`.
|
|
287
|
-
pub fn lookup_from_levels(
|
|
288
|
-
tree: *Tree,
|
|
289
|
-
callback: fn (*LookupContext, ?*const Value) void,
|
|
290
|
-
context: *LookupContext,
|
|
291
|
-
snapshot: u64,
|
|
292
|
-
key: Key,
|
|
293
|
-
) void {
|
|
294
|
-
assert(tree.lookup_snapshot_max >= snapshot);
|
|
295
|
-
if (constants.verify) {
|
|
296
|
-
// The caller is responsible for checking the mutable table.
|
|
297
|
-
assert(tree.lookup_from_memory(snapshot, key) == null);
|
|
298
|
-
}
|
|
299
|
-
|
|
300
|
-
var index_block_count: u8 = 0;
|
|
301
|
-
var index_block_addresses: [constants.lsm_levels]u64 = undefined;
|
|
302
|
-
var index_block_checksums: [constants.lsm_levels]u128 = undefined;
|
|
303
|
-
{
|
|
304
|
-
var it = tree.manifest.lookup(snapshot, key);
|
|
305
|
-
while (it.next()) |table| : (index_block_count += 1) {
|
|
306
|
-
assert(table.visible(snapshot));
|
|
307
|
-
assert(compare_keys(table.key_min, key) != .gt);
|
|
308
|
-
assert(compare_keys(table.key_max, key) != .lt);
|
|
309
|
-
|
|
310
|
-
index_block_addresses[index_block_count] = table.address;
|
|
311
|
-
index_block_checksums[index_block_count] = table.checksum;
|
|
312
|
-
}
|
|
313
|
-
}
|
|
314
|
-
|
|
315
|
-
if (index_block_count == 0) {
|
|
316
|
-
callback(context, null);
|
|
317
|
-
return;
|
|
318
|
-
}
|
|
319
|
-
|
|
320
|
-
// Hash the key to the fingerprint only once and reuse for all bloom filter checks.
|
|
321
|
-
const fingerprint = bloom_filter.Fingerprint.create(mem.asBytes(&key));
|
|
322
|
-
|
|
323
|
-
context.* = .{
|
|
324
|
-
.tree = tree,
|
|
325
|
-
.completion = undefined,
|
|
326
|
-
|
|
327
|
-
.key = key,
|
|
328
|
-
.fingerprint = fingerprint,
|
|
329
|
-
|
|
330
|
-
.index_block_count = index_block_count,
|
|
331
|
-
.index_block_addresses = index_block_addresses,
|
|
332
|
-
.index_block_checksums = index_block_checksums,
|
|
333
|
-
|
|
334
|
-
.callback = callback,
|
|
335
|
-
};
|
|
336
|
-
|
|
337
|
-
context.read_index_block();
|
|
338
|
-
}
|
|
339
|
-
|
|
340
|
-
pub const LookupContext = struct {
|
|
341
|
-
const Read = Grid.Read;
|
|
342
|
-
const BlockPtrConst = Grid.BlockPtrConst;
|
|
343
|
-
|
|
344
|
-
tree: *Tree,
|
|
345
|
-
completion: Read,
|
|
346
|
-
|
|
347
|
-
key: Key,
|
|
348
|
-
fingerprint: bloom_filter.Fingerprint,
|
|
349
|
-
|
|
350
|
-
/// This value is an index into the index_block_addresses/checksums arrays.
|
|
351
|
-
index_block: u8 = 0,
|
|
352
|
-
index_block_count: u8,
|
|
353
|
-
index_block_addresses: [constants.lsm_levels]u64,
|
|
354
|
-
index_block_checksums: [constants.lsm_levels]u128,
|
|
355
|
-
|
|
356
|
-
data_block: ?struct {
|
|
357
|
-
address: u64,
|
|
358
|
-
checksum: u128,
|
|
359
|
-
} = null,
|
|
360
|
-
|
|
361
|
-
callback: fn (*Tree.LookupContext, ?*const Value) void,
|
|
362
|
-
|
|
363
|
-
fn read_index_block(context: *LookupContext) void {
|
|
364
|
-
assert(context.data_block == null);
|
|
365
|
-
assert(context.index_block < context.index_block_count);
|
|
366
|
-
assert(context.index_block_count > 0);
|
|
367
|
-
assert(context.index_block_count <= constants.lsm_levels);
|
|
368
|
-
|
|
369
|
-
context.tree.grid.read_block(
|
|
370
|
-
read_index_block_callback,
|
|
371
|
-
&context.completion,
|
|
372
|
-
context.index_block_addresses[context.index_block],
|
|
373
|
-
context.index_block_checksums[context.index_block],
|
|
374
|
-
.index,
|
|
375
|
-
);
|
|
376
|
-
}
|
|
377
|
-
|
|
378
|
-
fn read_index_block_callback(completion: *Read, index_block: BlockPtrConst) void {
|
|
379
|
-
const context = @fieldParentPtr(LookupContext, "completion", completion);
|
|
380
|
-
assert(context.data_block == null);
|
|
381
|
-
assert(context.index_block < context.index_block_count);
|
|
382
|
-
assert(context.index_block_count > 0);
|
|
383
|
-
assert(context.index_block_count <= constants.lsm_levels);
|
|
384
|
-
|
|
385
|
-
const blocks = Table.index_blocks_for_key(index_block, context.key);
|
|
386
|
-
|
|
387
|
-
context.data_block = .{
|
|
388
|
-
.address = blocks.data_block_address,
|
|
389
|
-
.checksum = blocks.data_block_checksum,
|
|
390
|
-
};
|
|
391
|
-
|
|
392
|
-
context.tree.grid.read_block(
|
|
393
|
-
read_filter_block_callback,
|
|
394
|
-
completion,
|
|
395
|
-
blocks.filter_block_address,
|
|
396
|
-
blocks.filter_block_checksum,
|
|
397
|
-
.filter,
|
|
398
|
-
);
|
|
399
|
-
}
|
|
400
|
-
|
|
401
|
-
fn read_filter_block_callback(completion: *Read, filter_block: BlockPtrConst) void {
|
|
402
|
-
const context = @fieldParentPtr(LookupContext, "completion", completion);
|
|
403
|
-
assert(context.data_block != null);
|
|
404
|
-
assert(context.index_block < context.index_block_count);
|
|
405
|
-
assert(context.index_block_count > 0);
|
|
406
|
-
assert(context.index_block_count <= constants.lsm_levels);
|
|
407
|
-
|
|
408
|
-
const filter_bytes = Table.filter_block_filter_const(filter_block);
|
|
409
|
-
if (bloom_filter.may_contain(context.fingerprint, filter_bytes)) {
|
|
410
|
-
context.tree.grid.read_block(
|
|
411
|
-
read_data_block_callback,
|
|
412
|
-
completion,
|
|
413
|
-
context.data_block.?.address,
|
|
414
|
-
context.data_block.?.checksum,
|
|
415
|
-
.data,
|
|
416
|
-
);
|
|
417
|
-
} else {
|
|
418
|
-
// The key is not present in this table, check the next level.
|
|
419
|
-
context.advance_to_next_level();
|
|
420
|
-
}
|
|
421
|
-
}
|
|
422
|
-
|
|
423
|
-
fn read_data_block_callback(completion: *Read, data_block: BlockPtrConst) void {
|
|
424
|
-
const context = @fieldParentPtr(LookupContext, "completion", completion);
|
|
425
|
-
assert(context.data_block != null);
|
|
426
|
-
assert(context.index_block < context.index_block_count);
|
|
427
|
-
assert(context.index_block_count > 0);
|
|
428
|
-
assert(context.index_block_count <= constants.lsm_levels);
|
|
429
|
-
|
|
430
|
-
if (Table.data_block_search(data_block, context.key)) |value| {
|
|
431
|
-
context.callback(context, unwrap_tombstone(value));
|
|
432
|
-
} else {
|
|
433
|
-
// The key is not present in this table, check the next level.
|
|
434
|
-
context.advance_to_next_level();
|
|
435
|
-
}
|
|
436
|
-
}
|
|
437
|
-
|
|
438
|
-
fn advance_to_next_level(context: *LookupContext) void {
|
|
439
|
-
assert(context.data_block != null);
|
|
440
|
-
assert(context.index_block < context.index_block_count);
|
|
441
|
-
assert(context.index_block_count > 0);
|
|
442
|
-
assert(context.index_block_count <= constants.lsm_levels);
|
|
443
|
-
|
|
444
|
-
context.index_block += 1;
|
|
445
|
-
if (context.index_block == context.index_block_count) {
|
|
446
|
-
context.callback(context, null);
|
|
447
|
-
return;
|
|
448
|
-
}
|
|
449
|
-
assert(context.index_block < context.index_block_count);
|
|
450
|
-
|
|
451
|
-
context.data_block = null;
|
|
452
|
-
context.read_index_block();
|
|
453
|
-
}
|
|
454
|
-
};
|
|
455
|
-
|
|
456
|
-
/// Returns null if the value is null or a tombstone, otherwise returns the value.
|
|
457
|
-
/// We use tombstone values internally, but expose them as null to the user.
|
|
458
|
-
/// This distinction enables us to cache a null result as a tombstone in our hash maps.
|
|
459
|
-
pub inline fn unwrap_tombstone(value: ?*const Value) ?*const Value {
|
|
460
|
-
return if (value == null or tombstone(value.?)) null else value.?;
|
|
461
|
-
}
|
|
462
|
-
|
|
463
|
-
pub fn open(tree: *Tree, callback: fn (*Tree) void) void {
|
|
464
|
-
assert(tree.open_callback == null);
|
|
465
|
-
tree.open_callback = callback;
|
|
466
|
-
|
|
467
|
-
tree.manifest.open(manifest_open_callback);
|
|
468
|
-
}
|
|
469
|
-
|
|
470
|
-
fn manifest_open_callback(manifest: *Manifest) void {
|
|
471
|
-
const tree = @fieldParentPtr(Tree, "manifest", manifest);
|
|
472
|
-
assert(tree.open_callback != null);
|
|
473
|
-
|
|
474
|
-
const callback = tree.open_callback.?;
|
|
475
|
-
tree.open_callback = null;
|
|
476
|
-
callback(tree);
|
|
477
|
-
}
|
|
478
|
-
|
|
479
|
-
const CompactionTableContext = struct {
|
|
480
|
-
compaction: *CompactionTable,
|
|
481
|
-
level_a: u8,
|
|
482
|
-
level_b: u8,
|
|
483
|
-
};
|
|
484
|
-
|
|
485
|
-
const CompactionTableIterator = struct {
|
|
486
|
-
tree: *Tree,
|
|
487
|
-
index: u8 = 0,
|
|
488
|
-
|
|
489
|
-
fn next(it: *CompactionTableIterator) ?CompactionTableContext {
|
|
490
|
-
assert(it.tree.compaction_callback != null);
|
|
491
|
-
|
|
492
|
-
const compaction_beat = it.tree.compaction_op % constants.lsm_batch_multiple;
|
|
493
|
-
const even_levels = compaction_beat < half_bar_beat_count;
|
|
494
|
-
const level_a = (it.index * 2) + @boolToInt(!even_levels);
|
|
495
|
-
const level_b = level_a + 1;
|
|
496
|
-
|
|
497
|
-
if (level_a >= constants.lsm_levels - 1) return null;
|
|
498
|
-
assert(level_b < constants.lsm_levels);
|
|
499
|
-
|
|
500
|
-
defer it.index += 1;
|
|
501
|
-
return CompactionTableContext{
|
|
502
|
-
.compaction = &it.tree.compaction_table[it.index],
|
|
503
|
-
.level_a = level_a,
|
|
504
|
-
.level_b = level_b,
|
|
505
|
-
};
|
|
506
|
-
}
|
|
507
|
-
};
|
|
508
|
-
|
|
509
|
-
/// Since concurrent compactions into and out of a level may contend for the same range:
|
|
510
|
-
///
|
|
511
|
-
/// 1. compact level 0 to 1, level 2 to 3, level 4 to 5 etc., and then
|
|
512
|
-
/// 2. compact the immutable table to level 0, level 1 to 2, level 3 to 4 etc.
|
|
513
|
-
///
|
|
514
|
-
/// This order (even levels, then odd levels) is significant, since it reduces the number of
|
|
515
|
-
/// level 0 tables that overlap with the immutable table, reducing write amplification.
|
|
516
|
-
///
|
|
517
|
-
/// We therefore take the bar, during which all compactions run, and divide by two,
|
|
518
|
-
/// running the compactions from even levels in the first half bar, and then the odd.
|
|
519
|
-
///
|
|
520
|
-
/// Compactions start on the down beat of a half bar, using 0-based beats.
|
|
521
|
-
/// For example, if there are 4 beats in a bar, start on beat 0 or beat 2.
|
|
522
|
-
pub fn compact(tree: *Tree, callback: fn (*Tree) void, op: u64) void {
|
|
523
|
-
assert(tree.compaction_callback == null);
|
|
524
|
-
assert(op != 0);
|
|
525
|
-
assert(op == tree.compaction_op + 1);
|
|
526
|
-
assert(op > tree.grid.superblock.working.vsr_state.commit_min);
|
|
527
|
-
|
|
528
|
-
tree.compaction_op = op;
|
|
529
|
-
|
|
530
|
-
if (op < constants.lsm_batch_multiple) {
|
|
531
|
-
// There is nothing to compact for the first measure.
|
|
532
|
-
// We skip the main compaction code path first compaction bar entirely because it
|
|
533
|
-
// is a special case — its first beat is 1, not 0.
|
|
534
|
-
|
|
535
|
-
tree.lookup_snapshot_max = op + 1;
|
|
536
|
-
if (op + 1 == constants.lsm_batch_multiple) {
|
|
537
|
-
tree.compact_mutable_table_into_immutable();
|
|
538
|
-
}
|
|
539
|
-
|
|
540
|
-
callback(tree);
|
|
541
|
-
return;
|
|
542
|
-
}
|
|
543
|
-
|
|
544
|
-
if (tree.grid.superblock.working.vsr_state.op_compacted(op)) {
|
|
545
|
-
// We recovered from a checkpoint, and must avoid replaying one bar of
|
|
546
|
-
// compactions that were applied before the checkpoint. Repeating these ops'
|
|
547
|
-
// compactions would actually perform different compactions than before,
|
|
548
|
-
// causing the storage state of the replica to diverge from the cluster.
|
|
549
|
-
// See also: lookup_snapshot_max_for_checkpoint().
|
|
550
|
-
|
|
551
|
-
if (op + 1 == tree.lookup_snapshot_max) {
|
|
552
|
-
// This is the last op of the skipped compaction bar.
|
|
553
|
-
// Prepare the immutable table for the next bar — since this state is
|
|
554
|
-
// in-memory, it cannot be skipped.
|
|
555
|
-
tree.compact_mutable_table_into_immutable();
|
|
556
|
-
}
|
|
557
|
-
|
|
558
|
-
// TODO Defer this callback until tick() to avoid stack growth.
|
|
559
|
-
callback(tree);
|
|
560
|
-
return;
|
|
561
|
-
}
|
|
562
|
-
assert(op == tree.lookup_snapshot_max);
|
|
563
|
-
|
|
564
|
-
tree.compact_start(callback);
|
|
565
|
-
tree.compact_drive();
|
|
566
|
-
}
|
|
567
|
-
|
|
568
|
-
fn compact_start(tree: *Tree, callback: fn (*Tree) void) void {
|
|
569
|
-
assert(tree.compaction_io_pending == 0);
|
|
570
|
-
assert(tree.compaction_callback == null);
|
|
571
|
-
|
|
572
|
-
if (constants.verify) {
|
|
573
|
-
tree.manifest.verify(tree.compaction_op);
|
|
574
|
-
}
|
|
575
|
-
|
|
576
|
-
tracer.start(
|
|
577
|
-
&tree.tracer_slot,
|
|
578
|
-
.{ .tree = .{ .tree_name = tree_name } },
|
|
579
|
-
.tree_compaction_beat,
|
|
580
|
-
@src(),
|
|
581
|
-
);
|
|
582
|
-
|
|
583
|
-
tree.compaction_callback = callback;
|
|
584
|
-
|
|
585
|
-
const compaction_beat = tree.compaction_op % constants.lsm_batch_multiple;
|
|
586
|
-
const start = (compaction_beat == 0) or
|
|
587
|
-
(compaction_beat == half_bar_beat_count);
|
|
588
|
-
|
|
589
|
-
const op_min = compaction_op_min(tree.compaction_op);
|
|
590
|
-
assert(op_min < snapshot_latest);
|
|
591
|
-
assert(op_min % half_bar_beat_count == 0);
|
|
592
|
-
|
|
593
|
-
log.debug(tree_name ++ ": compact_start: op={d} op_min={d} beat={d}/{d}", .{
|
|
594
|
-
tree.compaction_op,
|
|
595
|
-
op_min,
|
|
596
|
-
compaction_beat + 1,
|
|
597
|
-
constants.lsm_batch_multiple,
|
|
598
|
-
});
|
|
599
|
-
|
|
600
|
-
if (start) tree.manifest.reserve();
|
|
601
|
-
|
|
602
|
-
// Try to start compacting the immutable table.
|
|
603
|
-
const even_levels = compaction_beat < half_bar_beat_count;
|
|
604
|
-
if (even_levels) {
|
|
605
|
-
assert(tree.compaction_table_immutable.status == .idle);
|
|
606
|
-
} else {
|
|
607
|
-
if (start) tree.compact_start_table_immutable(op_min);
|
|
608
|
-
}
|
|
609
|
-
|
|
610
|
-
// Try to start compacting the other levels.
|
|
611
|
-
var it = CompactionTableIterator{ .tree = tree };
|
|
612
|
-
while (it.next()) |context| {
|
|
613
|
-
if (start) tree.compact_start_table(op_min, context);
|
|
614
|
-
}
|
|
615
|
-
}
|
|
616
|
-
|
|
617
|
-
fn compact_start_table_immutable(tree: *Tree, op_min: u64) void {
|
|
618
|
-
const compaction_beat = tree.compaction_op % constants.lsm_batch_multiple;
|
|
619
|
-
assert(compaction_beat == half_bar_beat_count);
|
|
620
|
-
|
|
621
|
-
// Do not start compaction if the immutable table does not require compaction.
|
|
622
|
-
if (tree.table_immutable.free) return;
|
|
623
|
-
|
|
624
|
-
assert(tree.table_immutable.snapshot_min % half_bar_beat_count == 0);
|
|
625
|
-
|
|
626
|
-
const values_count = tree.table_immutable.values.len;
|
|
627
|
-
assert(values_count > 0);
|
|
628
|
-
|
|
629
|
-
const level_b: u8 = 0;
|
|
630
|
-
const table_a: ?*const Manifest.TableInfo = null;
|
|
631
|
-
const range = tree.manifest.compaction_range(
|
|
632
|
-
level_b,
|
|
633
|
-
tree.table_immutable.key_min(),
|
|
634
|
-
tree.table_immutable.key_max(),
|
|
635
|
-
);
|
|
636
|
-
|
|
637
|
-
assert(range.table_count >= 1);
|
|
638
|
-
assert(range.table_count <= compaction_tables_input_max);
|
|
639
|
-
assert(compare_keys(range.key_min, tree.table_immutable.key_min()) != .gt);
|
|
640
|
-
assert(compare_keys(range.key_max, tree.table_immutable.key_max()) != .lt);
|
|
641
|
-
|
|
642
|
-
log.debug(tree_name ++
|
|
643
|
-
": compacting immutable table to level 0 " ++
|
|
644
|
-
"(values.len={d} snapshot_min={d} compaction.op_min={d} table_count={d})", .{
|
|
645
|
-
tree.table_immutable.values.len,
|
|
646
|
-
tree.table_immutable.snapshot_min,
|
|
647
|
-
op_min,
|
|
648
|
-
range.table_count,
|
|
649
|
-
});
|
|
650
|
-
|
|
651
|
-
tree.compaction_table_immutable.start(
|
|
652
|
-
tree.grid,
|
|
653
|
-
&tree.manifest,
|
|
654
|
-
op_min,
|
|
655
|
-
range,
|
|
656
|
-
table_a,
|
|
657
|
-
level_b,
|
|
658
|
-
.{ .table = &tree.table_immutable },
|
|
659
|
-
);
|
|
660
|
-
}
|
|
661
|
-
|
|
662
|
-
fn compact_start_table(tree: *Tree, op_min: u64, context: CompactionTableContext) void {
|
|
663
|
-
const compaction_beat = tree.compaction_op % half_bar_beat_count;
|
|
664
|
-
assert(compaction_beat == 0);
|
|
665
|
-
|
|
666
|
-
assert(context.level_a < constants.lsm_levels);
|
|
667
|
-
assert(context.level_b < constants.lsm_levels);
|
|
668
|
-
assert(context.level_a + 1 == context.level_b);
|
|
669
|
-
|
|
670
|
-
// Do not start compaction if level A does not require compaction.
|
|
671
|
-
const table_range = tree.manifest.compaction_table(context.level_a) orelse return;
|
|
672
|
-
const table = table_range.table;
|
|
673
|
-
|
|
674
|
-
assert(table_range.range.table_count >= 1);
|
|
675
|
-
assert(table_range.range.table_count <= compaction_tables_input_max);
|
|
676
|
-
assert(compare_keys(table.key_min, table.key_max) != .gt);
|
|
677
|
-
assert(compare_keys(table_range.range.key_min, table.key_min) != .gt);
|
|
678
|
-
assert(compare_keys(table_range.range.key_max, table.key_max) != .lt);
|
|
679
|
-
|
|
680
|
-
log.debug(tree_name ++ ": compacting {d} tables from level {d} to level {d}", .{
|
|
681
|
-
table_range.range.table_count,
|
|
682
|
-
context.level_a,
|
|
683
|
-
context.level_b,
|
|
684
|
-
});
|
|
685
|
-
|
|
686
|
-
context.compaction.start(
|
|
687
|
-
tree.grid,
|
|
688
|
-
&tree.manifest,
|
|
689
|
-
op_min,
|
|
690
|
-
table_range.range,
|
|
691
|
-
table_range.table,
|
|
692
|
-
context.level_b,
|
|
693
|
-
.{
|
|
694
|
-
.grid = tree.grid,
|
|
695
|
-
.address = table.address,
|
|
696
|
-
.checksum = table.checksum,
|
|
697
|
-
},
|
|
698
|
-
);
|
|
699
|
-
}
|
|
700
|
-
|
|
701
|
-
fn compact_drive(tree: *Tree) void {
|
|
702
|
-
assert(tree.compaction_io_pending <= 2 + tree.compaction_table.len);
|
|
703
|
-
assert(tree.compaction_callback != null);
|
|
704
|
-
|
|
705
|
-
// Always start one fake io_pending that is resolved right after
|
|
706
|
-
// to handle the case where this compaction tick triggers no IO.
|
|
707
|
-
// (For example, ticking the immutable table, or level B is already done).
|
|
708
|
-
tree.compaction_io_pending += 1;
|
|
709
|
-
defer tree.compact_tick_done();
|
|
710
|
-
|
|
711
|
-
// Try to tick the immutable table compaction:
|
|
712
|
-
const compaction_beat = tree.compaction_op % constants.lsm_batch_multiple;
|
|
713
|
-
const even_levels = compaction_beat < half_bar_beat_count;
|
|
714
|
-
if (even_levels) {
|
|
715
|
-
assert(tree.compaction_table_immutable.status == .idle);
|
|
716
|
-
} else {
|
|
717
|
-
tree.compact_tick(&tree.compaction_table_immutable);
|
|
718
|
-
}
|
|
719
|
-
|
|
720
|
-
// Try to tick the compaction for each level:
|
|
721
|
-
var it = CompactionTableIterator{ .tree = tree };
|
|
722
|
-
while (it.next()) |context| {
|
|
723
|
-
tree.compact_tick(context.compaction);
|
|
724
|
-
}
|
|
725
|
-
}
|
|
726
|
-
|
|
727
|
-
fn compact_tick(tree: *Tree, compaction: anytype) void {
|
|
728
|
-
if (compaction.status != .processing) return;
|
|
729
|
-
tree.compaction_io_pending += 1;
|
|
730
|
-
|
|
731
|
-
const compaction_beat = tree.compaction_op % constants.lsm_batch_multiple;
|
|
732
|
-
const even_levels = compaction_beat < half_bar_beat_count;
|
|
733
|
-
assert(compaction.level_b < constants.lsm_levels);
|
|
734
|
-
assert(compaction.level_b % 2 == @boolToInt(even_levels));
|
|
735
|
-
|
|
736
|
-
if (@TypeOf(compaction.*) == CompactionTableImmutable) {
|
|
737
|
-
assert(compaction.level_b == 0);
|
|
738
|
-
log.debug(tree_name ++ ": compact_tick() for immutable table to level 0", .{});
|
|
739
|
-
compaction.compact_tick(Tree.compact_tick_callback_table_immutable);
|
|
740
|
-
} else {
|
|
741
|
-
assert(@TypeOf(compaction.*) == CompactionTable);
|
|
742
|
-
log.debug(tree_name ++ ": compact_tick() for level {d} to level {d}", .{
|
|
743
|
-
compaction.level_b - 1,
|
|
744
|
-
compaction.level_b,
|
|
745
|
-
});
|
|
746
|
-
compaction.compact_tick(Tree.compact_tick_callback_table);
|
|
747
|
-
}
|
|
748
|
-
}
|
|
749
|
-
|
|
750
|
-
fn compact_tick_callback_table_immutable(compaction: *CompactionTableImmutable) void {
|
|
751
|
-
assert(compaction.status == .processing or compaction.status == .done);
|
|
752
|
-
assert(compaction.level_b < constants.lsm_levels);
|
|
753
|
-
assert(compaction.level_b == 0);
|
|
754
|
-
|
|
755
|
-
const tree = @fieldParentPtr(Tree, "compaction_table_immutable", compaction);
|
|
756
|
-
const compaction_beat = tree.compaction_op % constants.lsm_batch_multiple;
|
|
757
|
-
assert(compaction_beat >= half_bar_beat_count);
|
|
758
|
-
|
|
759
|
-
log.debug(tree_name ++ ": compact_tick() complete for immutable table to level 0", .{});
|
|
760
|
-
tree.compact_tick_done();
|
|
761
|
-
}
|
|
762
|
-
|
|
763
|
-
fn compact_tick_callback_table(compaction: *CompactionTable) void {
|
|
764
|
-
assert(compaction.status == .processing or compaction.status == .done);
|
|
765
|
-
assert(compaction.level_b < constants.lsm_levels);
|
|
766
|
-
assert(compaction.level_b > 0);
|
|
767
|
-
|
|
768
|
-
const table_offset = @divFloor(compaction.level_b - 1, 2);
|
|
769
|
-
const table_ptr = @ptrCast([*]CompactionTable, compaction) - table_offset;
|
|
770
|
-
|
|
771
|
-
const table_size = @divFloor(constants.lsm_levels, 2);
|
|
772
|
-
const table: *[table_size]CompactionTable = table_ptr[0..table_size];
|
|
773
|
-
|
|
774
|
-
log.debug(tree_name ++ ": compact_tick() complete for level {d} to level {d}", .{
|
|
775
|
-
compaction.level_b - 1,
|
|
776
|
-
compaction.level_b,
|
|
777
|
-
});
|
|
778
|
-
|
|
779
|
-
const tree = @fieldParentPtr(Tree, "compaction_table", table);
|
|
780
|
-
tree.compact_tick_done();
|
|
781
|
-
}
|
|
782
|
-
|
|
783
|
-
fn compact_tick_done(tree: *Tree) void {
|
|
784
|
-
assert(tree.compaction_io_pending <= 2 + tree.compaction_table.len);
|
|
785
|
-
assert(tree.compaction_callback != null);
|
|
786
|
-
|
|
787
|
-
// compact_done() is called after all compact_tick()'s complete.
|
|
788
|
-
tree.compaction_io_pending -= 1;
|
|
789
|
-
if (tree.compaction_io_pending == 0) tree.compact_done();
|
|
790
|
-
}
|
|
791
|
-
|
|
792
|
-
/// Called at the end of each compaction tick.
|
|
793
|
-
fn compact_done(tree: *Tree) void {
|
|
794
|
-
assert(tree.compaction_io_pending == 0);
|
|
795
|
-
assert(tree.compaction_callback != null);
|
|
796
|
-
assert(tree.compaction_op == tree.lookup_snapshot_max);
|
|
797
|
-
|
|
798
|
-
const compaction_beat = tree.compaction_op % constants.lsm_batch_multiple;
|
|
799
|
-
const even_levels = compaction_beat < half_bar_beat_count;
|
|
800
|
-
const compacted_levels_even = compaction_beat == half_bar_beat_count - 1;
|
|
801
|
-
const compacted_levels_odd = compaction_beat == constants.lsm_batch_multiple - 1;
|
|
802
|
-
if (!compacted_levels_even and !compacted_levels_odd) {
|
|
803
|
-
// TODO(Deterministic Beats): Remove this when compact_done() is called exactly
|
|
804
|
-
// once when the beat finishes.
|
|
805
|
-
tree.lookup_snapshot_max = tree.compaction_op + 1;
|
|
806
|
-
|
|
807
|
-
tree.compact_finish();
|
|
808
|
-
return;
|
|
809
|
-
}
|
|
810
|
-
|
|
811
|
-
// At the end of the second and fourth beat:
|
|
812
|
-
// 1. Tick the Compactions until all have completed.
|
|
813
|
-
// 2. Remove invisible tables from the manifest.
|
|
814
|
-
// 3. Compact the manifest.
|
|
815
|
-
// Then at the end of the fourth beat, freeze the mutable table.
|
|
816
|
-
assert(compacted_levels_even or compacted_levels_odd);
|
|
817
|
-
assert(compacted_levels_even != compacted_levels_odd);
|
|
818
|
-
|
|
819
|
-
const still_compacting = blk: {
|
|
820
|
-
if (even_levels) {
|
|
821
|
-
assert(tree.compaction_table_immutable.status == .idle);
|
|
822
|
-
} else {
|
|
823
|
-
if (tree.compaction_table_immutable.status == .processing) break :blk true;
|
|
824
|
-
}
|
|
825
|
-
|
|
826
|
-
var it = CompactionTableIterator{ .tree = tree };
|
|
827
|
-
while (it.next()) |context| {
|
|
828
|
-
if (context.compaction.status == .processing) break :blk true;
|
|
829
|
-
}
|
|
830
|
-
break :blk false;
|
|
831
|
-
};
|
|
832
|
-
|
|
833
|
-
if (still_compacting) {
|
|
834
|
-
// We are at the end of a half-bar, but the compactions have not finished.
|
|
835
|
-
// We keep ticking them until they finish.
|
|
836
|
-
log.debug(tree_name ++ ": compact_done: driving outstanding compactions", .{});
|
|
837
|
-
tree.compact_drive();
|
|
838
|
-
return;
|
|
839
|
-
}
|
|
840
|
-
|
|
841
|
-
// TODO(Deterministic Beats): Move this to the top of the function when compact_done()
|
|
842
|
-
// is called exactly once when the beat finishes.
|
|
843
|
-
tree.lookup_snapshot_max = tree.compaction_op + 1;
|
|
844
|
-
|
|
845
|
-
// All compactions have finished for the current half-bar.
|
|
846
|
-
// We couldn't remove the (invisible) input tables until now because prefetch()
|
|
847
|
-
// needs a complete set of tables for lookups to avoid missing data.
|
|
848
|
-
|
|
849
|
-
// Reset the immutable table Compaction.
|
|
850
|
-
// Also clear any tables made invisible by the compaction.
|
|
851
|
-
if (!even_levels) {
|
|
852
|
-
switch (tree.compaction_table_immutable.status) {
|
|
853
|
-
// The compaction wasn't started for this half bar.
|
|
854
|
-
.idle => assert(tree.table_immutable.free),
|
|
855
|
-
.processing => unreachable,
|
|
856
|
-
.done => {
|
|
857
|
-
tree.compaction_table_immutable.reset();
|
|
858
|
-
tree.table_immutable.clear();
|
|
859
|
-
tree.manifest.remove_invisible_tables(
|
|
860
|
-
tree.compaction_table_immutable.level_b,
|
|
861
|
-
tree.lookup_snapshot_max,
|
|
862
|
-
tree.compaction_table_immutable.range.key_min,
|
|
863
|
-
tree.compaction_table_immutable.range.key_max,
|
|
864
|
-
);
|
|
865
|
-
},
|
|
866
|
-
}
|
|
867
|
-
}
|
|
868
|
-
|
|
869
|
-
// Reset all the other Compactions.
|
|
870
|
-
// Also clear any tables made invisible by the compactions.
|
|
871
|
-
var it = CompactionTableIterator{ .tree = tree };
|
|
872
|
-
while (it.next()) |context| {
|
|
873
|
-
switch (context.compaction.status) {
|
|
874
|
-
.idle => {}, // The compaction wasn't started for this half bar.
|
|
875
|
-
.processing => unreachable,
|
|
876
|
-
.done => {
|
|
877
|
-
context.compaction.reset();
|
|
878
|
-
tree.manifest.remove_invisible_tables(
|
|
879
|
-
context.compaction.level_b,
|
|
880
|
-
tree.lookup_snapshot_max,
|
|
881
|
-
context.compaction.range.key_min,
|
|
882
|
-
context.compaction.range.key_max,
|
|
883
|
-
);
|
|
884
|
-
if (context.compaction.level_b > 0) {
|
|
885
|
-
tree.manifest.remove_invisible_tables(
|
|
886
|
-
context.compaction.level_b - 1,
|
|
887
|
-
tree.lookup_snapshot_max,
|
|
888
|
-
context.compaction.range.key_min,
|
|
889
|
-
context.compaction.range.key_max,
|
|
890
|
-
);
|
|
891
|
-
}
|
|
892
|
-
},
|
|
893
|
-
}
|
|
894
|
-
}
|
|
895
|
-
|
|
896
|
-
assert(tree.compaction_table_immutable.status == .idle);
|
|
897
|
-
it = CompactionTableIterator{ .tree = tree };
|
|
898
|
-
while (it.next()) |context| {
|
|
899
|
-
assert(context.compaction.status == .idle);
|
|
900
|
-
}
|
|
901
|
-
|
|
902
|
-
// At the end of the fourth/last beat:
|
|
903
|
-
// - Assert all visible tables haven't overflowed their max per level.
|
|
904
|
-
// - Convert mutable table to immutable table for next bar.
|
|
905
|
-
if (compacted_levels_odd) {
|
|
906
|
-
tree.manifest.assert_level_table_counts();
|
|
907
|
-
tree.compact_mutable_table_into_immutable();
|
|
908
|
-
}
|
|
909
|
-
|
|
910
|
-
// At the end of the second/fourth beat:
|
|
911
|
-
// - Compact the manifest before invoking the compact() callback.
|
|
912
|
-
tree.manifest.compact(compact_manifest_callback);
|
|
913
|
-
}
|
|
914
|
-
|
|
915
|
-
/// Called after the last beat of a full compaction bar.
|
|
916
|
-
fn compact_mutable_table_into_immutable(tree: *Tree) void {
|
|
917
|
-
assert(tree.table_immutable.free);
|
|
918
|
-
assert((tree.compaction_op + 1) % constants.lsm_batch_multiple == 0);
|
|
919
|
-
assert(tree.compaction_op + 1 == tree.lookup_snapshot_max);
|
|
920
|
-
|
|
921
|
-
if (tree.table_mutable.count() == 0) return;
|
|
922
|
-
|
|
923
|
-
// Sort the mutable table values directly into the immutable table's array.
|
|
924
|
-
const values_max = tree.table_immutable.values_max();
|
|
925
|
-
const values = tree.table_mutable.sort_into_values_and_clear(values_max);
|
|
926
|
-
assert(values.ptr == values_max.ptr);
|
|
927
|
-
|
|
928
|
-
// The immutable table must be visible to the next bar — setting its snapshot_min to
|
|
929
|
-
// lookup_snapshot_max guarantees.
|
|
930
|
-
//
|
|
931
|
-
// In addition, the immutable table is conceptually an output table of this compaction
|
|
932
|
-
// bar, and now its snapshot_min matches the snapshot_min of the Compactions' output
|
|
933
|
-
// tables.
|
|
934
|
-
tree.table_immutable.reset_with_sorted_values(tree.lookup_snapshot_max, values);
|
|
935
|
-
|
|
936
|
-
assert(tree.table_mutable.count() == 0);
|
|
937
|
-
assert(!tree.table_immutable.free);
|
|
938
|
-
}
|
|
939
|
-
|
|
940
|
-
fn compact_manifest_callback(manifest: *Manifest) void {
|
|
941
|
-
const tree = @fieldParentPtr(Tree, "manifest", manifest);
|
|
942
|
-
assert(tree.compaction_io_pending == 0);
|
|
943
|
-
assert(tree.compaction_callback != null);
|
|
944
|
-
tree.compact_finish();
|
|
945
|
-
}
|
|
946
|
-
|
|
947
|
-
/// Called at the end of each compaction beat.
|
|
948
|
-
fn compact_finish(tree: *Tree) void {
|
|
949
|
-
assert(tree.compaction_io_pending == 0);
|
|
950
|
-
|
|
951
|
-
tracer.end(
|
|
952
|
-
&tree.tracer_slot,
|
|
953
|
-
.{ .tree = .{ .tree_name = tree_name } },
|
|
954
|
-
.tree_compaction_beat,
|
|
955
|
-
);
|
|
956
|
-
|
|
957
|
-
if (constants.verify) {
|
|
958
|
-
tree.manifest.verify(tree.compaction_op);
|
|
959
|
-
}
|
|
960
|
-
|
|
961
|
-
// Invoke the compact() callback after the manifest compacts at the end of the beat.
|
|
962
|
-
const callback = tree.compaction_callback.?;
|
|
963
|
-
tree.compaction_callback = null;
|
|
964
|
-
callback(tree);
|
|
965
|
-
}
|
|
966
|
-
|
|
967
|
-
pub fn checkpoint(tree: *Tree, callback: fn (*Tree) void) void {
|
|
968
|
-
// Assert no outstanding compact_tick() work.
|
|
969
|
-
assert(tree.compaction_io_pending == 0);
|
|
970
|
-
assert(tree.compaction_callback == null);
|
|
971
|
-
assert(tree.compaction_op > 0);
|
|
972
|
-
assert(tree.compaction_op + 1 == tree.lookup_snapshot_max);
|
|
973
|
-
// Don't re-run the checkpoint we recovered from.
|
|
974
|
-
assert(!tree.grid.superblock.working.vsr_state.op_compacted(tree.compaction_op));
|
|
975
|
-
|
|
976
|
-
// Assert that this is the last beat in the compaction bar.
|
|
977
|
-
const compaction_beat = tree.compaction_op % constants.lsm_batch_multiple;
|
|
978
|
-
const last_beat_in_bar = constants.lsm_batch_multiple - 1;
|
|
979
|
-
assert(last_beat_in_bar == compaction_beat);
|
|
980
|
-
|
|
981
|
-
// Assert no outstanding compactions.
|
|
982
|
-
assert(tree.compaction_table_immutable.status == .idle);
|
|
983
|
-
for (tree.compaction_table) |*compaction| {
|
|
984
|
-
assert(compaction.status == .idle);
|
|
985
|
-
}
|
|
986
|
-
|
|
987
|
-
// Assert all manifest levels haven't overflowed their table counts.
|
|
988
|
-
tree.manifest.assert_level_table_counts();
|
|
989
|
-
|
|
990
|
-
// Assert that we're checkpointing only after invisible tables have been removed.
|
|
991
|
-
if (constants.verify) {
|
|
992
|
-
tree.manifest.assert_no_invisible_tables(compaction_op_min(tree.compaction_op));
|
|
993
|
-
}
|
|
994
|
-
|
|
995
|
-
// Start an asynchronous checkpoint on the manifest.
|
|
996
|
-
assert(tree.checkpoint_callback == null);
|
|
997
|
-
tree.checkpoint_callback = callback;
|
|
998
|
-
tree.manifest.checkpoint(manifest_checkpoint_callback);
|
|
999
|
-
}
|
|
1000
|
-
|
|
1001
|
-
fn manifest_checkpoint_callback(manifest: *Manifest) void {
|
|
1002
|
-
const tree = @fieldParentPtr(Tree, "manifest", manifest);
|
|
1003
|
-
assert(tree.checkpoint_callback != null);
|
|
1004
|
-
|
|
1005
|
-
const callback = tree.checkpoint_callback.?;
|
|
1006
|
-
tree.checkpoint_callback = null;
|
|
1007
|
-
callback(tree);
|
|
1008
|
-
}
|
|
1009
|
-
|
|
1010
|
-
pub const RangeQuery = union(enum) {
|
|
1011
|
-
bounded: struct {
|
|
1012
|
-
start: Key,
|
|
1013
|
-
end: Key,
|
|
1014
|
-
},
|
|
1015
|
-
open: struct {
|
|
1016
|
-
start: Key,
|
|
1017
|
-
order: enum {
|
|
1018
|
-
ascending,
|
|
1019
|
-
descending,
|
|
1020
|
-
},
|
|
1021
|
-
},
|
|
1022
|
-
};
|
|
1023
|
-
|
|
1024
|
-
pub const RangeQueryIterator = struct {
|
|
1025
|
-
tree: *Tree,
|
|
1026
|
-
snapshot: u64,
|
|
1027
|
-
query: RangeQuery,
|
|
1028
|
-
|
|
1029
|
-
pub fn next(callback: fn (result: ?Value) void) void {
|
|
1030
|
-
_ = callback;
|
|
1031
|
-
}
|
|
1032
|
-
};
|
|
1033
|
-
|
|
1034
|
-
pub fn range_query(
|
|
1035
|
-
tree: *Tree,
|
|
1036
|
-
/// The snapshot timestamp, if any
|
|
1037
|
-
snapshot: ?u64,
|
|
1038
|
-
query: RangeQuery,
|
|
1039
|
-
) RangeQueryIterator {
|
|
1040
|
-
_ = tree;
|
|
1041
|
-
_ = snapshot;
|
|
1042
|
-
_ = query;
|
|
1043
|
-
}
|
|
1044
|
-
};
|
|
1045
|
-
}
|
|
1046
|
-
|
|
1047
|
-
/// Returns the first op of the compaction (Compaction.op_min) for a given op/beat.
|
|
1048
|
-
///
|
|
1049
|
-
/// After this compaction finishes:
|
|
1050
|
-
/// - `op_min + half_bar_beat_count - 1` will be the input tables' snapshot_max.
|
|
1051
|
-
/// - `op_min + half_bar_beat_count` will be the output tables' snapshot_min.
|
|
1052
|
-
///
|
|
1053
|
-
/// Each half-bar has a separate op_min (for deriving the output snapshot_min) instead of each full
|
|
1054
|
-
/// bar because this allows the output tables of the first half-bar's compaction to be prefetched
|
|
1055
|
-
/// against earlier — hopefully while they are still warm in the cache from being written.
|
|
1056
|
-
pub fn compaction_op_min(op: u64) u64 {
|
|
1057
|
-
return op - op % half_bar_beat_count;
|
|
1058
|
-
}
|
|
1059
|
-
|
|
1060
|
-
/// These charts depict the commit/compact ops and `lookup_snapshot_max` over a series of
|
|
1061
|
-
/// commits and compactions (with lsm_batch_multiple=8).
|
|
1062
|
-
///
|
|
1063
|
-
/// Legend:
|
|
1064
|
-
///
|
|
1065
|
-
/// ┼ full bar (first half-bar start)
|
|
1066
|
-
/// ┬ half bar (second half-bar start)
|
|
1067
|
-
/// $ lookup_snapshot_max (prefetch reads from the current snapshot)
|
|
1068
|
-
/// This is incremented at the end of each compact().
|
|
1069
|
-
/// . op is in mutable table (in memory)
|
|
1070
|
-
/// , op is in immutable table (in memory)
|
|
1071
|
-
/// # op is on disk
|
|
1072
|
-
/// ✓ checkpoint() may follow compact()
|
|
1073
|
-
///
|
|
1074
|
-
/// 0 2 4 6 8 0 2 4 6
|
|
1075
|
-
/// ┼───┬───┼───┬───┼
|
|
1076
|
-
/// .$ ╷ ╷ init(superblock.commit_min=0)⎤ Compaction is effectively a noop for the
|
|
1077
|
-
/// .$ ╷ ╷ commit;compact( 1) start/end ⎥ first bar because there are no tables on
|
|
1078
|
-
/// ..$ ╷ ╷ commit;compact( 2) start/end ⎥ disk yet, and no immutable table to
|
|
1079
|
-
/// ...$ ╷ ╷ commit;compact( 3) start/end ⎥ flush.
|
|
1080
|
-
/// ....$ ╷ ╷ commit;compact( 4) start/end ⎥
|
|
1081
|
-
/// .....$ ╷ ╷ commit;compact( 5) start/end ⎥ This applies:
|
|
1082
|
-
/// ......$ ╷ ╷ commit;compact( 6) start/end ⎥ - when the LSM is starting on a freshly
|
|
1083
|
-
/// .......$╷ ╷ commit;compact( 7) start ⎤⎥ formatted data file, and also
|
|
1084
|
-
/// ,,,,,,,,$ ╷ ✓ compact( 7) end⎦⎦ - when the LSM is recovering from a crash
|
|
1085
|
-
/// ,,,,,,,,$ ╷ commit;compact( 8) start/end (see below).
|
|
1086
|
-
/// ,,,,,,,,.$ ╷ commit;compact( 9) start/end
|
|
1087
|
-
/// ,,,,,,,,..$ ╷ commit;compact(10) start/end
|
|
1088
|
-
/// ,,,,,,,,...$ ╷ commit;compact(11) start/end
|
|
1089
|
-
/// ,,,,,,,,....$ ╷ commit;compact(12) start/end
|
|
1090
|
-
/// ,,,,,,,,.....$ ╷ commit;compact(13) start/end
|
|
1091
|
-
/// ,,,,,,,,......$ ╷ commit;compact(14) start/end
|
|
1092
|
-
/// ,,,,,,,,.......$╷ commit;compact(15) start ⎤
|
|
1093
|
-
/// ########,,,,,,,,$ ✓ compact(15) end⎦
|
|
1094
|
-
/// ########,,,,,,,,$ commit;compact(16) start/end
|
|
1095
|
-
/// ┼───┬───┼───┬───┼
|
|
1096
|
-
/// 0 2 4 6 8 0 2 4 6
|
|
1097
|
-
/// ┼───┬───┼───┬───┼ Recover with a checkpoint taken at op 15.
|
|
1098
|
-
/// ######## $ init(superblock.commit_min=7) At op 15, ops 8…15 are in memory, so they
|
|
1099
|
-
/// ########. $ commit ( 8) start/end ⎤ were dropped by the crash.
|
|
1100
|
-
/// ########.. $ commit ( 9) start/end ⎥
|
|
1101
|
-
/// ########... $ commit (10) start/end ⎥ But compaction is not run for ops 8…15
|
|
1102
|
-
/// ########.... $ commit (11) start/end ⎥ because it was already performed
|
|
1103
|
-
/// ########..... $ commit (12) start/end ⎥ before the checkpoint.
|
|
1104
|
-
/// ########...... $ commit (13) start/end ⎥
|
|
1105
|
-
/// ########....... $ commit (14) start/end ⎥ We can begin to compact again at op 16,
|
|
1106
|
-
/// ########........$ commit (15) start ⎤⎥ because those compactions (if previously
|
|
1107
|
-
/// ########,,,,,,,,$ ✓ (15) end⎦⎦ performed) are not included in the
|
|
1108
|
-
/// ########,,,,,,,,$ commit;compact(16) start/end checkpoint.
|
|
1109
|
-
/// ┼───┬───┼───┬───┼
|
|
1110
|
-
/// 0 2 4 6 8 0 2 4 6
|
|
1111
|
-
///
|
|
1112
|
-
/// Notice how in the checkpoint recovery example above, we are careful not to `compact(op)` twice
|
|
1113
|
-
/// for any op (even if we crash/recover), since that could lead to differences between replicas'
|
|
1114
|
-
/// storage. The last bar of `commit()`s is always only in memory, so it is safe to repeat.
|
|
1115
|
-
///
|
|
1116
|
-
/// Additionally, while skipping compactions during recovery, we use a `lookup_snapshot_max`
|
|
1117
|
-
/// different than the original compactions — the old tables may have been removed during the
|
|
1118
|
-
/// checkpoint.
|
|
1119
|
-
fn lookup_snapshot_max_for_checkpoint(op_checkpoint: u64) u64 {
|
|
1120
|
-
if (op_checkpoint == 0) {
|
|
1121
|
-
// Start from 1 because we never commit op 0.
|
|
1122
|
-
return 1;
|
|
1123
|
-
} else {
|
|
1124
|
-
return op_checkpoint + constants.lsm_batch_multiple + 1;
|
|
1125
|
-
}
|
|
1126
|
-
}
|
|
1127
|
-
|
|
1128
|
-
/// The total number of tables that can be supported by the tree across so many levels.
|
|
1129
|
-
pub fn table_count_max_for_tree(growth_factor: u32, levels_count: u32) u32 {
|
|
1130
|
-
assert(growth_factor >= 4);
|
|
1131
|
-
assert(growth_factor <= 16); // Limit excessive write amplification.
|
|
1132
|
-
assert(levels_count >= 2);
|
|
1133
|
-
assert(levels_count <= 10); // Limit excessive read amplification.
|
|
1134
|
-
assert(levels_count <= constants.lsm_levels);
|
|
1135
|
-
|
|
1136
|
-
var count: u32 = 0;
|
|
1137
|
-
var level: u32 = 0;
|
|
1138
|
-
while (level < levels_count) : (level += 1) {
|
|
1139
|
-
count += table_count_max_for_level(growth_factor, level);
|
|
1140
|
-
}
|
|
1141
|
-
return count;
|
|
1142
|
-
}
|
|
1143
|
-
|
|
1144
|
-
/// The total number of tables that can be supported by the level alone.
|
|
1145
|
-
pub fn table_count_max_for_level(growth_factor: u32, level: u32) u32 {
|
|
1146
|
-
assert(level >= 0);
|
|
1147
|
-
assert(level < constants.lsm_levels);
|
|
1148
|
-
|
|
1149
|
-
return math.pow(u32, growth_factor, level + 1);
|
|
1150
|
-
}
|
|
1151
|
-
|
|
1152
|
-
test "table_count_max_for_level/tree" {
|
|
1153
|
-
const expectEqual = std.testing.expectEqual;
|
|
1154
|
-
|
|
1155
|
-
try expectEqual(@as(u32, 8), table_count_max_for_level(8, 0));
|
|
1156
|
-
try expectEqual(@as(u32, 64), table_count_max_for_level(8, 1));
|
|
1157
|
-
try expectEqual(@as(u32, 512), table_count_max_for_level(8, 2));
|
|
1158
|
-
try expectEqual(@as(u32, 4096), table_count_max_for_level(8, 3));
|
|
1159
|
-
try expectEqual(@as(u32, 32768), table_count_max_for_level(8, 4));
|
|
1160
|
-
try expectEqual(@as(u32, 262144), table_count_max_for_level(8, 5));
|
|
1161
|
-
try expectEqual(@as(u32, 2097152), table_count_max_for_level(8, 6));
|
|
1162
|
-
|
|
1163
|
-
try expectEqual(@as(u32, 8 + 64), table_count_max_for_tree(8, 2));
|
|
1164
|
-
try expectEqual(@as(u32, 72 + 512), table_count_max_for_tree(8, 3));
|
|
1165
|
-
try expectEqual(@as(u32, 584 + 4096), table_count_max_for_tree(8, 4));
|
|
1166
|
-
try expectEqual(@as(u32, 4680 + 32768), table_count_max_for_tree(8, 5));
|
|
1167
|
-
try expectEqual(@as(u32, 37448 + 262144), table_count_max_for_tree(8, 6));
|
|
1168
|
-
try expectEqual(@as(u32, 299592 + 2097152), table_count_max_for_tree(8, 7));
|
|
1169
|
-
}
|