tigerbeetle-node 0.11.13 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. package/dist/bin/aarch64-linux-gnu/client.node +0 -0
  2. package/dist/bin/aarch64-linux-musl/client.node +0 -0
  3. package/dist/bin/aarch64-macos/client.node +0 -0
  4. package/dist/bin/x86_64-linux-gnu/client.node +0 -0
  5. package/dist/bin/x86_64-linux-musl/client.node +0 -0
  6. package/dist/bin/x86_64-macos/client.node +0 -0
  7. package/dist/index.js +33 -1
  8. package/dist/index.js.map +1 -1
  9. package/package-lock.json +66 -0
  10. package/package.json +6 -16
  11. package/src/index.ts +56 -1
  12. package/src/node.zig +9 -9
  13. package/dist/.client.node.sha256 +0 -1
  14. package/scripts/build_lib.sh +0 -61
  15. package/scripts/download_node_headers.sh +0 -32
  16. package/src/tigerbeetle/scripts/benchmark.bat +0 -55
  17. package/src/tigerbeetle/scripts/benchmark.sh +0 -66
  18. package/src/tigerbeetle/scripts/confirm_image.sh +0 -44
  19. package/src/tigerbeetle/scripts/fail_on_diff.sh +0 -9
  20. package/src/tigerbeetle/scripts/fuzz_loop.sh +0 -15
  21. package/src/tigerbeetle/scripts/fuzz_loop_hash_log.sh +0 -12
  22. package/src/tigerbeetle/scripts/fuzz_unique_errors.sh +0 -7
  23. package/src/tigerbeetle/scripts/install.bat +0 -7
  24. package/src/tigerbeetle/scripts/install.sh +0 -21
  25. package/src/tigerbeetle/scripts/install_zig.bat +0 -113
  26. package/src/tigerbeetle/scripts/install_zig.sh +0 -90
  27. package/src/tigerbeetle/scripts/lint.zig +0 -199
  28. package/src/tigerbeetle/scripts/pre-commit.sh +0 -9
  29. package/src/tigerbeetle/scripts/scripts/benchmark.bat +0 -55
  30. package/src/tigerbeetle/scripts/scripts/benchmark.sh +0 -66
  31. package/src/tigerbeetle/scripts/scripts/confirm_image.sh +0 -44
  32. package/src/tigerbeetle/scripts/scripts/fail_on_diff.sh +0 -9
  33. package/src/tigerbeetle/scripts/scripts/fuzz_loop.sh +0 -15
  34. package/src/tigerbeetle/scripts/scripts/fuzz_loop_hash_log.sh +0 -12
  35. package/src/tigerbeetle/scripts/scripts/fuzz_unique_errors.sh +0 -7
  36. package/src/tigerbeetle/scripts/scripts/install.bat +0 -7
  37. package/src/tigerbeetle/scripts/scripts/install.sh +0 -21
  38. package/src/tigerbeetle/scripts/scripts/install_zig.bat +0 -113
  39. package/src/tigerbeetle/scripts/scripts/install_zig.sh +0 -90
  40. package/src/tigerbeetle/scripts/scripts/lint.zig +0 -199
  41. package/src/tigerbeetle/scripts/scripts/pre-commit.sh +0 -9
  42. package/src/tigerbeetle/scripts/scripts/shellcheck.sh +0 -5
  43. package/src/tigerbeetle/scripts/scripts/tests_on_alpine.sh +0 -10
  44. package/src/tigerbeetle/scripts/scripts/tests_on_ubuntu.sh +0 -14
  45. package/src/tigerbeetle/scripts/scripts/upgrade_ubuntu_kernel.sh +0 -48
  46. package/src/tigerbeetle/scripts/scripts/validate_docs.sh +0 -23
  47. package/src/tigerbeetle/scripts/scripts/vr_state_enumerate +0 -46
  48. package/src/tigerbeetle/scripts/shellcheck.sh +0 -5
  49. package/src/tigerbeetle/scripts/tests_on_alpine.sh +0 -10
  50. package/src/tigerbeetle/scripts/tests_on_ubuntu.sh +0 -14
  51. package/src/tigerbeetle/scripts/upgrade_ubuntu_kernel.sh +0 -48
  52. package/src/tigerbeetle/scripts/validate_docs.sh +0 -23
  53. package/src/tigerbeetle/scripts/vr_state_enumerate +0 -46
  54. package/src/tigerbeetle/src/benchmark.zig +0 -336
  55. package/src/tigerbeetle/src/config.zig +0 -233
  56. package/src/tigerbeetle/src/constants.zig +0 -428
  57. package/src/tigerbeetle/src/ewah.zig +0 -286
  58. package/src/tigerbeetle/src/ewah_benchmark.zig +0 -120
  59. package/src/tigerbeetle/src/ewah_fuzz.zig +0 -130
  60. package/src/tigerbeetle/src/fifo.zig +0 -120
  61. package/src/tigerbeetle/src/io/benchmark.zig +0 -213
  62. package/src/tigerbeetle/src/io/darwin.zig +0 -814
  63. package/src/tigerbeetle/src/io/linux.zig +0 -1071
  64. package/src/tigerbeetle/src/io/test.zig +0 -643
  65. package/src/tigerbeetle/src/io/windows.zig +0 -1183
  66. package/src/tigerbeetle/src/io.zig +0 -34
  67. package/src/tigerbeetle/src/iops.zig +0 -107
  68. package/src/tigerbeetle/src/lsm/README.md +0 -308
  69. package/src/tigerbeetle/src/lsm/binary_search.zig +0 -341
  70. package/src/tigerbeetle/src/lsm/bloom_filter.zig +0 -125
  71. package/src/tigerbeetle/src/lsm/compaction.zig +0 -603
  72. package/src/tigerbeetle/src/lsm/composite_key.zig +0 -77
  73. package/src/tigerbeetle/src/lsm/direction.zig +0 -11
  74. package/src/tigerbeetle/src/lsm/eytzinger.zig +0 -587
  75. package/src/tigerbeetle/src/lsm/eytzinger_benchmark.zig +0 -330
  76. package/src/tigerbeetle/src/lsm/forest.zig +0 -205
  77. package/src/tigerbeetle/src/lsm/forest_fuzz.zig +0 -450
  78. package/src/tigerbeetle/src/lsm/grid.zig +0 -573
  79. package/src/tigerbeetle/src/lsm/groove.zig +0 -1036
  80. package/src/tigerbeetle/src/lsm/k_way_merge.zig +0 -474
  81. package/src/tigerbeetle/src/lsm/level_iterator.zig +0 -332
  82. package/src/tigerbeetle/src/lsm/manifest.zig +0 -617
  83. package/src/tigerbeetle/src/lsm/manifest_level.zig +0 -878
  84. package/src/tigerbeetle/src/lsm/manifest_log.zig +0 -789
  85. package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +0 -691
  86. package/src/tigerbeetle/src/lsm/merge_iterator.zig +0 -106
  87. package/src/tigerbeetle/src/lsm/node_pool.zig +0 -235
  88. package/src/tigerbeetle/src/lsm/posted_groove.zig +0 -381
  89. package/src/tigerbeetle/src/lsm/segmented_array.zig +0 -1329
  90. package/src/tigerbeetle/src/lsm/segmented_array_benchmark.zig +0 -148
  91. package/src/tigerbeetle/src/lsm/segmented_array_fuzz.zig +0 -9
  92. package/src/tigerbeetle/src/lsm/set_associative_cache.zig +0 -850
  93. package/src/tigerbeetle/src/lsm/table.zig +0 -1009
  94. package/src/tigerbeetle/src/lsm/table_immutable.zig +0 -192
  95. package/src/tigerbeetle/src/lsm/table_iterator.zig +0 -340
  96. package/src/tigerbeetle/src/lsm/table_mutable.zig +0 -203
  97. package/src/tigerbeetle/src/lsm/test.zig +0 -439
  98. package/src/tigerbeetle/src/lsm/tree.zig +0 -1169
  99. package/src/tigerbeetle/src/lsm/tree_fuzz.zig +0 -479
  100. package/src/tigerbeetle/src/message_bus.zig +0 -1013
  101. package/src/tigerbeetle/src/message_pool.zig +0 -156
  102. package/src/tigerbeetle/src/ring_buffer.zig +0 -399
  103. package/src/tigerbeetle/src/simulator.zig +0 -580
  104. package/src/tigerbeetle/src/state_machine/auditor.zig +0 -578
  105. package/src/tigerbeetle/src/state_machine/workload.zig +0 -883
  106. package/src/tigerbeetle/src/state_machine.zig +0 -2099
  107. package/src/tigerbeetle/src/static_allocator.zig +0 -65
  108. package/src/tigerbeetle/src/stdx.zig +0 -171
  109. package/src/tigerbeetle/src/storage.zig +0 -393
  110. package/src/tigerbeetle/src/testing/cluster/message_bus.zig +0 -82
  111. package/src/tigerbeetle/src/testing/cluster/network.zig +0 -237
  112. package/src/tigerbeetle/src/testing/cluster/state_checker.zig +0 -169
  113. package/src/tigerbeetle/src/testing/cluster/storage_checker.zig +0 -202
  114. package/src/tigerbeetle/src/testing/cluster.zig +0 -444
  115. package/src/tigerbeetle/src/testing/fuzz.zig +0 -140
  116. package/src/tigerbeetle/src/testing/hash_log.zig +0 -66
  117. package/src/tigerbeetle/src/testing/id.zig +0 -99
  118. package/src/tigerbeetle/src/testing/packet_simulator.zig +0 -374
  119. package/src/tigerbeetle/src/testing/priority_queue.zig +0 -645
  120. package/src/tigerbeetle/src/testing/reply_sequence.zig +0 -139
  121. package/src/tigerbeetle/src/testing/state_machine.zig +0 -250
  122. package/src/tigerbeetle/src/testing/storage.zig +0 -757
  123. package/src/tigerbeetle/src/testing/table.zig +0 -247
  124. package/src/tigerbeetle/src/testing/time.zig +0 -84
  125. package/src/tigerbeetle/src/tigerbeetle.zig +0 -227
  126. package/src/tigerbeetle/src/time.zig +0 -112
  127. package/src/tigerbeetle/src/tracer.zig +0 -529
  128. package/src/tigerbeetle/src/unit_tests.zig +0 -40
  129. package/src/tigerbeetle/src/vopr.zig +0 -495
  130. package/src/tigerbeetle/src/vsr/README.md +0 -209
  131. package/src/tigerbeetle/src/vsr/client.zig +0 -544
  132. package/src/tigerbeetle/src/vsr/clock.zig +0 -855
  133. package/src/tigerbeetle/src/vsr/journal.zig +0 -2415
  134. package/src/tigerbeetle/src/vsr/journal_format_fuzz.zig +0 -111
  135. package/src/tigerbeetle/src/vsr/marzullo.zig +0 -309
  136. package/src/tigerbeetle/src/vsr/replica.zig +0 -6616
  137. package/src/tigerbeetle/src/vsr/replica_format.zig +0 -219
  138. package/src/tigerbeetle/src/vsr/superblock.zig +0 -1631
  139. package/src/tigerbeetle/src/vsr/superblock_client_table.zig +0 -256
  140. package/src/tigerbeetle/src/vsr/superblock_free_set.zig +0 -929
  141. package/src/tigerbeetle/src/vsr/superblock_free_set_fuzz.zig +0 -334
  142. package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +0 -390
  143. package/src/tigerbeetle/src/vsr/superblock_manifest.zig +0 -615
  144. package/src/tigerbeetle/src/vsr/superblock_quorums.zig +0 -394
  145. package/src/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +0 -314
  146. package/src/tigerbeetle/src/vsr.zig +0 -1425
@@ -1,1169 +0,0 @@
1
- //! An LSM tree.
2
- const std = @import("std");
3
- const builtin = @import("builtin");
4
- const assert = std.debug.assert;
5
- const math = std.math;
6
- const mem = std.mem;
7
- const os = std.os;
8
-
9
- const log = std.log.scoped(.tree);
10
- const tracer = @import("../tracer.zig");
11
-
12
- const constants = @import("../constants.zig");
13
- const div_ceil = @import("../stdx.zig").div_ceil;
14
- const eytzinger = @import("eytzinger.zig").eytzinger;
15
- const vsr = @import("../vsr.zig");
16
- const bloom_filter = @import("bloom_filter.zig");
17
-
18
- const CompositeKey = @import("composite_key.zig").CompositeKey;
19
- const NodePool = @import("node_pool.zig").NodePool(constants.lsm_manifest_node_size, 16);
20
- const RingBuffer = @import("../ring_buffer.zig").RingBuffer;
21
-
22
- /// We reserve maxInt(u64) to indicate that a table has not been deleted.
23
- /// Tables that have not been deleted have snapshot_max of maxInt(u64).
24
- /// Since we ensure and assert that a query snapshot never exactly matches
25
- /// the snapshot_min/snapshot_max of a table, we must use maxInt(u64) - 1
26
- /// to query all non-deleted tables.
27
- pub const snapshot_latest: u64 = math.maxInt(u64) - 1;
28
-
29
- const half_bar_beat_count = @divExact(constants.lsm_batch_multiple, 2);
30
-
31
- // StateMachine:
32
- //
33
- // /// state machine will pass this on to all object stores
34
- // /// Read I/O only
35
- // pub fn read(batch, callback) void
36
- //
37
- // /// write the ops in batch to the memtable/objcache, previously called commit()
38
- // pub fn write(batch) void
39
- //
40
- // /// Flush in memory state to disk, perform merges, etc
41
- // /// Only function that triggers Write I/O in LSMs, as well as some Read
42
- // /// Make as incremental as possible, don't block the main thread, avoid high latency/spikes
43
- // pub fn flush(callback) void
44
- //
45
- // /// Write manifest info for all object stores into buffer
46
- // pub fn encode_superblock(buffer) void
47
- //
48
- // /// Restore all in-memory state from the superblock data
49
- // pub fn decode_superblock(buffer) void
50
- //
51
-
52
- /// The maximum number of tables for a single tree.
53
- pub const table_count_max = table_count_max_for_tree(
54
- constants.lsm_growth_factor,
55
- constants.lsm_levels,
56
- );
57
-
58
- /// The upper-bound count of input tables to a single tree's compaction.
59
- ///
60
- /// - +1 from level A.
61
- /// - +lsm_growth_factor from level B. The A-input table cannot overlap with an extra B-input table
62
- /// because input table selection is least-overlap. If the input table overlaps on one or both
63
- /// edges, there must be another table with less overlap to select.
64
- pub const compaction_tables_input_max = 1 + constants.lsm_growth_factor;
65
-
66
- /// The upper-bound count of output tables from a single tree's compaction.
67
- /// In the "worst" case, no keys are overwritten/merged, and no tombstones are dropped.
68
- pub const compaction_tables_output_max = compaction_tables_input_max;
69
-
70
- /// The maximum number of concurrent compactions (per tree).
71
- pub const compactions_max = div_ceil(constants.lsm_levels, 2);
72
-
73
- pub fn TreeType(comptime TreeTable: type, comptime Storage: type, comptime tree_name: [:0]const u8) type {
74
- const Key = TreeTable.Key;
75
- const Value = TreeTable.Value;
76
- const compare_keys = TreeTable.compare_keys;
77
- const tombstone = TreeTable.tombstone;
78
-
79
- const tree_hash = blk: {
80
- // Blake3 hash does alot at comptime..
81
- @setEvalBranchQuota(tree_name.len * 1024);
82
-
83
- var hash: u256 = undefined;
84
- std.crypto.hash.Blake3.hash(tree_name, std.mem.asBytes(&hash), .{});
85
- break :blk @truncate(u128, hash);
86
- };
87
-
88
- return struct {
89
- const Tree = @This();
90
-
91
- // Expose the Table & hash for the Groove.
92
- pub const Table = TreeTable;
93
- pub const name = tree_name;
94
- pub const hash = tree_hash;
95
-
96
- const Grid = @import("grid.zig").GridType(Storage);
97
- const Manifest = @import("manifest.zig").ManifestType(Table, Storage);
98
- pub const TableMutable = @import("table_mutable.zig").TableMutableType(Table);
99
- const TableImmutable = @import("table_immutable.zig").TableImmutableType(Table);
100
-
101
- const CompactionType = @import("compaction.zig").CompactionType;
102
- const TableIteratorType = @import("table_iterator.zig").TableIteratorType;
103
- const TableImmutableIteratorType = @import("table_immutable.zig").TableImmutableIteratorType;
104
-
105
- const CompactionTable = CompactionType(Table, Storage, TableIteratorType);
106
- const CompactionTableImmutable = CompactionType(Table, Storage, TableImmutableIteratorType);
107
-
108
- grid: *Grid,
109
- options: Options,
110
-
111
- table_mutable: TableMutable,
112
- table_immutable: TableImmutable,
113
- values_cache: ?*TableMutable.ValuesCache,
114
-
115
- manifest: Manifest,
116
-
117
- compaction_table_immutable: CompactionTableImmutable,
118
-
119
- /// The number of Compaction instances is divided by two as, at any given compaction tick,
120
- /// we're only compacting either even or odd levels but never both.
121
- /// Uses divFloor as the last level, even with odd lsm_levels, doesn't compact to anything.
122
- /// (e.g. floor(5/2) = 2 for levels 0->1, 2->3 when even and immut->0, 1->2, 3->4 when odd).
123
- /// This means, that for odd lsm_levels, the last CompactionTable is unused.
124
- compaction_table: [@divFloor(constants.lsm_levels, 2)]CompactionTable,
125
-
126
- /// While a compaction is running, this is the op of the last compact().
127
- /// While no compaction is running, this is the op of the last compact() to complete.
128
- /// (When recovering from a checkpoint, compaction_op starts at op_checkpoint).
129
- compaction_op: u64,
130
-
131
- /// The maximum snapshot which is safe to prefetch from.
132
- /// The minimum snapshot which can see the mutable table.
133
- ///
134
- /// This field ensures that the tree never queries the output tables of a running
135
- /// compaction; they are incomplete.
136
- ///
137
- /// See lookup_snapshot_max_for_checkpoint().
138
- ///
139
- /// Invariants:
140
- /// * `lookup_snapshot_max = compaction_op` while any compaction beat is in progress.
141
- /// * `lookup_snapshot_max = compaction_op + 1` after a compaction beat finishes.
142
- /// * `lookup_snapshot_max ≥ op_checkpoint + 1 + lsm_batch_multiple`
143
- /// when `op_checkpoint ≠ 0`.
144
- lookup_snapshot_max: u64,
145
-
146
- compaction_io_pending: usize,
147
- compaction_callback: ?fn (*Tree) void,
148
-
149
- checkpoint_callback: ?fn (*Tree) void,
150
- open_callback: ?fn (*Tree) void,
151
-
152
- tracer_slot: ?tracer.SpanStart = null,
153
-
154
- pub const Options = struct {
155
- /// The number of objects to cache in the set-associative value cache.
156
- cache_entries_max: u32 = 0,
157
- };
158
-
159
- pub fn init(
160
- allocator: mem.Allocator,
161
- node_pool: *NodePool,
162
- grid: *Grid,
163
- options: Options,
164
- ) !Tree {
165
- assert(grid.superblock.opened);
166
-
167
- var values_cache: ?*TableMutable.ValuesCache = null;
168
-
169
- if (options.cache_entries_max > 0) {
170
- // Cache is heap-allocated to pass a pointer into the mutable table.
171
- values_cache = try allocator.create(TableMutable.ValuesCache);
172
- }
173
- errdefer if (values_cache) |c| allocator.destroy(c);
174
-
175
- if (options.cache_entries_max > 0) {
176
- values_cache.?.* = try TableMutable.ValuesCache.init(
177
- allocator,
178
- options.cache_entries_max,
179
- );
180
- }
181
- errdefer if (values_cache) |c| c.deinit(allocator);
182
-
183
- var table_mutable = try TableMutable.init(allocator, values_cache);
184
- errdefer table_mutable.deinit(allocator);
185
-
186
- var table_immutable = try TableImmutable.init(allocator);
187
- errdefer table_immutable.deinit(allocator);
188
-
189
- var manifest = try Manifest.init(allocator, node_pool, grid, tree_hash);
190
- errdefer manifest.deinit(allocator);
191
-
192
- var compaction_table_immutable = try CompactionTableImmutable.init(
193
- allocator,
194
- std.fmt.comptimePrint("{s}(immutable->0)", .{tree_name}),
195
- );
196
- errdefer compaction_table_immutable.deinit(allocator);
197
-
198
- var compaction_table: [@divFloor(constants.lsm_levels, 2)]CompactionTable = undefined;
199
- {
200
- comptime var i: usize = 0;
201
- inline while (i < compaction_table.len) : (i += 1) {
202
- errdefer for (compaction_table[0..i]) |*c| c.deinit(allocator);
203
- const compaction_name = std.fmt.comptimePrint("{s}({}->{}/{}->{})", .{
204
- tree_name,
205
- 2 * i,
206
- 2 * i + 1,
207
- 2 * i + 1,
208
- 2 * i + 2,
209
- });
210
- compaction_table[i] = try CompactionTable.init(allocator, compaction_name);
211
- }
212
- }
213
- errdefer for (compaction_table) |*c| c.deinit(allocator);
214
-
215
- // Compaction is one bar ahead of superblock's commit_min.
216
- const op_checkpoint = grid.superblock.working.vsr_state.commit_min;
217
- const lookup_snapshot_max = lookup_snapshot_max_for_checkpoint(op_checkpoint);
218
- const compaction_op = op_checkpoint;
219
-
220
- return Tree{
221
- .grid = grid,
222
- .options = options,
223
- .table_mutable = table_mutable,
224
- .table_immutable = table_immutable,
225
- .values_cache = values_cache,
226
- .manifest = manifest,
227
- .compaction_table_immutable = compaction_table_immutable,
228
- .compaction_table = compaction_table,
229
- .compaction_op = compaction_op,
230
- .lookup_snapshot_max = lookup_snapshot_max,
231
- .compaction_io_pending = 0,
232
- .compaction_callback = null,
233
- .checkpoint_callback = null,
234
- .open_callback = null,
235
- };
236
- }
237
-
238
- pub fn deinit(tree: *Tree, allocator: mem.Allocator) void {
239
- assert(tree.tracer_slot == null);
240
-
241
- tree.compaction_table_immutable.deinit(allocator);
242
- for (tree.compaction_table) |*compaction| compaction.deinit(allocator);
243
-
244
- // TODO Consider whether we should release blocks acquired from Grid.block_free_set.
245
- tree.table_mutable.deinit(allocator);
246
- tree.table_immutable.deinit(allocator);
247
- tree.manifest.deinit(allocator);
248
-
249
- if (tree.values_cache) |cache| {
250
- cache.deinit(allocator);
251
- allocator.destroy(cache);
252
- }
253
- }
254
-
255
- pub fn put(tree: *Tree, value: *const Value) void {
256
- tree.table_mutable.put(value);
257
- }
258
-
259
- pub fn remove(tree: *Tree, value: *const Value) void {
260
- tree.table_mutable.remove(value);
261
- }
262
-
263
- /// Returns the value from the mutable or immutable table (possibly a tombstone),
264
- /// if one is available for the specified snapshot.
265
- pub fn lookup_from_memory(tree: *Tree, snapshot: u64, key: Key) ?*const Value {
266
- assert(tree.lookup_snapshot_max >= snapshot);
267
-
268
- if (tree.lookup_snapshot_max == snapshot) {
269
- if (tree.table_mutable.get(key)) |value| return value;
270
- } else {
271
- // The mutable table is converted to an immutable table when a snapshot is created.
272
- // This means that a past snapshot will never be able to see the mutable table.
273
- // This simplifies the mutable table and eliminates compaction for duplicate puts.
274
- }
275
-
276
- if (!tree.table_immutable.free and tree.table_immutable.snapshot_min <= snapshot) {
277
- if (tree.table_immutable.get(key)) |value| return value;
278
- } else {
279
- // If the immutable table is invisible, then the mutable table is also invisible.
280
- assert(tree.table_immutable.free or snapshot != tree.lookup_snapshot_max);
281
- }
282
-
283
- return null;
284
- }
285
-
286
- /// Call this function only after checking `lookup_from_memory()`.
287
- pub fn lookup_from_levels(
288
- tree: *Tree,
289
- callback: fn (*LookupContext, ?*const Value) void,
290
- context: *LookupContext,
291
- snapshot: u64,
292
- key: Key,
293
- ) void {
294
- assert(tree.lookup_snapshot_max >= snapshot);
295
- if (constants.verify) {
296
- // The caller is responsible for checking the mutable table.
297
- assert(tree.lookup_from_memory(snapshot, key) == null);
298
- }
299
-
300
- var index_block_count: u8 = 0;
301
- var index_block_addresses: [constants.lsm_levels]u64 = undefined;
302
- var index_block_checksums: [constants.lsm_levels]u128 = undefined;
303
- {
304
- var it = tree.manifest.lookup(snapshot, key);
305
- while (it.next()) |table| : (index_block_count += 1) {
306
- assert(table.visible(snapshot));
307
- assert(compare_keys(table.key_min, key) != .gt);
308
- assert(compare_keys(table.key_max, key) != .lt);
309
-
310
- index_block_addresses[index_block_count] = table.address;
311
- index_block_checksums[index_block_count] = table.checksum;
312
- }
313
- }
314
-
315
- if (index_block_count == 0) {
316
- callback(context, null);
317
- return;
318
- }
319
-
320
- // Hash the key to the fingerprint only once and reuse for all bloom filter checks.
321
- const fingerprint = bloom_filter.Fingerprint.create(mem.asBytes(&key));
322
-
323
- context.* = .{
324
- .tree = tree,
325
- .completion = undefined,
326
-
327
- .key = key,
328
- .fingerprint = fingerprint,
329
-
330
- .index_block_count = index_block_count,
331
- .index_block_addresses = index_block_addresses,
332
- .index_block_checksums = index_block_checksums,
333
-
334
- .callback = callback,
335
- };
336
-
337
- context.read_index_block();
338
- }
339
-
340
- pub const LookupContext = struct {
341
- const Read = Grid.Read;
342
- const BlockPtrConst = Grid.BlockPtrConst;
343
-
344
- tree: *Tree,
345
- completion: Read,
346
-
347
- key: Key,
348
- fingerprint: bloom_filter.Fingerprint,
349
-
350
- /// This value is an index into the index_block_addresses/checksums arrays.
351
- index_block: u8 = 0,
352
- index_block_count: u8,
353
- index_block_addresses: [constants.lsm_levels]u64,
354
- index_block_checksums: [constants.lsm_levels]u128,
355
-
356
- data_block: ?struct {
357
- address: u64,
358
- checksum: u128,
359
- } = null,
360
-
361
- callback: fn (*Tree.LookupContext, ?*const Value) void,
362
-
363
- fn read_index_block(context: *LookupContext) void {
364
- assert(context.data_block == null);
365
- assert(context.index_block < context.index_block_count);
366
- assert(context.index_block_count > 0);
367
- assert(context.index_block_count <= constants.lsm_levels);
368
-
369
- context.tree.grid.read_block(
370
- read_index_block_callback,
371
- &context.completion,
372
- context.index_block_addresses[context.index_block],
373
- context.index_block_checksums[context.index_block],
374
- .index,
375
- );
376
- }
377
-
378
- fn read_index_block_callback(completion: *Read, index_block: BlockPtrConst) void {
379
- const context = @fieldParentPtr(LookupContext, "completion", completion);
380
- assert(context.data_block == null);
381
- assert(context.index_block < context.index_block_count);
382
- assert(context.index_block_count > 0);
383
- assert(context.index_block_count <= constants.lsm_levels);
384
-
385
- const blocks = Table.index_blocks_for_key(index_block, context.key);
386
-
387
- context.data_block = .{
388
- .address = blocks.data_block_address,
389
- .checksum = blocks.data_block_checksum,
390
- };
391
-
392
- context.tree.grid.read_block(
393
- read_filter_block_callback,
394
- completion,
395
- blocks.filter_block_address,
396
- blocks.filter_block_checksum,
397
- .filter,
398
- );
399
- }
400
-
401
- fn read_filter_block_callback(completion: *Read, filter_block: BlockPtrConst) void {
402
- const context = @fieldParentPtr(LookupContext, "completion", completion);
403
- assert(context.data_block != null);
404
- assert(context.index_block < context.index_block_count);
405
- assert(context.index_block_count > 0);
406
- assert(context.index_block_count <= constants.lsm_levels);
407
-
408
- const filter_bytes = Table.filter_block_filter_const(filter_block);
409
- if (bloom_filter.may_contain(context.fingerprint, filter_bytes)) {
410
- context.tree.grid.read_block(
411
- read_data_block_callback,
412
- completion,
413
- context.data_block.?.address,
414
- context.data_block.?.checksum,
415
- .data,
416
- );
417
- } else {
418
- // The key is not present in this table, check the next level.
419
- context.advance_to_next_level();
420
- }
421
- }
422
-
423
- fn read_data_block_callback(completion: *Read, data_block: BlockPtrConst) void {
424
- const context = @fieldParentPtr(LookupContext, "completion", completion);
425
- assert(context.data_block != null);
426
- assert(context.index_block < context.index_block_count);
427
- assert(context.index_block_count > 0);
428
- assert(context.index_block_count <= constants.lsm_levels);
429
-
430
- if (Table.data_block_search(data_block, context.key)) |value| {
431
- context.callback(context, unwrap_tombstone(value));
432
- } else {
433
- // The key is not present in this table, check the next level.
434
- context.advance_to_next_level();
435
- }
436
- }
437
-
438
- fn advance_to_next_level(context: *LookupContext) void {
439
- assert(context.data_block != null);
440
- assert(context.index_block < context.index_block_count);
441
- assert(context.index_block_count > 0);
442
- assert(context.index_block_count <= constants.lsm_levels);
443
-
444
- context.index_block += 1;
445
- if (context.index_block == context.index_block_count) {
446
- context.callback(context, null);
447
- return;
448
- }
449
- assert(context.index_block < context.index_block_count);
450
-
451
- context.data_block = null;
452
- context.read_index_block();
453
- }
454
- };
455
-
456
- /// Returns null if the value is null or a tombstone, otherwise returns the value.
457
- /// We use tombstone values internally, but expose them as null to the user.
458
- /// This distinction enables us to cache a null result as a tombstone in our hash maps.
459
- pub inline fn unwrap_tombstone(value: ?*const Value) ?*const Value {
460
- return if (value == null or tombstone(value.?)) null else value.?;
461
- }
462
-
463
- pub fn open(tree: *Tree, callback: fn (*Tree) void) void {
464
- assert(tree.open_callback == null);
465
- tree.open_callback = callback;
466
-
467
- tree.manifest.open(manifest_open_callback);
468
- }
469
-
470
- fn manifest_open_callback(manifest: *Manifest) void {
471
- const tree = @fieldParentPtr(Tree, "manifest", manifest);
472
- assert(tree.open_callback != null);
473
-
474
- const callback = tree.open_callback.?;
475
- tree.open_callback = null;
476
- callback(tree);
477
- }
478
-
479
- const CompactionTableContext = struct {
480
- compaction: *CompactionTable,
481
- level_a: u8,
482
- level_b: u8,
483
- };
484
-
485
- const CompactionTableIterator = struct {
486
- tree: *Tree,
487
- index: u8 = 0,
488
-
489
- fn next(it: *CompactionTableIterator) ?CompactionTableContext {
490
- assert(it.tree.compaction_callback != null);
491
-
492
- const compaction_beat = it.tree.compaction_op % constants.lsm_batch_multiple;
493
- const even_levels = compaction_beat < half_bar_beat_count;
494
- const level_a = (it.index * 2) + @boolToInt(!even_levels);
495
- const level_b = level_a + 1;
496
-
497
- if (level_a >= constants.lsm_levels - 1) return null;
498
- assert(level_b < constants.lsm_levels);
499
-
500
- defer it.index += 1;
501
- return CompactionTableContext{
502
- .compaction = &it.tree.compaction_table[it.index],
503
- .level_a = level_a,
504
- .level_b = level_b,
505
- };
506
- }
507
- };
508
-
509
- /// Since concurrent compactions into and out of a level may contend for the same range:
510
- ///
511
- /// 1. compact level 0 to 1, level 2 to 3, level 4 to 5 etc., and then
512
- /// 2. compact the immutable table to level 0, level 1 to 2, level 3 to 4 etc.
513
- ///
514
- /// This order (even levels, then odd levels) is significant, since it reduces the number of
515
- /// level 0 tables that overlap with the immutable table, reducing write amplification.
516
- ///
517
- /// We therefore take the bar, during which all compactions run, and divide by two,
518
- /// running the compactions from even levels in the first half bar, and then the odd.
519
- ///
520
- /// Compactions start on the down beat of a half bar, using 0-based beats.
521
- /// For example, if there are 4 beats in a bar, start on beat 0 or beat 2.
522
- pub fn compact(tree: *Tree, callback: fn (*Tree) void, op: u64) void {
523
- assert(tree.compaction_callback == null);
524
- assert(op != 0);
525
- assert(op == tree.compaction_op + 1);
526
- assert(op > tree.grid.superblock.working.vsr_state.commit_min);
527
-
528
- tree.compaction_op = op;
529
-
530
- if (op < constants.lsm_batch_multiple) {
531
- // There is nothing to compact for the first measure.
532
- // We skip the main compaction code path first compaction bar entirely because it
533
- // is a special case — its first beat is 1, not 0.
534
-
535
- tree.lookup_snapshot_max = op + 1;
536
- if (op + 1 == constants.lsm_batch_multiple) {
537
- tree.compact_mutable_table_into_immutable();
538
- }
539
-
540
- callback(tree);
541
- return;
542
- }
543
-
544
- if (tree.grid.superblock.working.vsr_state.op_compacted(op)) {
545
- // We recovered from a checkpoint, and must avoid replaying one bar of
546
- // compactions that were applied before the checkpoint. Repeating these ops'
547
- // compactions would actually perform different compactions than before,
548
- // causing the storage state of the replica to diverge from the cluster.
549
- // See also: lookup_snapshot_max_for_checkpoint().
550
-
551
- if (op + 1 == tree.lookup_snapshot_max) {
552
- // This is the last op of the skipped compaction bar.
553
- // Prepare the immutable table for the next bar — since this state is
554
- // in-memory, it cannot be skipped.
555
- tree.compact_mutable_table_into_immutable();
556
- }
557
-
558
- // TODO Defer this callback until tick() to avoid stack growth.
559
- callback(tree);
560
- return;
561
- }
562
- assert(op == tree.lookup_snapshot_max);
563
-
564
- tree.compact_start(callback);
565
- tree.compact_drive();
566
- }
567
-
568
- fn compact_start(tree: *Tree, callback: fn (*Tree) void) void {
569
- assert(tree.compaction_io_pending == 0);
570
- assert(tree.compaction_callback == null);
571
-
572
- if (constants.verify) {
573
- tree.manifest.verify(tree.compaction_op);
574
- }
575
-
576
- tracer.start(
577
- &tree.tracer_slot,
578
- .{ .tree = .{ .tree_name = tree_name } },
579
- .tree_compaction_beat,
580
- @src(),
581
- );
582
-
583
- tree.compaction_callback = callback;
584
-
585
- const compaction_beat = tree.compaction_op % constants.lsm_batch_multiple;
586
- const start = (compaction_beat == 0) or
587
- (compaction_beat == half_bar_beat_count);
588
-
589
- const op_min = compaction_op_min(tree.compaction_op);
590
- assert(op_min < snapshot_latest);
591
- assert(op_min % half_bar_beat_count == 0);
592
-
593
- log.debug(tree_name ++ ": compact_start: op={d} op_min={d} beat={d}/{d}", .{
594
- tree.compaction_op,
595
- op_min,
596
- compaction_beat + 1,
597
- constants.lsm_batch_multiple,
598
- });
599
-
600
- if (start) tree.manifest.reserve();
601
-
602
- // Try to start compacting the immutable table.
603
- const even_levels = compaction_beat < half_bar_beat_count;
604
- if (even_levels) {
605
- assert(tree.compaction_table_immutable.status == .idle);
606
- } else {
607
- if (start) tree.compact_start_table_immutable(op_min);
608
- }
609
-
610
- // Try to start compacting the other levels.
611
- var it = CompactionTableIterator{ .tree = tree };
612
- while (it.next()) |context| {
613
- if (start) tree.compact_start_table(op_min, context);
614
- }
615
- }
616
-
617
- fn compact_start_table_immutable(tree: *Tree, op_min: u64) void {
618
- const compaction_beat = tree.compaction_op % constants.lsm_batch_multiple;
619
- assert(compaction_beat == half_bar_beat_count);
620
-
621
- // Do not start compaction if the immutable table does not require compaction.
622
- if (tree.table_immutable.free) return;
623
-
624
- assert(tree.table_immutable.snapshot_min % half_bar_beat_count == 0);
625
-
626
- const values_count = tree.table_immutable.values.len;
627
- assert(values_count > 0);
628
-
629
- const level_b: u8 = 0;
630
- const table_a: ?*const Manifest.TableInfo = null;
631
- const range = tree.manifest.compaction_range(
632
- level_b,
633
- tree.table_immutable.key_min(),
634
- tree.table_immutable.key_max(),
635
- );
636
-
637
- assert(range.table_count >= 1);
638
- assert(range.table_count <= compaction_tables_input_max);
639
- assert(compare_keys(range.key_min, tree.table_immutable.key_min()) != .gt);
640
- assert(compare_keys(range.key_max, tree.table_immutable.key_max()) != .lt);
641
-
642
- log.debug(tree_name ++
643
- ": compacting immutable table to level 0 " ++
644
- "(values.len={d} snapshot_min={d} compaction.op_min={d} table_count={d})", .{
645
- tree.table_immutable.values.len,
646
- tree.table_immutable.snapshot_min,
647
- op_min,
648
- range.table_count,
649
- });
650
-
651
- tree.compaction_table_immutable.start(
652
- tree.grid,
653
- &tree.manifest,
654
- op_min,
655
- range,
656
- table_a,
657
- level_b,
658
- .{ .table = &tree.table_immutable },
659
- );
660
- }
661
-
662
- fn compact_start_table(tree: *Tree, op_min: u64, context: CompactionTableContext) void {
663
- const compaction_beat = tree.compaction_op % half_bar_beat_count;
664
- assert(compaction_beat == 0);
665
-
666
- assert(context.level_a < constants.lsm_levels);
667
- assert(context.level_b < constants.lsm_levels);
668
- assert(context.level_a + 1 == context.level_b);
669
-
670
- // Do not start compaction if level A does not require compaction.
671
- const table_range = tree.manifest.compaction_table(context.level_a) orelse return;
672
- const table = table_range.table;
673
-
674
- assert(table_range.range.table_count >= 1);
675
- assert(table_range.range.table_count <= compaction_tables_input_max);
676
- assert(compare_keys(table.key_min, table.key_max) != .gt);
677
- assert(compare_keys(table_range.range.key_min, table.key_min) != .gt);
678
- assert(compare_keys(table_range.range.key_max, table.key_max) != .lt);
679
-
680
- log.debug(tree_name ++ ": compacting {d} tables from level {d} to level {d}", .{
681
- table_range.range.table_count,
682
- context.level_a,
683
- context.level_b,
684
- });
685
-
686
- context.compaction.start(
687
- tree.grid,
688
- &tree.manifest,
689
- op_min,
690
- table_range.range,
691
- table_range.table,
692
- context.level_b,
693
- .{
694
- .grid = tree.grid,
695
- .address = table.address,
696
- .checksum = table.checksum,
697
- },
698
- );
699
- }
700
-
701
- fn compact_drive(tree: *Tree) void {
702
- assert(tree.compaction_io_pending <= 2 + tree.compaction_table.len);
703
- assert(tree.compaction_callback != null);
704
-
705
- // Always start one fake io_pending that is resolved right after
706
- // to handle the case where this compaction tick triggers no IO.
707
- // (For example, ticking the immutable table, or level B is already done).
708
- tree.compaction_io_pending += 1;
709
- defer tree.compact_tick_done();
710
-
711
- // Try to tick the immutable table compaction:
712
- const compaction_beat = tree.compaction_op % constants.lsm_batch_multiple;
713
- const even_levels = compaction_beat < half_bar_beat_count;
714
- if (even_levels) {
715
- assert(tree.compaction_table_immutable.status == .idle);
716
- } else {
717
- tree.compact_tick(&tree.compaction_table_immutable);
718
- }
719
-
720
- // Try to tick the compaction for each level:
721
- var it = CompactionTableIterator{ .tree = tree };
722
- while (it.next()) |context| {
723
- tree.compact_tick(context.compaction);
724
- }
725
- }
726
-
727
- fn compact_tick(tree: *Tree, compaction: anytype) void {
728
- if (compaction.status != .processing) return;
729
- tree.compaction_io_pending += 1;
730
-
731
- const compaction_beat = tree.compaction_op % constants.lsm_batch_multiple;
732
- const even_levels = compaction_beat < half_bar_beat_count;
733
- assert(compaction.level_b < constants.lsm_levels);
734
- assert(compaction.level_b % 2 == @boolToInt(even_levels));
735
-
736
- if (@TypeOf(compaction.*) == CompactionTableImmutable) {
737
- assert(compaction.level_b == 0);
738
- log.debug(tree_name ++ ": compact_tick() for immutable table to level 0", .{});
739
- compaction.compact_tick(Tree.compact_tick_callback_table_immutable);
740
- } else {
741
- assert(@TypeOf(compaction.*) == CompactionTable);
742
- log.debug(tree_name ++ ": compact_tick() for level {d} to level {d}", .{
743
- compaction.level_b - 1,
744
- compaction.level_b,
745
- });
746
- compaction.compact_tick(Tree.compact_tick_callback_table);
747
- }
748
- }
749
-
750
- fn compact_tick_callback_table_immutable(compaction: *CompactionTableImmutable) void {
751
- assert(compaction.status == .processing or compaction.status == .done);
752
- assert(compaction.level_b < constants.lsm_levels);
753
- assert(compaction.level_b == 0);
754
-
755
- const tree = @fieldParentPtr(Tree, "compaction_table_immutable", compaction);
756
- const compaction_beat = tree.compaction_op % constants.lsm_batch_multiple;
757
- assert(compaction_beat >= half_bar_beat_count);
758
-
759
- log.debug(tree_name ++ ": compact_tick() complete for immutable table to level 0", .{});
760
- tree.compact_tick_done();
761
- }
762
-
763
- fn compact_tick_callback_table(compaction: *CompactionTable) void {
764
- assert(compaction.status == .processing or compaction.status == .done);
765
- assert(compaction.level_b < constants.lsm_levels);
766
- assert(compaction.level_b > 0);
767
-
768
- const table_offset = @divFloor(compaction.level_b - 1, 2);
769
- const table_ptr = @ptrCast([*]CompactionTable, compaction) - table_offset;
770
-
771
- const table_size = @divFloor(constants.lsm_levels, 2);
772
- const table: *[table_size]CompactionTable = table_ptr[0..table_size];
773
-
774
- log.debug(tree_name ++ ": compact_tick() complete for level {d} to level {d}", .{
775
- compaction.level_b - 1,
776
- compaction.level_b,
777
- });
778
-
779
- const tree = @fieldParentPtr(Tree, "compaction_table", table);
780
- tree.compact_tick_done();
781
- }
782
-
783
- fn compact_tick_done(tree: *Tree) void {
784
- assert(tree.compaction_io_pending <= 2 + tree.compaction_table.len);
785
- assert(tree.compaction_callback != null);
786
-
787
- // compact_done() is called after all compact_tick()'s complete.
788
- tree.compaction_io_pending -= 1;
789
- if (tree.compaction_io_pending == 0) tree.compact_done();
790
- }
791
-
792
- /// Called at the end of each compaction tick.
793
- fn compact_done(tree: *Tree) void {
794
- assert(tree.compaction_io_pending == 0);
795
- assert(tree.compaction_callback != null);
796
- assert(tree.compaction_op == tree.lookup_snapshot_max);
797
-
798
- const compaction_beat = tree.compaction_op % constants.lsm_batch_multiple;
799
- const even_levels = compaction_beat < half_bar_beat_count;
800
- const compacted_levels_even = compaction_beat == half_bar_beat_count - 1;
801
- const compacted_levels_odd = compaction_beat == constants.lsm_batch_multiple - 1;
802
- if (!compacted_levels_even and !compacted_levels_odd) {
803
- // TODO(Deterministic Beats): Remove this when compact_done() is called exactly
804
- // once when the beat finishes.
805
- tree.lookup_snapshot_max = tree.compaction_op + 1;
806
-
807
- tree.compact_finish();
808
- return;
809
- }
810
-
811
- // At the end of the second and fourth beat:
812
- // 1. Tick the Compactions until all have completed.
813
- // 2. Remove invisible tables from the manifest.
814
- // 3. Compact the manifest.
815
- // Then at the end of the fourth beat, freeze the mutable table.
816
- assert(compacted_levels_even or compacted_levels_odd);
817
- assert(compacted_levels_even != compacted_levels_odd);
818
-
819
- const still_compacting = blk: {
820
- if (even_levels) {
821
- assert(tree.compaction_table_immutable.status == .idle);
822
- } else {
823
- if (tree.compaction_table_immutable.status == .processing) break :blk true;
824
- }
825
-
826
- var it = CompactionTableIterator{ .tree = tree };
827
- while (it.next()) |context| {
828
- if (context.compaction.status == .processing) break :blk true;
829
- }
830
- break :blk false;
831
- };
832
-
833
- if (still_compacting) {
834
- // We are at the end of a half-bar, but the compactions have not finished.
835
- // We keep ticking them until they finish.
836
- log.debug(tree_name ++ ": compact_done: driving outstanding compactions", .{});
837
- tree.compact_drive();
838
- return;
839
- }
840
-
841
- // TODO(Deterministic Beats): Move this to the top of the function when compact_done()
842
- // is called exactly once when the beat finishes.
843
- tree.lookup_snapshot_max = tree.compaction_op + 1;
844
-
845
- // All compactions have finished for the current half-bar.
846
- // We couldn't remove the (invisible) input tables until now because prefetch()
847
- // needs a complete set of tables for lookups to avoid missing data.
848
-
849
- // Reset the immutable table Compaction.
850
- // Also clear any tables made invisible by the compaction.
851
- if (!even_levels) {
852
- switch (tree.compaction_table_immutable.status) {
853
- // The compaction wasn't started for this half bar.
854
- .idle => assert(tree.table_immutable.free),
855
- .processing => unreachable,
856
- .done => {
857
- tree.compaction_table_immutable.reset();
858
- tree.table_immutable.clear();
859
- tree.manifest.remove_invisible_tables(
860
- tree.compaction_table_immutable.level_b,
861
- tree.lookup_snapshot_max,
862
- tree.compaction_table_immutable.range.key_min,
863
- tree.compaction_table_immutable.range.key_max,
864
- );
865
- },
866
- }
867
- }
868
-
869
- // Reset all the other Compactions.
870
- // Also clear any tables made invisible by the compactions.
871
- var it = CompactionTableIterator{ .tree = tree };
872
- while (it.next()) |context| {
873
- switch (context.compaction.status) {
874
- .idle => {}, // The compaction wasn't started for this half bar.
875
- .processing => unreachable,
876
- .done => {
877
- context.compaction.reset();
878
- tree.manifest.remove_invisible_tables(
879
- context.compaction.level_b,
880
- tree.lookup_snapshot_max,
881
- context.compaction.range.key_min,
882
- context.compaction.range.key_max,
883
- );
884
- if (context.compaction.level_b > 0) {
885
- tree.manifest.remove_invisible_tables(
886
- context.compaction.level_b - 1,
887
- tree.lookup_snapshot_max,
888
- context.compaction.range.key_min,
889
- context.compaction.range.key_max,
890
- );
891
- }
892
- },
893
- }
894
- }
895
-
896
- assert(tree.compaction_table_immutable.status == .idle);
897
- it = CompactionTableIterator{ .tree = tree };
898
- while (it.next()) |context| {
899
- assert(context.compaction.status == .idle);
900
- }
901
-
902
- // At the end of the fourth/last beat:
903
- // - Assert all visible tables haven't overflowed their max per level.
904
- // - Convert mutable table to immutable table for next bar.
905
- if (compacted_levels_odd) {
906
- tree.manifest.assert_level_table_counts();
907
- tree.compact_mutable_table_into_immutable();
908
- }
909
-
910
- // At the end of the second/fourth beat:
911
- // - Compact the manifest before invoking the compact() callback.
912
- tree.manifest.compact(compact_manifest_callback);
913
- }
914
-
915
- /// Called after the last beat of a full compaction bar.
916
- fn compact_mutable_table_into_immutable(tree: *Tree) void {
917
- assert(tree.table_immutable.free);
918
- assert((tree.compaction_op + 1) % constants.lsm_batch_multiple == 0);
919
- assert(tree.compaction_op + 1 == tree.lookup_snapshot_max);
920
-
921
- if (tree.table_mutable.count() == 0) return;
922
-
923
- // Sort the mutable table values directly into the immutable table's array.
924
- const values_max = tree.table_immutable.values_max();
925
- const values = tree.table_mutable.sort_into_values_and_clear(values_max);
926
- assert(values.ptr == values_max.ptr);
927
-
928
- // The immutable table must be visible to the next bar — setting its snapshot_min to
929
- // lookup_snapshot_max guarantees.
930
- //
931
- // In addition, the immutable table is conceptually an output table of this compaction
932
- // bar, and now its snapshot_min matches the snapshot_min of the Compactions' output
933
- // tables.
934
- tree.table_immutable.reset_with_sorted_values(tree.lookup_snapshot_max, values);
935
-
936
- assert(tree.table_mutable.count() == 0);
937
- assert(!tree.table_immutable.free);
938
- }
939
-
940
- fn compact_manifest_callback(manifest: *Manifest) void {
941
- const tree = @fieldParentPtr(Tree, "manifest", manifest);
942
- assert(tree.compaction_io_pending == 0);
943
- assert(tree.compaction_callback != null);
944
- tree.compact_finish();
945
- }
946
-
947
- /// Called at the end of each compaction beat.
948
- fn compact_finish(tree: *Tree) void {
949
- assert(tree.compaction_io_pending == 0);
950
-
951
- tracer.end(
952
- &tree.tracer_slot,
953
- .{ .tree = .{ .tree_name = tree_name } },
954
- .tree_compaction_beat,
955
- );
956
-
957
- if (constants.verify) {
958
- tree.manifest.verify(tree.compaction_op);
959
- }
960
-
961
- // Invoke the compact() callback after the manifest compacts at the end of the beat.
962
- const callback = tree.compaction_callback.?;
963
- tree.compaction_callback = null;
964
- callback(tree);
965
- }
966
-
967
- pub fn checkpoint(tree: *Tree, callback: fn (*Tree) void) void {
968
- // Assert no outstanding compact_tick() work.
969
- assert(tree.compaction_io_pending == 0);
970
- assert(tree.compaction_callback == null);
971
- assert(tree.compaction_op > 0);
972
- assert(tree.compaction_op + 1 == tree.lookup_snapshot_max);
973
- // Don't re-run the checkpoint we recovered from.
974
- assert(!tree.grid.superblock.working.vsr_state.op_compacted(tree.compaction_op));
975
-
976
- // Assert that this is the last beat in the compaction bar.
977
- const compaction_beat = tree.compaction_op % constants.lsm_batch_multiple;
978
- const last_beat_in_bar = constants.lsm_batch_multiple - 1;
979
- assert(last_beat_in_bar == compaction_beat);
980
-
981
- // Assert no outstanding compactions.
982
- assert(tree.compaction_table_immutable.status == .idle);
983
- for (tree.compaction_table) |*compaction| {
984
- assert(compaction.status == .idle);
985
- }
986
-
987
- // Assert all manifest levels haven't overflowed their table counts.
988
- tree.manifest.assert_level_table_counts();
989
-
990
- // Assert that we're checkpointing only after invisible tables have been removed.
991
- if (constants.verify) {
992
- tree.manifest.assert_no_invisible_tables(compaction_op_min(tree.compaction_op));
993
- }
994
-
995
- // Start an asynchronous checkpoint on the manifest.
996
- assert(tree.checkpoint_callback == null);
997
- tree.checkpoint_callback = callback;
998
- tree.manifest.checkpoint(manifest_checkpoint_callback);
999
- }
1000
-
1001
- fn manifest_checkpoint_callback(manifest: *Manifest) void {
1002
- const tree = @fieldParentPtr(Tree, "manifest", manifest);
1003
- assert(tree.checkpoint_callback != null);
1004
-
1005
- const callback = tree.checkpoint_callback.?;
1006
- tree.checkpoint_callback = null;
1007
- callback(tree);
1008
- }
1009
-
1010
- pub const RangeQuery = union(enum) {
1011
- bounded: struct {
1012
- start: Key,
1013
- end: Key,
1014
- },
1015
- open: struct {
1016
- start: Key,
1017
- order: enum {
1018
- ascending,
1019
- descending,
1020
- },
1021
- },
1022
- };
1023
-
1024
- pub const RangeQueryIterator = struct {
1025
- tree: *Tree,
1026
- snapshot: u64,
1027
- query: RangeQuery,
1028
-
1029
- pub fn next(callback: fn (result: ?Value) void) void {
1030
- _ = callback;
1031
- }
1032
- };
1033
-
1034
- pub fn range_query(
1035
- tree: *Tree,
1036
- /// The snapshot timestamp, if any
1037
- snapshot: ?u64,
1038
- query: RangeQuery,
1039
- ) RangeQueryIterator {
1040
- _ = tree;
1041
- _ = snapshot;
1042
- _ = query;
1043
- }
1044
- };
1045
- }
1046
-
1047
- /// Returns the first op of the compaction (Compaction.op_min) for a given op/beat.
1048
- ///
1049
- /// After this compaction finishes:
1050
- /// - `op_min + half_bar_beat_count - 1` will be the input tables' snapshot_max.
1051
- /// - `op_min + half_bar_beat_count` will be the output tables' snapshot_min.
1052
- ///
1053
- /// Each half-bar has a separate op_min (for deriving the output snapshot_min) instead of each full
1054
- /// bar because this allows the output tables of the first half-bar's compaction to be prefetched
1055
- /// against earlier — hopefully while they are still warm in the cache from being written.
1056
- pub fn compaction_op_min(op: u64) u64 {
1057
- return op - op % half_bar_beat_count;
1058
- }
1059
-
1060
- /// These charts depict the commit/compact ops and `lookup_snapshot_max` over a series of
1061
- /// commits and compactions (with lsm_batch_multiple=8).
1062
- ///
1063
- /// Legend:
1064
- ///
1065
- /// ┼ full bar (first half-bar start)
1066
- /// ┬ half bar (second half-bar start)
1067
- /// $ lookup_snapshot_max (prefetch reads from the current snapshot)
1068
- /// This is incremented at the end of each compact().
1069
- /// . op is in mutable table (in memory)
1070
- /// , op is in immutable table (in memory)
1071
- /// # op is on disk
1072
- /// ✓ checkpoint() may follow compact()
1073
- ///
1074
- /// 0 2 4 6 8 0 2 4 6
1075
- /// ┼───┬───┼───┬───┼
1076
- /// .$ ╷ ╷ init(superblock.commit_min=0)⎤ Compaction is effectively a noop for the
1077
- /// .$ ╷ ╷ commit;compact( 1) start/end ⎥ first bar because there are no tables on
1078
- /// ..$ ╷ ╷ commit;compact( 2) start/end ⎥ disk yet, and no immutable table to
1079
- /// ...$ ╷ ╷ commit;compact( 3) start/end ⎥ flush.
1080
- /// ....$ ╷ ╷ commit;compact( 4) start/end ⎥
1081
- /// .....$ ╷ ╷ commit;compact( 5) start/end ⎥ This applies:
1082
- /// ......$ ╷ ╷ commit;compact( 6) start/end ⎥ - when the LSM is starting on a freshly
1083
- /// .......$╷ ╷ commit;compact( 7) start ⎤⎥ formatted data file, and also
1084
- /// ,,,,,,,,$ ╷ ✓ compact( 7) end⎦⎦ - when the LSM is recovering from a crash
1085
- /// ,,,,,,,,$ ╷ commit;compact( 8) start/end (see below).
1086
- /// ,,,,,,,,.$ ╷ commit;compact( 9) start/end
1087
- /// ,,,,,,,,..$ ╷ commit;compact(10) start/end
1088
- /// ,,,,,,,,...$ ╷ commit;compact(11) start/end
1089
- /// ,,,,,,,,....$ ╷ commit;compact(12) start/end
1090
- /// ,,,,,,,,.....$ ╷ commit;compact(13) start/end
1091
- /// ,,,,,,,,......$ ╷ commit;compact(14) start/end
1092
- /// ,,,,,,,,.......$╷ commit;compact(15) start ⎤
1093
- /// ########,,,,,,,,$ ✓ compact(15) end⎦
1094
- /// ########,,,,,,,,$ commit;compact(16) start/end
1095
- /// ┼───┬───┼───┬───┼
1096
- /// 0 2 4 6 8 0 2 4 6
1097
- /// ┼───┬───┼───┬───┼ Recover with a checkpoint taken at op 15.
1098
- /// ######## $ init(superblock.commit_min=7) At op 15, ops 8…15 are in memory, so they
1099
- /// ########. $ commit ( 8) start/end ⎤ were dropped by the crash.
1100
- /// ########.. $ commit ( 9) start/end ⎥
1101
- /// ########... $ commit (10) start/end ⎥ But compaction is not run for ops 8…15
1102
- /// ########.... $ commit (11) start/end ⎥ because it was already performed
1103
- /// ########..... $ commit (12) start/end ⎥ before the checkpoint.
1104
- /// ########...... $ commit (13) start/end ⎥
1105
- /// ########....... $ commit (14) start/end ⎥ We can begin to compact again at op 16,
1106
- /// ########........$ commit (15) start ⎤⎥ because those compactions (if previously
1107
- /// ########,,,,,,,,$ ✓ (15) end⎦⎦ performed) are not included in the
1108
- /// ########,,,,,,,,$ commit;compact(16) start/end checkpoint.
1109
- /// ┼───┬───┼───┬───┼
1110
- /// 0 2 4 6 8 0 2 4 6
1111
- ///
1112
- /// Notice how in the checkpoint recovery example above, we are careful not to `compact(op)` twice
1113
- /// for any op (even if we crash/recover), since that could lead to differences between replicas'
1114
- /// storage. The last bar of `commit()`s is always only in memory, so it is safe to repeat.
1115
- ///
1116
- /// Additionally, while skipping compactions during recovery, we use a `lookup_snapshot_max`
1117
- /// different than the original compactions — the old tables may have been removed during the
1118
- /// checkpoint.
1119
- fn lookup_snapshot_max_for_checkpoint(op_checkpoint: u64) u64 {
1120
- if (op_checkpoint == 0) {
1121
- // Start from 1 because we never commit op 0.
1122
- return 1;
1123
- } else {
1124
- return op_checkpoint + constants.lsm_batch_multiple + 1;
1125
- }
1126
- }
1127
-
1128
- /// The total number of tables that can be supported by the tree across so many levels.
1129
- pub fn table_count_max_for_tree(growth_factor: u32, levels_count: u32) u32 {
1130
- assert(growth_factor >= 4);
1131
- assert(growth_factor <= 16); // Limit excessive write amplification.
1132
- assert(levels_count >= 2);
1133
- assert(levels_count <= 10); // Limit excessive read amplification.
1134
- assert(levels_count <= constants.lsm_levels);
1135
-
1136
- var count: u32 = 0;
1137
- var level: u32 = 0;
1138
- while (level < levels_count) : (level += 1) {
1139
- count += table_count_max_for_level(growth_factor, level);
1140
- }
1141
- return count;
1142
- }
1143
-
1144
- /// The total number of tables that can be supported by the level alone.
1145
- pub fn table_count_max_for_level(growth_factor: u32, level: u32) u32 {
1146
- assert(level >= 0);
1147
- assert(level < constants.lsm_levels);
1148
-
1149
- return math.pow(u32, growth_factor, level + 1);
1150
- }
1151
-
1152
- test "table_count_max_for_level/tree" {
1153
- const expectEqual = std.testing.expectEqual;
1154
-
1155
- try expectEqual(@as(u32, 8), table_count_max_for_level(8, 0));
1156
- try expectEqual(@as(u32, 64), table_count_max_for_level(8, 1));
1157
- try expectEqual(@as(u32, 512), table_count_max_for_level(8, 2));
1158
- try expectEqual(@as(u32, 4096), table_count_max_for_level(8, 3));
1159
- try expectEqual(@as(u32, 32768), table_count_max_for_level(8, 4));
1160
- try expectEqual(@as(u32, 262144), table_count_max_for_level(8, 5));
1161
- try expectEqual(@as(u32, 2097152), table_count_max_for_level(8, 6));
1162
-
1163
- try expectEqual(@as(u32, 8 + 64), table_count_max_for_tree(8, 2));
1164
- try expectEqual(@as(u32, 72 + 512), table_count_max_for_tree(8, 3));
1165
- try expectEqual(@as(u32, 584 + 4096), table_count_max_for_tree(8, 4));
1166
- try expectEqual(@as(u32, 4680 + 32768), table_count_max_for_tree(8, 5));
1167
- try expectEqual(@as(u32, 37448 + 262144), table_count_max_for_tree(8, 6));
1168
- try expectEqual(@as(u32, 299592 + 2097152), table_count_max_for_tree(8, 7));
1169
- }