tigerbeetle-node 0.11.12 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. package/README.md +212 -196
  2. package/dist/bin/aarch64-linux-gnu/client.node +0 -0
  3. package/dist/bin/aarch64-linux-musl/client.node +0 -0
  4. package/dist/bin/aarch64-macos/client.node +0 -0
  5. package/dist/bin/x86_64-linux-gnu/client.node +0 -0
  6. package/dist/bin/x86_64-linux-musl/client.node +0 -0
  7. package/dist/bin/x86_64-macos/client.node +0 -0
  8. package/dist/index.js +33 -1
  9. package/dist/index.js.map +1 -1
  10. package/package-lock.json +66 -0
  11. package/package.json +8 -17
  12. package/src/index.ts +56 -1
  13. package/src/node.zig +10 -9
  14. package/dist/.client.node.sha256 +0 -1
  15. package/scripts/build_lib.sh +0 -61
  16. package/scripts/download_node_headers.sh +0 -32
  17. package/src/tigerbeetle/scripts/benchmark.bat +0 -48
  18. package/src/tigerbeetle/scripts/benchmark.sh +0 -66
  19. package/src/tigerbeetle/scripts/confirm_image.sh +0 -44
  20. package/src/tigerbeetle/scripts/fuzz_loop.sh +0 -15
  21. package/src/tigerbeetle/scripts/fuzz_unique_errors.sh +0 -7
  22. package/src/tigerbeetle/scripts/install.bat +0 -7
  23. package/src/tigerbeetle/scripts/install.sh +0 -21
  24. package/src/tigerbeetle/scripts/install_zig.bat +0 -113
  25. package/src/tigerbeetle/scripts/install_zig.sh +0 -90
  26. package/src/tigerbeetle/scripts/lint.zig +0 -199
  27. package/src/tigerbeetle/scripts/pre-commit.sh +0 -9
  28. package/src/tigerbeetle/scripts/scripts/benchmark.bat +0 -48
  29. package/src/tigerbeetle/scripts/scripts/benchmark.sh +0 -66
  30. package/src/tigerbeetle/scripts/scripts/confirm_image.sh +0 -44
  31. package/src/tigerbeetle/scripts/scripts/fuzz_loop.sh +0 -15
  32. package/src/tigerbeetle/scripts/scripts/fuzz_unique_errors.sh +0 -7
  33. package/src/tigerbeetle/scripts/scripts/install.bat +0 -7
  34. package/src/tigerbeetle/scripts/scripts/install.sh +0 -21
  35. package/src/tigerbeetle/scripts/scripts/install_zig.bat +0 -113
  36. package/src/tigerbeetle/scripts/scripts/install_zig.sh +0 -90
  37. package/src/tigerbeetle/scripts/scripts/lint.zig +0 -199
  38. package/src/tigerbeetle/scripts/scripts/pre-commit.sh +0 -9
  39. package/src/tigerbeetle/scripts/scripts/shellcheck.sh +0 -5
  40. package/src/tigerbeetle/scripts/scripts/tests_on_alpine.sh +0 -10
  41. package/src/tigerbeetle/scripts/scripts/tests_on_ubuntu.sh +0 -14
  42. package/src/tigerbeetle/scripts/scripts/upgrade_ubuntu_kernel.sh +0 -48
  43. package/src/tigerbeetle/scripts/scripts/validate_docs.sh +0 -23
  44. package/src/tigerbeetle/scripts/scripts/vr_state_enumerate +0 -46
  45. package/src/tigerbeetle/scripts/shellcheck.sh +0 -5
  46. package/src/tigerbeetle/scripts/tests_on_alpine.sh +0 -10
  47. package/src/tigerbeetle/scripts/tests_on_ubuntu.sh +0 -14
  48. package/src/tigerbeetle/scripts/upgrade_ubuntu_kernel.sh +0 -48
  49. package/src/tigerbeetle/scripts/validate_docs.sh +0 -23
  50. package/src/tigerbeetle/scripts/vr_state_enumerate +0 -46
  51. package/src/tigerbeetle/src/benchmark.zig +0 -314
  52. package/src/tigerbeetle/src/config.zig +0 -234
  53. package/src/tigerbeetle/src/constants.zig +0 -436
  54. package/src/tigerbeetle/src/ewah.zig +0 -286
  55. package/src/tigerbeetle/src/ewah_benchmark.zig +0 -120
  56. package/src/tigerbeetle/src/ewah_fuzz.zig +0 -130
  57. package/src/tigerbeetle/src/fifo.zig +0 -120
  58. package/src/tigerbeetle/src/io/benchmark.zig +0 -213
  59. package/src/tigerbeetle/src/io/darwin.zig +0 -814
  60. package/src/tigerbeetle/src/io/linux.zig +0 -1062
  61. package/src/tigerbeetle/src/io/test.zig +0 -643
  62. package/src/tigerbeetle/src/io/windows.zig +0 -1183
  63. package/src/tigerbeetle/src/io.zig +0 -34
  64. package/src/tigerbeetle/src/iops.zig +0 -107
  65. package/src/tigerbeetle/src/lsm/README.md +0 -308
  66. package/src/tigerbeetle/src/lsm/binary_search.zig +0 -341
  67. package/src/tigerbeetle/src/lsm/bloom_filter.zig +0 -125
  68. package/src/tigerbeetle/src/lsm/compaction.zig +0 -603
  69. package/src/tigerbeetle/src/lsm/composite_key.zig +0 -77
  70. package/src/tigerbeetle/src/lsm/direction.zig +0 -11
  71. package/src/tigerbeetle/src/lsm/eytzinger.zig +0 -587
  72. package/src/tigerbeetle/src/lsm/eytzinger_benchmark.zig +0 -330
  73. package/src/tigerbeetle/src/lsm/forest.zig +0 -204
  74. package/src/tigerbeetle/src/lsm/forest_fuzz.zig +0 -401
  75. package/src/tigerbeetle/src/lsm/grid.zig +0 -573
  76. package/src/tigerbeetle/src/lsm/groove.zig +0 -972
  77. package/src/tigerbeetle/src/lsm/k_way_merge.zig +0 -474
  78. package/src/tigerbeetle/src/lsm/level_iterator.zig +0 -332
  79. package/src/tigerbeetle/src/lsm/manifest.zig +0 -617
  80. package/src/tigerbeetle/src/lsm/manifest_level.zig +0 -877
  81. package/src/tigerbeetle/src/lsm/manifest_log.zig +0 -789
  82. package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +0 -691
  83. package/src/tigerbeetle/src/lsm/merge_iterator.zig +0 -106
  84. package/src/tigerbeetle/src/lsm/node_pool.zig +0 -235
  85. package/src/tigerbeetle/src/lsm/posted_groove.zig +0 -378
  86. package/src/tigerbeetle/src/lsm/segmented_array.zig +0 -1328
  87. package/src/tigerbeetle/src/lsm/segmented_array_benchmark.zig +0 -148
  88. package/src/tigerbeetle/src/lsm/segmented_array_fuzz.zig +0 -9
  89. package/src/tigerbeetle/src/lsm/set_associative_cache.zig +0 -850
  90. package/src/tigerbeetle/src/lsm/table.zig +0 -1031
  91. package/src/tigerbeetle/src/lsm/table_immutable.zig +0 -203
  92. package/src/tigerbeetle/src/lsm/table_iterator.zig +0 -340
  93. package/src/tigerbeetle/src/lsm/table_mutable.zig +0 -220
  94. package/src/tigerbeetle/src/lsm/test.zig +0 -438
  95. package/src/tigerbeetle/src/lsm/tree.zig +0 -1193
  96. package/src/tigerbeetle/src/lsm/tree_fuzz.zig +0 -474
  97. package/src/tigerbeetle/src/message_bus.zig +0 -1012
  98. package/src/tigerbeetle/src/message_pool.zig +0 -156
  99. package/src/tigerbeetle/src/ring_buffer.zig +0 -399
  100. package/src/tigerbeetle/src/simulator.zig +0 -569
  101. package/src/tigerbeetle/src/state_machine/auditor.zig +0 -577
  102. package/src/tigerbeetle/src/state_machine/workload.zig +0 -883
  103. package/src/tigerbeetle/src/state_machine.zig +0 -1881
  104. package/src/tigerbeetle/src/static_allocator.zig +0 -65
  105. package/src/tigerbeetle/src/stdx.zig +0 -162
  106. package/src/tigerbeetle/src/storage.zig +0 -393
  107. package/src/tigerbeetle/src/testing/cluster/message_bus.zig +0 -82
  108. package/src/tigerbeetle/src/testing/cluster/network.zig +0 -237
  109. package/src/tigerbeetle/src/testing/cluster/state_checker.zig +0 -169
  110. package/src/tigerbeetle/src/testing/cluster/storage_checker.zig +0 -202
  111. package/src/tigerbeetle/src/testing/cluster.zig +0 -443
  112. package/src/tigerbeetle/src/testing/fuzz.zig +0 -140
  113. package/src/tigerbeetle/src/testing/hash_log.zig +0 -66
  114. package/src/tigerbeetle/src/testing/id.zig +0 -99
  115. package/src/tigerbeetle/src/testing/packet_simulator.zig +0 -364
  116. package/src/tigerbeetle/src/testing/priority_queue.zig +0 -645
  117. package/src/tigerbeetle/src/testing/reply_sequence.zig +0 -139
  118. package/src/tigerbeetle/src/testing/state_machine.zig +0 -249
  119. package/src/tigerbeetle/src/testing/storage.zig +0 -757
  120. package/src/tigerbeetle/src/testing/table.zig +0 -247
  121. package/src/tigerbeetle/src/testing/time.zig +0 -84
  122. package/src/tigerbeetle/src/tigerbeetle.zig +0 -227
  123. package/src/tigerbeetle/src/time.zig +0 -112
  124. package/src/tigerbeetle/src/tracer.zig +0 -529
  125. package/src/tigerbeetle/src/unit_tests.zig +0 -42
  126. package/src/tigerbeetle/src/vopr.zig +0 -495
  127. package/src/tigerbeetle/src/vsr/README.md +0 -209
  128. package/src/tigerbeetle/src/vsr/client.zig +0 -544
  129. package/src/tigerbeetle/src/vsr/clock.zig +0 -853
  130. package/src/tigerbeetle/src/vsr/journal.zig +0 -2413
  131. package/src/tigerbeetle/src/vsr/journal_format_fuzz.zig +0 -111
  132. package/src/tigerbeetle/src/vsr/marzullo.zig +0 -309
  133. package/src/tigerbeetle/src/vsr/replica.zig +0 -6381
  134. package/src/tigerbeetle/src/vsr/replica_format.zig +0 -219
  135. package/src/tigerbeetle/src/vsr/superblock.zig +0 -1631
  136. package/src/tigerbeetle/src/vsr/superblock_client_table.zig +0 -256
  137. package/src/tigerbeetle/src/vsr/superblock_free_set.zig +0 -929
  138. package/src/tigerbeetle/src/vsr/superblock_free_set_fuzz.zig +0 -334
  139. package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +0 -390
  140. package/src/tigerbeetle/src/vsr/superblock_manifest.zig +0 -615
  141. package/src/tigerbeetle/src/vsr/superblock_quorums.zig +0 -394
  142. package/src/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +0 -314
  143. package/src/tigerbeetle/src/vsr.zig +0 -1352
@@ -1,1193 +0,0 @@
1
- //! An LSM tree.
2
- const std = @import("std");
3
- const builtin = @import("builtin");
4
- const assert = std.debug.assert;
5
- const math = std.math;
6
- const mem = std.mem;
7
- const os = std.os;
8
-
9
- const log = std.log.scoped(.tree);
10
- const tracer = @import("../tracer.zig");
11
-
12
- const constants = @import("../constants.zig");
13
- const div_ceil = @import("../stdx.zig").div_ceil;
14
- const eytzinger = @import("eytzinger.zig").eytzinger;
15
- const vsr = @import("../vsr.zig");
16
- const bloom_filter = @import("bloom_filter.zig");
17
-
18
- const CompositeKey = @import("composite_key.zig").CompositeKey;
19
- const NodePool = @import("node_pool.zig").NodePool(constants.lsm_manifest_node_size, 16);
20
- const RingBuffer = @import("../ring_buffer.zig").RingBuffer;
21
-
22
- /// We reserve maxInt(u64) to indicate that a table has not been deleted.
23
- /// Tables that have not been deleted have snapshot_max of maxInt(u64).
24
- /// Since we ensure and assert that a query snapshot never exactly matches
25
- /// the snapshot_min/snapshot_max of a table, we must use maxInt(u64) - 1
26
- /// to query all non-deleted tables.
27
- pub const snapshot_latest: u64 = math.maxInt(u64) - 1;
28
-
29
- const half_bar_beat_count = @divExact(constants.lsm_batch_multiple, 2);
30
-
31
- // StateMachine:
32
- //
33
- // /// state machine will pass this on to all object stores
34
- // /// Read I/O only
35
- // pub fn read(batch, callback) void
36
- //
37
- // /// write the ops in batch to the memtable/objcache, previously called commit()
38
- // pub fn write(batch) void
39
- //
40
- // /// Flush in memory state to disk, perform merges, etc
41
- // /// Only function that triggers Write I/O in LSMs, as well as some Read
42
- // /// Make as incremental as possible, don't block the main thread, avoid high latency/spikes
43
- // pub fn flush(callback) void
44
- //
45
- // /// Write manifest info for all object stores into buffer
46
- // pub fn encode_superblock(buffer) void
47
- //
48
- // /// Restore all in-memory state from the superblock data
49
- // pub fn decode_superblock(buffer) void
50
- //
51
-
52
- /// The maximum number of tables for a single tree.
53
- pub const table_count_max = table_count_max_for_tree(
54
- constants.lsm_growth_factor,
55
- constants.lsm_levels,
56
- );
57
-
58
- /// The upper-bound count of input tables to a single tree's compaction.
59
- ///
60
- /// - +1 from level A.
61
- /// - +lsm_growth_factor from level B. The A-input table cannot overlap with an extra B-input table
62
- /// because input table selection is least-overlap. If the input table overlaps on one or both
63
- /// edges, there must be another table with less overlap to select.
64
- pub const compaction_tables_input_max = 1 + constants.lsm_growth_factor;
65
-
66
- /// The upper-bound count of output tables from a single tree's compaction.
67
- /// In the "worst" case, no keys are overwritten/merged, and no tombstones are dropped.
68
- pub const compaction_tables_output_max = compaction_tables_input_max;
69
-
70
- /// The maximum number of concurrent compactions (per tree).
71
- pub const compactions_max = div_ceil(constants.lsm_levels, 2);
72
-
73
- pub fn TreeType(comptime TreeTable: type, comptime Storage: type, comptime tree_name: [:0]const u8) type {
74
- const Key = TreeTable.Key;
75
- const Value = TreeTable.Value;
76
- const compare_keys = TreeTable.compare_keys;
77
- const tombstone = TreeTable.tombstone;
78
-
79
- const tree_hash = blk: {
80
- // Blake3 hash does alot at comptime..
81
- @setEvalBranchQuota(tree_name.len * 1024);
82
-
83
- var hash: u256 = undefined;
84
- std.crypto.hash.Blake3.hash(tree_name, std.mem.asBytes(&hash), .{});
85
- break :blk @truncate(u128, hash);
86
- };
87
-
88
- return struct {
89
- const Tree = @This();
90
-
91
- // Expose the Table & hash for the Groove.
92
- pub const Table = TreeTable;
93
- pub const name = tree_name;
94
- pub const hash = tree_hash;
95
-
96
- const Grid = @import("grid.zig").GridType(Storage);
97
- const Manifest = @import("manifest.zig").ManifestType(Table, Storage);
98
- pub const TableMutable = @import("table_mutable.zig").TableMutableType(Table);
99
- const TableImmutable = @import("table_immutable.zig").TableImmutableType(Table);
100
-
101
- const CompactionType = @import("compaction.zig").CompactionType;
102
- const TableIteratorType = @import("table_iterator.zig").TableIteratorType;
103
- const TableImmutableIteratorType = @import("table_immutable.zig").TableImmutableIteratorType;
104
-
105
- const CompactionTable = CompactionType(Table, Storage, TableIteratorType);
106
- const CompactionTableImmutable = CompactionType(Table, Storage, TableImmutableIteratorType);
107
-
108
- grid: *Grid,
109
- options: Options,
110
-
111
- table_mutable: TableMutable,
112
- table_immutable: TableImmutable,
113
- values_cache: ?*TableMutable.ValuesCache,
114
-
115
- manifest: Manifest,
116
-
117
- compaction_table_immutable: CompactionTableImmutable,
118
-
119
- /// The number of Compaction instances is divided by two as, at any given compaction tick,
120
- /// we're only compacting either even or odd levels but never both.
121
- /// Uses divFloor as the last level, even with odd lsm_levels, doesn't compact to anything.
122
- /// (e.g. floor(5/2) = 2 for levels 0->1, 2->3 when even and immut->0, 1->2, 3->4 when odd).
123
- /// This means, that for odd lsm_levels, the last CompactionTable is unused.
124
- compaction_table: [@divFloor(constants.lsm_levels, 2)]CompactionTable,
125
-
126
- /// While a compaction is running, this is the op of the last compact().
127
- /// While no compaction is running, this is the op of the last compact() to complete.
128
- /// (When recovering from a checkpoint, compaction_op starts at op_checkpoint).
129
- compaction_op: u64,
130
-
131
- /// The maximum snapshot which is safe to prefetch from.
132
- /// The minimum snapshot which can see the mutable table.
133
- ///
134
- /// This field ensures that the tree never queries the output tables of a running
135
- /// compaction; they are incomplete.
136
- ///
137
- /// See lookup_snapshot_max_for_checkpoint().
138
- ///
139
- /// Invariants:
140
- /// * `lookup_snapshot_max = compaction_op` while any compaction beat is in progress.
141
- /// * `lookup_snapshot_max = compaction_op + 1` after a compaction beat finishes.
142
- /// * `lookup_snapshot_max ≥ op_checkpoint + 1 + lsm_batch_multiple`
143
- /// when `op_checkpoint ≠ 0`.
144
- lookup_snapshot_max: u64,
145
-
146
- compaction_io_pending: usize,
147
- compaction_callback: ?fn (*Tree) void,
148
-
149
- checkpoint_callback: ?fn (*Tree) void,
150
- open_callback: ?fn (*Tree) void,
151
-
152
- tracer_slot: ?tracer.SpanStart = null,
153
-
154
- pub const Options = struct {
155
- /// The maximum number of keys that may be committed per batch.
156
- ///
157
- /// In general, the commit count max for a field depends on the field's object —
158
- /// how many objects might be inserted/updated/removed by a batch:
159
- /// (constants.message_size_max - sizeOf(vsr.header))
160
- /// For example, there are at most 8191 transfers in a batch.
161
- /// So commit_entries_max=8191 for transfer objects and indexes.
162
- ///
163
- /// However, if a transfer is ever mutated, then this will double commit_entries_max
164
- /// since the old index might need to be removed, and the new index inserted.
165
- ///
166
- /// A way to see this is by looking at the state machine. If a transfer is inserted,
167
- /// how many accounts and transfer put/removes will be generated?
168
- ///
169
- /// This also means looking at the state machine operation that will generate the
170
- /// most put/removes in the worst case.
171
- /// For example, create_accounts will put at most 8191 accounts.
172
- /// However, create_transfers will put 2 accounts (8191 * 2) for every transfer, and
173
- /// some of these accounts may exist, requiring a remove/put to update the index.
174
- commit_entries_max: u32,
175
- /// The number of objects to cache in the set-associative value cache.
176
- cache_entries_max: u32 = 0,
177
- };
178
-
179
- pub fn init(
180
- allocator: mem.Allocator,
181
- node_pool: *NodePool,
182
- grid: *Grid,
183
- options: Options,
184
- ) !Tree {
185
- assert(options.commit_entries_max > 0);
186
- assert(grid.superblock.opened);
187
-
188
- var values_cache: ?*TableMutable.ValuesCache = null;
189
-
190
- if (options.cache_entries_max > 0) {
191
- // Cache is heap-allocated to pass a pointer into the mutable table.
192
- values_cache = try allocator.create(TableMutable.ValuesCache);
193
- }
194
- errdefer if (values_cache) |c| allocator.destroy(c);
195
-
196
- if (options.cache_entries_max > 0) {
197
- values_cache.?.* = try TableMutable.ValuesCache.init(
198
- allocator,
199
- options.cache_entries_max,
200
- );
201
- }
202
- errdefer if (values_cache) |c| c.deinit(allocator);
203
-
204
- var table_mutable = try TableMutable.init(allocator, values_cache, options.commit_entries_max);
205
- errdefer table_mutable.deinit(allocator);
206
-
207
- var table_immutable = try TableImmutable.init(allocator, options.commit_entries_max);
208
- errdefer table_immutable.deinit(allocator);
209
-
210
- assert(table_immutable.value_count_max == table_mutable.value_count_max);
211
-
212
- var manifest = try Manifest.init(allocator, node_pool, grid, tree_hash);
213
- errdefer manifest.deinit(allocator);
214
-
215
- var compaction_table_immutable = try CompactionTableImmutable.init(
216
- allocator,
217
- std.fmt.comptimePrint("{s}(immutable->0)", .{tree_name}),
218
- );
219
- errdefer compaction_table_immutable.deinit(allocator);
220
-
221
- var compaction_table: [@divFloor(constants.lsm_levels, 2)]CompactionTable = undefined;
222
- {
223
- comptime var i: usize = 0;
224
- inline while (i < compaction_table.len) : (i += 1) {
225
- errdefer for (compaction_table[0..i]) |*c| c.deinit(allocator);
226
- const compaction_name = std.fmt.comptimePrint("{s}({}->{}/{}->{})", .{
227
- tree_name,
228
- 2 * i,
229
- 2 * i + 1,
230
- 2 * i + 1,
231
- 2 * i + 2,
232
- });
233
- compaction_table[i] = try CompactionTable.init(allocator, compaction_name);
234
- }
235
- }
236
- errdefer for (compaction_table) |*c| c.deinit(allocator);
237
-
238
- // Compaction is one bar ahead of superblock's commit_min.
239
- const op_checkpoint = grid.superblock.working.vsr_state.commit_min;
240
- const lookup_snapshot_max = lookup_snapshot_max_for_checkpoint(op_checkpoint);
241
- const compaction_op = op_checkpoint;
242
-
243
- return Tree{
244
- .grid = grid,
245
- .options = options,
246
- .table_mutable = table_mutable,
247
- .table_immutable = table_immutable,
248
- .values_cache = values_cache,
249
- .manifest = manifest,
250
- .compaction_table_immutable = compaction_table_immutable,
251
- .compaction_table = compaction_table,
252
- .compaction_op = compaction_op,
253
- .lookup_snapshot_max = lookup_snapshot_max,
254
- .compaction_io_pending = 0,
255
- .compaction_callback = null,
256
- .checkpoint_callback = null,
257
- .open_callback = null,
258
- };
259
- }
260
-
261
- pub fn deinit(tree: *Tree, allocator: mem.Allocator) void {
262
- assert(tree.tracer_slot == null);
263
-
264
- tree.compaction_table_immutable.deinit(allocator);
265
- for (tree.compaction_table) |*compaction| compaction.deinit(allocator);
266
-
267
- // TODO Consider whether we should release blocks acquired from Grid.block_free_set.
268
- tree.table_mutable.deinit(allocator);
269
- tree.table_immutable.deinit(allocator);
270
- tree.manifest.deinit(allocator);
271
-
272
- if (tree.values_cache) |cache| {
273
- cache.deinit(allocator);
274
- allocator.destroy(cache);
275
- }
276
- }
277
-
278
- pub fn put(tree: *Tree, value: *const Value) void {
279
- tree.table_mutable.put(value);
280
- }
281
-
282
- pub fn remove(tree: *Tree, value: *const Value) void {
283
- tree.table_mutable.remove(value);
284
- }
285
-
286
- /// Returns the value from the mutable or immutable table (possibly a tombstone),
287
- /// if one is available for the specified snapshot.
288
- pub fn lookup_from_memory(tree: *Tree, snapshot: u64, key: Key) ?*const Value {
289
- assert(tree.lookup_snapshot_max >= snapshot);
290
-
291
- if (tree.lookup_snapshot_max == snapshot) {
292
- if (tree.table_mutable.get(key)) |value| return value;
293
- } else {
294
- // The mutable table is converted to an immutable table when a snapshot is created.
295
- // This means that a past snapshot will never be able to see the mutable table.
296
- // This simplifies the mutable table and eliminates compaction for duplicate puts.
297
- }
298
-
299
- if (!tree.table_immutable.free and tree.table_immutable.snapshot_min <= snapshot) {
300
- if (tree.table_immutable.get(key)) |value| return value;
301
- } else {
302
- // If the immutable table is invisible, then the mutable table is also invisible.
303
- assert(tree.table_immutable.free or snapshot != tree.lookup_snapshot_max);
304
- }
305
-
306
- return null;
307
- }
308
-
309
- /// Call this function only after checking `lookup_from_memory()`.
310
- pub fn lookup_from_levels(
311
- tree: *Tree,
312
- callback: fn (*LookupContext, ?*const Value) void,
313
- context: *LookupContext,
314
- snapshot: u64,
315
- key: Key,
316
- ) void {
317
- assert(tree.lookup_snapshot_max >= snapshot);
318
- if (constants.verify) {
319
- // The caller is responsible for checking the mutable table.
320
- assert(tree.lookup_from_memory(snapshot, key) == null);
321
- }
322
-
323
- var index_block_count: u8 = 0;
324
- var index_block_addresses: [constants.lsm_levels]u64 = undefined;
325
- var index_block_checksums: [constants.lsm_levels]u128 = undefined;
326
- {
327
- var it = tree.manifest.lookup(snapshot, key);
328
- while (it.next()) |table| : (index_block_count += 1) {
329
- assert(table.visible(snapshot));
330
- assert(compare_keys(table.key_min, key) != .gt);
331
- assert(compare_keys(table.key_max, key) != .lt);
332
-
333
- index_block_addresses[index_block_count] = table.address;
334
- index_block_checksums[index_block_count] = table.checksum;
335
- }
336
- }
337
-
338
- if (index_block_count == 0) {
339
- callback(context, null);
340
- return;
341
- }
342
-
343
- // Hash the key to the fingerprint only once and reuse for all bloom filter checks.
344
- const fingerprint = bloom_filter.Fingerprint.create(mem.asBytes(&key));
345
-
346
- context.* = .{
347
- .tree = tree,
348
- .completion = undefined,
349
-
350
- .key = key,
351
- .fingerprint = fingerprint,
352
-
353
- .index_block_count = index_block_count,
354
- .index_block_addresses = index_block_addresses,
355
- .index_block_checksums = index_block_checksums,
356
-
357
- .callback = callback,
358
- };
359
-
360
- context.read_index_block();
361
- }
362
-
363
- pub const LookupContext = struct {
364
- const Read = Grid.Read;
365
- const BlockPtrConst = Grid.BlockPtrConst;
366
-
367
- tree: *Tree,
368
- completion: Read,
369
-
370
- key: Key,
371
- fingerprint: bloom_filter.Fingerprint,
372
-
373
- /// This value is an index into the index_block_addresses/checksums arrays.
374
- index_block: u8 = 0,
375
- index_block_count: u8,
376
- index_block_addresses: [constants.lsm_levels]u64,
377
- index_block_checksums: [constants.lsm_levels]u128,
378
-
379
- data_block: ?struct {
380
- address: u64,
381
- checksum: u128,
382
- } = null,
383
-
384
- callback: fn (*Tree.LookupContext, ?*const Value) void,
385
-
386
- fn read_index_block(context: *LookupContext) void {
387
- assert(context.data_block == null);
388
- assert(context.index_block < context.index_block_count);
389
- assert(context.index_block_count > 0);
390
- assert(context.index_block_count <= constants.lsm_levels);
391
-
392
- context.tree.grid.read_block(
393
- read_index_block_callback,
394
- &context.completion,
395
- context.index_block_addresses[context.index_block],
396
- context.index_block_checksums[context.index_block],
397
- .index,
398
- );
399
- }
400
-
401
- fn read_index_block_callback(completion: *Read, index_block: BlockPtrConst) void {
402
- const context = @fieldParentPtr(LookupContext, "completion", completion);
403
- assert(context.data_block == null);
404
- assert(context.index_block < context.index_block_count);
405
- assert(context.index_block_count > 0);
406
- assert(context.index_block_count <= constants.lsm_levels);
407
-
408
- const blocks = Table.index_blocks_for_key(index_block, context.key);
409
-
410
- context.data_block = .{
411
- .address = blocks.data_block_address,
412
- .checksum = blocks.data_block_checksum,
413
- };
414
-
415
- context.tree.grid.read_block(
416
- read_filter_block_callback,
417
- completion,
418
- blocks.filter_block_address,
419
- blocks.filter_block_checksum,
420
- .filter,
421
- );
422
- }
423
-
424
- fn read_filter_block_callback(completion: *Read, filter_block: BlockPtrConst) void {
425
- const context = @fieldParentPtr(LookupContext, "completion", completion);
426
- assert(context.data_block != null);
427
- assert(context.index_block < context.index_block_count);
428
- assert(context.index_block_count > 0);
429
- assert(context.index_block_count <= constants.lsm_levels);
430
-
431
- const filter_bytes = Table.filter_block_filter_const(filter_block);
432
- if (bloom_filter.may_contain(context.fingerprint, filter_bytes)) {
433
- context.tree.grid.read_block(
434
- read_data_block_callback,
435
- completion,
436
- context.data_block.?.address,
437
- context.data_block.?.checksum,
438
- .data,
439
- );
440
- } else {
441
- // The key is not present in this table, check the next level.
442
- context.advance_to_next_level();
443
- }
444
- }
445
-
446
- fn read_data_block_callback(completion: *Read, data_block: BlockPtrConst) void {
447
- const context = @fieldParentPtr(LookupContext, "completion", completion);
448
- assert(context.data_block != null);
449
- assert(context.index_block < context.index_block_count);
450
- assert(context.index_block_count > 0);
451
- assert(context.index_block_count <= constants.lsm_levels);
452
-
453
- if (Table.data_block_search(data_block, context.key)) |value| {
454
- context.callback(context, unwrap_tombstone(value));
455
- } else {
456
- // The key is not present in this table, check the next level.
457
- context.advance_to_next_level();
458
- }
459
- }
460
-
461
- fn advance_to_next_level(context: *LookupContext) void {
462
- assert(context.data_block != null);
463
- assert(context.index_block < context.index_block_count);
464
- assert(context.index_block_count > 0);
465
- assert(context.index_block_count <= constants.lsm_levels);
466
-
467
- context.index_block += 1;
468
- if (context.index_block == context.index_block_count) {
469
- context.callback(context, null);
470
- return;
471
- }
472
- assert(context.index_block < context.index_block_count);
473
-
474
- context.data_block = null;
475
- context.read_index_block();
476
- }
477
- };
478
-
479
- /// Returns null if the value is null or a tombstone, otherwise returns the value.
480
- /// We use tombstone values internally, but expose them as null to the user.
481
- /// This distinction enables us to cache a null result as a tombstone in our hash maps.
482
- pub inline fn unwrap_tombstone(value: ?*const Value) ?*const Value {
483
- return if (value == null or tombstone(value.?)) null else value.?;
484
- }
485
-
486
- pub fn open(tree: *Tree, callback: fn (*Tree) void) void {
487
- assert(tree.open_callback == null);
488
- tree.open_callback = callback;
489
-
490
- tree.manifest.open(manifest_open_callback);
491
- }
492
-
493
- fn manifest_open_callback(manifest: *Manifest) void {
494
- const tree = @fieldParentPtr(Tree, "manifest", manifest);
495
- assert(tree.open_callback != null);
496
-
497
- const callback = tree.open_callback.?;
498
- tree.open_callback = null;
499
- callback(tree);
500
- }
501
-
502
- const CompactionTableContext = struct {
503
- compaction: *CompactionTable,
504
- level_a: u8,
505
- level_b: u8,
506
- };
507
-
508
- const CompactionTableIterator = struct {
509
- tree: *Tree,
510
- index: u8 = 0,
511
-
512
- fn next(it: *CompactionTableIterator) ?CompactionTableContext {
513
- assert(it.tree.compaction_callback != null);
514
-
515
- const compaction_beat = it.tree.compaction_op % constants.lsm_batch_multiple;
516
- const even_levels = compaction_beat < half_bar_beat_count;
517
- const level_a = (it.index * 2) + @boolToInt(!even_levels);
518
- const level_b = level_a + 1;
519
-
520
- if (level_a >= constants.lsm_levels - 1) return null;
521
- assert(level_b < constants.lsm_levels);
522
-
523
- defer it.index += 1;
524
- return CompactionTableContext{
525
- .compaction = &it.tree.compaction_table[it.index],
526
- .level_a = level_a,
527
- .level_b = level_b,
528
- };
529
- }
530
- };
531
-
532
- /// Since concurrent compactions into and out of a level may contend for the same range:
533
- ///
534
- /// 1. compact level 0 to 1, level 2 to 3, level 4 to 5 etc., and then
535
- /// 2. compact the immutable table to level 0, level 1 to 2, level 3 to 4 etc.
536
- ///
537
- /// This order (even levels, then odd levels) is significant, since it reduces the number of
538
- /// level 0 tables that overlap with the immutable table, reducing write amplification.
539
- ///
540
- /// We therefore take the bar, during which all compactions run, and divide by two,
541
- /// running the compactions from even levels in the first half bar, and then the odd.
542
- ///
543
- /// Compactions start on the down beat of a half bar, using 0-based beats.
544
- /// For example, if there are 4 beats in a bar, start on beat 0 or beat 2.
545
- pub fn compact(tree: *Tree, callback: fn (*Tree) void, op: u64) void {
546
- assert(tree.compaction_callback == null);
547
- assert(op != 0);
548
- assert(op == tree.compaction_op + 1);
549
- assert(op > tree.grid.superblock.working.vsr_state.commit_min);
550
-
551
- tree.compaction_op = op;
552
-
553
- if (op < constants.lsm_batch_multiple) {
554
- // There is nothing to compact for the first measure.
555
- // We skip the main compaction code path first compaction bar entirely because it
556
- // is a special case — its first beat is 1, not 0.
557
-
558
- tree.lookup_snapshot_max = op + 1;
559
- if (op + 1 == constants.lsm_batch_multiple) {
560
- tree.compact_mutable_table_into_immutable();
561
- }
562
-
563
- callback(tree);
564
- return;
565
- }
566
-
567
- if (tree.grid.superblock.working.vsr_state.op_compacted(op)) {
568
- // We recovered from a checkpoint, and must avoid replaying one bar of
569
- // compactions that were applied before the checkpoint. Repeating these ops'
570
- // compactions would actually perform different compactions than before,
571
- // causing the storage state of the replica to diverge from the cluster.
572
- // See also: lookup_snapshot_max_for_checkpoint().
573
-
574
- if (op + 1 == tree.lookup_snapshot_max) {
575
- // This is the last op of the skipped compaction bar.
576
- // Prepare the immutable table for the next bar — since this state is
577
- // in-memory, it cannot be skipped.
578
- tree.compact_mutable_table_into_immutable();
579
- }
580
-
581
- // TODO Defer this callback until tick() to avoid stack growth.
582
- callback(tree);
583
- return;
584
- }
585
- assert(op == tree.lookup_snapshot_max);
586
-
587
- tree.compact_start(callback);
588
- tree.compact_drive();
589
- }
590
-
591
- fn compact_start(tree: *Tree, callback: fn (*Tree) void) void {
592
- assert(tree.compaction_io_pending == 0);
593
- assert(tree.compaction_callback == null);
594
-
595
- if (constants.verify) {
596
- tree.manifest.verify(tree.compaction_op);
597
- }
598
-
599
- tracer.start(
600
- &tree.tracer_slot,
601
- .{ .tree = .{ .tree_name = tree_name } },
602
- .tree_compaction_beat,
603
- @src(),
604
- );
605
-
606
- tree.compaction_callback = callback;
607
-
608
- const compaction_beat = tree.compaction_op % constants.lsm_batch_multiple;
609
- const start = (compaction_beat == 0) or
610
- (compaction_beat == half_bar_beat_count);
611
-
612
- const op_min = compaction_op_min(tree.compaction_op);
613
- assert(op_min < snapshot_latest);
614
- assert(op_min % half_bar_beat_count == 0);
615
-
616
- log.debug(tree_name ++ ": compact_start: op={d} op_min={d} beat={d}/{d}", .{
617
- tree.compaction_op,
618
- op_min,
619
- compaction_beat + 1,
620
- constants.lsm_batch_multiple,
621
- });
622
-
623
- if (start) tree.manifest.reserve();
624
-
625
- // Try to start compacting the immutable table.
626
- const even_levels = compaction_beat < half_bar_beat_count;
627
- if (even_levels) {
628
- assert(tree.compaction_table_immutable.status == .idle);
629
- } else {
630
- if (start) tree.compact_start_table_immutable(op_min);
631
- }
632
-
633
- // Try to start compacting the other levels.
634
- var it = CompactionTableIterator{ .tree = tree };
635
- while (it.next()) |context| {
636
- if (start) tree.compact_start_table(op_min, context);
637
- }
638
- }
639
-
640
- fn compact_start_table_immutable(tree: *Tree, op_min: u64) void {
641
- const compaction_beat = tree.compaction_op % constants.lsm_batch_multiple;
642
- assert(compaction_beat == half_bar_beat_count);
643
-
644
- // Do not start compaction if the immutable table does not require compaction.
645
- if (tree.table_immutable.free) return;
646
-
647
- assert(tree.table_immutable.snapshot_min % half_bar_beat_count == 0);
648
-
649
- const values_count = tree.table_immutable.values.len;
650
- assert(values_count > 0);
651
-
652
- const level_b: u8 = 0;
653
- const table_a: ?*const Manifest.TableInfo = null;
654
- const range = tree.manifest.compaction_range(
655
- level_b,
656
- tree.table_immutable.key_min(),
657
- tree.table_immutable.key_max(),
658
- );
659
-
660
- assert(range.table_count >= 1);
661
- assert(range.table_count <= compaction_tables_input_max);
662
- assert(compare_keys(range.key_min, tree.table_immutable.key_min()) != .gt);
663
- assert(compare_keys(range.key_max, tree.table_immutable.key_max()) != .lt);
664
-
665
- log.debug(tree_name ++
666
- ": compacting immutable table to level 0 " ++
667
- "(values.len={d} snapshot_min={d} compaction.op_min={d} table_count={d})", .{
668
- tree.table_immutable.values.len,
669
- tree.table_immutable.snapshot_min,
670
- op_min,
671
- range.table_count,
672
- });
673
-
674
- tree.compaction_table_immutable.start(
675
- tree.grid,
676
- &tree.manifest,
677
- op_min,
678
- range,
679
- table_a,
680
- level_b,
681
- .{ .table = &tree.table_immutable },
682
- );
683
- }
684
-
685
- fn compact_start_table(tree: *Tree, op_min: u64, context: CompactionTableContext) void {
686
- const compaction_beat = tree.compaction_op % half_bar_beat_count;
687
- assert(compaction_beat == 0);
688
-
689
- assert(context.level_a < constants.lsm_levels);
690
- assert(context.level_b < constants.lsm_levels);
691
- assert(context.level_a + 1 == context.level_b);
692
-
693
- // Do not start compaction if level A does not require compaction.
694
- const table_range = tree.manifest.compaction_table(context.level_a) orelse return;
695
- const table = table_range.table;
696
-
697
- assert(table_range.range.table_count >= 1);
698
- assert(table_range.range.table_count <= compaction_tables_input_max);
699
- assert(compare_keys(table.key_min, table.key_max) != .gt);
700
- assert(compare_keys(table_range.range.key_min, table.key_min) != .gt);
701
- assert(compare_keys(table_range.range.key_max, table.key_max) != .lt);
702
-
703
- log.debug(tree_name ++ ": compacting {d} tables from level {d} to level {d}", .{
704
- table_range.range.table_count,
705
- context.level_a,
706
- context.level_b,
707
- });
708
-
709
- context.compaction.start(
710
- tree.grid,
711
- &tree.manifest,
712
- op_min,
713
- table_range.range,
714
- table_range.table,
715
- context.level_b,
716
- .{
717
- .grid = tree.grid,
718
- .address = table.address,
719
- .checksum = table.checksum,
720
- },
721
- );
722
- }
723
-
724
- fn compact_drive(tree: *Tree) void {
725
- assert(tree.compaction_io_pending <= 2 + tree.compaction_table.len);
726
- assert(tree.compaction_callback != null);
727
-
728
- // Always start one fake io_pending that is resolved right after
729
- // to handle the case where this compaction tick triggers no IO.
730
- // (For example, ticking the immutable table, or level B is already done).
731
- tree.compaction_io_pending += 1;
732
- defer tree.compact_tick_done();
733
-
734
- // Try to tick the immutable table compaction:
735
- const compaction_beat = tree.compaction_op % constants.lsm_batch_multiple;
736
- const even_levels = compaction_beat < half_bar_beat_count;
737
- if (even_levels) {
738
- assert(tree.compaction_table_immutable.status == .idle);
739
- } else {
740
- tree.compact_tick(&tree.compaction_table_immutable);
741
- }
742
-
743
- // Try to tick the compaction for each level:
744
- var it = CompactionTableIterator{ .tree = tree };
745
- while (it.next()) |context| {
746
- tree.compact_tick(context.compaction);
747
- }
748
- }
749
-
750
- fn compact_tick(tree: *Tree, compaction: anytype) void {
751
- if (compaction.status != .processing) return;
752
- tree.compaction_io_pending += 1;
753
-
754
- const compaction_beat = tree.compaction_op % constants.lsm_batch_multiple;
755
- const even_levels = compaction_beat < half_bar_beat_count;
756
- assert(compaction.level_b < constants.lsm_levels);
757
- assert(compaction.level_b % 2 == @boolToInt(even_levels));
758
-
759
- if (@TypeOf(compaction.*) == CompactionTableImmutable) {
760
- assert(compaction.level_b == 0);
761
- log.debug(tree_name ++ ": compact_tick() for immutable table to level 0", .{});
762
- compaction.compact_tick(Tree.compact_tick_callback_table_immutable);
763
- } else {
764
- assert(@TypeOf(compaction.*) == CompactionTable);
765
- log.debug(tree_name ++ ": compact_tick() for level {d} to level {d}", .{
766
- compaction.level_b - 1,
767
- compaction.level_b,
768
- });
769
- compaction.compact_tick(Tree.compact_tick_callback_table);
770
- }
771
- }
772
-
773
- fn compact_tick_callback_table_immutable(compaction: *CompactionTableImmutable) void {
774
- assert(compaction.status == .processing or compaction.status == .done);
775
- assert(compaction.level_b < constants.lsm_levels);
776
- assert(compaction.level_b == 0);
777
-
778
- const tree = @fieldParentPtr(Tree, "compaction_table_immutable", compaction);
779
- const compaction_beat = tree.compaction_op % constants.lsm_batch_multiple;
780
- assert(compaction_beat >= half_bar_beat_count);
781
-
782
- log.debug(tree_name ++ ": compact_tick() complete for immutable table to level 0", .{});
783
- tree.compact_tick_done();
784
- }
785
-
786
- fn compact_tick_callback_table(compaction: *CompactionTable) void {
787
- assert(compaction.status == .processing or compaction.status == .done);
788
- assert(compaction.level_b < constants.lsm_levels);
789
- assert(compaction.level_b > 0);
790
-
791
- const table_offset = @divFloor(compaction.level_b - 1, 2);
792
- const table_ptr = @ptrCast([*]CompactionTable, compaction) - table_offset;
793
-
794
- const table_size = @divFloor(constants.lsm_levels, 2);
795
- const table: *[table_size]CompactionTable = table_ptr[0..table_size];
796
-
797
- log.debug(tree_name ++ ": compact_tick() complete for level {d} to level {d}", .{
798
- compaction.level_b - 1,
799
- compaction.level_b,
800
- });
801
-
802
- const tree = @fieldParentPtr(Tree, "compaction_table", table);
803
- tree.compact_tick_done();
804
- }
805
-
806
- fn compact_tick_done(tree: *Tree) void {
807
- assert(tree.compaction_io_pending <= 2 + tree.compaction_table.len);
808
- assert(tree.compaction_callback != null);
809
-
810
- // compact_done() is called after all compact_tick()'s complete.
811
- tree.compaction_io_pending -= 1;
812
- if (tree.compaction_io_pending == 0) tree.compact_done();
813
- }
814
-
815
- /// Called at the end of each compaction tick.
816
- fn compact_done(tree: *Tree) void {
817
- assert(tree.compaction_io_pending == 0);
818
- assert(tree.compaction_callback != null);
819
- assert(tree.compaction_op == tree.lookup_snapshot_max);
820
-
821
- const compaction_beat = tree.compaction_op % constants.lsm_batch_multiple;
822
- const even_levels = compaction_beat < half_bar_beat_count;
823
- const compacted_levels_even = compaction_beat == half_bar_beat_count - 1;
824
- const compacted_levels_odd = compaction_beat == constants.lsm_batch_multiple - 1;
825
- if (!compacted_levels_even and !compacted_levels_odd) {
826
- // TODO(Deterministic Beats): Remove this when compact_done() is called exactly
827
- // once when the beat finishes.
828
- tree.lookup_snapshot_max = tree.compaction_op + 1;
829
-
830
- tree.compact_finish();
831
- return;
832
- }
833
-
834
- // At the end of the second and fourth beat:
835
- // 1. Tick the Compactions until all have completed.
836
- // 2. Remove invisible tables from the manifest.
837
- // 3. Compact the manifest.
838
- // Then at the end of the fourth beat, freeze the mutable table.
839
- assert(compacted_levels_even or compacted_levels_odd);
840
- assert(compacted_levels_even != compacted_levels_odd);
841
-
842
- const still_compacting = blk: {
843
- if (even_levels) {
844
- assert(tree.compaction_table_immutable.status == .idle);
845
- } else {
846
- if (tree.compaction_table_immutable.status == .processing) break :blk true;
847
- }
848
-
849
- var it = CompactionTableIterator{ .tree = tree };
850
- while (it.next()) |context| {
851
- if (context.compaction.status == .processing) break :blk true;
852
- }
853
- break :blk false;
854
- };
855
-
856
- if (still_compacting) {
857
- // We are at the end of a half-bar, but the compactions have not finished.
858
- // We keep ticking them until they finish.
859
- log.debug(tree_name ++ ": compact_done: driving outstanding compactions", .{});
860
- tree.compact_drive();
861
- return;
862
- }
863
-
864
- // TODO(Deterministic Beats): Move this to the top of the function when compact_done()
865
- // is called exactly once when the beat finishes.
866
- tree.lookup_snapshot_max = tree.compaction_op + 1;
867
-
868
- // All compactions have finished for the current half-bar.
869
- // We couldn't remove the (invisible) input tables until now because prefetch()
870
- // needs a complete set of tables for lookups to avoid missing data.
871
-
872
- // Reset the immutable table Compaction.
873
- // Also clear any tables made invisible by the compaction.
874
- if (!even_levels) {
875
- switch (tree.compaction_table_immutable.status) {
876
- // The compaction wasn't started for this half bar.
877
- .idle => assert(tree.table_immutable.free),
878
- .processing => unreachable,
879
- .done => {
880
- tree.compaction_table_immutable.reset();
881
- tree.table_immutable.clear();
882
- tree.manifest.remove_invisible_tables(
883
- tree.compaction_table_immutable.level_b,
884
- tree.lookup_snapshot_max,
885
- tree.compaction_table_immutable.range.key_min,
886
- tree.compaction_table_immutable.range.key_max,
887
- );
888
- },
889
- }
890
- }
891
-
892
- // Reset all the other Compactions.
893
- // Also clear any tables made invisible by the compactions.
894
- var it = CompactionTableIterator{ .tree = tree };
895
- while (it.next()) |context| {
896
- switch (context.compaction.status) {
897
- .idle => {}, // The compaction wasn't started for this half bar.
898
- .processing => unreachable,
899
- .done => {
900
- context.compaction.reset();
901
- tree.manifest.remove_invisible_tables(
902
- context.compaction.level_b,
903
- tree.lookup_snapshot_max,
904
- context.compaction.range.key_min,
905
- context.compaction.range.key_max,
906
- );
907
- if (context.compaction.level_b > 0) {
908
- tree.manifest.remove_invisible_tables(
909
- context.compaction.level_b - 1,
910
- tree.lookup_snapshot_max,
911
- context.compaction.range.key_min,
912
- context.compaction.range.key_max,
913
- );
914
- }
915
- },
916
- }
917
- }
918
-
919
- assert(tree.compaction_table_immutable.status == .idle);
920
- it = CompactionTableIterator{ .tree = tree };
921
- while (it.next()) |context| {
922
- assert(context.compaction.status == .idle);
923
- }
924
-
925
- // At the end of the fourth/last beat:
926
- // - Assert all visible tables haven't overflowed their max per level.
927
- // - Convert mutable table to immutable table for next bar.
928
- if (compacted_levels_odd) {
929
- tree.manifest.assert_level_table_counts();
930
- tree.compact_mutable_table_into_immutable();
931
- }
932
-
933
- // At the end of the second/fourth beat:
934
- // - Compact the manifest before invoking the compact() callback.
935
- tree.manifest.compact(compact_manifest_callback);
936
- }
937
-
938
- /// Called after the last beat of a full compaction bar.
939
- fn compact_mutable_table_into_immutable(tree: *Tree) void {
940
- assert(tree.table_immutable.free);
941
- assert((tree.compaction_op + 1) % constants.lsm_batch_multiple == 0);
942
- assert(tree.compaction_op + 1 == tree.lookup_snapshot_max);
943
-
944
- if (tree.table_mutable.count() == 0) return;
945
-
946
- // Sort the mutable table values directly into the immutable table's array.
947
- const values_max = tree.table_immutable.values_max();
948
- const values = tree.table_mutable.sort_into_values_and_clear(values_max);
949
- assert(values.ptr == values_max.ptr);
950
-
951
- // The immutable table must be visible to the next bar — setting its snapshot_min to
952
- // lookup_snapshot_max guarantees.
953
- //
954
- // In addition, the immutable table is conceptually an output table of this compaction
955
- // bar, and now its snapshot_min matches the snapshot_min of the Compactions' output
956
- // tables.
957
- tree.table_immutable.reset_with_sorted_values(tree.lookup_snapshot_max, values);
958
-
959
- assert(tree.table_mutable.count() == 0);
960
- assert(!tree.table_immutable.free);
961
- }
962
-
963
- fn compact_manifest_callback(manifest: *Manifest) void {
964
- const tree = @fieldParentPtr(Tree, "manifest", manifest);
965
- assert(tree.compaction_io_pending == 0);
966
- assert(tree.compaction_callback != null);
967
- tree.compact_finish();
968
- }
969
-
970
- /// Called at the end of each compaction beat.
971
- fn compact_finish(tree: *Tree) void {
972
- assert(tree.compaction_io_pending == 0);
973
- assert(tree.table_mutable.can_commit_batch(tree.options.commit_entries_max));
974
-
975
- tracer.end(
976
- &tree.tracer_slot,
977
- .{ .tree = .{ .tree_name = tree_name } },
978
- .tree_compaction_beat,
979
- );
980
-
981
- if (constants.verify) {
982
- tree.manifest.verify(tree.compaction_op);
983
- }
984
-
985
- // Invoke the compact() callback after the manifest compacts at the end of the beat.
986
- const callback = tree.compaction_callback.?;
987
- tree.compaction_callback = null;
988
- callback(tree);
989
- }
990
-
991
- pub fn checkpoint(tree: *Tree, callback: fn (*Tree) void) void {
992
- // Assert no outstanding compact_tick() work.
993
- assert(tree.compaction_io_pending == 0);
994
- assert(tree.compaction_callback == null);
995
- assert(tree.compaction_op > 0);
996
- assert(tree.compaction_op + 1 == tree.lookup_snapshot_max);
997
- // Don't re-run the checkpoint we recovered from.
998
- assert(!tree.grid.superblock.working.vsr_state.op_compacted(tree.compaction_op));
999
-
1000
- // Assert that this is the last beat in the compaction bar.
1001
- const compaction_beat = tree.compaction_op % constants.lsm_batch_multiple;
1002
- const last_beat_in_bar = constants.lsm_batch_multiple - 1;
1003
- assert(last_beat_in_bar == compaction_beat);
1004
-
1005
- // Assert no outstanding compactions.
1006
- assert(tree.compaction_table_immutable.status == .idle);
1007
- for (tree.compaction_table) |*compaction| {
1008
- assert(compaction.status == .idle);
1009
- }
1010
-
1011
- // Assert all manifest levels haven't overflowed their table counts.
1012
- tree.manifest.assert_level_table_counts();
1013
-
1014
- // Assert that we're checkpointing only after invisible tables have been removed.
1015
- if (constants.verify) {
1016
- tree.manifest.assert_no_invisible_tables(compaction_op_min(tree.compaction_op));
1017
- }
1018
-
1019
- // Start an asynchronous checkpoint on the manifest.
1020
- assert(tree.checkpoint_callback == null);
1021
- tree.checkpoint_callback = callback;
1022
- tree.manifest.checkpoint(manifest_checkpoint_callback);
1023
- }
1024
-
1025
- fn manifest_checkpoint_callback(manifest: *Manifest) void {
1026
- const tree = @fieldParentPtr(Tree, "manifest", manifest);
1027
- assert(tree.checkpoint_callback != null);
1028
-
1029
- const callback = tree.checkpoint_callback.?;
1030
- tree.checkpoint_callback = null;
1031
- callback(tree);
1032
- }
1033
-
1034
- pub const RangeQuery = union(enum) {
1035
- bounded: struct {
1036
- start: Key,
1037
- end: Key,
1038
- },
1039
- open: struct {
1040
- start: Key,
1041
- order: enum {
1042
- ascending,
1043
- descending,
1044
- },
1045
- },
1046
- };
1047
-
1048
- pub const RangeQueryIterator = struct {
1049
- tree: *Tree,
1050
- snapshot: u64,
1051
- query: RangeQuery,
1052
-
1053
- pub fn next(callback: fn (result: ?Value) void) void {
1054
- _ = callback;
1055
- }
1056
- };
1057
-
1058
- pub fn range_query(
1059
- tree: *Tree,
1060
- /// The snapshot timestamp, if any
1061
- snapshot: ?u64,
1062
- query: RangeQuery,
1063
- ) RangeQueryIterator {
1064
- _ = tree;
1065
- _ = snapshot;
1066
- _ = query;
1067
- }
1068
- };
1069
- }
1070
-
1071
- /// Returns the first op of the compaction (Compaction.op_min) for a given op/beat.
1072
- ///
1073
- /// After this compaction finishes:
1074
- /// - `op_min + half_bar_beat_count - 1` will be the input tables' snapshot_max.
1075
- /// - `op_min + half_bar_beat_count` will be the output tables' snapshot_min.
1076
- ///
1077
- /// Each half-bar has a separate op_min (for deriving the output snapshot_min) instead of each full
1078
- /// bar because this allows the output tables of the first half-bar's compaction to be prefetched
1079
- /// against earlier — hopefully while they are still warm in the cache from being written.
1080
- pub fn compaction_op_min(op: u64) u64 {
1081
- return op - op % half_bar_beat_count;
1082
- }
1083
-
1084
- /// These charts depict the commit/compact ops and `lookup_snapshot_max` over a series of
1085
- /// commits and compactions (with lsm_batch_multiple=8).
1086
- ///
1087
- /// Legend:
1088
- ///
1089
- /// ┼ full bar (first half-bar start)
1090
- /// ┬ half bar (second half-bar start)
1091
- /// $ lookup_snapshot_max (prefetch reads from the current snapshot)
1092
- /// This is incremented at the end of each compact().
1093
- /// . op is in mutable table (in memory)
1094
- /// , op is in immutable table (in memory)
1095
- /// # op is on disk
1096
- /// ✓ checkpoint() may follow compact()
1097
- ///
1098
- /// 0 2 4 6 8 0 2 4 6
1099
- /// ┼───┬───┼───┬───┼
1100
- /// .$ ╷ ╷ init(superblock.commit_min=0)⎤ Compaction is effectively a noop for the
1101
- /// .$ ╷ ╷ commit;compact( 1) start/end ⎥ first bar because there are no tables on
1102
- /// ..$ ╷ ╷ commit;compact( 2) start/end ⎥ disk yet, and no immutable table to
1103
- /// ...$ ╷ ╷ commit;compact( 3) start/end ⎥ flush.
1104
- /// ....$ ╷ ╷ commit;compact( 4) start/end ⎥
1105
- /// .....$ ╷ ╷ commit;compact( 5) start/end ⎥ This applies:
1106
- /// ......$ ╷ ╷ commit;compact( 6) start/end ⎥ - when the LSM is starting on a freshly
1107
- /// .......$╷ ╷ commit;compact( 7) start ⎤⎥ formatted data file, and also
1108
- /// ,,,,,,,,$ ╷ ✓ compact( 7) end⎦⎦ - when the LSM is recovering from a crash
1109
- /// ,,,,,,,,$ ╷ commit;compact( 8) start/end (see below).
1110
- /// ,,,,,,,,.$ ╷ commit;compact( 9) start/end
1111
- /// ,,,,,,,,..$ ╷ commit;compact(10) start/end
1112
- /// ,,,,,,,,...$ ╷ commit;compact(11) start/end
1113
- /// ,,,,,,,,....$ ╷ commit;compact(12) start/end
1114
- /// ,,,,,,,,.....$ ╷ commit;compact(13) start/end
1115
- /// ,,,,,,,,......$ ╷ commit;compact(14) start/end
1116
- /// ,,,,,,,,.......$╷ commit;compact(15) start ⎤
1117
- /// ########,,,,,,,,$ ✓ compact(15) end⎦
1118
- /// ########,,,,,,,,$ commit;compact(16) start/end
1119
- /// ┼───┬───┼───┬───┼
1120
- /// 0 2 4 6 8 0 2 4 6
1121
- /// ┼───┬───┼───┬───┼ Recover with a checkpoint taken at op 15.
1122
- /// ######## $ init(superblock.commit_min=7) At op 15, ops 8…15 are in memory, so they
1123
- /// ########. $ commit ( 8) start/end ⎤ were dropped by the crash.
1124
- /// ########.. $ commit ( 9) start/end ⎥
1125
- /// ########... $ commit (10) start/end ⎥ But compaction is not run for ops 8…15
1126
- /// ########.... $ commit (11) start/end ⎥ because it was already performed
1127
- /// ########..... $ commit (12) start/end ⎥ before the checkpoint.
1128
- /// ########...... $ commit (13) start/end ⎥
1129
- /// ########....... $ commit (14) start/end ⎥ We can begin to compact again at op 16,
1130
- /// ########........$ commit (15) start ⎤⎥ because those compactions (if previously
1131
- /// ########,,,,,,,,$ ✓ (15) end⎦⎦ performed) are not included in the
1132
- /// ########,,,,,,,,$ commit;compact(16) start/end checkpoint.
1133
- /// ┼───┬───┼───┬───┼
1134
- /// 0 2 4 6 8 0 2 4 6
1135
- ///
1136
- /// Notice how in the checkpoint recovery example above, we are careful not to `compact(op)` twice
1137
- /// for any op (even if we crash/recover), since that could lead to differences between replicas'
1138
- /// storage. The last bar of `commit()`s is always only in memory, so it is safe to repeat.
1139
- ///
1140
- /// Additionally, while skipping compactions during recovery, we use a `lookup_snapshot_max`
1141
- /// different than the original compactions — the old tables may have been removed during the
1142
- /// checkpoint.
1143
- fn lookup_snapshot_max_for_checkpoint(op_checkpoint: u64) u64 {
1144
- if (op_checkpoint == 0) {
1145
- // Start from 1 because we never commit op 0.
1146
- return 1;
1147
- } else {
1148
- return op_checkpoint + constants.lsm_batch_multiple + 1;
1149
- }
1150
- }
1151
-
1152
- /// The total number of tables that can be supported by the tree across so many levels.
1153
- pub fn table_count_max_for_tree(growth_factor: u32, levels_count: u32) u32 {
1154
- assert(growth_factor >= 4);
1155
- assert(growth_factor <= 16); // Limit excessive write amplification.
1156
- assert(levels_count >= 2);
1157
- assert(levels_count <= 10); // Limit excessive read amplification.
1158
- assert(levels_count <= constants.lsm_levels);
1159
-
1160
- var count: u32 = 0;
1161
- var level: u32 = 0;
1162
- while (level < levels_count) : (level += 1) {
1163
- count += table_count_max_for_level(growth_factor, level);
1164
- }
1165
- return count;
1166
- }
1167
-
1168
- /// The total number of tables that can be supported by the level alone.
1169
- pub fn table_count_max_for_level(growth_factor: u32, level: u32) u32 {
1170
- assert(level >= 0);
1171
- assert(level < constants.lsm_levels);
1172
-
1173
- return math.pow(u32, growth_factor, level + 1);
1174
- }
1175
-
1176
- test "table_count_max_for_level/tree" {
1177
- const expectEqual = std.testing.expectEqual;
1178
-
1179
- try expectEqual(@as(u32, 8), table_count_max_for_level(8, 0));
1180
- try expectEqual(@as(u32, 64), table_count_max_for_level(8, 1));
1181
- try expectEqual(@as(u32, 512), table_count_max_for_level(8, 2));
1182
- try expectEqual(@as(u32, 4096), table_count_max_for_level(8, 3));
1183
- try expectEqual(@as(u32, 32768), table_count_max_for_level(8, 4));
1184
- try expectEqual(@as(u32, 262144), table_count_max_for_level(8, 5));
1185
- try expectEqual(@as(u32, 2097152), table_count_max_for_level(8, 6));
1186
-
1187
- try expectEqual(@as(u32, 8 + 64), table_count_max_for_tree(8, 2));
1188
- try expectEqual(@as(u32, 72 + 512), table_count_max_for_tree(8, 3));
1189
- try expectEqual(@as(u32, 584 + 4096), table_count_max_for_tree(8, 4));
1190
- try expectEqual(@as(u32, 4680 + 32768), table_count_max_for_tree(8, 5));
1191
- try expectEqual(@as(u32, 37448 + 262144), table_count_max_for_tree(8, 6));
1192
- try expectEqual(@as(u32, 299592 + 2097152), table_count_max_for_tree(8, 7));
1193
- }