tigerbeetle-node 0.10.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. package/README.md +302 -101
  2. package/dist/index.d.ts +70 -72
  3. package/dist/index.js +70 -72
  4. package/dist/index.js.map +1 -1
  5. package/package.json +6 -6
  6. package/scripts/download_node_headers.sh +14 -7
  7. package/src/index.ts +6 -10
  8. package/src/node.zig +6 -3
  9. package/src/tigerbeetle/scripts/benchmark.sh +4 -4
  10. package/src/tigerbeetle/scripts/confirm_image.sh +44 -0
  11. package/src/tigerbeetle/scripts/fuzz_loop.sh +15 -0
  12. package/src/tigerbeetle/scripts/fuzz_unique_errors.sh +7 -0
  13. package/src/tigerbeetle/scripts/install.sh +19 -4
  14. package/src/tigerbeetle/scripts/install_zig.bat +5 -1
  15. package/src/tigerbeetle/scripts/install_zig.sh +24 -14
  16. package/src/tigerbeetle/scripts/pre-commit.sh +9 -0
  17. package/src/tigerbeetle/scripts/shellcheck.sh +5 -0
  18. package/src/tigerbeetle/scripts/tests_on_alpine.sh +10 -0
  19. package/src/tigerbeetle/scripts/tests_on_ubuntu.sh +14 -0
  20. package/src/tigerbeetle/src/benchmark.zig +4 -2
  21. package/src/tigerbeetle/src/benchmark_array_search.zig +3 -3
  22. package/src/tigerbeetle/src/c/tb_client/thread.zig +8 -9
  23. package/src/tigerbeetle/src/c/tb_client.h +100 -80
  24. package/src/tigerbeetle/src/c/tb_client.zig +4 -1
  25. package/src/tigerbeetle/src/cli.zig +1 -1
  26. package/src/tigerbeetle/src/config.zig +48 -16
  27. package/src/tigerbeetle/src/demo.zig +3 -1
  28. package/src/tigerbeetle/src/eytzinger_benchmark.zig +3 -3
  29. package/src/tigerbeetle/src/io/linux.zig +1 -1
  30. package/src/tigerbeetle/src/lsm/README.md +214 -0
  31. package/src/tigerbeetle/src/lsm/binary_search.zig +137 -10
  32. package/src/tigerbeetle/src/lsm/bloom_filter.zig +43 -0
  33. package/src/tigerbeetle/src/lsm/compaction.zig +352 -398
  34. package/src/tigerbeetle/src/lsm/composite_key.zig +2 -0
  35. package/src/tigerbeetle/src/lsm/eytzinger.zig +1 -1
  36. package/src/tigerbeetle/src/lsm/forest.zig +21 -447
  37. package/src/tigerbeetle/src/lsm/forest_fuzz.zig +412 -0
  38. package/src/tigerbeetle/src/lsm/grid.zig +145 -69
  39. package/src/tigerbeetle/src/lsm/groove.zig +196 -133
  40. package/src/tigerbeetle/src/lsm/k_way_merge.zig +40 -18
  41. package/src/tigerbeetle/src/lsm/level_iterator.zig +28 -9
  42. package/src/tigerbeetle/src/lsm/manifest.zig +81 -181
  43. package/src/tigerbeetle/src/lsm/manifest_level.zig +210 -454
  44. package/src/tigerbeetle/src/lsm/manifest_log.zig +77 -28
  45. package/src/tigerbeetle/src/lsm/posted_groove.zig +64 -76
  46. package/src/tigerbeetle/src/lsm/segmented_array.zig +561 -241
  47. package/src/tigerbeetle/src/lsm/segmented_array_benchmark.zig +148 -0
  48. package/src/tigerbeetle/src/lsm/segmented_array_fuzz.zig +9 -0
  49. package/src/tigerbeetle/src/lsm/set_associative_cache.zig +62 -12
  50. package/src/tigerbeetle/src/lsm/table.zig +83 -48
  51. package/src/tigerbeetle/src/lsm/table_immutable.zig +30 -23
  52. package/src/tigerbeetle/src/lsm/table_iterator.zig +25 -14
  53. package/src/tigerbeetle/src/lsm/table_mutable.zig +63 -12
  54. package/src/tigerbeetle/src/lsm/test.zig +49 -55
  55. package/src/tigerbeetle/src/lsm/tree.zig +407 -402
  56. package/src/tigerbeetle/src/lsm/tree_fuzz.zig +457 -0
  57. package/src/tigerbeetle/src/main.zig +28 -6
  58. package/src/tigerbeetle/src/message_bus.zig +2 -2
  59. package/src/tigerbeetle/src/message_pool.zig +14 -17
  60. package/src/tigerbeetle/src/simulator.zig +145 -112
  61. package/src/tigerbeetle/src/state_machine.zig +338 -228
  62. package/src/tigerbeetle/src/static_allocator.zig +65 -0
  63. package/src/tigerbeetle/src/storage.zig +3 -7
  64. package/src/tigerbeetle/src/test/accounting/auditor.zig +577 -0
  65. package/src/tigerbeetle/src/test/accounting/workload.zig +819 -0
  66. package/src/tigerbeetle/src/test/cluster.zig +18 -48
  67. package/src/tigerbeetle/src/test/conductor.zig +365 -0
  68. package/src/tigerbeetle/src/test/fuzz.zig +121 -0
  69. package/src/tigerbeetle/src/test/id.zig +89 -0
  70. package/src/tigerbeetle/src/test/priority_queue.zig +645 -0
  71. package/src/tigerbeetle/src/test/state_checker.zig +93 -69
  72. package/src/tigerbeetle/src/test/state_machine.zig +11 -35
  73. package/src/tigerbeetle/src/test/storage.zig +29 -8
  74. package/src/tigerbeetle/src/tigerbeetle.zig +14 -16
  75. package/src/tigerbeetle/src/unit_tests.zig +7 -0
  76. package/src/tigerbeetle/src/vopr.zig +494 -0
  77. package/src/tigerbeetle/src/vopr_hub/README.md +58 -0
  78. package/src/tigerbeetle/src/vopr_hub/SETUP.md +199 -0
  79. package/src/tigerbeetle/src/vopr_hub/go.mod +3 -0
  80. package/src/tigerbeetle/src/vopr_hub/main.go +1022 -0
  81. package/src/tigerbeetle/src/vopr_hub/scheduler/go.mod +3 -0
  82. package/src/tigerbeetle/src/vopr_hub/scheduler/main.go +403 -0
  83. package/src/tigerbeetle/src/vsr/client.zig +13 -0
  84. package/src/tigerbeetle/src/vsr/journal.zig +16 -13
  85. package/src/tigerbeetle/src/vsr/replica.zig +924 -491
  86. package/src/tigerbeetle/src/vsr/superblock.zig +55 -37
  87. package/src/tigerbeetle/src/vsr/superblock_client_table.zig +7 -10
  88. package/src/tigerbeetle/src/vsr/superblock_free_set.zig +2 -2
  89. package/src/tigerbeetle/src/vsr/superblock_manifest.zig +18 -3
  90. package/src/tigerbeetle/src/vsr.zig +75 -55
  91. package/src/tigerbeetle/scripts/vopr.bat +0 -48
  92. package/src/tigerbeetle/scripts/vopr.sh +0 -33
@@ -1,3 +1,4 @@
1
+ //! An LSM tree.
1
2
  const std = @import("std");
2
3
  const builtin = @import("builtin");
3
4
  const assert = std.debug.assert;
@@ -11,13 +12,11 @@ const config = @import("../config.zig");
11
12
  const div_ceil = @import("../util.zig").div_ceil;
12
13
  const eytzinger = @import("eytzinger.zig").eytzinger;
13
14
  const vsr = @import("../vsr.zig");
14
- const binary_search = @import("binary_search.zig");
15
15
  const bloom_filter = @import("bloom_filter.zig");
16
16
 
17
17
  const CompositeKey = @import("composite_key.zig").CompositeKey;
18
18
  const NodePool = @import("node_pool.zig").NodePool(config.lsm_manifest_node_size, 16);
19
19
  const RingBuffer = @import("../ring_buffer.zig").RingBuffer;
20
- const SuperBlockType = vsr.SuperBlockType;
21
20
 
22
21
  /// We reserve maxInt(u64) to indicate that a table has not been deleted.
23
22
  /// Tables that have not been deleted have snapshot_max of maxInt(u64).
@@ -26,6 +25,8 @@ const SuperBlockType = vsr.SuperBlockType;
26
25
  /// to query all non-deleted tables.
27
26
  pub const snapshot_latest: u64 = math.maxInt(u64) - 1;
28
27
 
28
+ const half_bar_beat_count = @divExact(config.lsm_batch_multiple, 2);
29
+
29
30
  // StateMachine:
30
31
  //
31
32
  // /// state machine will pass this on to all object stores
@@ -35,7 +36,7 @@ pub const snapshot_latest: u64 = math.maxInt(u64) - 1;
35
36
  // /// write the ops in batch to the memtable/objcache, previously called commit()
36
37
  // pub fn write(batch) void
37
38
  //
38
- // /// Flush in memory state to disk, preform merges, etc
39
+ // /// Flush in memory state to disk, perform merges, etc
39
40
  // /// Only function that triggers Write I/O in LSMs, as well as some Read
40
41
  // /// Make as incremental as possible, don't block the main thread, avoid high latency/spikes
41
42
  // pub fn flush(callback) void
@@ -47,14 +48,14 @@ pub const snapshot_latest: u64 = math.maxInt(u64) - 1;
47
48
  // pub fn decode_superblock(buffer) void
48
49
  //
49
50
 
51
+ /// The maximum number of tables for a single tree.
50
52
  pub const table_count_max = table_count_max_for_tree(config.lsm_growth_factor, config.lsm_levels);
51
53
 
52
- pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name: []const u8) type {
53
- const Key = Table.Key;
54
- const Value = Table.Value;
55
- const compare_keys = Table.compare_keys;
56
- const tombstone = Table.tombstone;
57
- const tombstone_from_key = Table.tombstone_from_key;
54
+ pub fn TreeType(comptime TreeTable: type, comptime Storage: type, comptime tree_name: []const u8) type {
55
+ const Key = TreeTable.Key;
56
+ const Value = TreeTable.Value;
57
+ const compare_keys = TreeTable.compare_keys;
58
+ const tombstone = TreeTable.tombstone;
58
59
 
59
60
  const tree_hash = blk: {
60
61
  // Blake3 hash does alot at comptime..
@@ -69,34 +70,25 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
69
70
  const Tree = @This();
70
71
 
71
72
  // Expose the Table & hash for the Groove.
72
- pub const TableType = Table;
73
+ pub const Table = TreeTable;
73
74
  pub const name = tree_name;
74
75
  pub const hash = tree_hash;
75
76
 
76
77
  const Grid = @import("grid.zig").GridType(Storage);
77
78
  const Manifest = @import("manifest.zig").ManifestType(Table, Storage);
78
- const TableMutable = @import("table_mutable.zig").TableMutableType(Table);
79
+ pub const TableMutable = @import("table_mutable.zig").TableMutableType(Table);
79
80
  const TableImmutable = @import("table_immutable.zig").TableImmutableType(Table);
80
81
 
81
82
  const CompactionType = @import("compaction.zig").CompactionType;
82
83
  const TableIteratorType = @import("table_iterator.zig").TableIteratorType;
83
84
  const TableImmutableIteratorType = @import("table_immutable.zig").TableImmutableIteratorType;
84
85
 
85
- pub const ValueCache = std.HashMapUnmanaged(Value, void, Table.HashMapContextValue, 70);
86
-
87
86
  const CompactionTable = CompactionType(Table, Storage, TableIteratorType);
88
87
  const CompactionTableImmutable = CompactionType(Table, Storage, TableImmutableIteratorType);
89
88
 
90
89
  grid: *Grid,
91
90
  options: Options,
92
91
 
93
- /// TODO(ifreund) Replace this with SetAssociativeCache:
94
- /// A set associative cache of values shared by trees with the same key/value sizes.
95
- /// This is used to accelerate point lookups and is not used for range queries.
96
- /// Secondary index trees used only for range queries can therefore set this to null.
97
- /// The value type will be []u8 and this will be shared by trees with the same value size.
98
- value_cache: ?*ValueCache,
99
-
100
92
  table_mutable: TableMutable,
101
93
  table_immutable: TableImmutable,
102
94
 
@@ -111,7 +103,26 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
111
103
  /// This means, that for odd lsm_levels, the last CompactionTable is unused.
112
104
  compaction_table: [@divFloor(config.lsm_levels, 2)]CompactionTable,
113
105
 
106
+ /// While a compaction is running, this is the op of the last compact().
107
+ /// While no compaction is running, this is the op of the last compact() to complete.
108
+ /// (When recovering from a checkpoint, compaction_op starts at op_checkpoint).
114
109
  compaction_op: u64,
110
+
111
+ /// The maximum snapshot which is safe to prefetch from.
112
+ /// The minimum snapshot which can see the mutable table.
113
+ ///
114
+ /// This field ensures that the tree never queries the output tables of a running
115
+ /// compaction; they are incomplete.
116
+ ///
117
+ /// See lookup_snapshot_max_for_checkpoint().
118
+ ///
119
+ /// Invariants:
120
+ /// * `lookup_snapshot_max = compaction_op` while any compaction beat is in progress.
121
+ /// * `lookup_snapshot_max = compaction_op + 1` after a compaction beat finishes.
122
+ /// * `lookup_snapshot_max ≥ op_checkpoint + 1 + lsm_batch_multiple`
123
+ /// when `op_checkpoint ≠ 0`.
124
+ lookup_snapshot_max: u64,
125
+
115
126
  compaction_io_pending: usize,
116
127
  compaction_callback: ?fn (*Tree) void,
117
128
 
@@ -120,22 +131,45 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
120
131
 
121
132
  pub const Options = struct {
122
133
  /// The maximum number of keys that may be committed per batch.
123
- commit_count_max: u32,
134
+ ///
135
+ /// In general, the commit count max for a field depends on the field's object —
136
+ /// how many objects might be inserted/updated/removed by a batch:
137
+ /// (config.message_size_max - sizeOf(vsr.header))
138
+ /// For example, there are at most 8191 transfers in a batch.
139
+ /// So commit_entries_max=8191 for transfer objects and indexes.
140
+ ///
141
+ /// However, if a transfer is ever mutated, then this will double commit_entries_max
142
+ /// since the old index might need to be removed, and the new index inserted.
143
+ ///
144
+ /// A way to see this is by looking at the state machine. If a transfer is inserted,
145
+ /// how many accounts and transfer put/removes will be generated?
146
+ ///
147
+ /// This also means looking at the state machine operation that will generate the
148
+ /// most put/removes in the worst case.
149
+ /// For example, create_accounts will put at most 8191 accounts.
150
+ /// However, create_transfers will put 2 accounts (8191 * 2) for every transfer, and
151
+ /// some of these accounts may exist, requiring a remove/put to update the index.
152
+ commit_entries_max: u32,
124
153
  };
125
154
 
126
155
  pub fn init(
127
156
  allocator: mem.Allocator,
128
157
  node_pool: *NodePool,
129
158
  grid: *Grid,
130
- value_cache: ?*ValueCache,
159
+ values_cache: ?*TableMutable.ValuesCache,
131
160
  options: Options,
132
161
  ) !Tree {
133
- var table_mutable = try TableMutable.init(allocator, options.commit_count_max);
162
+ assert(options.commit_entries_max > 0);
163
+ assert(grid.superblock.opened);
164
+
165
+ var table_mutable = try TableMutable.init(allocator, values_cache, options.commit_entries_max);
134
166
  errdefer table_mutable.deinit(allocator);
135
167
 
136
- var table_immutable = try TableImmutable.init(allocator, options.commit_count_max);
168
+ var table_immutable = try TableImmutable.init(allocator, options.commit_entries_max);
137
169
  errdefer table_immutable.deinit(allocator);
138
170
 
171
+ assert(table_immutable.value_count_max == table_mutable.value_count_max);
172
+
139
173
  var manifest = try Manifest.init(allocator, node_pool, grid, tree_hash);
140
174
  errdefer manifest.deinit(allocator);
141
175
 
@@ -149,16 +183,21 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
149
183
  }
150
184
  errdefer for (compaction_table) |*c| c.deinit(allocator);
151
185
 
186
+ // Compaction is one bar ahead of superblock's commit_min.
187
+ const op_checkpoint = grid.superblock.working.vsr_state.commit_min;
188
+ const lookup_snapshot_max = lookup_snapshot_max_for_checkpoint(op_checkpoint);
189
+ const compaction_op = op_checkpoint;
190
+
152
191
  return Tree{
153
192
  .grid = grid,
154
193
  .options = options,
155
- .value_cache = value_cache,
156
194
  .table_mutable = table_mutable,
157
195
  .table_immutable = table_immutable,
158
196
  .manifest = manifest,
159
197
  .compaction_table_immutable = compaction_table_immutable,
160
198
  .compaction_table = compaction_table,
161
- .compaction_op = 0,
199
+ .compaction_op = compaction_op,
200
+ .lookup_snapshot_max = lookup_snapshot_max,
162
201
  .compaction_io_pending = 0,
163
202
  .compaction_callback = null,
164
203
  .checkpoint_callback = null,
@@ -176,15 +215,6 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
176
215
  tree.manifest.deinit(allocator);
177
216
  }
178
217
 
179
- /// Get a cached value/tombstone for the given key.
180
- /// Returns null if no value/tombstone for the given key is cached.
181
- pub fn get_cached(tree: *const Tree, key: Key) ?*const Value {
182
- const value = tree.table_mutable.get(key) orelse
183
- tree.value_cache.?.getKeyPtr(tombstone_from_key(key));
184
-
185
- return value;
186
- }
187
-
188
218
  pub fn put(tree: *Tree, value: *const Value) void {
189
219
  tree.table_mutable.put(value);
190
220
  }
@@ -193,33 +223,41 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
193
223
  tree.table_mutable.remove(value);
194
224
  }
195
225
 
196
- pub fn lookup(
226
+ /// Returns the value from the mutable or immutable table (possibly a tombstone),
227
+ /// if one is available for the specified snapshot.
228
+ pub fn lookup_from_memory(tree: *Tree, snapshot: u64, key: Key) ?*const Value {
229
+ assert(tree.lookup_snapshot_max >= snapshot);
230
+
231
+ if (tree.lookup_snapshot_max == snapshot) {
232
+ if (tree.table_mutable.get(key)) |value| return value;
233
+ } else {
234
+ // The mutable table is converted to an immutable table when a snapshot is created.
235
+ // This means that a past snapshot will never be able to see the mutable table.
236
+ // This simplifies the mutable table and eliminates compaction for duplicate puts.
237
+ }
238
+
239
+ if (!tree.table_immutable.free and tree.table_immutable.snapshot_min <= snapshot) {
240
+ if (tree.table_immutable.get(key)) |value| return value;
241
+ } else {
242
+ // If the immutable table is invisible, then the mutable table is also invisible.
243
+ assert(tree.table_immutable.free or snapshot != tree.lookup_snapshot_max);
244
+ }
245
+
246
+ return null;
247
+ }
248
+
249
+ /// Call this function only after checking `lookup_from_memory()`.
250
+ pub fn lookup_from_levels(
197
251
  tree: *Tree,
198
252
  callback: fn (*LookupContext, ?*const Value) void,
199
253
  context: *LookupContext,
200
254
  snapshot: u64,
201
255
  key: Key,
202
256
  ) void {
203
- assert(snapshot <= snapshot_latest);
204
- if (snapshot == snapshot_latest) {
205
- // The mutable table is converted to an immutable table when a snapshot is created.
206
- // This means that a snapshot will never be able to see the mutable table.
207
- // This simplifies the mutable table and eliminates compaction for duplicate puts.
208
- // The value cache is only used for the latest snapshot for simplicity.
209
- // Earlier snapshots will still be able to utilize the block cache.
210
- if (tree.table_mutable.get(key) orelse
211
- tree.value_cache.?.getKeyPtr(tombstone_from_key(key))) |value|
212
- {
213
- callback(context, unwrap_tombstone(value));
214
- return;
215
- }
216
- }
217
-
218
- if (!tree.table_immutable.free and tree.table_immutable.snapshot_min < snapshot) {
219
- if (tree.table_immutable.get(key)) |value| {
220
- callback(context, unwrap_tombstone(value));
221
- return;
222
- }
257
+ assert(tree.lookup_snapshot_max >= snapshot);
258
+ if (config.verify) {
259
+ // The caller is responsible for checking the mutable table.
260
+ assert(tree.lookup_from_memory(snapshot, key) == null);
223
261
  }
224
262
 
225
263
  var index_block_count: u8 = 0;
@@ -285,12 +323,6 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
285
323
 
286
324
  callback: fn (*Tree.LookupContext, ?*const Value) void,
287
325
 
288
- fn finish(context: *LookupContext, value: ?*const Value) void {
289
- const callback = context.callback;
290
- context.* = undefined;
291
- callback(context, value);
292
- }
293
-
294
326
  fn read_index_block(context: *LookupContext) void {
295
327
  assert(context.data_block == null);
296
328
  assert(context.index_block < context.index_block_count);
@@ -302,6 +334,7 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
302
334
  &context.completion,
303
335
  context.index_block_addresses[context.index_block],
304
336
  context.index_block_checksums[context.index_block],
337
+ .index,
305
338
  );
306
339
  }
307
340
 
@@ -324,6 +357,7 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
324
357
  completion,
325
358
  blocks.filter_block_address,
326
359
  blocks.filter_block_checksum,
360
+ .filter,
327
361
  );
328
362
  }
329
363
 
@@ -341,6 +375,7 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
341
375
  completion,
342
376
  context.data_block.?.address,
343
377
  context.data_block.?.checksum,
378
+ .data,
344
379
  );
345
380
  } else {
346
381
  // The key is not present in this table, check the next level.
@@ -356,7 +391,7 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
356
391
  assert(context.index_block_count <= config.lsm_levels);
357
392
 
358
393
  if (Table.data_block_search(data_block, context.key)) |value| {
359
- context.finish(unwrap_tombstone(value));
394
+ context.callback(context, unwrap_tombstone(value));
360
395
  } else {
361
396
  // The key is not present in this table, check the next level.
362
397
  context.advance_to_next_level();
@@ -371,7 +406,7 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
371
406
 
372
407
  context.index_block += 1;
373
408
  if (context.index_block == context.index_block_count) {
374
- context.finish(null);
409
+ context.callback(context, null);
375
410
  return;
376
411
  }
377
412
  assert(context.index_block < context.index_block_count);
@@ -384,7 +419,7 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
384
419
  /// Returns null if the value is null or a tombstone, otherwise returns the value.
385
420
  /// We use tombstone values internally, but expose them as null to the user.
386
421
  /// This distinction enables us to cache a null result as a tombstone in our hash maps.
387
- inline fn unwrap_tombstone(value: ?*const Value) ?*const Value {
422
+ pub inline fn unwrap_tombstone(value: ?*const Value) ?*const Value {
388
423
  return if (value == null or tombstone(value.?)) null else value.?;
389
424
  }
390
425
 
@@ -404,53 +439,6 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
404
439
  callback(tree);
405
440
  }
406
441
 
407
- // Tree compaction runs to the sound of music!
408
- //
409
- // Compacting LSM trees involves merging and moving tables into the next levels as needed.
410
- // To avoid write amplification stalls and bound latency, compaction is done incrementally.
411
- //
412
- // A full compaction phase is denoted as a bar or measure, using terms from music notation.
413
- // Each measure consists of `lsm_batch_multiple` beats or "compaction ticks" of work.
414
- // A compaction beat is started asynchronously with `compact_io` which takes a callback.
415
- // After `compact_io` is called, `compact_cpu` should be called to enable pipelining.
416
- // The compaction beat completes when the `compact_io` callback is invoked.
417
- //
418
- // A measure is split in half according to the "first" down beat and "middle" down beat.
419
- // The first half of the measure compacts even levels while the latter compacts odd levels.
420
- // Mutable table changes are sorted and compacted into the immutable table.
421
- // The immutable table is compacted into level 0 during the odd level half of the measure.
422
- //
423
- // At any given point, there's only levels/2 max compactions happening concurrently.
424
- // The source level is denoted as `level_a` with the target level being `level_b`.
425
- // The last level in the LSM tree has no target level so it's not compaction-from.
426
- //
427
- // Assuming a measure/`lsm_batch_multiple` of 4, the invariants can be described as follows:
428
- // * assert: at the end of every beat, there's space in mutable table for the next beat.
429
- // * manifest info for the tables compacted are updating during the compaction.
430
- // * manifest is compacted at the end of every beat.
431
- //
432
- // - (first) down beat of the measure:
433
- // * assert: no compactions are currently running.
434
- // * compact immutable table if contains any sorted values (could be empty).
435
- // * allow level visible table counts to overflow if needed.
436
- // * start even level compactions if there's any tables to compact.
437
- //
438
- // - (second) up beat of the measure:
439
- // * finish ticking running even-level compactions.
440
- // * assert: on callback completion, all compactions should be completed.
441
- //
442
- // - (third) down beat of the measure:
443
- // * assert: no compactions are currently running.
444
- // * start odd level and immutable table compactions.
445
- //
446
- // - (fourth) last beat of the measure:
447
- // * finish ticking running odd-level and immutable table compactions.
448
- // * assert: on callback completion, all compactions should be completed.
449
- // * assert: on callback completion, all level visible table counts shouldn't overflow.
450
- // * flush, clear, and sort mutable table values into immutable table for next measure.
451
-
452
- const half_measure_beat_count = @divExact(config.lsm_batch_multiple, 2);
453
-
454
442
  const CompactionTableContext = struct {
455
443
  compaction: *CompactionTable,
456
444
  level_a: u8,
@@ -465,7 +453,7 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
465
453
  assert(it.tree.compaction_callback != null);
466
454
 
467
455
  const compaction_beat = it.tree.compaction_op % config.lsm_batch_multiple;
468
- const even_levels = compaction_beat < half_measure_beat_count;
456
+ const even_levels = compaction_beat < half_bar_beat_count;
469
457
  const level_a = (it.index * 2) + @boolToInt(!even_levels);
470
458
  const level_b = level_a + 1;
471
459
 
@@ -489,77 +477,93 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
489
477
  /// This order (even levels, then odd levels) is significant, since it reduces the number of
490
478
  /// level 0 tables that overlap with the immutable table, reducing write amplification.
491
479
  ///
492
- /// We therefore take the measure, during which all compactions run, and divide by two,
493
- /// running the compactions from even levels in the first half measure, and then the odd.
480
+ /// We therefore take the bar, during which all compactions run, and divide by two,
481
+ /// running the compactions from even levels in the first half bar, and then the odd.
494
482
  ///
495
- /// Compactions start on the down beat of a half measure, using 0-based beats.
496
- /// For example, if there are 4 beats in a measure, start on beat 0 or beat 2.
483
+ /// Compactions start on the down beat of a half bar, using 0-based beats.
484
+ /// For example, if there are 4 beats in a bar, start on beat 0 or beat 2.
497
485
  pub fn compact(tree: *Tree, callback: fn (*Tree) void, op: u64) void {
498
- tree.compact_start(callback, op);
499
- tree.compact_drive();
500
- }
486
+ assert(tree.compaction_callback == null);
487
+ assert(op != 0);
488
+ assert(op == tree.compaction_op + 1);
489
+ assert(op > tree.grid.superblock.working.vsr_state.commit_min);
501
490
 
502
- fn compact_drive(tree: *Tree) void {
503
- tree.compact_io();
504
- // tree.manifest.manifest_log.superblock.storage.tick();
505
- tree.compact_cpu();
491
+ tree.compaction_op = op;
492
+
493
+ if (tree.grid.superblock.working.vsr_state.op_compacted(op)) {
494
+ // We recovered from a checkpoint, and must avoid replaying one bar of
495
+ // compactions that were applied before the checkpoint. Repeating these ops'
496
+ // compactions would actually perform different compactions than before,
497
+ // causing the storage state of the replica to diverge from the cluster.
498
+ // See also: lookup_snapshot_max_for_checkpoint().
499
+
500
+ if (tree.compaction_op + 1 == tree.lookup_snapshot_max) {
501
+ // This is the last op of the skipped compaction bar.
502
+ // Prepare the immutable table for the next bar — since this state is
503
+ // in-memory, it cannot be skipped.
504
+ tree.compact_mutable_table_into_immutable();
505
+ }
506
+
507
+ // TODO Defer this callback until tick() to avoid stack growth.
508
+ callback(tree);
509
+ return;
510
+ }
511
+ assert(op == tree.lookup_snapshot_max);
512
+
513
+ tree.compact_start(callback);
514
+ tree.compact_drive();
506
515
  }
507
516
 
508
- fn compact_start(tree: *Tree, callback: fn (*Tree) void, op: u64) void {
517
+ fn compact_start(tree: *Tree, callback: fn (*Tree) void) void {
509
518
  assert(tree.compaction_io_pending == 0);
510
519
  assert(tree.compaction_callback == null);
511
-
512
- if (op > 0) assert(op > tree.compaction_op);
513
- tree.compaction_op = op;
520
+
514
521
  tree.compaction_callback = callback;
515
522
 
516
523
  const compaction_beat = tree.compaction_op % config.lsm_batch_multiple;
517
524
  const start = (compaction_beat == 0) or
518
- (compaction_beat == half_measure_beat_count);
525
+ (compaction_beat == half_bar_beat_count);
519
526
 
520
- // The target snapshot of a compaction is actually the previous batch minus one.
521
- //
522
- // At the start of the current batch, mutable table inserts from the previous batch
523
- // would be in the immutable table. This means the current batch compaction will
524
- // actually be flushing to disk (levels) mutable table updates from the previous batch.
525
- //
526
- // -1 as the ops are zero based so the "last" op from previous batch is reflected.
527
- const snapshot = std.mem.alignBackward(op, config.lsm_batch_multiple) -| 1;
528
- assert(snapshot != snapshot_latest);
527
+ const op_min = compaction_op_min(tree.compaction_op);
528
+ assert(op_min < snapshot_latest);
529
+ assert(op_min % half_bar_beat_count == 0);
529
530
 
530
- log.debug(tree_name ++ ": compact_start: op={d} snapshot={d} beat={d}/{d}", .{
531
- op,
532
- snapshot,
531
+ log.debug(tree_name ++ ": compact_start: op={d} op_min={d} beat={d}/{d}", .{
532
+ tree.compaction_op,
533
+ op_min,
533
534
  compaction_beat + 1,
534
535
  config.lsm_batch_multiple,
535
536
  });
536
537
 
537
538
  // Try to start compacting the immutable table.
538
- const even_levels = compaction_beat < half_measure_beat_count;
539
+ const even_levels = compaction_beat < half_bar_beat_count;
539
540
  if (even_levels) {
540
541
  assert(tree.compaction_table_immutable.status == .idle);
541
542
  } else {
542
- if (start) tree.compact_io_start_table_immutable(snapshot);
543
+ if (start) tree.compact_start_table_immutable(op_min);
543
544
  }
544
545
 
545
546
  // Try to start compacting the other levels.
546
547
  var it = CompactionTableIterator{ .tree = tree };
547
548
  while (it.next()) |context| {
548
- if (start) tree.compact_io_start_table(snapshot, context);
549
+ if (start) tree.compact_start_table(op_min, context);
549
550
  }
550
551
  }
551
552
 
552
- fn compact_io_start_table_immutable(tree: *Tree, snapshot: u64) void {
553
+ fn compact_start_table_immutable(tree: *Tree, op_min: u64) void {
553
554
  const compaction_beat = tree.compaction_op % config.lsm_batch_multiple;
554
- assert(compaction_beat == half_measure_beat_count);
555
+ assert(compaction_beat == half_bar_beat_count);
555
556
 
556
557
  // Do not start compaction if the immutable table does not require compaction.
557
558
  if (tree.table_immutable.free) return;
558
559
 
560
+ assert(tree.table_immutable.snapshot_min % half_bar_beat_count == 0);
561
+
559
562
  const values_count = tree.table_immutable.values.len;
560
563
  assert(values_count > 0);
561
564
 
562
565
  const level_b: u8 = 0;
566
+ const table_a: ?*const Manifest.TableInfo = null;
563
567
  const range = tree.manifest.compaction_range(
564
568
  level_b,
565
569
  tree.table_immutable.key_min(),
@@ -572,24 +576,25 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
572
576
 
573
577
  log.debug(tree_name ++
574
578
  ": compacting immutable table to level 0 " ++
575
- "(values.len={d} snapshot_min={d} compaction.snapshot={d} table_count={d})", .{
579
+ "(values.len={d} snapshot_min={d} compaction.op_min={d} table_count={d})", .{
576
580
  tree.table_immutable.values.len,
577
581
  tree.table_immutable.snapshot_min,
578
- snapshot,
582
+ op_min,
579
583
  range.table_count,
580
584
  });
581
585
 
582
586
  tree.compaction_table_immutable.start(
583
587
  tree.grid,
584
588
  &tree.manifest,
585
- level_b,
589
+ op_min,
586
590
  range,
587
- snapshot,
591
+ table_a,
592
+ level_b,
588
593
  .{ .table = &tree.table_immutable },
589
594
  );
590
595
  }
591
596
 
592
- fn compact_io_start_table(tree: *Tree, snapshot: u64, context: CompactionTableContext) void {
597
+ fn compact_start_table(tree: *Tree, op_min: u64, context: CompactionTableContext) void {
593
598
  assert(context.level_a < config.lsm_levels);
594
599
  assert(context.level_b < config.lsm_levels);
595
600
  assert(context.level_a + 1 == context.level_b);
@@ -613,9 +618,10 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
613
618
  context.compaction.start(
614
619
  tree.grid,
615
620
  &tree.manifest,
616
- context.level_b,
621
+ op_min,
617
622
  table_range.range,
618
- snapshot,
623
+ table_range.table,
624
+ context.level_b,
619
625
  .{
620
626
  .grid = tree.grid,
621
627
  .address = table.address,
@@ -624,72 +630,70 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
624
630
  );
625
631
  }
626
632
 
627
- fn compact_io(tree: *Tree) void {
633
+ fn compact_drive(tree: *Tree) void {
628
634
  assert(tree.compaction_io_pending <= 2 + tree.compaction_table.len);
629
635
  assert(tree.compaction_callback != null);
630
636
 
631
- // Try to tick the cpu portion of the immutable table compaction:
637
+ // Always start one fake io_pending that is resolved right after
638
+ // to handle the case where this compaction tick triggers no IO.
639
+ // (For example, ticking the immutable table, or level B is already done).
640
+ tree.compaction_io_pending += 1;
641
+ defer tree.compact_tick_done();
642
+
643
+ // Try to tick the immutable table compaction:
632
644
  const compaction_beat = tree.compaction_op % config.lsm_batch_multiple;
633
- const even_levels = compaction_beat < half_measure_beat_count;
645
+ const even_levels = compaction_beat < half_bar_beat_count;
634
646
  if (even_levels) {
635
647
  assert(tree.compaction_table_immutable.status == .idle);
636
648
  } else {
637
- if (tree.compaction_table_immutable.status == .compacting) {
638
- tree.compact_io_tick(&tree.compaction_table_immutable);
639
- }
649
+ tree.compact_tick(&tree.compaction_table_immutable);
640
650
  }
641
651
 
642
- // Try to tick the cpu portion of the level compactions:
652
+ // Try to tick the compaction for each level:
643
653
  var it = CompactionTableIterator{ .tree = tree };
644
654
  while (it.next()) |context| {
645
- if (context.compaction.status == .compacting) {
646
- assert(context.compaction.level_b == context.level_b);
647
- tree.compact_io_tick(context.compaction);
648
- }
655
+ tree.compact_tick(context.compaction);
649
656
  }
650
-
651
- // Always start one io_pending that is resolved in compact_cpu()
652
- // to handle the case of no level or immutable table being selected for compaction
653
- tree.compaction_io_pending += 1;
654
- assert(tree.compaction_io_pending <= 2 + tree.compaction_table.len);
655
657
  }
656
658
 
657
- fn compact_io_tick(tree: *Tree, compaction: anytype) void {
659
+ fn compact_tick(tree: *Tree, compaction: anytype) void {
660
+ if (compaction.status != .processing) return;
658
661
  tree.compaction_io_pending += 1;
659
662
 
660
663
  const compaction_beat = tree.compaction_op % config.lsm_batch_multiple;
661
- const even_levels = compaction_beat < half_measure_beat_count;
664
+ const even_levels = compaction_beat < half_bar_beat_count;
662
665
  assert(compaction.level_b < config.lsm_levels);
663
666
  assert(compaction.level_b % 2 == @boolToInt(even_levels));
664
667
 
665
668
  if (@TypeOf(compaction.*) == CompactionTableImmutable) {
666
669
  assert(compaction.level_b == 0);
667
- compaction.tick_io(Tree.compact_io_tick_callback_table_immutable);
668
- log.debug(tree_name ++ ": queued compaction for immutable table to level 0", .{});
670
+ log.debug(tree_name ++ ": compact_tick() for immutable table to level 0", .{});
671
+ compaction.compact_tick(Tree.compact_tick_callback_table_immutable);
669
672
  } else {
670
- compaction.tick_io(Tree.compact_io_tick_callback_table);
671
- log.debug(tree_name ++ ": queued compaction for level {d} to level {d}", .{
673
+ assert(@TypeOf(compaction.*) == CompactionTable);
674
+ log.debug(tree_name ++ ": compact_tick() for level {d} to level {d}", .{
672
675
  compaction.level_b - 1,
673
676
  compaction.level_b,
674
677
  });
678
+ compaction.compact_tick(Tree.compact_tick_callback_table);
675
679
  }
676
680
  }
677
681
 
678
- fn compact_io_tick_callback_table_immutable(compaction: *CompactionTableImmutable) void {
679
- assert(compaction.status == .compacting or compaction.status == .done);
682
+ fn compact_tick_callback_table_immutable(compaction: *CompactionTableImmutable) void {
683
+ assert(compaction.status == .processing or compaction.status == .done);
680
684
  assert(compaction.level_b < config.lsm_levels);
681
685
  assert(compaction.level_b == 0);
682
686
 
683
687
  const tree = @fieldParentPtr(Tree, "compaction_table_immutable", compaction);
684
688
  const compaction_beat = tree.compaction_op % config.lsm_batch_multiple;
685
- assert(compaction_beat >= half_measure_beat_count);
689
+ assert(compaction_beat >= half_bar_beat_count);
686
690
 
687
- log.debug(tree_name ++ ": compact_io complete for immutable table to level 0", .{});
688
- tree.compact_io_tick_done();
691
+ log.debug(tree_name ++ ": compact_tick() complete for immutable table to level 0", .{});
692
+ tree.compact_tick_done();
689
693
  }
690
694
 
691
- fn compact_io_tick_callback_table(compaction: *CompactionTable) void {
692
- assert(compaction.status == .compacting or compaction.status == .done);
695
+ fn compact_tick_callback_table(compaction: *CompactionTable) void {
696
+ assert(compaction.status == .processing or compaction.status == .done);
693
697
  assert(compaction.level_b < config.lsm_levels);
694
698
  assert(compaction.level_b > 0);
695
699
 
@@ -699,182 +703,201 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
699
703
  const table_size = @divFloor(config.lsm_levels, 2);
700
704
  const table: *[table_size]CompactionTable = table_ptr[0..table_size];
701
705
 
702
- const tree = @fieldParentPtr(Tree, "compaction_table", table);
703
- log.debug(tree_name ++ ": compact_io complete for level {d} to level {d}", .{
706
+ log.debug(tree_name ++ ": compact_tick() complete for level {d} to level {d}", .{
704
707
  compaction.level_b - 1,
705
708
  compaction.level_b,
706
709
  });
707
710
 
708
- tree.compact_io_tick_done();
711
+ const tree = @fieldParentPtr(Tree, "compaction_table", table);
712
+ tree.compact_tick_done();
709
713
  }
710
714
 
711
- fn compact_io_tick_done(tree: *Tree) void {
715
+ fn compact_tick_done(tree: *Tree) void {
712
716
  assert(tree.compaction_io_pending <= 2 + tree.compaction_table.len);
713
717
  assert(tree.compaction_callback != null);
714
718
 
715
- // compact_done() is called after all compact_io_tick()'s complete.
716
- // This function can be triggered asynchronously or by compact_cpu() below.
719
+ // compact_done() is called after all compact_tick()'s complete.
717
720
  tree.compaction_io_pending -= 1;
718
721
  if (tree.compaction_io_pending == 0) tree.compact_done();
719
722
  }
720
723
 
721
- fn compact_cpu(tree: *Tree) void {
722
- assert(tree.compaction_io_pending <= 2 + tree.compaction_table.len);
724
+ /// Called at the end of each compaction tick.
725
+ fn compact_done(tree: *Tree) void {
726
+ assert(tree.compaction_io_pending == 0);
723
727
  assert(tree.compaction_callback != null);
728
+ assert(tree.compaction_op == tree.lookup_snapshot_max);
724
729
 
725
- // Try to tick the cpu portion of the immutable table compaction:
726
730
  const compaction_beat = tree.compaction_op % config.lsm_batch_multiple;
727
- const even_levels = compaction_beat < half_measure_beat_count;
728
- if (even_levels) {
729
- assert(tree.compaction_table_immutable.status == .idle);
730
- } else {
731
- if (tree.compaction_table_immutable.status == .compacting) {
732
- tree.compaction_table_immutable.tick_cpu();
733
- }
731
+ const even_levels = compaction_beat < half_bar_beat_count;
732
+ const compacted_levels_even = compaction_beat == half_bar_beat_count - 1;
733
+ const compacted_levels_odd = compaction_beat == config.lsm_batch_multiple - 1;
734
+ if (!compacted_levels_even and !compacted_levels_odd) {
735
+ // TODO(Deterministic Beats): Remove this when compact_done() is called exactly
736
+ // once when the beat finishes.
737
+ tree.lookup_snapshot_max = tree.compaction_op + 1;
738
+
739
+ tree.compact_finish();
740
+ return;
734
741
  }
735
742
 
736
- // Try to tick the cpu portion of the level compactions:
737
- var it = CompactionTableIterator{ .tree = tree };
738
- while (it.next()) |context| {
739
- if (context.compaction.status == .compacting) {
740
- assert(context.compaction.level_b == context.level_b);
741
- context.compaction.tick_cpu();
743
+ // At the end of the second and fourth beat:
744
+ // 1. Tick the Compactions until all have completed.
745
+ // 2. Remove invisible tables from the manifest.
746
+ // 3. Compact the manifest.
747
+ // Then at the end of the fourth beat, freeze the mutable table.
748
+ assert(compacted_levels_even or compacted_levels_odd);
749
+ assert(compacted_levels_even != compacted_levels_odd);
750
+
751
+ const still_compacting = blk: {
752
+ if (even_levels) {
753
+ assert(tree.compaction_table_immutable.status == .idle);
754
+ } else {
755
+ if (tree.compaction_table_immutable.status == .processing) break :blk true;
742
756
  }
743
- }
744
757
 
745
- // Resolve the io_pending added by compact_io(). This may trigger compact_done().
746
- tree.compact_io_tick_done();
747
- }
748
-
749
- fn compact_done(tree: *Tree) void {
750
- assert(tree.compaction_io_pending == 0);
751
- assert(tree.compaction_callback != null);
758
+ var it = CompactionTableIterator{ .tree = tree };
759
+ while (it.next()) |context| {
760
+ if (context.compaction.status == .processing) break :blk true;
761
+ }
762
+ break :blk false;
763
+ };
752
764
 
753
- var still_compacting = false;
765
+ if (still_compacting) {
766
+ // We are at the end of a half-bar, but the compactions have not finished.
767
+ // We keep ticking them until they finish.
768
+ log.debug(tree_name ++ ": compact_done: driving outstanding compactions", .{});
769
+ tree.compact_drive();
770
+ return;
771
+ }
754
772
 
755
- // Mark immutable compaction that reported done in their callback as "completed".
756
- const compaction_beat = tree.compaction_op % config.lsm_batch_multiple;
757
- const even_levels = compaction_beat < half_measure_beat_count;
758
- if (even_levels) {
759
- assert(tree.compaction_table_immutable.status == .idle);
760
- } else {
761
- if (tree.compaction_table_immutable.status == .done) {
762
- tree.compaction_table_immutable.reset();
763
- tree.table_immutable.clear();
764
- } else if (tree.compaction_table_immutable.status == .compacting) {
765
- still_compacting = true;
773
+ // TODO(Deterministic Beats): Move this to the top of the function when compact_done()
774
+ // is called exactly once when the beat finishes.
775
+ tree.lookup_snapshot_max = tree.compaction_op + 1;
776
+
777
+ // All compactions have finished for the current half-bar.
778
+ // We couldn't remove the (invisible) input tables until now because prefetch()
779
+ // needs a complete set of tables for lookups to avoid missing data.
780
+
781
+ // Reset the immutable table Compaction.
782
+ // Also clear any tables made invisible by the compaction.
783
+ if (!even_levels) {
784
+ switch (tree.compaction_table_immutable.status) {
785
+ // The compaction wasn't started for this half bar.
786
+ .idle => assert(tree.table_immutable.free),
787
+ .processing => unreachable,
788
+ .done => {
789
+ tree.compaction_table_immutable.reset();
790
+ tree.table_immutable.clear();
791
+ tree.manifest.remove_invisible_tables(
792
+ tree.compaction_table_immutable.level_b,
793
+ tree.lookup_snapshot_max,
794
+ tree.compaction_table_immutable.range.key_min,
795
+ tree.compaction_table_immutable.range.key_max,
796
+ );
797
+ },
766
798
  }
767
799
  }
768
800
 
769
- // Mark compactions that reported done in their callback as "completed" (done = null).
801
+ // Reset all the other Compactions.
802
+ // Also clear any tables made invisible by the compactions.
770
803
  var it = CompactionTableIterator{ .tree = tree };
771
804
  while (it.next()) |context| {
772
- if (context.compaction.status == .done) {
773
- assert(context.compaction.level_b == context.level_b);
774
- context.compaction.reset();
775
- } else if (context.compaction.status == .compacting) {
776
- still_compacting = true;
805
+ switch (context.compaction.status) {
806
+ .idle => {}, // The compaction wasn't started for this half bar.
807
+ .processing => unreachable,
808
+ .done => {
809
+ context.compaction.reset();
810
+ tree.manifest.remove_invisible_tables(
811
+ context.compaction.level_b,
812
+ tree.lookup_snapshot_max,
813
+ context.compaction.range.key_min,
814
+ context.compaction.range.key_max,
815
+ );
816
+ if (context.compaction.level_b > 0) {
817
+ tree.manifest.remove_invisible_tables(
818
+ context.compaction.level_b - 1,
819
+ tree.lookup_snapshot_max,
820
+ context.compaction.range.key_min,
821
+ context.compaction.range.key_max,
822
+ );
823
+ }
824
+ },
777
825
  }
778
826
  }
779
827
 
780
- // At the end of every beat, ensure mutable table can be flushed to immutable table.
781
- assert(tree.table_mutable.can_commit_batch(tree.options.commit_count_max));
782
-
783
- // At end of second/half measure:
784
- // - assert: even compactions from previous tick are finished.
785
- // - remove tables made invisible during compaction of even levels.
786
- if (compaction_beat == half_measure_beat_count - 1) {
787
- if (still_compacting) {
788
- log.debug(tree_name ++ ": compact_done: driving outstanding compactions", .{});
789
- return tree.compact_drive();
790
- }
791
-
792
- log.debug(tree_name ++ ": compact_done: compacted even levels", .{});
793
-
794
- it = CompactionTableIterator{ .tree = tree };
795
- while (it.next()) |context| {
796
- assert(context.compaction.status == .idle);
797
- tree.manifest.remove_invisible_tables(
798
- context.level_a,
799
- context.compaction.snapshot,
800
- context.compaction.range.key_min,
801
- context.compaction.range.key_max,
802
- );
803
- }
828
+ assert(tree.compaction_table_immutable.status == .idle);
829
+ it = CompactionTableIterator{ .tree = tree };
830
+ while (it.next()) |context| {
831
+ assert(context.compaction.status == .idle);
804
832
  }
805
833
 
806
- // At end of fourth/last measure:
807
- // - assert: immutable table and odd level compactions from previous tick are finished.
808
- // - remove tables made invisible during compaction of odd levels.
809
- // - assert: all visible levels haven't overflowed their max.
810
- // - convert mutable table to immutable tables for next measure.
811
- if (compaction_beat == config.lsm_batch_multiple - 1) {
812
- if (still_compacting) {
813
- log.debug(tree_name ++ ": compact_done: driving outstanding compactions", .{});
814
- return tree.compact_drive();
815
- }
816
-
817
- // TODO Make log message more accurate according to what was compacted.
818
- log.debug(tree_name ++ ": compact_done: compacted immutable table and odd levels", .{});
819
-
820
- assert(tree.compaction_table_immutable.status == .idle);
821
- it = CompactionTableIterator{ .tree = tree };
822
- while (it.next()) |context| {
823
- assert(context.compaction.status == .idle);
824
- tree.manifest.remove_invisible_tables(
825
- context.level_a,
826
- context.compaction.snapshot,
827
- context.compaction.range.key_min,
828
- context.compaction.range.key_max,
829
- );
830
- }
831
-
834
+ // At the end of the fourth/last beat:
835
+ // - Assert all visible tables haven't overflowed their max per level.
836
+ // - Convert mutable table to immutable table for next bar.
837
+ if (compacted_levels_odd) {
832
838
  tree.manifest.assert_level_table_counts();
833
839
  tree.compact_mutable_table_into_immutable();
834
840
  }
835
841
 
836
- // At the end of every beat, call manifest.compact before invoking the compact callback.
842
+ // At the end of the second/fourth beat:
843
+ // - Compact the manifest before invoking the compact() callback.
837
844
  tree.manifest.compact(compact_manifest_callback);
838
845
  }
839
846
 
840
- fn compact_manifest_callback(manifest: *Manifest) void {
841
- const tree = @fieldParentPtr(Tree, "manifest", manifest);
842
- assert(tree.compaction_io_pending == 0);
843
- assert(tree.compaction_callback != null);
844
-
845
- // Invoke the compact_io() callback after the manifest compacts at the end of the beat.
846
- const callback = tree.compaction_callback.?;
847
- tree.compaction_callback = null;
848
- callback(tree);
849
- }
850
-
847
+ /// Called after the last beat of a full compaction bar.
851
848
  fn compact_mutable_table_into_immutable(tree: *Tree) void {
852
- // Ensure mutable table can be flushed into immutable table.
853
- if (tree.table_mutable.count() == 0) return;
854
849
  assert(tree.table_immutable.free);
850
+ assert((tree.compaction_op + 1) % config.lsm_batch_multiple == 0);
851
+ assert(tree.compaction_op + 1 == tree.lookup_snapshot_max);
852
+
853
+ if (tree.table_mutable.count() == 0) return;
855
854
 
856
855
  // Sort the mutable table values directly into the immutable table's array.
857
856
  const values_max = tree.table_immutable.values_max();
858
857
  const values = tree.table_mutable.sort_into_values_and_clear(values_max);
859
858
  assert(values.ptr == values_max.ptr);
860
859
 
861
- // Take a manifest snapshot and setup the immutable table with the sorted values.
862
- const snapshot_min = tree.compaction_table_immutable.snapshot;
863
- tree.table_immutable.reset_with_sorted_values(snapshot_min, values);
860
+ // The immutable table must be visible to the next bar setting its snapshot_min to
861
+ // lookup_snapshot_max guarantees.
862
+ //
863
+ // In addition, the immutable table is conceptually an output table of this compaction
864
+ // bar, and now its snapshot_min matches the snapshot_min of the Compactions' output
865
+ // tables.
866
+ tree.table_immutable.reset_with_sorted_values(tree.lookup_snapshot_max, values);
864
867
 
865
868
  assert(tree.table_mutable.count() == 0);
866
869
  assert(!tree.table_immutable.free);
867
870
  }
868
871
 
872
+ fn compact_manifest_callback(manifest: *Manifest) void {
873
+ const tree = @fieldParentPtr(Tree, "manifest", manifest);
874
+ assert(tree.compaction_io_pending == 0);
875
+ assert(tree.compaction_callback != null);
876
+ tree.compact_finish();
877
+ }
878
+
879
+ /// Called at the end of each compaction beat.
880
+ fn compact_finish(tree: *Tree) void {
881
+ assert(tree.compaction_io_pending == 0);
882
+ assert(tree.table_mutable.can_commit_batch(tree.options.commit_entries_max));
883
+
884
+ // Invoke the compact() callback after the manifest compacts at the end of the beat.
885
+ const callback = tree.compaction_callback.?;
886
+ tree.compaction_callback = null;
887
+ callback(tree);
888
+ }
889
+
869
890
  pub fn checkpoint(tree: *Tree, callback: fn (*Tree) void) void {
870
- // Assert no outstanding compact_io() work..
891
+ // Assert no outstanding compact_tick() work..
871
892
  assert(tree.compaction_io_pending == 0);
872
893
  assert(tree.compaction_callback == null);
894
+ assert(tree.compaction_op > 0);
895
+ assert(tree.compaction_op + 1 == tree.lookup_snapshot_max);
873
896
 
874
- // Avoid checkpointing if this is not the last beat in the compaction measure.
897
+ // Assert that this is the last beat in the compaction bar.
875
898
  const compaction_beat = tree.compaction_op % config.lsm_batch_multiple;
876
- const last_beat_in_measure = config.lsm_batch_multiple - 1;
877
- if (compaction_beat != last_beat_in_measure) return callback(tree);
899
+ const last_beat_in_bar = config.lsm_batch_multiple - 1;
900
+ assert(last_beat_in_bar == compaction_beat);
878
901
 
879
902
  // Assert no outstanding compactions.
880
903
  assert(tree.compaction_table_immutable.status == .idle);
@@ -887,7 +910,7 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
887
910
 
888
911
  // Assert that we're checkpointing only after invisible tables have been removed.
889
912
  if (config.verify) {
890
- tree.manifest.assert_no_invisible_tables(tree.compaction_op);
913
+ tree.manifest.assert_no_invisible_tables(compaction_op_min(tree.compaction_op));
891
914
  }
892
915
 
893
916
  // Start an asynchronous checkpoint on the manifest.
@@ -932,7 +955,7 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
932
955
  pub fn range_query(
933
956
  tree: *Tree,
934
957
  /// The snapshot timestamp, if any
935
- snapshot: u64,
958
+ snapshot: ?u64,
936
959
  query: RangeQuery,
937
960
  ) RangeQueryIterator {
938
961
  _ = tree;
@@ -942,6 +965,87 @@ pub fn TreeType(comptime Table: type, comptime Storage: type, comptime tree_name
942
965
  };
943
966
  }
944
967
 
968
+ /// Returns the first op of the compaction (Compaction.op_min) for a given op/beat.
969
+ ///
970
+ /// After this compaction finishes:
971
+ /// - `op_min + half_bar_beat_count - 1` will be the input tables' snapshot_max.
972
+ /// - `op_min + half_bar_beat_count` will be the output tables' snapshot_min.
973
+ ///
974
+ /// Each half-bar has a separate op_min (for deriving the output snapshot_min) instead of each full
975
+ /// bar because this allows the output tables of the first half-bar's compaction to be prefetched
976
+ /// against earlier — hopefully while they are still warm in the cache from being written.
977
+ pub fn compaction_op_min(op: u64) u64 {
978
+ return op - op % half_bar_beat_count;
979
+ }
980
+
981
+ /// These charts depict the commit/compact ops and `lookup_snapshot_max` over a series of
982
+ /// commits and compactions (with lsm_batch_multiple=8).
983
+ ///
984
+ /// Legend:
985
+ ///
986
+ /// ┼ full bar (first half-bar start)
987
+ /// ┬ half bar (second half-bar start)
988
+ /// $ lookup_snapshot_max (prefetch reads from the current snapshot)
989
+ /// This is incremented at the end of each compact().
990
+ /// . op is in mutable table (in memory)
991
+ /// , op is in immutable table (in memory)
992
+ /// # op is on disk
993
+ /// ✓ checkpoint() may follow compact()
994
+ ///
995
+ /// 0 2 4 6 8 0 2 4 6
996
+ /// ┼───┬───┼───┬───┼
997
+ /// .$ ╷ ╷ init(superblock.commit_min=0)⎤ Compaction is effectively a noop for the
998
+ /// .$ ╷ ╷ commit;compact( 1) start/end ⎥ first bar because there are no tables on
999
+ /// ..$ ╷ ╷ commit;compact( 2) start/end ⎥ disk yet, and no immutable table to
1000
+ /// ...$ ╷ ╷ commit;compact( 3) start/end ⎥ flush.
1001
+ /// ....$ ╷ ╷ commit;compact( 4) start/end ⎥
1002
+ /// .....$ ╷ ╷ commit;compact( 5) start/end ⎥ This applies:
1003
+ /// ......$ ╷ ╷ commit;compact( 6) start/end ⎥ - when the LSM is starting on a freshly
1004
+ /// .......$╷ ╷ commit;compact( 7) start ⎤⎥ formatted data file, and also
1005
+ /// ,,,,,,,,$ ╷ ✓ compact( 7) end⎦⎦ - when the LSM is recovering from a crash
1006
+ /// ,,,,,,,,$ ╷ commit;compact( 8) start/end (see below).
1007
+ /// ,,,,,,,,.$ ╷ commit;compact( 9) start/end
1008
+ /// ,,,,,,,,..$ ╷ commit;compact(10) start/end
1009
+ /// ,,,,,,,,...$ ╷ commit;compact(11) start/end
1010
+ /// ,,,,,,,,....$ ╷ commit;compact(12) start/end
1011
+ /// ,,,,,,,,.....$ ╷ commit;compact(13) start/end
1012
+ /// ,,,,,,,,......$ ╷ commit;compact(14) start/end
1013
+ /// ,,,,,,,,.......$╷ commit;compact(15) start ⎤
1014
+ /// ########,,,,,,,,$ ✓ compact(15) end⎦
1015
+ /// ########,,,,,,,,$ commit;compact(16) start/end
1016
+ /// ┼───┬───┼───┬───┼
1017
+ /// 0 2 4 6 8 0 2 4 6
1018
+ /// ┼───┬───┼───┬───┼ Recover with a checkpoint taken at op 15.
1019
+ /// ######## $ init(superblock.commit_min=7) At op 15, ops 8…15 are in memory, so they
1020
+ /// ########. $ commit ( 8) start/end ⎤ were dropped by the crash.
1021
+ /// ########.. $ commit ( 9) start/end ⎥
1022
+ /// ########... $ commit (10) start/end ⎥ But compaction is not run for ops 8…15
1023
+ /// ########.... $ commit (11) start/end ⎥ because it was already performed
1024
+ /// ########..... $ commit (12) start/end ⎥ before the checkpoint.
1025
+ /// ########...... $ commit (13) start/end ⎥
1026
+ /// ########....... $ commit (14) start/end ⎥ We can begin to compact again at op 16,
1027
+ /// ########........$ commit (15) start ⎤⎥ because those compactions (if previously
1028
+ /// ########,,,,,,,,$ ✓ (15) end⎦⎦ performed) are not included in the
1029
+ /// ########,,,,,,,,$ commit;compact(16) start/end checkpoint.
1030
+ /// ┼───┬───┼───┬───┼
1031
+ /// 0 2 4 6 8 0 2 4 6
1032
+ ///
1033
+ /// Notice how in the checkpoint recovery example above, we are careful not to `compact(op)` twice
1034
+ /// for any op (even if we crash/recover), since that could lead to differences between replicas'
1035
+ /// storage. The last bar of `commit()`s is always only in memory, so it is safe to repeat.
1036
+ ///
1037
+ /// Additionally, while skipping compactions during recovery, we use a `lookup_snapshot_max`
1038
+ /// different than the original compactions — the old tables may have been removed during the
1039
+ /// checkpoint.
1040
+ fn lookup_snapshot_max_for_checkpoint(op_checkpoint: u64) u64 {
1041
+ if (op_checkpoint == 0) {
1042
+ // Start from 1 because we never commit op 0.
1043
+ return 1;
1044
+ } else {
1045
+ return op_checkpoint + config.lsm_batch_multiple + 1;
1046
+ }
1047
+ }
1048
+
945
1049
  /// The total number of tables that can be supported by the tree across so many levels.
946
1050
  pub fn table_count_max_for_tree(growth_factor: u32, levels_count: u32) u32 {
947
1051
  assert(growth_factor >= 4);
@@ -984,102 +1088,3 @@ test "table_count_max_for_level/tree" {
984
1088
  try expectEqual(@as(u32, 37448 + 262144), table_count_max_for_tree(8, 6));
985
1089
  try expectEqual(@as(u32, 299592 + 2097152), table_count_max_for_tree(8, 7));
986
1090
  }
987
-
988
- pub fn main() !void {
989
- const testing = std.testing;
990
- const allocator = testing.allocator;
991
-
992
- const IO = @import("../io.zig").IO;
993
- const Storage = @import("../storage.zig").Storage;
994
- const Grid = @import("grid.zig").GridType(Storage);
995
-
996
- const data_file_size_min = @import("../vsr/superblock.zig").data_file_size_min;
997
-
998
- const dir_fd = try IO.open_dir(".");
999
- const storage_fd = try IO.open_file(dir_fd, "test_tree", data_file_size_min, true);
1000
- defer std.fs.cwd().deleteFile("test_tree") catch {};
1001
-
1002
- var io = try IO.init(128, 0);
1003
- defer io.deinit();
1004
-
1005
- var storage = try Storage.init(&io, storage_fd);
1006
- defer storage.deinit();
1007
-
1008
- const Key = CompositeKey(u128);
1009
- const Table = @import("table.zig").TableType(
1010
- Key,
1011
- Key.Value,
1012
- Key.compare_keys,
1013
- Key.key_from_value,
1014
- Key.sentinel_key,
1015
- Key.tombstone,
1016
- Key.tombstone_from_key,
1017
- );
1018
-
1019
- const Tree = TreeType(Table, Storage, @typeName(Table) ++ "_test");
1020
-
1021
- // Check out our spreadsheet to see how we calculate node_count for a forest of trees.
1022
- const node_count = 1024;
1023
- var node_pool = try NodePool.init(allocator, node_count);
1024
- defer node_pool.deinit(allocator);
1025
-
1026
- var value_cache = Tree.ValueCache{};
1027
- try value_cache.ensureTotalCapacity(allocator, 10000);
1028
- defer value_cache.deinit(allocator);
1029
-
1030
- const batch_size_max = config.message_size_max - @sizeOf(vsr.Header);
1031
- const commit_count_max = @divFloor(batch_size_max, 128);
1032
-
1033
- var sort_buffer = try allocator.allocAdvanced(
1034
- u8,
1035
- 16,
1036
- // This must be the greatest commit_count_max and value_size across trees:
1037
- commit_count_max * config.lsm_batch_multiple * 128,
1038
- .exact,
1039
- );
1040
- defer allocator.free(sort_buffer);
1041
-
1042
- // TODO Initialize SuperBlock:
1043
- var superblock: SuperBlockType(Storage) = undefined;
1044
-
1045
- var grid = try Grid.init(allocator, &superblock);
1046
- defer grid.deinit(allocator);
1047
-
1048
- var tree = try Tree.init(
1049
- allocator,
1050
- &node_pool,
1051
- &grid,
1052
- &value_cache,
1053
- .{
1054
- .prefetch_count_max = commit_count_max * 2,
1055
- .commit_count_max = commit_count_max,
1056
- },
1057
- );
1058
- defer tree.deinit(allocator);
1059
-
1060
- testing.refAllDecls(@This());
1061
-
1062
- // TODO: more references
1063
- _ = Table;
1064
- _ = Table.Builder.data_block_finish;
1065
-
1066
- // TODO: more references
1067
- _ = Tree.CompactionTable;
1068
-
1069
- _ = tree.prefetch_enqueue;
1070
- _ = tree.prefetch;
1071
- _ = tree.prefetch_key;
1072
- _ = tree.get;
1073
- _ = tree.put;
1074
- _ = tree.remove;
1075
- _ = tree.lookup;
1076
- _ = tree.compact_io;
1077
- _ = tree.compact_cpu;
1078
-
1079
- _ = Tree.Manifest.LookupIterator.next;
1080
- _ = tree.manifest;
1081
- _ = tree.manifest.lookup;
1082
- _ = tree.manifest.insert_tables;
1083
-
1084
- std.debug.print("table_count_max={}\n", .{table_count_max});
1085
- }