tigerbeetle-node 0.10.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. package/README.md +302 -101
  2. package/dist/index.d.ts +70 -72
  3. package/dist/index.js +70 -72
  4. package/dist/index.js.map +1 -1
  5. package/package.json +6 -6
  6. package/scripts/download_node_headers.sh +14 -7
  7. package/src/index.ts +6 -10
  8. package/src/node.zig +6 -3
  9. package/src/tigerbeetle/scripts/benchmark.sh +4 -4
  10. package/src/tigerbeetle/scripts/confirm_image.sh +44 -0
  11. package/src/tigerbeetle/scripts/fuzz_loop.sh +15 -0
  12. package/src/tigerbeetle/scripts/fuzz_unique_errors.sh +7 -0
  13. package/src/tigerbeetle/scripts/install.sh +19 -4
  14. package/src/tigerbeetle/scripts/install_zig.bat +5 -1
  15. package/src/tigerbeetle/scripts/install_zig.sh +24 -14
  16. package/src/tigerbeetle/scripts/pre-commit.sh +9 -0
  17. package/src/tigerbeetle/scripts/shellcheck.sh +5 -0
  18. package/src/tigerbeetle/scripts/tests_on_alpine.sh +10 -0
  19. package/src/tigerbeetle/scripts/tests_on_ubuntu.sh +14 -0
  20. package/src/tigerbeetle/src/benchmark.zig +4 -2
  21. package/src/tigerbeetle/src/benchmark_array_search.zig +3 -3
  22. package/src/tigerbeetle/src/c/tb_client/thread.zig +8 -9
  23. package/src/tigerbeetle/src/c/tb_client.h +100 -80
  24. package/src/tigerbeetle/src/c/tb_client.zig +4 -1
  25. package/src/tigerbeetle/src/cli.zig +1 -1
  26. package/src/tigerbeetle/src/config.zig +48 -16
  27. package/src/tigerbeetle/src/demo.zig +3 -1
  28. package/src/tigerbeetle/src/eytzinger_benchmark.zig +3 -3
  29. package/src/tigerbeetle/src/io/linux.zig +1 -1
  30. package/src/tigerbeetle/src/lsm/README.md +214 -0
  31. package/src/tigerbeetle/src/lsm/binary_search.zig +137 -10
  32. package/src/tigerbeetle/src/lsm/bloom_filter.zig +43 -0
  33. package/src/tigerbeetle/src/lsm/compaction.zig +352 -398
  34. package/src/tigerbeetle/src/lsm/composite_key.zig +2 -0
  35. package/src/tigerbeetle/src/lsm/eytzinger.zig +1 -1
  36. package/src/tigerbeetle/src/lsm/forest.zig +21 -447
  37. package/src/tigerbeetle/src/lsm/forest_fuzz.zig +412 -0
  38. package/src/tigerbeetle/src/lsm/grid.zig +145 -69
  39. package/src/tigerbeetle/src/lsm/groove.zig +196 -133
  40. package/src/tigerbeetle/src/lsm/k_way_merge.zig +40 -18
  41. package/src/tigerbeetle/src/lsm/level_iterator.zig +28 -9
  42. package/src/tigerbeetle/src/lsm/manifest.zig +81 -181
  43. package/src/tigerbeetle/src/lsm/manifest_level.zig +210 -454
  44. package/src/tigerbeetle/src/lsm/manifest_log.zig +77 -28
  45. package/src/tigerbeetle/src/lsm/posted_groove.zig +64 -76
  46. package/src/tigerbeetle/src/lsm/segmented_array.zig +561 -241
  47. package/src/tigerbeetle/src/lsm/segmented_array_benchmark.zig +148 -0
  48. package/src/tigerbeetle/src/lsm/segmented_array_fuzz.zig +9 -0
  49. package/src/tigerbeetle/src/lsm/set_associative_cache.zig +62 -12
  50. package/src/tigerbeetle/src/lsm/table.zig +83 -48
  51. package/src/tigerbeetle/src/lsm/table_immutable.zig +30 -23
  52. package/src/tigerbeetle/src/lsm/table_iterator.zig +25 -14
  53. package/src/tigerbeetle/src/lsm/table_mutable.zig +63 -12
  54. package/src/tigerbeetle/src/lsm/test.zig +49 -55
  55. package/src/tigerbeetle/src/lsm/tree.zig +407 -402
  56. package/src/tigerbeetle/src/lsm/tree_fuzz.zig +457 -0
  57. package/src/tigerbeetle/src/main.zig +28 -6
  58. package/src/tigerbeetle/src/message_bus.zig +2 -2
  59. package/src/tigerbeetle/src/message_pool.zig +14 -17
  60. package/src/tigerbeetle/src/simulator.zig +145 -112
  61. package/src/tigerbeetle/src/state_machine.zig +338 -228
  62. package/src/tigerbeetle/src/static_allocator.zig +65 -0
  63. package/src/tigerbeetle/src/storage.zig +3 -7
  64. package/src/tigerbeetle/src/test/accounting/auditor.zig +577 -0
  65. package/src/tigerbeetle/src/test/accounting/workload.zig +819 -0
  66. package/src/tigerbeetle/src/test/cluster.zig +18 -48
  67. package/src/tigerbeetle/src/test/conductor.zig +365 -0
  68. package/src/tigerbeetle/src/test/fuzz.zig +121 -0
  69. package/src/tigerbeetle/src/test/id.zig +89 -0
  70. package/src/tigerbeetle/src/test/priority_queue.zig +645 -0
  71. package/src/tigerbeetle/src/test/state_checker.zig +93 -69
  72. package/src/tigerbeetle/src/test/state_machine.zig +11 -35
  73. package/src/tigerbeetle/src/test/storage.zig +29 -8
  74. package/src/tigerbeetle/src/tigerbeetle.zig +14 -16
  75. package/src/tigerbeetle/src/unit_tests.zig +7 -0
  76. package/src/tigerbeetle/src/vopr.zig +494 -0
  77. package/src/tigerbeetle/src/vopr_hub/README.md +58 -0
  78. package/src/tigerbeetle/src/vopr_hub/SETUP.md +199 -0
  79. package/src/tigerbeetle/src/vopr_hub/go.mod +3 -0
  80. package/src/tigerbeetle/src/vopr_hub/main.go +1022 -0
  81. package/src/tigerbeetle/src/vopr_hub/scheduler/go.mod +3 -0
  82. package/src/tigerbeetle/src/vopr_hub/scheduler/main.go +403 -0
  83. package/src/tigerbeetle/src/vsr/client.zig +13 -0
  84. package/src/tigerbeetle/src/vsr/journal.zig +16 -13
  85. package/src/tigerbeetle/src/vsr/replica.zig +924 -491
  86. package/src/tigerbeetle/src/vsr/superblock.zig +55 -37
  87. package/src/tigerbeetle/src/vsr/superblock_client_table.zig +7 -10
  88. package/src/tigerbeetle/src/vsr/superblock_free_set.zig +2 -2
  89. package/src/tigerbeetle/src/vsr/superblock_manifest.zig +18 -3
  90. package/src/tigerbeetle/src/vsr.zig +75 -55
  91. package/src/tigerbeetle/scripts/vopr.bat +0 -48
  92. package/src/tigerbeetle/scripts/vopr.sh +0 -33
@@ -1,8 +1,41 @@
1
+ //! Compaction moves or merges a table's values into the next level.
2
+ //!
3
+ //! Each Compaction is paced to run in one half-bar.
4
+ //!
5
+ //!
6
+ //! Compaction overview:
7
+ //!
8
+ //! 1. Given:
9
+ //!
10
+ //! - levels A and B, where A+1=B
11
+ //! - a single table in level A ("table A")
12
+ //! - all tables from level B which intersect table A's key range ("tables B")
13
+ //! (This can include anything between 0 tables and all of level B's tables.)
14
+ //!
15
+ //! 2. If table A's key range is disjoint from the keys in level B, move table A into level B.
16
+ //! All done! (But if the key ranges intersect, jump to step 3).
17
+ //!
18
+ //! 3. Create an iterator from the sort-merge of table A and the concatenation of tables B.
19
+ //! If the same key exists in level A and B, take A's and discard B's. †
20
+ //!
21
+ //! 4. Write the sort-merge iterator into a sequence of new tables on disk.
22
+ //!
23
+ //! 5. Update the input tables in the Manifest with their new `snapshot_max` so that they become
24
+ //! invisible to subsequent read transactions.
25
+ //!
26
+ //! 6. Insert the new level-B tables into the Manifest.
27
+ //!
28
+ //! † When A's value is a tombstone, there is a special case for garbage collection. When either:
29
+ //! * level B is the final level, or
30
+ //! * A's key does not exist in B or any deeper level,
31
+ //! then the tombstone is omitted from the compacted output (see: `compaction_must_drop_tombstones`).
32
+ //!
1
33
  const std = @import("std");
2
34
  const mem = std.mem;
3
35
  const math = std.math;
4
36
  const assert = std.debug.assert;
5
37
 
38
+ const log = std.log.scoped(.compaction);
6
39
  const config = @import("../config.zig");
7
40
 
8
41
  const GridType = @import("grid.zig").GridType;
@@ -14,84 +47,110 @@ const LevelIteratorType = @import("level_iterator.zig").LevelIteratorType;
14
47
  pub fn CompactionType(
15
48
  comptime Table: type,
16
49
  comptime Storage: type,
17
- comptime IteratorAType: anytype, // fn (Table: type, Storage: type) type
50
+ comptime IteratorAType: anytype,
18
51
  ) type {
19
52
  const Key = Table.Key;
20
53
  const Value = Table.Value;
21
54
  const tombstone = Table.tombstone;
22
- const compare_keys = Table.compare_keys;
23
55
 
24
56
  return struct {
25
57
  const Compaction = @This();
26
58
 
27
59
  const Grid = GridType(Storage);
28
60
  const BlockPtr = Grid.BlockPtr;
61
+ const BlockPtrConst = Grid.BlockPtrConst;
62
+ const BlockWrite = struct {
63
+ write: Grid.Write = undefined,
64
+ block: BlockPtr = undefined,
65
+ writable: bool = false,
66
+ };
67
+
29
68
  const Manifest = ManifestType(Table, Storage);
30
69
  const TableInfo = Manifest.TableInfo;
31
70
 
32
71
  const IteratorA = IteratorAType(Table, Storage);
33
72
  const IteratorB = LevelIteratorType(Table, Storage);
34
73
 
35
- pub const Callback = fn (it: *Compaction) void;
36
-
37
74
  const k = 2;
38
75
  const MergeIterator = KWayMergeIterator(
39
76
  Compaction,
40
- Key,
41
- Value,
77
+ Table.Key,
78
+ Table.Value,
42
79
  Table.key_from_value,
43
80
  Table.compare_keys,
44
81
  k,
45
- stream_peek,
46
- stream_pop,
47
- stream_precedence,
82
+ MergeStreamSelector.peek,
83
+ MergeStreamSelector.pop,
84
+ MergeStreamSelector.precedence,
48
85
  );
49
86
 
50
- const Status = enum {
51
- idle,
52
- compacting,
53
- done,
54
- };
87
+ const MergeStreamSelector = struct {
88
+ fn peek(compaction: *const Compaction, stream_id: u32) error{Empty, Drained}!Key {
89
+ return switch (stream_id) {
90
+ 0 => compaction.iterator_a.peek(),
91
+ 1 => compaction.iterator_b.peek(),
92
+ else => unreachable,
93
+ };
94
+ }
55
95
 
56
- const BlockPtrConst = *align(config.sector_size) const [config.block_size]u8;
57
- const BlockWrite = struct {
58
- block: BlockPtr,
59
- write: Grid.Write = undefined,
60
- ready: bool = false,
96
+ fn pop(compaction: *Compaction, stream_id: u32) Value {
97
+ return switch (stream_id) {
98
+ 0 => compaction.iterator_a.pop(),
99
+ 1 => compaction.iterator_b.pop(),
100
+ else => unreachable,
101
+ };
102
+ }
103
+
104
+ /// Returns true if stream A has higher precedence than stream B.
105
+ /// This is used to deduplicate values across streams.
106
+ fn precedence(compaction: *const Compaction, stream_a: u32, stream_b: u32) bool {
107
+ _ = compaction;
108
+ assert(stream_a + stream_b == 1);
109
+
110
+ // All tables in iterator_a (stream=0) have a higher precedence.
111
+ return stream_a == 0;
112
+ }
61
113
  };
62
114
 
63
- const TableInfoBuffer = @import("manifest.zig").TableInfoBufferType(Table, .ascending);
115
+ pub const Callback = fn (it: *Compaction) void;
64
116
 
65
- status: Status,
117
+ const Status = enum {
118
+ idle,
119
+ processing,
120
+ done,
121
+ };
66
122
 
67
123
  grid: *Grid,
68
- manifest: *Manifest,
69
- level_b: u8,
70
124
  range: Manifest.CompactionRange,
71
- snapshot: u64,
125
+
126
+ /// `op_min` is the first op/beat of this compaction's half-bar.
127
+ /// `op_min` is used as a snapshot — the compaction's input tables must be visible
128
+ /// to `op_min`.
129
+ ///
130
+ /// After this compaction finishes:
131
+ /// - `op_min + half_bar_beat_count - 1` will be the input tables' snapshot_max.
132
+ /// - `op_min + half_bar_beat_count` will be the output tables' snapshot_min.
133
+ op_min: u64,
72
134
  drop_tombstones: bool,
73
135
 
136
+ status: Status,
74
137
  callback: ?Callback = null,
75
- ticks: u32 = 0,
76
138
  io_pending: u32 = 0,
77
139
 
78
140
  iterator_a: IteratorA,
79
141
  iterator_b: IteratorB,
80
142
 
81
- /// Private:
82
- /// The caller must use the Callback's `done` argument to know when compaction is done,
83
- /// because a write I/O may yet follow even after the merge is done.
84
- merge_done: bool = false,
85
- merge_iterator: MergeIterator,
86
- table_builder: Table.Builder,
143
+ merge_done: bool,
144
+ merge_iterator: ?MergeIterator,
87
145
 
146
+ table_builder: Table.Builder,
88
147
  index: BlockWrite,
89
148
  filter: BlockWrite,
90
149
  data: BlockWrite,
91
150
 
92
- remove_level_a: ?*const TableInfo = null,
93
- update_level_b: TableInfoBuffer,
94
- insert_level_b: TableInfoBuffer,
151
+ manifest: *Manifest,
152
+ level_b: u8,
153
+ level_a_input: ?TableInfo,
95
154
 
96
155
  pub fn init(allocator: mem.Allocator) !Compaction {
97
156
  var iterator_a = try IteratorA.init(allocator);
@@ -103,162 +162,122 @@ pub fn CompactionType(
103
162
  var table_builder = try Table.Builder.init(allocator);
104
163
  errdefer table_builder.deinit(allocator);
105
164
 
106
- const index = BlockWrite{ .block = try allocate_block(allocator) };
107
- errdefer allocator.free(index.block);
108
-
109
- const filter = BlockWrite{ .block = try allocate_block(allocator) };
110
- errdefer allocator.free(filter.block);
111
-
112
- const data = BlockWrite{ .block = try allocate_block(allocator) };
113
- errdefer allocator.free(data.block);
114
-
115
- // The average number of tables involved in a compaction is the 1 table from level A,
116
- // plus the growth_factor number of tables from level B, plus 1 on either side,
117
- // since the overlap may not be perfectly aligned to table boundaries.
118
- // However, the worst case number of tables may approach all tables in level B,
119
- // since key ranges may be skewed and not evenly distributed across a level.
120
- const table_buffer_count_max = 1 + config.lsm_growth_factor + 2;
121
-
122
- var update_level_b = try TableInfoBuffer.init(allocator, table_buffer_count_max);
123
- errdefer update_level_b.deinit(allocator);
124
-
125
- var insert_level_b = try TableInfoBuffer.init(allocator, table_buffer_count_max);
126
- errdefer insert_level_b.deinit(allocator);
127
-
128
165
  return Compaction{
129
- .status = .idle,
130
-
131
- // Assigned by start():
166
+ // Assigned by start()
132
167
  .grid = undefined,
133
- .manifest = undefined,
134
- .level_b = undefined,
135
168
  .range = undefined,
136
- .snapshot = undefined,
169
+ .op_min = undefined,
137
170
  .drop_tombstones = undefined,
138
171
 
172
+ .status = .idle,
139
173
  .iterator_a = iterator_a,
140
174
  .iterator_b = iterator_b,
141
175
 
142
- .merge_iterator = undefined, // This can only be initialized on tick 1.
143
- .table_builder = table_builder,
176
+ .merge_done = false,
177
+ .merge_iterator = null,
144
178
 
145
- .index = index,
146
- .filter = filter,
147
- .data = data,
179
+ .table_builder = table_builder,
180
+ .index = .{},
181
+ .filter = .{},
182
+ .data = .{},
148
183
 
149
- .update_level_b = update_level_b,
150
- .insert_level_b = insert_level_b,
184
+ // Assigned by start()
185
+ .manifest = undefined,
186
+ .level_b = undefined,
187
+ .level_a_input = null,
151
188
  };
152
189
  }
153
190
 
154
- fn allocate_block(allocator: mem.Allocator) !BlockPtr {
155
- const block = try allocator.alignedAlloc(u8, config.sector_size, config.block_size);
156
- return block[0..config.block_size];
157
- }
158
-
159
191
  pub fn deinit(compaction: *Compaction, allocator: mem.Allocator) void {
160
- compaction.iterator_a.deinit(allocator);
161
- compaction.iterator_b.deinit(allocator);
162
192
  compaction.table_builder.deinit(allocator);
163
- compaction.update_level_b.deinit(allocator);
164
- compaction.insert_level_b.deinit(allocator);
165
193
 
166
- allocator.free(compaction.index.block);
167
- allocator.free(compaction.filter.block);
168
- allocator.free(compaction.data.block);
194
+ compaction.iterator_b.deinit(allocator);
195
+ compaction.iterator_a.deinit(allocator);
169
196
  }
170
197
 
198
+ /// The compaction's input tables are:
199
+ /// * table_a (which is null when level B is 0), and
200
+ /// * any level-B tables visible to `op_min` within `range`.
171
201
  pub fn start(
172
202
  compaction: *Compaction,
173
203
  grid: *Grid,
174
204
  manifest: *Manifest,
175
- // TODO level_a_table: ?TableInfo,
176
- level_b: u8,
205
+ op_min: u64,
177
206
  range: Manifest.CompactionRange,
178
- snapshot: u64,
207
+ table_a: ?*const TableInfo,
208
+ level_b: u8,
179
209
  iterator_a_context: IteratorA.Context,
180
210
  ) void {
181
211
  assert(compaction.status == .idle);
182
212
  assert(compaction.callback == null);
183
213
  assert(compaction.io_pending == 0);
184
- assert(level_b < config.lsm_levels);
214
+ assert(!compaction.merge_done and compaction.merge_iterator == null);
215
+
216
+ assert(op_min % @divExact(config.lsm_batch_multiple, 2) == 0);
185
217
  assert(range.table_count > 0);
218
+ if (table_a) |t| assert(t.visible(op_min));
186
219
 
220
+ assert(level_b < config.lsm_levels);
221
+ assert((level_b == 0) == (table_a == null));
222
+
223
+ // Levels may choose to drop tombstones if keys aren't included in the lower levels.
224
+ // This invariant is always true for the last level as it doesn't have any lower ones.
187
225
  const drop_tombstones = manifest.compaction_must_drop_tombstones(level_b, range);
188
226
  assert(drop_tombstones or level_b < config.lsm_levels - 1);
189
227
 
190
228
  compaction.* = .{
191
- .status = .compacting,
192
-
193
229
  .grid = grid,
194
- .manifest = manifest,
195
- .level_b = level_b,
196
230
  .range = range,
197
- .snapshot = snapshot,
231
+ .op_min = op_min,
198
232
  .drop_tombstones = drop_tombstones,
199
233
 
234
+ .status = .processing,
200
235
  .iterator_a = compaction.iterator_a,
201
236
  .iterator_b = compaction.iterator_b,
202
237
 
203
- .merge_iterator = undefined,
204
- .table_builder = compaction.table_builder,
238
+ .merge_done = false,
239
+ .merge_iterator = null,
205
240
 
241
+ .table_builder = compaction.table_builder,
206
242
  .index = compaction.index,
207
243
  .filter = compaction.filter,
208
244
  .data = compaction.data,
209
245
 
210
- .update_level_b = compaction.update_level_b,
211
- .insert_level_b = compaction.insert_level_b,
246
+ .manifest = manifest,
247
+ .level_b = level_b,
248
+ .level_a_input = if (table_a) |table| table.* else null,
212
249
  };
213
250
 
214
- assert(!compaction.data.ready);
215
- assert(!compaction.filter.ready);
216
- assert(!compaction.index.ready);
217
-
218
- // TODO Reset builder.
219
-
220
- // TODO: Enable when move_table() can fetch TableInfo from address/checksum.
221
- //
222
- // Perform a "compaction move" to the next level inline if certain factors allow:
223
- // - Can only do the specialization if there's a single table to compact.
224
- // - Must be compacting from a table iterator which has an address and checksum.
225
- // - Cannot drop tombstones as then we have to go through the normal compaction path.
226
- // - Cannot be performing the immutable table -> level 0 compaction
227
- // as it requires the table being moved to reside on disk (tracked by manifest).
228
- if (false and IteratorA.Context == TableIteratorType(Table, Storage)) {
229
- if (!drop_tombstones and range.table_count == 1) {
230
- assert(compaction.level_b != 0);
231
- assert(compaction.status == .compacting);
232
-
233
- const level_a = level_b - 1;
234
- assert(level_a < config.lsm_levels - 1);
235
-
236
- compaction.manifest.move_table(
237
- level_a,
238
- level_b,
239
- snapshot,
240
- iterator_a_context.address,
241
- iterator_a_context.checksum,
242
- );
243
-
244
- compaction.status = .done;
245
- return;
246
- }
247
- }
251
+ assert(!compaction.index.writable);
252
+ assert(!compaction.filter.writable);
253
+ assert(!compaction.data.writable);
254
+
255
+ // TODO Implement manifest.move_table() optimization if there's only range.table_count == 1.
256
+ // This would do update_tables + insert_tables inline without going through the iterators.
248
257
 
249
258
  const iterator_b_context = .{
250
259
  .grid = grid,
251
260
  .manifest = manifest,
252
261
  .level = level_b,
253
- .snapshot = snapshot,
262
+ .snapshot = op_min,
254
263
  .key_min = range.key_min,
255
264
  .key_max = range.key_max,
256
265
  .direction = .ascending,
257
- .table_info_callback = iterator_b_table_info_callback, // TODO
266
+ .table_info_callback = iterator_b_table_info_callback,
258
267
  };
259
268
 
260
- compaction.iterator_a.start(iterator_a_context, iterator_a_callback);
261
- compaction.iterator_b.start(iterator_b_context, iterator_b_callback);
269
+ compaction.iterator_a.start(iterator_a_context, iterator_a_io_callback);
270
+ compaction.iterator_b.start(iterator_b_context, iterator_b_io_callback);
271
+ }
272
+
273
+ fn iterator_a_io_callback(iterator_a: *IteratorA) void {
274
+ const compaction = @fieldParentPtr(Compaction, "iterator_a", iterator_a);
275
+ compaction.io_finish();
276
+ }
277
+
278
+ fn iterator_b_io_callback(iterator_b: *IteratorB) void {
279
+ const compaction = @fieldParentPtr(Compaction, "iterator_b", iterator_b);
280
+ compaction.io_finish();
262
281
  }
263
282
 
264
283
  fn iterator_b_table_info_callback(
@@ -267,337 +286,272 @@ pub fn CompactionType(
267
286
  index_block: BlockPtrConst,
268
287
  ) void {
269
288
  const compaction = @fieldParentPtr(Compaction, "iterator_b", iterator_b);
270
- compaction.queue_manifest_update(&compaction.update_level_b, table);
271
-
272
- // Release the table's block addresses if it's invisible to the compaction.
273
- if (table.invisible(&.{compaction.snapshot})) {
274
- compaction.grid.release(Table.index_block_address(index_block));
275
-
276
- for (Table.index_filter_addresses_used(index_block)) |address| {
277
- compaction.grid.release(address);
278
- }
279
-
280
- for (Table.index_data_addresses_used(index_block)) |address| {
281
- compaction.grid.release(address);
282
- }
283
- }
284
- }
285
-
286
- fn queue_manifest_update(
287
- compaction: *Compaction,
288
- buffer: *TableInfoBuffer,
289
- table: *const TableInfo,
290
- ) void {
291
- assert(buffer == &compaction.update_level_b or buffer == &compaction.insert_level_b);
292
- if (buffer.full()) compaction.update_manifest(buffer);
293
- buffer.push(table);
294
- }
295
-
296
- fn update_manifest(compaction: *Compaction, buffer: *TableInfoBuffer) void {
297
- assert(buffer == &compaction.update_level_b or buffer == &compaction.insert_level_b);
298
-
299
- const tables: []const TableInfo = buffer.drain();
300
- if (tables.len == 0) return;
301
-
302
- for (tables) |table| {
303
- assert(compare_keys(table.key_min, compaction.range.key_min) != .lt);
304
- assert(compare_keys(table.key_max, compaction.range.key_max) != .gt);
289
+ assert(compaction.status == .processing);
290
+ assert(compaction.callback != null);
291
+ assert(!compaction.merge_done);
292
+ assert(table.visible(compaction.op_min));
293
+
294
+ // Tables discovered by iterator_b that are visible at the start of compaction.
295
+ var table_copy = table.*;
296
+ compaction.manifest.update_table(
297
+ compaction.level_b,
298
+ snapshot_max_for_table_input(compaction.op_min),
299
+ &table_copy,
300
+ );
301
+
302
+ // Release the table's block addresses in the Grid as it will be made invisible.
303
+ // This is safe; iterator_b makes a copy of the block before calling us.
304
+ const grid = compaction.grid;
305
+ for (Table.index_data_addresses_used(index_block)) |address| {
306
+ grid.release_at_checkpoint(address);
305
307
  }
306
-
307
- if (buffer == &compaction.update_level_b) {
308
- compaction.manifest.update_tables(compaction.level_b, compaction.snapshot, tables);
309
- } else {
310
- compaction.manifest.insert_tables(compaction.level_b, tables);
308
+ for (Table.index_filter_addresses_used(index_block)) |address| {
309
+ grid.release_at_checkpoint(address);
311
310
  }
311
+ grid.release_at_checkpoint(Table.index_block_address(index_block));
312
312
  }
313
313
 
314
- /// Compaction.ticks stages at which each action may be performed.
315
- const pipeline_tick_read = 0;
316
- const pipeline_tick_merge = 1;
317
- const pipeline_tick_write = 2;
318
-
319
- /// Submits all read/write I/O before starting the CPU-intensive k-way merge.
320
- /// This allows the I/O to happen in parallel with the merge.
321
- ///
322
- /// The caller must call:
323
- ///
324
- /// 1. tick_io() across all trees,
325
- /// 2. IO.tick() to submit these I/O operations to the kernel,
326
- /// 3. tick_cpu() across all trees.
327
- pub fn tick_io(compaction: *Compaction, callback: Callback) void {
328
- assert(compaction.status == .compacting);
314
+ pub fn compact_tick(compaction: *Compaction, callback: Callback) void {
315
+ assert(compaction.status == .processing);
329
316
  assert(compaction.callback == null);
330
317
  assert(compaction.io_pending == 0);
331
318
  assert(!compaction.merge_done);
332
319
 
333
320
  compaction.callback = callback;
334
321
 
335
- if (compaction.ticks >= pipeline_tick_read) compaction.tick_io_read();
336
- if (compaction.ticks >= pipeline_tick_write) compaction.tick_io_write();
337
-
338
- // All values may be eclipsed by tombstones, with no write I/O pending here.
322
+ // Generate fake IO to make sure io_pending doesn't reach zero multiple times from
323
+ // IO being completed inline down below.
324
+ // The fake IO is immediately resolved and triggers the cpu_merge_start if all
325
+ // IO completes inline or if no IO was started.
326
+ compaction.io_start();
327
+ defer compaction.io_finish();
328
+
329
+ // Start reading blocks from the iterators to merge them.
330
+ if (compaction.iterator_a.tick()) compaction.io_start();
331
+ if (compaction.iterator_b.tick()) compaction.io_start();
332
+
333
+ // Start writing blocks prepared by the merge iterator from a previous compact_tick().
334
+ compaction.io_write_start(.data);
335
+ compaction.io_write_start(.filter);
336
+ compaction.io_write_start(.index);
339
337
  }
340
338
 
341
- pub fn tick_cpu(compaction: *Compaction) void {
342
- assert(compaction.status == .compacting);
343
- assert(compaction.callback != null);
344
- assert(compaction.io_pending >= 0);
345
- assert(!compaction.merge_done);
339
+ const BlockWriteField = enum { data, filter, index };
346
340
 
347
- if (compaction.ticks == pipeline_tick_merge) {
348
- // We cannot initialize the merge until we can peek() a value from each stream,
349
- // which depends on tick 0 (to read blocks) having happened.
350
- compaction.merge_iterator = MergeIterator.init(compaction, k, .ascending);
351
- }
352
-
353
- if (compaction.ticks >= pipeline_tick_merge) {
354
- if (compaction.merge_iterator.empty()) {
355
- assert(!compaction.merge_done);
341
+ fn io_write_start(compaction: *Compaction, comptime field: BlockWriteField) void {
342
+ const write_callback = struct {
343
+ fn callback(write: *Grid.Write) void {
344
+ const block_write = @fieldParentPtr(BlockWrite, "write", write);
345
+ block_write.block = undefined;
356
346
 
357
- // We must distinguish between merge_iterator.empty() and merge_done.
358
- // The former cannot be accessed before MergeIterator.init() on tick 1.
359
- compaction.merge_done = true;
360
- } else {
361
- compaction.tick_cpu_merge();
347
+ const _compaction = @fieldParentPtr(Compaction, @tagName(field), block_write);
348
+ _compaction.io_finish();
362
349
  }
363
- }
350
+ }.callback;
364
351
 
365
- compaction.ticks += 1;
352
+ const block_write: *BlockWrite = &@field(compaction, @tagName(field));
353
+ if (block_write.writable) {
354
+ block_write.writable = false;
366
355
 
367
- // Normally, a tick completes only after a read/write I/O.
368
- // However, the compaction may drop only tombstones, resulting in no write I/O.
369
- if (compaction.io_pending == 0) compaction.tick_done();
356
+ compaction.io_start();
357
+ compaction.grid.write_block(
358
+ write_callback,
359
+ &block_write.write,
360
+ block_write.block,
361
+ Table.block_address(block_write.block),
362
+ );
363
+ }
370
364
  }
371
365
 
372
- fn tick_done(compaction: *Compaction) void {
373
- assert(compaction.status == .compacting);
366
+ fn io_start(compaction: *Compaction) void {
367
+ assert(compaction.status == .processing);
374
368
  assert(compaction.callback != null);
375
- assert(compaction.io_pending == 0);
376
-
377
- // Consume the callback and invoke it one finished updating state below.
378
- const callback = compaction.callback.?;
379
- compaction.callback = null;
380
- defer callback(compaction);
381
-
382
- // Once merge completes, the compaction is now officially over.
383
- if (compaction.merge_done) {
384
- compaction.status = .done;
369
+ assert(!compaction.merge_done);
385
370
 
386
- // Flush updates to the table infos discovered during compaction
387
- // TODO Handle compaction.remove_level_a
388
- compaction.update_manifest(&compaction.update_level_b);
389
- compaction.update_manifest(&compaction.insert_level_b);
390
- }
371
+ compaction.io_pending += 1;
391
372
  }
392
373
 
393
- pub fn reset(compaction: *Compaction) void {
394
- assert(compaction.callback == null);
395
- assert(compaction.io_pending == 0);
396
-
397
- assert(compaction.status == .done);
398
- compaction.status = .idle;
374
+ fn io_finish(compaction: *Compaction) void {
375
+ assert(compaction.status == .processing);
376
+ assert(compaction.callback != null);
377
+ assert(compaction.io_pending > 0);
378
+ assert(!compaction.merge_done);
399
379
 
400
- assert(compaction.update_level_b.drain().len == 0);
401
- assert(compaction.insert_level_b.drain().len == 0);
380
+ compaction.io_pending -= 1;
381
+ if (compaction.io_pending == 0) compaction.cpu_merge_start();
402
382
  }
403
383
 
404
- fn tick_io_read(compaction: *Compaction) void {
384
+ fn cpu_merge_start(compaction: *Compaction) void {
385
+ assert(compaction.status == .processing);
405
386
  assert(compaction.callback != null);
387
+ assert(compaction.io_pending == 0);
388
+ assert(!compaction.merge_done);
406
389
 
407
- if (compaction.iterator_a.tick()) compaction.io_pending += 1;
408
- if (compaction.iterator_b.tick()) compaction.io_pending += 1;
390
+ // Create the merge iterator only when we can peek() from the read iterators.
391
+ // This happens after IO for the first reads complete.
392
+ if (compaction.merge_iterator == null) {
393
+ compaction.merge_iterator = MergeIterator.init(compaction, k, .ascending);
394
+ assert(!compaction.merge_iterator.?.empty());
395
+ }
409
396
 
410
- if (compaction.merge_done) assert(compaction.io_pending == 0);
411
- }
397
+ assert(!compaction.data.writable);
398
+ assert(!compaction.filter.writable);
399
+ assert(!compaction.index.writable);
412
400
 
413
- fn tick_io_write(compaction: *Compaction) void {
414
- assert(compaction.callback != null);
415
- assert(compaction.ticks >= pipeline_tick_write);
416
- // There may be no data block to write if all values are eclipsed by tombstones.
417
- assert(compaction.data.ready or !compaction.data.ready);
401
+ if (!compaction.merge_iterator.?.empty()) {
402
+ compaction.cpu_merge();
403
+ } else {
404
+ compaction.cpu_merge_finish();
405
+ }
418
406
 
419
- compaction.write_block_if_ready(&compaction.data, write_block_callback("data"));
420
- compaction.write_block_if_ready(&compaction.filter, write_block_callback("filter"));
421
- compaction.write_block_if_ready(&compaction.index, write_block_callback("index"));
407
+ // TODO Implement pacing here by deciding if we should do another compact_tick()
408
+ // instead of invoking the callback, using compaction.range.table_count as the heuristic.
422
409
 
423
- assert(!compaction.data.ready);
424
- assert(!compaction.filter.ready);
425
- assert(!compaction.index.ready);
410
+ const callback = compaction.callback.?;
411
+ compaction.callback = null;
412
+ callback(compaction);
426
413
  }
427
414
 
428
- fn tick_cpu_merge(compaction: *Compaction) void {
415
+ fn cpu_merge(compaction: *Compaction) void {
416
+ // Ensure this is the result of a compact_tick() call that finished processing IO.
417
+ assert(compaction.status == .processing);
429
418
  assert(compaction.callback != null);
430
- assert(compaction.ticks >= pipeline_tick_merge);
419
+ assert(compaction.io_pending == 0);
431
420
  assert(!compaction.merge_done);
432
- assert(!compaction.merge_iterator.empty());
433
421
 
434
- assert(!compaction.data.ready);
435
- assert(!compaction.filter.ready);
436
- assert(!compaction.index.ready);
422
+ // Ensure there are values to merge and that is it safe to do so.
423
+ const merge_iterator = &compaction.merge_iterator.?;
424
+ assert(!merge_iterator.empty());
425
+ assert(!compaction.data.writable);
426
+ assert(!compaction.filter.writable);
427
+ assert(!compaction.index.writable);
437
428
 
438
- var tombstones_dropped: u32 = 0;
429
+ // Build up a data block with values merged from the read iterators.
430
+ // This skips tombstone values if compaction was started with the intent to drop them.
439
431
  while (!compaction.table_builder.data_block_full()) {
440
- const value = compaction.merge_iterator.pop() orelse {
441
- compaction.assert_read_iterators_empty();
442
- break;
443
- };
444
- if (compaction.drop_tombstones and tombstone(&value)) {
445
- tombstones_dropped += 1;
446
- } else {
447
- compaction.table_builder.data_block_append(&value);
448
- }
432
+ const value = merge_iterator.pop() orelse break;
433
+ if (compaction.drop_tombstones and tombstone(&value)) continue;
434
+ compaction.table_builder.data_block_append(&value);
449
435
  }
450
436
 
451
- if (compaction.table_builder.data_block_empty()) {
452
- assert(compaction.drop_tombstones);
453
- assert(tombstones_dropped > 0);
454
- } else {
437
+ // Finalize the data block if it's full or if it contains pending values when there's
438
+ // no more left to merge.
439
+ if (compaction.table_builder.data_block_full() or
440
+ (merge_iterator.empty() and !compaction.table_builder.data_block_empty()))
441
+ {
455
442
  compaction.table_builder.data_block_finish(.{
456
443
  .cluster = compaction.grid.superblock.working.cluster,
457
444
  .address = compaction.grid.acquire(),
458
445
  });
459
- swap_buffers(&compaction.data, &compaction.table_builder.data_block);
460
- assert(compaction.data.ready);
461
446
 
462
- if (!compaction.merge_iterator.empty()) {
463
- // Ensure that the block was filled completely.
464
- const values_used = Table.data_block_values_used(compaction.data.block).len;
465
- assert(values_used == Table.data.value_count_max);
466
- }
447
+ // Mark the finished data block as writable for the next compact_tick() call.
448
+ compaction.data.block = compaction.table_builder.data_block;
449
+ assert(!compaction.data.writable);
450
+ compaction.data.writable = true;
467
451
  }
468
452
 
453
+ // Finalize the filter block if it's full or if it contains pending data blocks
454
+ // when there's no more merged values to fill them.
469
455
  if (compaction.table_builder.filter_block_full() or
470
- compaction.table_builder.index_block_full() or
471
- compaction.merge_iterator.empty())
456
+ (merge_iterator.empty() and !compaction.table_builder.filter_block_empty()))
472
457
  {
473
- if (compaction.table_builder.filter_block_empty()) {
474
- assert(compaction.drop_tombstones);
475
- assert(tombstones_dropped > 0);
476
- } else {
477
- compaction.table_builder.filter_block_finish(.{
478
- .cluster = compaction.grid.superblock.working.cluster,
479
- .address = compaction.grid.acquire(),
480
- });
481
- swap_buffers(&compaction.filter, &compaction.table_builder.filter_block);
482
- assert(compaction.filter.ready);
483
- }
458
+ compaction.table_builder.filter_block_finish(.{
459
+ .cluster = compaction.grid.superblock.working.cluster,
460
+ .address = compaction.grid.acquire(),
461
+ });
462
+
463
+ // Mark the finished filter block as writable for the next compact_tick() call.
464
+ compaction.filter.block = compaction.table_builder.filter_block;
465
+ assert(!compaction.filter.writable);
466
+ compaction.filter.writable = true;
484
467
  }
485
468
 
469
+ // Finalize the index block if it's full or if it contains pending data blocks
470
+ // when there's no more merged values to fill them.
486
471
  if (compaction.table_builder.index_block_full() or
487
- compaction.merge_iterator.empty())
472
+ (merge_iterator.empty() and !compaction.table_builder.index_block_empty()))
488
473
  {
489
- if (compaction.table_builder.index_block_empty()) {
490
- assert(compaction.drop_tombstones);
491
- assert(tombstones_dropped > 0);
492
- } else {
493
- const snapshot_min = compaction.snapshot;
494
- const table = compaction.table_builder.index_block_finish(.{
495
- .cluster = compaction.grid.superblock.working.cluster,
496
- .address = compaction.grid.acquire(),
497
- .snapshot_min = snapshot_min,
498
- });
499
- compaction.queue_manifest_update(&compaction.insert_level_b, &table);
500
-
501
- swap_buffers(&compaction.index, &compaction.table_builder.index_block);
502
- assert(compaction.index.ready);
503
- }
504
- }
505
- }
506
-
507
- fn iterator_a_callback(iterator_a: *IteratorA) void {
508
- const compaction = @fieldParentPtr(Compaction, "iterator_a", iterator_a);
509
- compaction.io_callback();
510
- }
511
-
512
- fn iterator_b_callback(iterator_b: *IteratorB) void {
513
- const compaction = @fieldParentPtr(Compaction, "iterator_b", iterator_b);
514
- compaction.io_callback();
515
- }
516
-
517
- fn io_callback(compaction: *Compaction) void {
518
- compaction.io_pending -= 1;
519
- if (compaction.io_pending == 0) compaction.tick_done();
520
- }
521
-
522
- fn write_block_if_ready(
523
- compaction: *Compaction,
524
- block_write: *BlockWrite,
525
- callback: fn (*Grid.Write) void,
526
- ) void {
527
- if (block_write.ready) {
528
- block_write.ready = false;
474
+ const table = compaction.table_builder.index_block_finish(.{
475
+ .cluster = compaction.grid.superblock.working.cluster,
476
+ .address = compaction.grid.acquire(),
477
+ .snapshot_min = snapshot_min_for_table_output(compaction.op_min),
478
+ // TODO(Persistent Snapshots) set snapshot_max to the minimum snapshot_max of
479
+ // all the (original) input tables.
480
+ });
481
+ compaction.manifest.insert_table(compaction.level_b, &table);
529
482
 
530
- compaction.io_pending += 1;
531
- compaction.grid.write_block(
532
- callback,
533
- &block_write.write,
534
- block_write.block,
535
- Table.block_address(block_write.block),
536
- );
483
+ // Mark the finished index block as writable for the next compact_tick() call.
484
+ compaction.index.block = compaction.table_builder.index_block;
485
+ assert(!compaction.index.writable);
486
+ compaction.index.writable = true;
537
487
  }
538
488
  }
539
489
 
540
- fn write_block_callback(comptime field: []const u8) fn (*Grid.Write) void {
541
- return struct {
542
- fn callback(write: *Grid.Write) void {
543
- const block_write = @fieldParentPtr(BlockWrite, "write", write);
544
- const compaction = @fieldParentPtr(Compaction, field, block_write);
490
+ fn cpu_merge_finish(compaction: *Compaction) void {
491
+ // Ensure this is the result of a compact_tick() call that finished processing IO.
492
+ assert(compaction.status == .processing);
493
+ assert(compaction.callback != null);
494
+ assert(compaction.io_pending == 0);
495
+ assert(!compaction.merge_done);
545
496
 
546
- io_callback(compaction);
497
+ // Ensure merging is truly finished.
498
+ assert(compaction.merge_iterator.?.empty());
499
+ assert(!compaction.data.writable);
500
+ assert(!compaction.filter.writable);
501
+ assert(!compaction.index.writable);
502
+
503
+ // Double check the iterators are finished as well.
504
+ const stream_empty = struct {
505
+ fn empty(it: anytype) bool {
506
+ _ = it.peek() catch |err| switch (err) {
507
+ error.Drained => {},
508
+ error.Empty => {
509
+ assert(it.buffered_all_values());
510
+ return true;
511
+ },
512
+ };
513
+ return false;
547
514
  }
548
- }.callback;
549
- }
550
-
551
- fn swap_buffers(block_write: *BlockWrite, block_ready: *BlockPtr) void {
552
- mem.swap(BlockPtr, &block_write.block, block_ready);
553
-
554
- assert(!block_write.ready);
555
- block_write.ready = true;
556
- }
557
-
558
- fn assert_read_iterators_empty(compaction: Compaction) void {
559
- assert(compaction.iterator_a.buffered_all_values());
560
- assert(compaction.iterator_a.peek() == null);
561
-
562
- assert(compaction.iterator_b.buffered_all_values());
563
- assert(compaction.iterator_b.peek() == null);
564
- }
565
-
566
- fn stream_peek(compaction: *Compaction, stream_id: u32) ?Key {
567
- assert(stream_id <= 1);
568
-
569
- if (stream_id == 0) {
570
- return compaction.iterator_a.peek();
515
+ }.empty;
516
+ assert(stream_empty(&compaction.iterator_a));
517
+ assert(stream_empty(&compaction.iterator_b));
518
+
519
+ // Mark the level_a table as invisible if it was provided;
520
+ // it has been merged into level_b.
521
+ // TODO: Release the grid blocks associated with level_a as well
522
+ if (compaction.level_a_input) |*level_a_table| {
523
+ const level_a = compaction.level_b - 1;
524
+ const snapshot_max = snapshot_max_for_table_input(compaction.op_min);
525
+ compaction.manifest.update_table(level_a, snapshot_max, level_a_table);
526
+ assert(level_a_table.snapshot_max == snapshot_max);
571
527
  } else {
572
- return compaction.iterator_b.peek();
528
+ assert(compaction.level_b == 0);
573
529
  }
574
- }
575
-
576
- fn stream_pop(compaction: *Compaction, stream_id: u32) Value {
577
- assert(stream_id <= 1);
578
530
 
579
- if (stream_id == 0) {
580
- return compaction.iterator_a.pop();
581
- } else {
582
- return compaction.iterator_b.pop();
583
- }
531
+ // Finally, mark Compaction as officially complete and ready to be reset().
532
+ compaction.merge_iterator = null;
533
+ compaction.merge_done = true;
534
+ compaction.status = .done;
584
535
  }
585
536
 
586
- /// Returns true if stream A has higher precedence than stream B.
587
- /// This is used to deduplicate values across streams.
588
- ///
589
- /// This assumes that all overlapping tables in level A at the time the compaction was
590
- /// started are included in the compaction. If this is not the case, the older table
591
- /// in a pair of overlapping tables could be left in level A and shadow the newer table
592
- /// in level B, resulting in data loss/invalid data.
593
- fn stream_precedence(compaction: *Compaction, a: u32, b: u32) bool {
594
- _ = compaction;
595
-
596
- assert(a + b == 1);
597
-
598
- // A stream_id of 0 indicates the level A iterator.
599
- // All tables in level A have higher precedence.
600
- return a == 0;
537
+ pub fn reset(compaction: *Compaction) void {
538
+ assert(compaction.status == .done);
539
+ assert(compaction.callback == null);
540
+ assert(compaction.io_pending == 0);
541
+ assert(compaction.merge_done);
542
+
543
+ compaction.status = .idle;
544
+ compaction.merge_done = false;
601
545
  }
602
546
  };
603
547
  }
548
+
549
+ fn snapshot_max_for_table_input(op_min: u64) u64 {
550
+ assert(op_min % @divExact(config.lsm_batch_multiple, 2) == 0);
551
+ return op_min + @divExact(config.lsm_batch_multiple, 2) - 1;
552
+ }
553
+
554
+ fn snapshot_min_for_table_output(op_min: u64) u64 {
555
+ assert(op_min % @divExact(config.lsm_batch_multiple, 2) == 0);
556
+ return op_min + @divExact(config.lsm_batch_multiple, 2);
557
+ }