tigerbeetle-node 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. package/README.md +3 -2
  2. package/dist/index.d.ts +66 -61
  3. package/dist/index.js +66 -61
  4. package/dist/index.js.map +1 -1
  5. package/package.json +1 -1
  6. package/src/index.ts +5 -0
  7. package/src/node.zig +17 -18
  8. package/src/tigerbeetle/scripts/benchmark.bat +4 -3
  9. package/src/tigerbeetle/scripts/benchmark.sh +25 -10
  10. package/src/tigerbeetle/scripts/install.sh +2 -1
  11. package/src/tigerbeetle/scripts/install_zig.sh +14 -18
  12. package/src/tigerbeetle/scripts/upgrade_ubuntu_kernel.sh +12 -3
  13. package/src/tigerbeetle/scripts/vopr.sh +5 -5
  14. package/src/tigerbeetle/src/benchmark.zig +17 -9
  15. package/src/tigerbeetle/src/benchmark_array_search.zig +317 -0
  16. package/src/tigerbeetle/src/benchmarks/perf.zig +299 -0
  17. package/src/tigerbeetle/src/c/tb_client/context.zig +103 -0
  18. package/src/tigerbeetle/src/c/tb_client/packet.zig +80 -0
  19. package/src/tigerbeetle/src/c/tb_client/signal.zig +288 -0
  20. package/src/tigerbeetle/src/c/tb_client/thread.zig +329 -0
  21. package/src/tigerbeetle/src/c/tb_client.h +201 -0
  22. package/src/tigerbeetle/src/c/tb_client.zig +101 -0
  23. package/src/tigerbeetle/src/c/test.zig +1 -0
  24. package/src/tigerbeetle/src/cli.zig +142 -83
  25. package/src/tigerbeetle/src/config.zig +119 -10
  26. package/src/tigerbeetle/src/demo.zig +12 -8
  27. package/src/tigerbeetle/src/demo_05_post_pending_transfers.zig +2 -2
  28. package/src/tigerbeetle/src/ewah.zig +318 -0
  29. package/src/tigerbeetle/src/ewah_benchmark.zig +121 -0
  30. package/src/tigerbeetle/src/eytzinger_benchmark.zig +317 -0
  31. package/src/tigerbeetle/src/fifo.zig +17 -1
  32. package/src/tigerbeetle/src/io/darwin.zig +12 -10
  33. package/src/tigerbeetle/src/io/linux.zig +25 -9
  34. package/src/tigerbeetle/src/io/windows.zig +13 -9
  35. package/src/tigerbeetle/src/iops.zig +101 -0
  36. package/src/tigerbeetle/src/lsm/binary_search.zig +214 -0
  37. package/src/tigerbeetle/src/lsm/bloom_filter.zig +82 -0
  38. package/src/tigerbeetle/src/lsm/compaction.zig +603 -0
  39. package/src/tigerbeetle/src/lsm/composite_key.zig +75 -0
  40. package/src/tigerbeetle/src/lsm/direction.zig +11 -0
  41. package/src/tigerbeetle/src/lsm/eytzinger.zig +587 -0
  42. package/src/tigerbeetle/src/lsm/forest.zig +630 -0
  43. package/src/tigerbeetle/src/lsm/grid.zig +473 -0
  44. package/src/tigerbeetle/src/lsm/groove.zig +939 -0
  45. package/src/tigerbeetle/src/lsm/k_way_merge.zig +452 -0
  46. package/src/tigerbeetle/src/lsm/level_iterator.zig +296 -0
  47. package/src/tigerbeetle/src/lsm/manifest.zig +680 -0
  48. package/src/tigerbeetle/src/lsm/manifest_level.zig +1169 -0
  49. package/src/tigerbeetle/src/lsm/manifest_log.zig +904 -0
  50. package/src/tigerbeetle/src/lsm/node_pool.zig +231 -0
  51. package/src/tigerbeetle/src/lsm/posted_groove.zig +399 -0
  52. package/src/tigerbeetle/src/lsm/segmented_array.zig +998 -0
  53. package/src/tigerbeetle/src/lsm/set_associative_cache.zig +844 -0
  54. package/src/tigerbeetle/src/lsm/table.zig +932 -0
  55. package/src/tigerbeetle/src/lsm/table_immutable.zig +196 -0
  56. package/src/tigerbeetle/src/lsm/table_iterator.zig +295 -0
  57. package/src/tigerbeetle/src/lsm/table_mutable.zig +123 -0
  58. package/src/tigerbeetle/src/lsm/test.zig +429 -0
  59. package/src/tigerbeetle/src/lsm/tree.zig +1085 -0
  60. package/src/tigerbeetle/src/main.zig +119 -109
  61. package/src/tigerbeetle/src/message_bus.zig +49 -48
  62. package/src/tigerbeetle/src/message_pool.zig +15 -2
  63. package/src/tigerbeetle/src/ring_buffer.zig +126 -30
  64. package/src/tigerbeetle/src/simulator.zig +76 -44
  65. package/src/tigerbeetle/src/state_machine.zig +1022 -585
  66. package/src/tigerbeetle/src/storage.zig +46 -16
  67. package/src/tigerbeetle/src/test/cluster.zig +109 -63
  68. package/src/tigerbeetle/src/test/message_bus.zig +15 -24
  69. package/src/tigerbeetle/src/test/network.zig +26 -17
  70. package/src/tigerbeetle/src/test/state_checker.zig +7 -5
  71. package/src/tigerbeetle/src/test/state_machine.zig +159 -69
  72. package/src/tigerbeetle/src/test/storage.zig +57 -28
  73. package/src/tigerbeetle/src/tigerbeetle.zig +5 -0
  74. package/src/tigerbeetle/src/unit_tests.zig +8 -0
  75. package/src/tigerbeetle/src/util.zig +51 -0
  76. package/src/tigerbeetle/src/vsr/client.zig +21 -7
  77. package/src/tigerbeetle/src/vsr/journal.zig +154 -167
  78. package/src/tigerbeetle/src/vsr/replica.zig +744 -226
  79. package/src/tigerbeetle/src/vsr/superblock.zig +1743 -0
  80. package/src/tigerbeetle/src/vsr/superblock_client_table.zig +258 -0
  81. package/src/tigerbeetle/src/vsr/superblock_free_set.zig +644 -0
  82. package/src/tigerbeetle/src/vsr/superblock_manifest.zig +546 -0
  83. package/src/tigerbeetle/src/vsr.zig +43 -115
@@ -0,0 +1,603 @@
1
+ const std = @import("std");
2
+ const mem = std.mem;
3
+ const math = std.math;
4
+ const assert = std.debug.assert;
5
+
6
+ const config = @import("../config.zig");
7
+
8
+ const GridType = @import("grid.zig").GridType;
9
+ const ManifestType = @import("manifest.zig").ManifestType;
10
+ const KWayMergeIterator = @import("k_way_merge.zig").KWayMergeIterator;
11
+ const TableIteratorType = @import("table_iterator.zig").TableIteratorType;
12
+ const LevelIteratorType = @import("level_iterator.zig").LevelIteratorType;
13
+
14
+ pub fn CompactionType(
15
+ comptime Table: type,
16
+ comptime Storage: type,
17
+ comptime IteratorAType: anytype, // fn (Table: type, Storage: type) type
18
+ ) type {
19
+ const Key = Table.Key;
20
+ const Value = Table.Value;
21
+ const tombstone = Table.tombstone;
22
+ const compare_keys = Table.compare_keys;
23
+
24
+ return struct {
25
+ const Compaction = @This();
26
+
27
+ const Grid = GridType(Storage);
28
+ const BlockPtr = Grid.BlockPtr;
29
+ const Manifest = ManifestType(Table, Storage);
30
+ const TableInfo = Manifest.TableInfo;
31
+
32
+ const IteratorA = IteratorAType(Table, Storage);
33
+ const IteratorB = LevelIteratorType(Table, Storage);
34
+
35
+ pub const Callback = fn (it: *Compaction) void;
36
+
37
+ const k = 2;
38
+ const MergeIterator = KWayMergeIterator(
39
+ Compaction,
40
+ Key,
41
+ Value,
42
+ Table.key_from_value,
43
+ Table.compare_keys,
44
+ k,
45
+ stream_peek,
46
+ stream_pop,
47
+ stream_precedence,
48
+ );
49
+
50
+ const Status = enum {
51
+ idle,
52
+ compacting,
53
+ done,
54
+ };
55
+
56
+ const BlockPtrConst = *align(config.sector_size) const [config.block_size]u8;
57
+ const BlockWrite = struct {
58
+ block: BlockPtr,
59
+ write: Grid.Write = undefined,
60
+ ready: bool = false,
61
+ };
62
+
63
+ const TableInfoBuffer = @import("manifest.zig").TableInfoBufferType(Table, .ascending);
64
+
65
+ status: Status,
66
+
67
+ grid: *Grid,
68
+ manifest: *Manifest,
69
+ level_b: u8,
70
+ range: Manifest.CompactionRange,
71
+ snapshot: u64,
72
+ drop_tombstones: bool,
73
+
74
+ callback: ?Callback = null,
75
+ ticks: u32 = 0,
76
+ io_pending: u32 = 0,
77
+
78
+ iterator_a: IteratorA,
79
+ iterator_b: IteratorB,
80
+
81
+ /// Private:
82
+ /// The caller must use the Callback's `done` argument to know when compaction is done,
83
+ /// because a write I/O may yet follow even after the merge is done.
84
+ merge_done: bool = false,
85
+ merge_iterator: MergeIterator,
86
+ table_builder: Table.Builder,
87
+
88
+ index: BlockWrite,
89
+ filter: BlockWrite,
90
+ data: BlockWrite,
91
+
92
+ remove_level_a: ?*const TableInfo = null,
93
+ update_level_b: TableInfoBuffer,
94
+ insert_level_b: TableInfoBuffer,
95
+
96
+ pub fn init(allocator: mem.Allocator) !Compaction {
97
+ var iterator_a = try IteratorA.init(allocator);
98
+ errdefer iterator_a.deinit(allocator);
99
+
100
+ var iterator_b = try IteratorB.init(allocator);
101
+ errdefer iterator_b.deinit(allocator);
102
+
103
+ var table_builder = try Table.Builder.init(allocator);
104
+ errdefer table_builder.deinit(allocator);
105
+
106
+ const index = BlockWrite{ .block = try allocate_block(allocator) };
107
+ errdefer allocator.free(index.block);
108
+
109
+ const filter = BlockWrite{ .block = try allocate_block(allocator) };
110
+ errdefer allocator.free(filter.block);
111
+
112
+ const data = BlockWrite{ .block = try allocate_block(allocator) };
113
+ errdefer allocator.free(data.block);
114
+
115
+ // The average number of tables involved in a compaction is the 1 table from level A,
116
+ // plus the growth_factor number of tables from level B, plus 1 on either side,
117
+ // since the overlap may not be perfectly aligned to table boundaries.
118
+ // However, the worst case number of tables may approach all tables in level B,
119
+ // since key ranges may be skewed and not evenly distributed across a level.
120
+ const table_buffer_count_max = 1 + config.lsm_growth_factor + 2;
121
+
122
+ var update_level_b = try TableInfoBuffer.init(allocator, table_buffer_count_max);
123
+ errdefer update_level_b.deinit(allocator);
124
+
125
+ var insert_level_b = try TableInfoBuffer.init(allocator, table_buffer_count_max);
126
+ errdefer insert_level_b.deinit(allocator);
127
+
128
+ return Compaction{
129
+ .status = .idle,
130
+
131
+ // Assigned by start():
132
+ .grid = undefined,
133
+ .manifest = undefined,
134
+ .level_b = undefined,
135
+ .range = undefined,
136
+ .snapshot = undefined,
137
+ .drop_tombstones = undefined,
138
+
139
+ .iterator_a = iterator_a,
140
+ .iterator_b = iterator_b,
141
+
142
+ .merge_iterator = undefined, // This can only be initialized on tick 1.
143
+ .table_builder = table_builder,
144
+
145
+ .index = index,
146
+ .filter = filter,
147
+ .data = data,
148
+
149
+ .update_level_b = update_level_b,
150
+ .insert_level_b = insert_level_b,
151
+ };
152
+ }
153
+
154
+ fn allocate_block(allocator: mem.Allocator) !BlockPtr {
155
+ const block = try allocator.alignedAlloc(u8, config.sector_size, config.block_size);
156
+ return block[0..config.block_size];
157
+ }
158
+
159
+ pub fn deinit(compaction: *Compaction, allocator: mem.Allocator) void {
160
+ compaction.iterator_a.deinit(allocator);
161
+ compaction.iterator_b.deinit(allocator);
162
+ compaction.table_builder.deinit(allocator);
163
+ compaction.update_level_b.deinit(allocator);
164
+ compaction.insert_level_b.deinit(allocator);
165
+
166
+ allocator.free(compaction.index.block);
167
+ allocator.free(compaction.filter.block);
168
+ allocator.free(compaction.data.block);
169
+ }
170
+
171
+ pub fn start(
172
+ compaction: *Compaction,
173
+ grid: *Grid,
174
+ manifest: *Manifest,
175
+ // TODO level_a_table: ?TableInfo,
176
+ level_b: u8,
177
+ range: Manifest.CompactionRange,
178
+ snapshot: u64,
179
+ iterator_a_context: IteratorA.Context,
180
+ ) void {
181
+ assert(compaction.status == .idle);
182
+ assert(compaction.callback == null);
183
+ assert(compaction.io_pending == 0);
184
+ assert(level_b < config.lsm_levels);
185
+ assert(range.table_count > 0);
186
+
187
+ const drop_tombstones = manifest.compaction_must_drop_tombstones(level_b, range);
188
+ assert(drop_tombstones or level_b < config.lsm_levels - 1);
189
+
190
+ compaction.* = .{
191
+ .status = .compacting,
192
+
193
+ .grid = grid,
194
+ .manifest = manifest,
195
+ .level_b = level_b,
196
+ .range = range,
197
+ .snapshot = snapshot,
198
+ .drop_tombstones = drop_tombstones,
199
+
200
+ .iterator_a = compaction.iterator_a,
201
+ .iterator_b = compaction.iterator_b,
202
+
203
+ .merge_iterator = undefined,
204
+ .table_builder = compaction.table_builder,
205
+
206
+ .index = compaction.index,
207
+ .filter = compaction.filter,
208
+ .data = compaction.data,
209
+
210
+ .update_level_b = compaction.update_level_b,
211
+ .insert_level_b = compaction.insert_level_b,
212
+ };
213
+
214
+ assert(!compaction.data.ready);
215
+ assert(!compaction.filter.ready);
216
+ assert(!compaction.index.ready);
217
+
218
+ // TODO Reset builder.
219
+
220
+ // TODO: Enable when move_table() can fetch TableInfo from address/checksum.
221
+ //
222
+ // Perform a "compaction move" to the next level inline if certain factors allow:
223
+ // - Can only do the specialization if there's a single table to compact.
224
+ // - Must be compacting from a table iterator which has an address and checksum.
225
+ // - Cannot drop tombstones as then we have to go through the normal compaction path.
226
+ // - Cannot be performing the immutable table -> level 0 compaction
227
+ // as it requires the table being moved to reside on disk (tracked by manifest).
228
+ if (false and IteratorA.Context == TableIteratorType(Table, Storage)) {
229
+ if (!drop_tombstones and range.table_count == 1) {
230
+ assert(compaction.level_b != 0);
231
+ assert(compaction.status == .compacting);
232
+
233
+ const level_a = level_b - 1;
234
+ assert(level_a < config.lsm_levels - 1);
235
+
236
+ compaction.manifest.move_table(
237
+ level_a,
238
+ level_b,
239
+ snapshot,
240
+ iterator_a_context.address,
241
+ iterator_a_context.checksum,
242
+ );
243
+
244
+ compaction.status = .done;
245
+ return;
246
+ }
247
+ }
248
+
249
+ const iterator_b_context = .{
250
+ .grid = grid,
251
+ .manifest = manifest,
252
+ .level = level_b,
253
+ .snapshot = snapshot,
254
+ .key_min = range.key_min,
255
+ .key_max = range.key_max,
256
+ .direction = .ascending,
257
+ .table_info_callback = iterator_b_table_info_callback, // TODO
258
+ };
259
+
260
+ compaction.iterator_a.start(iterator_a_context, iterator_a_callback);
261
+ compaction.iterator_b.start(iterator_b_context, iterator_b_callback);
262
+ }
263
+
264
+ fn iterator_b_table_info_callback(
265
+ iterator_b: *IteratorB,
266
+ table: *const TableInfo,
267
+ index_block: BlockPtrConst,
268
+ ) void {
269
+ const compaction = @fieldParentPtr(Compaction, "iterator_b", iterator_b);
270
+ compaction.queue_manifest_update(&compaction.update_level_b, table);
271
+
272
+ // Release the table's block addresses if it's invisible to the compaction.
273
+ if (table.invisible(&.{compaction.snapshot})) {
274
+ compaction.grid.release(Table.index_block_address(index_block));
275
+
276
+ for (Table.index_filter_addresses_used(index_block)) |address| {
277
+ compaction.grid.release(address);
278
+ }
279
+
280
+ for (Table.index_data_addresses_used(index_block)) |address| {
281
+ compaction.grid.release(address);
282
+ }
283
+ }
284
+ }
285
+
286
+ fn queue_manifest_update(
287
+ compaction: *Compaction,
288
+ buffer: *TableInfoBuffer,
289
+ table: *const TableInfo,
290
+ ) void {
291
+ assert(buffer == &compaction.update_level_b or buffer == &compaction.insert_level_b);
292
+ if (buffer.full()) compaction.update_manifest(buffer);
293
+ buffer.push(table);
294
+ }
295
+
296
+ fn update_manifest(compaction: *Compaction, buffer: *TableInfoBuffer) void {
297
+ assert(buffer == &compaction.update_level_b or buffer == &compaction.insert_level_b);
298
+
299
+ const tables: []const TableInfo = buffer.drain();
300
+ if (tables.len == 0) return;
301
+
302
+ for (tables) |table| {
303
+ assert(compare_keys(table.key_min, compaction.range.key_min) != .lt);
304
+ assert(compare_keys(table.key_max, compaction.range.key_max) != .gt);
305
+ }
306
+
307
+ if (buffer == &compaction.update_level_b) {
308
+ compaction.manifest.update_tables(compaction.level_b, compaction.snapshot, tables);
309
+ } else {
310
+ compaction.manifest.insert_tables(compaction.level_b, tables);
311
+ }
312
+ }
313
+
314
+ /// Compaction.ticks stages at which each action may be performed.
315
+ const pipeline_tick_read = 0;
316
+ const pipeline_tick_merge = 1;
317
+ const pipeline_tick_write = 2;
318
+
319
+ /// Submits all read/write I/O before starting the CPU-intensive k-way merge.
320
+ /// This allows the I/O to happen in parallel with the merge.
321
+ ///
322
+ /// The caller must call:
323
+ ///
324
+ /// 1. tick_io() across all trees,
325
+ /// 2. IO.tick() to submit these I/O operations to the kernel,
326
+ /// 3. tick_cpu() across all trees.
327
+ pub fn tick_io(compaction: *Compaction, callback: Callback) void {
328
+ assert(compaction.status == .compacting);
329
+ assert(compaction.callback == null);
330
+ assert(compaction.io_pending == 0);
331
+ assert(!compaction.merge_done);
332
+
333
+ compaction.callback = callback;
334
+
335
+ if (compaction.ticks >= pipeline_tick_read) compaction.tick_io_read();
336
+ if (compaction.ticks >= pipeline_tick_write) compaction.tick_io_write();
337
+
338
+ // All values may be eclipsed by tombstones, with no write I/O pending here.
339
+ }
340
+
341
+ pub fn tick_cpu(compaction: *Compaction) void {
342
+ assert(compaction.status == .compacting);
343
+ assert(compaction.callback != null);
344
+ assert(compaction.io_pending >= 0);
345
+ assert(!compaction.merge_done);
346
+
347
+ if (compaction.ticks == pipeline_tick_merge) {
348
+ // We cannot initialize the merge until we can peek() a value from each stream,
349
+ // which depends on tick 0 (to read blocks) having happened.
350
+ compaction.merge_iterator = MergeIterator.init(compaction, k, .ascending);
351
+ }
352
+
353
+ if (compaction.ticks >= pipeline_tick_merge) {
354
+ if (compaction.merge_iterator.empty()) {
355
+ assert(!compaction.merge_done);
356
+
357
+ // We must distinguish between merge_iterator.empty() and merge_done.
358
+ // The former cannot be accessed before MergeIterator.init() on tick 1.
359
+ compaction.merge_done = true;
360
+ } else {
361
+ compaction.tick_cpu_merge();
362
+ }
363
+ }
364
+
365
+ compaction.ticks += 1;
366
+
367
+ // Normally, a tick completes only after a read/write I/O.
368
+ // However, the compaction may drop only tombstones, resulting in no write I/O.
369
+ if (compaction.io_pending == 0) compaction.tick_done();
370
+ }
371
+
372
+ fn tick_done(compaction: *Compaction) void {
373
+ assert(compaction.status == .compacting);
374
+ assert(compaction.callback != null);
375
+ assert(compaction.io_pending == 0);
376
+
377
+ // Consume the callback and invoke it one finished updating state below.
378
+ const callback = compaction.callback.?;
379
+ compaction.callback = null;
380
+ defer callback(compaction);
381
+
382
+ // Once merge completes, the compaction is now officially over.
383
+ if (compaction.merge_done) {
384
+ compaction.status = .done;
385
+
386
+ // Flush updates to the table infos discovered during compaction
387
+ // TODO Handle compaction.remove_level_a
388
+ compaction.update_manifest(&compaction.update_level_b);
389
+ compaction.update_manifest(&compaction.insert_level_b);
390
+ }
391
+ }
392
+
393
+ pub fn reset(compaction: *Compaction) void {
394
+ assert(compaction.callback == null);
395
+ assert(compaction.io_pending == 0);
396
+
397
+ assert(compaction.status == .done);
398
+ compaction.status = .idle;
399
+
400
+ assert(compaction.update_level_b.drain().len == 0);
401
+ assert(compaction.insert_level_b.drain().len == 0);
402
+ }
403
+
404
+ fn tick_io_read(compaction: *Compaction) void {
405
+ assert(compaction.callback != null);
406
+
407
+ if (compaction.iterator_a.tick()) compaction.io_pending += 1;
408
+ if (compaction.iterator_b.tick()) compaction.io_pending += 1;
409
+
410
+ if (compaction.merge_done) assert(compaction.io_pending == 0);
411
+ }
412
+
413
+ fn tick_io_write(compaction: *Compaction) void {
414
+ assert(compaction.callback != null);
415
+ assert(compaction.ticks >= pipeline_tick_write);
416
+ // There may be no data block to write if all values are eclipsed by tombstones.
417
+ assert(compaction.data.ready or !compaction.data.ready);
418
+
419
+ compaction.write_block_if_ready(&compaction.data, write_block_callback("data"));
420
+ compaction.write_block_if_ready(&compaction.filter, write_block_callback("filter"));
421
+ compaction.write_block_if_ready(&compaction.index, write_block_callback("index"));
422
+
423
+ assert(!compaction.data.ready);
424
+ assert(!compaction.filter.ready);
425
+ assert(!compaction.index.ready);
426
+ }
427
+
428
+ fn tick_cpu_merge(compaction: *Compaction) void {
429
+ assert(compaction.callback != null);
430
+ assert(compaction.ticks >= pipeline_tick_merge);
431
+ assert(!compaction.merge_done);
432
+ assert(!compaction.merge_iterator.empty());
433
+
434
+ assert(!compaction.data.ready);
435
+ assert(!compaction.filter.ready);
436
+ assert(!compaction.index.ready);
437
+
438
+ var tombstones_dropped: u32 = 0;
439
+ while (!compaction.table_builder.data_block_full()) {
440
+ const value = compaction.merge_iterator.pop() orelse {
441
+ compaction.assert_read_iterators_empty();
442
+ break;
443
+ };
444
+ if (compaction.drop_tombstones and tombstone(&value)) {
445
+ tombstones_dropped += 1;
446
+ } else {
447
+ compaction.table_builder.data_block_append(&value);
448
+ }
449
+ }
450
+
451
+ if (compaction.table_builder.data_block_empty()) {
452
+ assert(compaction.drop_tombstones);
453
+ assert(tombstones_dropped > 0);
454
+ } else {
455
+ compaction.table_builder.data_block_finish(.{
456
+ .cluster = compaction.grid.superblock.working.cluster,
457
+ .address = compaction.grid.acquire(),
458
+ });
459
+ swap_buffers(&compaction.data, &compaction.table_builder.data_block);
460
+ assert(compaction.data.ready);
461
+
462
+ if (!compaction.merge_iterator.empty()) {
463
+ // Ensure that the block was filled completely.
464
+ const values_used = Table.data_block_values_used(compaction.data.block).len;
465
+ assert(values_used == Table.data.value_count_max);
466
+ }
467
+ }
468
+
469
+ if (compaction.table_builder.filter_block_full() or
470
+ compaction.table_builder.index_block_full() or
471
+ compaction.merge_iterator.empty())
472
+ {
473
+ if (compaction.table_builder.filter_block_empty()) {
474
+ assert(compaction.drop_tombstones);
475
+ assert(tombstones_dropped > 0);
476
+ } else {
477
+ compaction.table_builder.filter_block_finish(.{
478
+ .cluster = compaction.grid.superblock.working.cluster,
479
+ .address = compaction.grid.acquire(),
480
+ });
481
+ swap_buffers(&compaction.filter, &compaction.table_builder.filter_block);
482
+ assert(compaction.filter.ready);
483
+ }
484
+ }
485
+
486
+ if (compaction.table_builder.index_block_full() or
487
+ compaction.merge_iterator.empty())
488
+ {
489
+ if (compaction.table_builder.index_block_empty()) {
490
+ assert(compaction.drop_tombstones);
491
+ assert(tombstones_dropped > 0);
492
+ } else {
493
+ const snapshot_min = compaction.snapshot;
494
+ const table = compaction.table_builder.index_block_finish(.{
495
+ .cluster = compaction.grid.superblock.working.cluster,
496
+ .address = compaction.grid.acquire(),
497
+ .snapshot_min = snapshot_min,
498
+ });
499
+ compaction.queue_manifest_update(&compaction.insert_level_b, &table);
500
+
501
+ swap_buffers(&compaction.index, &compaction.table_builder.index_block);
502
+ assert(compaction.index.ready);
503
+ }
504
+ }
505
+ }
506
+
507
+ fn iterator_a_callback(iterator_a: *IteratorA) void {
508
+ const compaction = @fieldParentPtr(Compaction, "iterator_a", iterator_a);
509
+ compaction.io_callback();
510
+ }
511
+
512
+ fn iterator_b_callback(iterator_b: *IteratorB) void {
513
+ const compaction = @fieldParentPtr(Compaction, "iterator_b", iterator_b);
514
+ compaction.io_callback();
515
+ }
516
+
517
+ fn io_callback(compaction: *Compaction) void {
518
+ compaction.io_pending -= 1;
519
+ if (compaction.io_pending == 0) compaction.tick_done();
520
+ }
521
+
522
+ fn write_block_if_ready(
523
+ compaction: *Compaction,
524
+ block_write: *BlockWrite,
525
+ callback: fn (*Grid.Write) void,
526
+ ) void {
527
+ if (block_write.ready) {
528
+ block_write.ready = false;
529
+
530
+ compaction.io_pending += 1;
531
+ compaction.grid.write_block(
532
+ callback,
533
+ &block_write.write,
534
+ block_write.block,
535
+ Table.block_address(block_write.block),
536
+ );
537
+ }
538
+ }
539
+
540
+ fn write_block_callback(comptime field: []const u8) fn (*Grid.Write) void {
541
+ return struct {
542
+ fn callback(write: *Grid.Write) void {
543
+ const block_write = @fieldParentPtr(BlockWrite, "write", write);
544
+ const compaction = @fieldParentPtr(Compaction, field, block_write);
545
+
546
+ io_callback(compaction);
547
+ }
548
+ }.callback;
549
+ }
550
+
551
+ fn swap_buffers(block_write: *BlockWrite, block_ready: *BlockPtr) void {
552
+ mem.swap(BlockPtr, &block_write.block, block_ready);
553
+
554
+ assert(!block_write.ready);
555
+ block_write.ready = true;
556
+ }
557
+
558
+ fn assert_read_iterators_empty(compaction: Compaction) void {
559
+ assert(compaction.iterator_a.buffered_all_values());
560
+ assert(compaction.iterator_a.peek() == null);
561
+
562
+ assert(compaction.iterator_b.buffered_all_values());
563
+ assert(compaction.iterator_b.peek() == null);
564
+ }
565
+
566
+ fn stream_peek(compaction: *Compaction, stream_id: u32) ?Key {
567
+ assert(stream_id <= 1);
568
+
569
+ if (stream_id == 0) {
570
+ return compaction.iterator_a.peek();
571
+ } else {
572
+ return compaction.iterator_b.peek();
573
+ }
574
+ }
575
+
576
+ fn stream_pop(compaction: *Compaction, stream_id: u32) Value {
577
+ assert(stream_id <= 1);
578
+
579
+ if (stream_id == 0) {
580
+ return compaction.iterator_a.pop();
581
+ } else {
582
+ return compaction.iterator_b.pop();
583
+ }
584
+ }
585
+
586
+ /// Returns true if stream A has higher precedence than stream B.
587
+ /// This is used to deduplicate values across streams.
588
+ ///
589
+ /// This assumes that all overlapping tables in level A at the time the compaction was
590
+ /// started are included in the compaction. If this is not the case, the older table
591
+ /// in a pair of overlapping tables could be left in level A and shadow the newer table
592
+ /// in level B, resulting in data loss/invalid data.
593
+ fn stream_precedence(compaction: *Compaction, a: u32, b: u32) bool {
594
+ _ = compaction;
595
+
596
+ assert(a + b == 1);
597
+
598
+ // A stream_id of 0 indicates the level A iterator.
599
+ // All tables in level A have higher precedence.
600
+ return a == 0;
601
+ }
602
+ };
603
+ }
@@ -0,0 +1,75 @@
1
+ const std = @import("std");
2
+ const assert = std.debug.assert;
3
+ const math = std.math;
4
+
5
+ pub fn CompositeKey(comptime Field: type) type {
6
+ assert(Field == u128 or Field == u64);
7
+
8
+ return packed struct {
9
+ const Self = @This();
10
+
11
+ pub const sentinel_key: Self = .{
12
+ .field = math.maxInt(Field),
13
+ .timestamp = math.maxInt(u64),
14
+ };
15
+
16
+ const tombstone_bit = 1 << 63;
17
+
18
+ // If zeroed padding is needed after the timestamp field.
19
+ const pad = Field == u128;
20
+
21
+ pub const Value = packed struct {
22
+ field: Field align(@alignOf(Field)),
23
+ /// The most significant bit indicates if the value is a tombstone.
24
+ timestamp: u64 align(@alignOf(u64)),
25
+ padding: (if (pad) u64 else u0) = 0,
26
+
27
+ comptime {
28
+ assert(@sizeOf(Value) == @sizeOf(Field) * 2);
29
+ assert(@alignOf(Value) == @alignOf(Field));
30
+ }
31
+ };
32
+
33
+ field: Field align(@alignOf(Field)),
34
+ /// The most significant bit must be unset as it is used to indicate a tombstone.
35
+ timestamp: u64 align(@alignOf(u64)),
36
+ padding: (if (pad) u64 else u0) = 0,
37
+
38
+ comptime {
39
+ assert(@sizeOf(Self) == @sizeOf(Field) * 2);
40
+ assert(@alignOf(Self) == @alignOf(Field));
41
+ }
42
+
43
+ pub inline fn compare_keys(a: Self, b: Self) math.Order {
44
+ if (a.field < b.field) {
45
+ return .lt;
46
+ } else if (a.field > b.field) {
47
+ return .gt;
48
+ } else if (a.timestamp < b.timestamp) {
49
+ return .lt;
50
+ } else if (a.timestamp > b.timestamp) {
51
+ return .gt;
52
+ } else {
53
+ return .eq;
54
+ }
55
+ }
56
+
57
+ pub inline fn key_from_value(value: *const Value) Self {
58
+ return .{
59
+ .field = value.field,
60
+ .timestamp = @truncate(u63, value.timestamp),
61
+ };
62
+ }
63
+
64
+ pub inline fn tombstone(value: *const Value) bool {
65
+ return (value.timestamp & tombstone_bit) != 0;
66
+ }
67
+
68
+ pub inline fn tombstone_from_key(key: Self) Value {
69
+ return .{
70
+ .field = key.field,
71
+ .timestamp = key.timestamp | tombstone_bit,
72
+ };
73
+ }
74
+ };
75
+ }
@@ -0,0 +1,11 @@
1
+ pub const Direction = enum {
2
+ ascending,
3
+ descending,
4
+
5
+ pub fn reverse(d: Direction) Direction {
6
+ return switch (d) {
7
+ .ascending => .descending,
8
+ .descending => .ascending,
9
+ };
10
+ }
11
+ };