tigerbeetle-node 0.10.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. package/README.md +302 -101
  2. package/dist/index.d.ts +70 -72
  3. package/dist/index.js +70 -72
  4. package/dist/index.js.map +1 -1
  5. package/package.json +6 -6
  6. package/scripts/download_node_headers.sh +14 -7
  7. package/src/index.ts +6 -10
  8. package/src/node.zig +6 -3
  9. package/src/tigerbeetle/scripts/benchmark.sh +4 -4
  10. package/src/tigerbeetle/scripts/confirm_image.sh +44 -0
  11. package/src/tigerbeetle/scripts/fuzz_loop.sh +15 -0
  12. package/src/tigerbeetle/scripts/fuzz_unique_errors.sh +7 -0
  13. package/src/tigerbeetle/scripts/install.sh +19 -4
  14. package/src/tigerbeetle/scripts/install_zig.bat +5 -1
  15. package/src/tigerbeetle/scripts/install_zig.sh +24 -14
  16. package/src/tigerbeetle/scripts/pre-commit.sh +9 -0
  17. package/src/tigerbeetle/scripts/shellcheck.sh +5 -0
  18. package/src/tigerbeetle/scripts/tests_on_alpine.sh +10 -0
  19. package/src/tigerbeetle/scripts/tests_on_ubuntu.sh +14 -0
  20. package/src/tigerbeetle/src/benchmark.zig +4 -2
  21. package/src/tigerbeetle/src/benchmark_array_search.zig +3 -3
  22. package/src/tigerbeetle/src/c/tb_client/thread.zig +8 -9
  23. package/src/tigerbeetle/src/c/tb_client.h +100 -80
  24. package/src/tigerbeetle/src/c/tb_client.zig +4 -1
  25. package/src/tigerbeetle/src/cli.zig +1 -1
  26. package/src/tigerbeetle/src/config.zig +48 -16
  27. package/src/tigerbeetle/src/demo.zig +3 -1
  28. package/src/tigerbeetle/src/eytzinger_benchmark.zig +3 -3
  29. package/src/tigerbeetle/src/io/linux.zig +1 -1
  30. package/src/tigerbeetle/src/lsm/README.md +214 -0
  31. package/src/tigerbeetle/src/lsm/binary_search.zig +137 -10
  32. package/src/tigerbeetle/src/lsm/bloom_filter.zig +43 -0
  33. package/src/tigerbeetle/src/lsm/compaction.zig +352 -398
  34. package/src/tigerbeetle/src/lsm/composite_key.zig +2 -0
  35. package/src/tigerbeetle/src/lsm/eytzinger.zig +1 -1
  36. package/src/tigerbeetle/src/lsm/forest.zig +21 -447
  37. package/src/tigerbeetle/src/lsm/forest_fuzz.zig +412 -0
  38. package/src/tigerbeetle/src/lsm/grid.zig +145 -69
  39. package/src/tigerbeetle/src/lsm/groove.zig +196 -133
  40. package/src/tigerbeetle/src/lsm/k_way_merge.zig +40 -18
  41. package/src/tigerbeetle/src/lsm/level_iterator.zig +28 -9
  42. package/src/tigerbeetle/src/lsm/manifest.zig +81 -181
  43. package/src/tigerbeetle/src/lsm/manifest_level.zig +210 -454
  44. package/src/tigerbeetle/src/lsm/manifest_log.zig +77 -28
  45. package/src/tigerbeetle/src/lsm/posted_groove.zig +64 -76
  46. package/src/tigerbeetle/src/lsm/segmented_array.zig +561 -241
  47. package/src/tigerbeetle/src/lsm/segmented_array_benchmark.zig +148 -0
  48. package/src/tigerbeetle/src/lsm/segmented_array_fuzz.zig +9 -0
  49. package/src/tigerbeetle/src/lsm/set_associative_cache.zig +62 -12
  50. package/src/tigerbeetle/src/lsm/table.zig +83 -48
  51. package/src/tigerbeetle/src/lsm/table_immutable.zig +30 -23
  52. package/src/tigerbeetle/src/lsm/table_iterator.zig +25 -14
  53. package/src/tigerbeetle/src/lsm/table_mutable.zig +63 -12
  54. package/src/tigerbeetle/src/lsm/test.zig +49 -55
  55. package/src/tigerbeetle/src/lsm/tree.zig +407 -402
  56. package/src/tigerbeetle/src/lsm/tree_fuzz.zig +457 -0
  57. package/src/tigerbeetle/src/main.zig +28 -6
  58. package/src/tigerbeetle/src/message_bus.zig +2 -2
  59. package/src/tigerbeetle/src/message_pool.zig +14 -17
  60. package/src/tigerbeetle/src/simulator.zig +145 -112
  61. package/src/tigerbeetle/src/state_machine.zig +338 -228
  62. package/src/tigerbeetle/src/static_allocator.zig +65 -0
  63. package/src/tigerbeetle/src/storage.zig +3 -7
  64. package/src/tigerbeetle/src/test/accounting/auditor.zig +577 -0
  65. package/src/tigerbeetle/src/test/accounting/workload.zig +819 -0
  66. package/src/tigerbeetle/src/test/cluster.zig +18 -48
  67. package/src/tigerbeetle/src/test/conductor.zig +365 -0
  68. package/src/tigerbeetle/src/test/fuzz.zig +121 -0
  69. package/src/tigerbeetle/src/test/id.zig +89 -0
  70. package/src/tigerbeetle/src/test/priority_queue.zig +645 -0
  71. package/src/tigerbeetle/src/test/state_checker.zig +93 -69
  72. package/src/tigerbeetle/src/test/state_machine.zig +11 -35
  73. package/src/tigerbeetle/src/test/storage.zig +29 -8
  74. package/src/tigerbeetle/src/tigerbeetle.zig +14 -16
  75. package/src/tigerbeetle/src/unit_tests.zig +7 -0
  76. package/src/tigerbeetle/src/vopr.zig +494 -0
  77. package/src/tigerbeetle/src/vopr_hub/README.md +58 -0
  78. package/src/tigerbeetle/src/vopr_hub/SETUP.md +199 -0
  79. package/src/tigerbeetle/src/vopr_hub/go.mod +3 -0
  80. package/src/tigerbeetle/src/vopr_hub/main.go +1022 -0
  81. package/src/tigerbeetle/src/vopr_hub/scheduler/go.mod +3 -0
  82. package/src/tigerbeetle/src/vopr_hub/scheduler/main.go +403 -0
  83. package/src/tigerbeetle/src/vsr/client.zig +13 -0
  84. package/src/tigerbeetle/src/vsr/journal.zig +16 -13
  85. package/src/tigerbeetle/src/vsr/replica.zig +924 -491
  86. package/src/tigerbeetle/src/vsr/superblock.zig +55 -37
  87. package/src/tigerbeetle/src/vsr/superblock_client_table.zig +7 -10
  88. package/src/tigerbeetle/src/vsr/superblock_free_set.zig +2 -2
  89. package/src/tigerbeetle/src/vsr/superblock_manifest.zig +18 -3
  90. package/src/tigerbeetle/src/vsr.zig +75 -55
  91. package/src/tigerbeetle/scripts/vopr.bat +0 -48
  92. package/src/tigerbeetle/scripts/vopr.sh +0 -33
@@ -31,8 +31,15 @@ const vsr = @import("../vsr.zig");
31
31
 
32
32
  const SuperBlockType = vsr.SuperBlockType;
33
33
  const GridType = @import("grid.zig").GridType;
34
+ const BlockType = @import("grid.zig").BlockType;
34
35
  const RingBuffer = @import("../ring_buffer.zig").RingBuffer;
35
36
 
37
+ /// ManifestLog block schema:
38
+ /// │ vsr.Header │ operation=BlockType.manifest
39
+ /// │ [entry_count_max]Label │ level index, insert|remove
40
+ /// │ [≤entry_count_max]TableInfo │
41
+ /// │ […]u8{0} │ padding (to end of block)
42
+ /// Label and TableInfo entries correspond.
36
43
  pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
37
44
  return struct {
38
45
  const ManifestLog = @This();
@@ -40,8 +47,8 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
40
47
  const SuperBlock = SuperBlockType(Storage);
41
48
  const Grid = GridType(Storage);
42
49
 
43
- const BlockPtr = *align(config.sector_size) [config.block_size]u8;
44
- const BlockPtrConst = *align(config.sector_size) const [config.block_size]u8;
50
+ const BlockPtr = Grid.BlockPtr;
51
+ const BlockPtrConst = Grid.BlockPtrConst;
45
52
 
46
53
  pub const Callback = fn (manifest_log: *ManifestLog) void;
47
54
 
@@ -99,20 +106,27 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
99
106
  blocks_closed: u8 = 0,
100
107
 
101
108
  /// The number of entries in the open block.
109
+ ///
110
+ /// Invariants:
111
+ /// - When `entry_count = 0`, there is no open block.
112
+ /// - `entry_count < entry_count_max`. When `entry_count` reaches the maximum, the open
113
+ /// block is closed, and `entry_count` resets to 0.
102
114
  entry_count: u32 = 0,
103
115
 
104
116
  opened: bool = false,
105
117
  open_event: OpenEvent = undefined,
106
118
  open_iterator: SuperBlock.Manifest.IteratorReverse = undefined,
107
119
 
120
+ /// Set for the duration of `compact`.
108
121
  reading: bool = false,
109
122
  read: Grid.Read = undefined,
110
- read_callback: Callback = undefined,
123
+ read_callback: ?Callback = null,
111
124
  read_block_reference: ?SuperBlock.Manifest.BlockReference = null,
112
125
 
126
+ /// Set for the duration of `flush` and `checkpoint`.
113
127
  writing: bool = false,
114
128
  write: Grid.Write = undefined,
115
- write_callback: Callback = undefined,
129
+ write_callback: ?Callback = null,
116
130
 
117
131
  pub fn init(allocator: mem.Allocator, grid: *Grid, tree_hash: u128) !ManifestLog {
118
132
  // TODO RingBuffer for .pointer should be extended to take care of alignment:
@@ -154,6 +168,11 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
154
168
  assert(!manifest_log.opened);
155
169
  assert(!manifest_log.reading);
156
170
  assert(!manifest_log.writing);
171
+ assert(manifest_log.read_callback == null);
172
+
173
+ assert(manifest_log.blocks.count == 0);
174
+ assert(manifest_log.blocks_closed == 0);
175
+ assert(manifest_log.entry_count == 0);
157
176
 
158
177
  manifest_log.open_event = event;
159
178
  manifest_log.open_iterator = manifest_log.superblock.manifest.iterator_reverse(
@@ -171,6 +190,10 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
171
190
  assert(manifest_log.reading);
172
191
  assert(!manifest_log.writing);
173
192
 
193
+ assert(manifest_log.blocks.count == 0);
194
+ assert(manifest_log.blocks_closed == 0);
195
+ assert(manifest_log.entry_count == 0);
196
+
174
197
  manifest_log.read_block_reference = manifest_log.open_iterator.next();
175
198
 
176
199
  if (manifest_log.read_block_reference) |block| {
@@ -182,15 +205,16 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
182
205
  &manifest_log.read,
183
206
  block.address,
184
207
  block.checksum,
208
+ .manifest,
185
209
  );
186
210
  } else {
187
211
  manifest_log.opened = true;
188
212
  manifest_log.open_event = undefined;
189
213
  manifest_log.open_iterator = undefined;
190
214
 
191
- const callback = manifest_log.read_callback;
215
+ const callback = manifest_log.read_callback.?;
192
216
  manifest_log.reading = false;
193
- manifest_log.read_callback = undefined;
217
+ manifest_log.read_callback = null;
194
218
  assert(manifest_log.read_block_reference == null);
195
219
 
196
220
  callback(manifest_log);
@@ -229,6 +253,10 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
229
253
  }
230
254
  }
231
255
 
256
+ if (block_entry_count(block) < entry_count_max) {
257
+ manifest.queue_for_compaction(block_reference.address);
258
+ }
259
+
232
260
  log.debug("{}: opened: checksum={} address={} entries={}", .{
233
261
  manifest_log.tree_hash,
234
262
  block_reference.checksum,
@@ -259,11 +287,8 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
259
287
  assert(table.snapshot_min > 0);
260
288
  assert(table.snapshot_max > table.snapshot_min);
261
289
 
262
- if (manifest_log.blocks.empty()) {
263
- manifest_log.acquire_block();
264
- } else if (manifest_log.entry_count == entry_count_max) {
265
- assert(manifest_log.blocks.count > 0);
266
- manifest_log.close_block();
290
+ if (manifest_log.entry_count == 0) {
291
+ assert(manifest_log.blocks.count == manifest_log.blocks_closed);
267
292
  manifest_log.acquire_block();
268
293
  } else if (manifest_log.entry_count > 0) {
269
294
  assert(manifest_log.blocks.count > 0);
@@ -302,12 +327,18 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
302
327
  }
303
328
 
304
329
  manifest_log.entry_count += 1;
330
+ if (manifest_log.entry_count == entry_count_max) {
331
+ manifest_log.close_block();
332
+ assert(manifest_log.entry_count == 0);
333
+ }
305
334
  }
306
335
 
307
- pub fn flush(manifest_log: *ManifestLog, callback: Callback) void {
336
+ /// `flush` does not close a partial block; that is only necessary during `checkpoint`.
337
+ fn flush(manifest_log: *ManifestLog, callback: Callback) void {
308
338
  assert(manifest_log.opened);
309
339
  assert(!manifest_log.reading);
310
340
  assert(!manifest_log.writing);
341
+ assert(manifest_log.write_callback == null);
311
342
 
312
343
  manifest_log.writing = true;
313
344
  manifest_log.write_callback = callback;
@@ -332,8 +363,8 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
332
363
  assert(manifest_log.entry_count < entry_count_max);
333
364
  }
334
365
 
335
- const callback = manifest_log.write_callback;
336
- manifest_log.write_callback = undefined;
366
+ const callback = manifest_log.write_callback.?;
367
+ manifest_log.write_callback = null;
337
368
  manifest_log.writing = false;
338
369
 
339
370
  callback(manifest_log);
@@ -350,6 +381,7 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
350
381
  const entry_count = block_entry_count(block);
351
382
 
352
383
  if (manifest_log.blocks_closed == 1 and manifest_log.blocks.count == 1) {
384
+ // This might be the last block of a checkpoint, which can be a partial block.
353
385
  assert(entry_count > 0);
354
386
  } else {
355
387
  assert(entry_count == entry_count_max);
@@ -395,41 +427,48 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
395
427
  }
396
428
 
397
429
  pub fn compact(manifest_log: *ManifestLog, callback: Callback) void {
430
+ assert(manifest_log.opened);
398
431
  assert(!manifest_log.reading);
432
+ assert(!manifest_log.writing);
433
+ assert(manifest_log.read_callback == null);
399
434
  manifest_log.read_callback = callback;
400
- manifest_log.flush(flush_callback);
435
+ manifest_log.flush(compact_flush_callback);
401
436
  }
402
437
 
403
- fn flush_callback(manifest_log: *ManifestLog) void {
404
- const callback = manifest_log.read_callback;
405
- manifest_log.read_callback = undefined;
438
+ fn compact_flush_callback(manifest_log: *ManifestLog) void {
439
+ const callback = manifest_log.read_callback.?;
406
440
 
407
441
  assert(manifest_log.opened);
408
442
  assert(!manifest_log.reading);
409
443
  assert(!manifest_log.writing);
444
+ assert(manifest_log.blocks_closed == 0);
410
445
 
411
446
  const manifest: *SuperBlock.Manifest = &manifest_log.superblock.manifest;
412
447
 
448
+ // Compact a single manifest block — to minimize latency spikes, we want to do the bare
449
+ // minimum of compaction work required.
450
+ // TODO Compact more than 1 block if fragmentation is outstripping the compaction rate.
413
451
  if (manifest.oldest_block_queued_for_compaction(manifest_log.tree_hash)) |block| {
414
452
  assert(block.tree == manifest_log.tree_hash);
415
453
  assert(block.address > 0);
416
454
 
417
455
  manifest_log.reading = true;
418
- manifest_log.read_callback = callback;
419
456
  manifest_log.read_block_reference = block;
420
457
 
421
458
  manifest_log.grid.read_block(
422
- compact_callback,
459
+ compact_read_block_callback,
423
460
  &manifest_log.read,
424
461
  block.address,
425
462
  block.checksum,
463
+ .manifest,
426
464
  );
427
465
  } else {
466
+ manifest_log.read_callback = null;
428
467
  callback(manifest_log);
429
468
  }
430
469
  }
431
470
 
432
- fn compact_callback(read: *Grid.Read, block: BlockPtrConst) void {
471
+ fn compact_read_block_callback(read: *Grid.Read, block: BlockPtrConst) void {
433
472
  const manifest_log = @fieldParentPtr(ManifestLog, "read", read);
434
473
  assert(manifest_log.opened);
435
474
  assert(manifest_log.reading);
@@ -489,11 +528,11 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
489
528
  );
490
529
  assert(!manifest.queued_for_compaction(block_reference.address));
491
530
 
492
- manifest_log.superblock.free_set.release_at_checkpoint(block_reference.address);
531
+ manifest_log.grid.release_at_checkpoint(block_reference.address);
493
532
 
494
- const callback = manifest_log.read_callback;
533
+ const callback = manifest_log.read_callback.?;
495
534
  manifest_log.reading = false;
496
- manifest_log.read_callback = undefined;
535
+ manifest_log.read_callback = null;
497
536
  manifest_log.read_block_reference = null;
498
537
 
499
538
  callback(manifest_log);
@@ -503,6 +542,7 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
503
542
  assert(manifest_log.opened);
504
543
  assert(!manifest_log.reading);
505
544
  assert(!manifest_log.writing);
545
+ assert(manifest_log.write_callback == null);
506
546
 
507
547
  manifest_log.writing = true;
508
548
  manifest_log.write_callback = callback;
@@ -519,7 +559,9 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
519
559
  }
520
560
 
521
561
  fn acquire_block(manifest_log: *ManifestLog) void {
562
+ assert(manifest_log.opened);
522
563
  assert(manifest_log.entry_count == 0);
564
+ assert(manifest_log.blocks.count == manifest_log.blocks_closed);
523
565
  assert(!manifest_log.blocks.full());
524
566
 
525
567
  manifest_log.blocks.advance_tail();
@@ -529,15 +571,16 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
529
571
  const header = mem.bytesAsValue(vsr.Header, block[0..@sizeOf(vsr.Header)]);
530
572
  header.* = .{
531
573
  .cluster = manifest_log.superblock.working.cluster,
532
- .op = manifest_log.superblock.free_set.acquire().?,
574
+ .op = manifest_log.grid.acquire(),
533
575
  .size = undefined,
534
576
  .command = .block,
535
577
  };
536
578
  }
537
579
 
538
580
  fn close_block(manifest_log: *ManifestLog) void {
539
- const block: BlockPtr = manifest_log.blocks.tail().?;
581
+ assert(manifest_log.blocks.count == manifest_log.blocks_closed + 1);
540
582
 
583
+ const block: BlockPtr = manifest_log.blocks.tail().?;
541
584
  const entry_count = manifest_log.entry_count;
542
585
  assert(entry_count > 0);
543
586
  assert(entry_count <= entry_count_max);
@@ -554,6 +597,7 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
554
597
  // Zero unused tables, and padding:
555
598
  mem.set(u8, block[header.size..], 0);
556
599
 
600
+ header.operation = BlockType.manifest.operation();
557
601
  header.set_checksum_body(block[@sizeOf(vsr.Header)..header.size]);
558
602
  header.set_checksum();
559
603
 
@@ -569,10 +613,12 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
569
613
 
570
614
  manifest_log.blocks_closed += 1;
571
615
  manifest_log.entry_count = 0;
616
+ assert(manifest_log.blocks.count == manifest_log.blocks_closed);
572
617
  }
573
618
 
574
619
  fn verify_block(block: BlockPtrConst, checksum: ?u128, address: ?u64) void {
575
620
  const header = mem.bytesAsValue(vsr.Header, block[0..@sizeOf(vsr.Header)]);
621
+ assert(BlockType.from(header.operation) == .manifest);
576
622
 
577
623
  if (config.verify) {
578
624
  assert(header.valid_checksum());
@@ -623,6 +669,8 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
623
669
 
624
670
  // Encode the smaller type first because this will be multiplied by entry_count_max.
625
671
  const labels_size = entry_count_max * @sizeOf(Label);
672
+ assert(labels_size == labels_size_max);
673
+ assert((@sizeOf(vsr.Header) + labels_size) % @alignOf(TableInfo) == 0);
626
674
  const tables_size = entry_count * @sizeOf(TableInfo);
627
675
 
628
676
  return @sizeOf(vsr.Header) + labels_size + tables_size;
@@ -649,14 +697,14 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
649
697
  fn tables(block: BlockPtr) *[entry_count_max]TableInfo {
650
698
  return mem.bytesAsSlice(
651
699
  TableInfo,
652
- block[@sizeOf(vsr.Header) + entry_count_max ..][0..tables_size_max],
700
+ block[@sizeOf(vsr.Header) + labels_size_max ..][0..tables_size_max],
653
701
  )[0..entry_count_max];
654
702
  }
655
703
 
656
704
  fn tables_const(block: BlockPtrConst) *const [entry_count_max]TableInfo {
657
705
  return mem.bytesAsSlice(
658
706
  TableInfo,
659
- block[@sizeOf(vsr.Header) + entry_count_max ..][0..tables_size_max],
707
+ block[@sizeOf(vsr.Header) + labels_size_max ..][0..tables_size_max],
660
708
  )[0..entry_count_max];
661
709
  }
662
710
  };
@@ -893,6 +941,7 @@ pub fn main() !void {
893
941
  };
894
942
  assert(@sizeOf(TableInfo) == 48 + 16 * 2);
895
943
  assert(@alignOf(TableInfo) == 16);
944
+ assert(@bitSizeOf(TableInfo) == @sizeOf(TableInfo) * 8);
896
945
 
897
946
  const ManifestLogTest = ManifestLogTestType(Storage, TableInfo);
898
947
 
@@ -12,6 +12,7 @@ const GridType = @import("grid.zig").GridType;
12
12
  const NodePool = @import("node_pool.zig").NodePool(config.lsm_manifest_node_size, 16);
13
13
 
14
14
  const snapshot_latest = @import("tree.zig").snapshot_latest;
15
+ const compaction_snapshot_for_op = @import("tree.zig").compaction_snapshot_for_op;
15
16
 
16
17
  /// This type wraps a single LSM tree in the API needed to integrate it with the Forest.
17
18
  /// TigerBeetle's state machine requires a map from u128 ID to posted boolean for transfers
@@ -44,7 +45,6 @@ pub fn PostedGrooveType(comptime Storage: type) type {
44
45
  return value.id;
45
46
  }
46
47
 
47
- // TODO(ifreund): disallow this id in the state machine.
48
48
  const sentinel_key = math.maxInt(u128);
49
49
 
50
50
  inline fn tombstone(value: *const Value) bool {
@@ -69,13 +69,13 @@ pub fn PostedGrooveType(comptime Storage: type) type {
69
69
  Value.tombstone_from_key,
70
70
  );
71
71
 
72
- const Tree = TreeType(Table, Storage, "groove");
72
+ const Tree = TreeType(Table, Storage, "posted_groove");
73
73
  const Grid = GridType(Storage);
74
74
 
75
75
  const PrefetchIDs = std.AutoHashMapUnmanaged(u128, void);
76
- const PrefetchObjects = std.AutoHashMapUnmanaged(u128, bool);
76
+ const PrefetchObjects = std.AutoHashMapUnmanaged(u128, bool); // true:posted, false:voided
77
77
 
78
- cache: *Tree.ValueCache,
78
+ cache: *Tree.TableMutable.ValuesCache,
79
79
  tree: Tree,
80
80
 
81
81
  /// Object IDs enqueued to be prefetched.
@@ -90,42 +90,31 @@ pub fn PostedGrooveType(comptime Storage: type) type {
90
90
  /// sufficient to query this hashmap alone to know the state of the LSM trees.
91
91
  prefetch_objects: PrefetchObjects,
92
92
 
93
- /// This field is necessary to expose the same open()/compact_cpu()/compact_io() function
93
+ /// The snapshot to prefetch from.
94
+ prefetch_snapshot: ?u64,
95
+
96
+ /// This field is necessary to expose the same open()/compact()/checkpoint() function
94
97
  /// signatures as the real Groove type.
95
98
  callback: ?fn (*PostedGroove) void = null,
96
99
 
100
+ /// See comments for Groove.Options.
101
+ pub const Options = struct {
102
+ cache_entries_max: u32,
103
+ prefetch_entries_max: u32,
104
+ commit_entries_max: u32,
105
+ };
106
+
97
107
  pub fn init(
98
108
  allocator: mem.Allocator,
99
109
  node_pool: *NodePool,
100
110
  grid: *Grid,
101
- // The cache size is meant to be computed based on the left over available memory
102
- // that tigerbeetle was given to allocate from CLI arguments.
103
- cache_size: u32,
104
- // In general, the commit count max for a field, depends on the field's object,
105
- // how many objects might be changed by a batch:
106
- // (config.message_size_max - sizeOf(vsr.header))
107
- // For example, there are at most 8191 transfers in a batch.
108
- // So commit_count_max=8191 for transfer objects and indexes.
109
- //
110
- // However, if a transfer is ever mutated, then this will double commit_count_max
111
- // since the old index might need to be removed, and the new index inserted.
112
- //
113
- // A way to see this is by looking at the state machine. If a transfer is inserted,
114
- // how many accounts and transfer put/removes will be generated?
115
- //
116
- // This also means looking at the state machine operation that will generate the
117
- // most put/removes in the worst case.
118
- // For example, create_accounts will put at most 8191 accounts.
119
- // However, create_transfers will put 2 accounts (8191 * 2) for every transfer, and
120
- // some of these accounts may exist, requiring a remove/put to update the index.
121
- commit_count_max: u32,
111
+ options: Options,
122
112
  ) !PostedGroove {
123
- // Cache is dynamically allocated to pass a pointer into the Object tree.
124
- const cache = try allocator.create(Tree.ValueCache);
113
+ // Cache is heap-allocated to pass a pointer into the Object tree.
114
+ const cache = try allocator.create(Tree.TableMutable.ValuesCache);
125
115
  errdefer allocator.destroy(cache);
126
116
 
127
- cache.* = .{};
128
- try cache.ensureTotalCapacity(allocator, cache_size);
117
+ cache.* = try Tree.TableMutable.ValuesCache.init(allocator, options.cache_entries_max);
129
118
  errdefer cache.deinit(allocator);
130
119
 
131
120
  var tree = try Tree.init(
@@ -134,20 +123,17 @@ pub fn PostedGrooveType(comptime Storage: type) type {
134
123
  grid,
135
124
  cache,
136
125
  .{
137
- .commit_count_max = commit_count_max,
126
+ .commit_entries_max = options.commit_entries_max,
138
127
  },
139
128
  );
140
129
  errdefer tree.deinit(allocator);
141
130
 
142
- // TODO: document why this is twice the commit count max.
143
- const prefetch_count_max = commit_count_max * 2;
144
-
145
131
  var prefetch_ids = PrefetchIDs{};
146
- try prefetch_ids.ensureTotalCapacity(allocator, prefetch_count_max);
132
+ try prefetch_ids.ensureTotalCapacity(allocator, options.prefetch_entries_max);
147
133
  errdefer prefetch_ids.deinit(allocator);
148
134
 
149
135
  var prefetch_objects = PrefetchObjects{};
150
- try prefetch_objects.ensureTotalCapacity(allocator, prefetch_count_max);
136
+ try prefetch_objects.ensureTotalCapacity(allocator, options.prefetch_entries_max);
151
137
  errdefer prefetch_objects.deinit(allocator);
152
138
 
153
139
  return PostedGroove{
@@ -156,12 +142,11 @@ pub fn PostedGrooveType(comptime Storage: type) type {
156
142
 
157
143
  .prefetch_ids = prefetch_ids,
158
144
  .prefetch_objects = prefetch_objects,
145
+ .prefetch_snapshot = null,
159
146
  };
160
147
  }
161
148
 
162
149
  pub fn deinit(groove: *PostedGroove, allocator: mem.Allocator) void {
163
- assert(groove.callback == null);
164
-
165
150
  groove.tree.deinit(allocator);
166
151
  groove.cache.deinit(allocator);
167
152
  allocator.destroy(groove.cache);
@@ -176,10 +161,24 @@ pub fn PostedGrooveType(comptime Storage: type) type {
176
161
  return groove.prefetch_objects.get(id);
177
162
  }
178
163
 
179
- /// Must be called directly after the state machine commit is finished and prefetch results
180
- /// are no longer needed.
181
- pub fn prefetch_clear(groove: *PostedGroove) void {
182
- groove.prefetch_objects.clearRetainingCapacity();
164
+ /// Must be called directly before the state machine begins queuing ids for prefetch.
165
+ /// When `snapshot` is null, prefetch from the current snapshot.
166
+ pub fn prefetch_setup(groove: *PostedGroove, snapshot: ?u64) void {
167
+ // We may query the input tables of an ongoing compaction, but must not query the
168
+ // output tables until the compaction is complete. (Until then, the output tables may
169
+ // be in the manifest but not yet on disk).
170
+ const snapshot_max = groove.tree.lookup_snapshot_max;
171
+ const snapshot_target = snapshot orelse snapshot_max;
172
+ assert(snapshot_target <= snapshot_max);
173
+
174
+ if (groove.prefetch_snapshot == null) {
175
+ groove.prefetch_objects.clearRetainingCapacity();
176
+ } else {
177
+ // If there is a snapshot already set from the previous prefetch_setup(), then its
178
+ // prefetch() was never called, so there must already be no queued objects or ids.
179
+ }
180
+
181
+ groove.prefetch_snapshot = snapshot_target;
183
182
  assert(groove.prefetch_objects.count() == 0);
184
183
  assert(groove.prefetch_ids.count() == 0);
185
184
  }
@@ -188,7 +187,7 @@ pub fn PostedGrooveType(comptime Storage: type) type {
188
187
  /// We tolerate duplicate IDs enqueued by the state machine.
189
188
  /// For example, if all unique operations require the same two dependencies.
190
189
  pub fn prefetch_enqueue(groove: *PostedGroove, id: u128) void {
191
- if (groove.tree.get_cached(id)) |value| {
190
+ if (groove.tree.lookup_from_memory(groove.prefetch_snapshot.?, id)) |value| {
192
191
  switch (value.data) {
193
192
  .posted => groove.prefetch_objects.putAssumeCapacity(value.id, true),
194
193
  .voided => groove.prefetch_objects.putAssumeCapacity(value.id, false),
@@ -200,8 +199,7 @@ pub fn PostedGrooveType(comptime Storage: type) type {
200
199
  }
201
200
 
202
201
  /// Ensure the objects corresponding to all ids enqueued with prefetch_enqueue() are
203
- /// in memory, either in the value cache of the object tree or in the prefetch_objects
204
- /// backup hash map.
202
+ /// available in `prefetch_objects`.
205
203
  pub fn prefetch(
206
204
  groove: *PostedGroove,
207
205
  callback: fn (*PrefetchContext) void,
@@ -210,14 +208,17 @@ pub fn PostedGrooveType(comptime Storage: type) type {
210
208
  context.* = .{
211
209
  .groove = groove,
212
210
  .callback = callback,
211
+ .snapshot = groove.prefetch_snapshot.?,
213
212
  .id_iterator = groove.prefetch_ids.keyIterator(),
214
213
  };
214
+ groove.prefetch_snapshot = null;
215
215
  context.start_workers();
216
216
  }
217
217
 
218
218
  pub const PrefetchContext = struct {
219
219
  groove: *PostedGroove,
220
220
  callback: fn (*PrefetchContext) void,
221
+ snapshot: u64,
221
222
 
222
223
  id_iterator: PrefetchIDs.KeyIterator,
223
224
 
@@ -233,17 +234,16 @@ pub fn PostedGrooveType(comptime Storage: type) type {
233
234
 
234
235
  // Track an extra "worker" that will finish after the loop.
235
236
  //
236
- // This prevents `context.finish()` from being called within the loop body when every
237
- // worker finishes synchronously. `context.finish()` sets the `context` to undefined,
238
- // but `context` is required for the last loop condition check.
237
+ // This prevents `context.finish()` from being called within the loop body when
238
+ // every worker finishes synchronously. `context.finish()` calls the user-provided
239
+ // callback which may re-use the memory of this `PrefetchContext`. However, we
240
+ // rely on `context` being well-defined for the loop condition.
239
241
  context.workers_busy += 1;
240
242
 
241
- // -1 to ignore the extra worker.
242
- while (context.workers_busy - 1 < context.workers.len) {
243
- const worker = &context.workers[context.workers_busy - 1];
243
+ for (context.workers) |*worker| {
244
244
  worker.* = .{ .context = context };
245
245
  context.workers_busy += 1;
246
- if (!worker.lookup_start()) break;
246
+ worker.lookup_start_next();
247
247
  }
248
248
 
249
249
  assert(context.workers_busy >= 1);
@@ -257,12 +257,12 @@ pub fn PostedGrooveType(comptime Storage: type) type {
257
257
 
258
258
  fn finish(context: *PrefetchContext) void {
259
259
  assert(context.workers_busy == 0);
260
- assert(context.groove.prefetch_ids.count() == 0);
260
+
261
261
  assert(context.id_iterator.next() == null);
262
+ context.groove.prefetch_ids.clearRetainingCapacity();
263
+ assert(context.groove.prefetch_ids.count() == 0);
262
264
 
263
- const callback = context.callback;
264
- context.* = undefined;
265
- callback(context);
265
+ context.callback(context);
266
266
  }
267
267
  };
268
268
 
@@ -272,33 +272,27 @@ pub fn PostedGrooveType(comptime Storage: type) type {
272
272
 
273
273
  /// Returns true if asynchronous I/O has been started.
274
274
  /// Returns false if there are no more IDs to prefetch.
275
- fn lookup_start(worker: *PrefetchWorker) bool {
276
- const groove = worker.context.groove;
277
-
275
+ fn lookup_start_next(worker: *PrefetchWorker) void {
278
276
  const id = worker.context.id_iterator.next() orelse {
279
- groove.prefetch_ids.clearRetainingCapacity();
280
- assert(groove.prefetch_ids.count() == 0);
281
277
  worker.context.worker_finished();
282
- return false;
278
+ return;
283
279
  };
284
280
 
285
281
  if (config.verify) {
286
- // This is checked in prefetch_enqueue()
287
- assert(groove.tree.get_cached(id.*) == null);
282
+ // This was checked in prefetch_enqueue().
283
+ assert(worker.context.groove.tree.lookup_from_memory(worker.context.snapshot, id.*) == null);
288
284
  }
289
285
 
290
286
  // If not in the LSM tree's cache, the object must be read from disk and added
291
287
  // to the auxillary prefetch_objects hash map.
292
288
  // TODO: this LSM tree function needlessly checks the LSM tree's cache a
293
289
  // second time. Adding API to the LSM tree to avoid this may be worthwhile.
294
- groove.tree.lookup(
290
+ worker.context.groove.tree.lookup_from_levels(
295
291
  lookup_id_callback,
296
292
  &worker.lookup_id,
297
- snapshot_latest,
293
+ worker.context.snapshot,
298
294
  id.*,
299
295
  );
300
-
301
- return true;
302
296
  }
303
297
 
304
298
  fn lookup_id_callback(
@@ -321,13 +315,7 @@ pub fn PostedGrooveType(comptime Storage: type) type {
321
315
  },
322
316
  }
323
317
  }
324
- worker.lookup_finish();
325
- }
326
-
327
- fn lookup_finish(worker: *PrefetchWorker) void {
328
- if (!worker.lookup_start()) {
329
- worker.* = undefined;
330
- }
318
+ worker.lookup_start_next();
331
319
  }
332
320
  };
333
321
 
@@ -392,7 +380,7 @@ test "PostedGroove" {
392
380
 
393
381
  _ = PostedGroove.prefetch_enqueue;
394
382
  _ = PostedGroove.prefetch;
395
- _ = PostedGroove.prefetch_clear;
383
+ _ = PostedGroove.prefetch_setup;
396
384
 
397
385
  std.testing.refAllDecls(PostedGroove.PrefetchWorker);
398
386
  std.testing.refAllDecls(PostedGroove.PrefetchContext);