tigerbeetle-node 0.9.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. package/README.md +305 -103
  2. package/dist/index.d.ts +70 -67
  3. package/dist/index.js +70 -67
  4. package/dist/index.js.map +1 -1
  5. package/package.json +6 -6
  6. package/scripts/download_node_headers.sh +14 -7
  7. package/src/index.ts +11 -10
  8. package/src/node.zig +22 -20
  9. package/src/tigerbeetle/scripts/benchmark.bat +4 -3
  10. package/src/tigerbeetle/scripts/benchmark.sh +25 -10
  11. package/src/tigerbeetle/scripts/confirm_image.sh +44 -0
  12. package/src/tigerbeetle/scripts/fuzz_loop.sh +15 -0
  13. package/src/tigerbeetle/scripts/fuzz_unique_errors.sh +7 -0
  14. package/src/tigerbeetle/scripts/install.sh +20 -4
  15. package/src/tigerbeetle/scripts/install_zig.bat +5 -1
  16. package/src/tigerbeetle/scripts/install_zig.sh +32 -26
  17. package/src/tigerbeetle/scripts/pre-commit.sh +9 -0
  18. package/src/tigerbeetle/scripts/shellcheck.sh +5 -0
  19. package/src/tigerbeetle/scripts/tests_on_alpine.sh +10 -0
  20. package/src/tigerbeetle/scripts/tests_on_ubuntu.sh +14 -0
  21. package/src/tigerbeetle/scripts/upgrade_ubuntu_kernel.sh +12 -3
  22. package/src/tigerbeetle/src/benchmark.zig +19 -9
  23. package/src/tigerbeetle/src/benchmark_array_search.zig +317 -0
  24. package/src/tigerbeetle/src/benchmarks/perf.zig +299 -0
  25. package/src/tigerbeetle/src/c/tb_client/context.zig +103 -0
  26. package/src/tigerbeetle/src/c/tb_client/packet.zig +80 -0
  27. package/src/tigerbeetle/src/c/tb_client/signal.zig +288 -0
  28. package/src/tigerbeetle/src/c/tb_client/thread.zig +328 -0
  29. package/src/tigerbeetle/src/c/tb_client.h +221 -0
  30. package/src/tigerbeetle/src/c/tb_client.zig +104 -0
  31. package/src/tigerbeetle/src/c/test.zig +1 -0
  32. package/src/tigerbeetle/src/cli.zig +143 -84
  33. package/src/tigerbeetle/src/config.zig +161 -20
  34. package/src/tigerbeetle/src/demo.zig +14 -8
  35. package/src/tigerbeetle/src/demo_05_post_pending_transfers.zig +2 -2
  36. package/src/tigerbeetle/src/ewah.zig +318 -0
  37. package/src/tigerbeetle/src/ewah_benchmark.zig +121 -0
  38. package/src/tigerbeetle/src/eytzinger_benchmark.zig +317 -0
  39. package/src/tigerbeetle/src/fifo.zig +17 -1
  40. package/src/tigerbeetle/src/io/darwin.zig +12 -10
  41. package/src/tigerbeetle/src/io/linux.zig +25 -9
  42. package/src/tigerbeetle/src/io/windows.zig +13 -9
  43. package/src/tigerbeetle/src/iops.zig +101 -0
  44. package/src/tigerbeetle/src/lsm/README.md +214 -0
  45. package/src/tigerbeetle/src/lsm/binary_search.zig +341 -0
  46. package/src/tigerbeetle/src/lsm/bloom_filter.zig +125 -0
  47. package/src/tigerbeetle/src/lsm/compaction.zig +557 -0
  48. package/src/tigerbeetle/src/lsm/composite_key.zig +77 -0
  49. package/src/tigerbeetle/src/lsm/direction.zig +11 -0
  50. package/src/tigerbeetle/src/lsm/eytzinger.zig +587 -0
  51. package/src/tigerbeetle/src/lsm/forest.zig +204 -0
  52. package/src/tigerbeetle/src/lsm/forest_fuzz.zig +412 -0
  53. package/src/tigerbeetle/src/lsm/grid.zig +549 -0
  54. package/src/tigerbeetle/src/lsm/groove.zig +1002 -0
  55. package/src/tigerbeetle/src/lsm/k_way_merge.zig +474 -0
  56. package/src/tigerbeetle/src/lsm/level_iterator.zig +315 -0
  57. package/src/tigerbeetle/src/lsm/manifest.zig +580 -0
  58. package/src/tigerbeetle/src/lsm/manifest_level.zig +925 -0
  59. package/src/tigerbeetle/src/lsm/manifest_log.zig +953 -0
  60. package/src/tigerbeetle/src/lsm/node_pool.zig +231 -0
  61. package/src/tigerbeetle/src/lsm/posted_groove.zig +387 -0
  62. package/src/tigerbeetle/src/lsm/segmented_array.zig +1318 -0
  63. package/src/tigerbeetle/src/lsm/segmented_array_benchmark.zig +148 -0
  64. package/src/tigerbeetle/src/lsm/segmented_array_fuzz.zig +9 -0
  65. package/src/tigerbeetle/src/lsm/set_associative_cache.zig +894 -0
  66. package/src/tigerbeetle/src/lsm/table.zig +967 -0
  67. package/src/tigerbeetle/src/lsm/table_immutable.zig +203 -0
  68. package/src/tigerbeetle/src/lsm/table_iterator.zig +306 -0
  69. package/src/tigerbeetle/src/lsm/table_mutable.zig +174 -0
  70. package/src/tigerbeetle/src/lsm/test.zig +423 -0
  71. package/src/tigerbeetle/src/lsm/tree.zig +1090 -0
  72. package/src/tigerbeetle/src/lsm/tree_fuzz.zig +457 -0
  73. package/src/tigerbeetle/src/main.zig +141 -109
  74. package/src/tigerbeetle/src/message_bus.zig +49 -48
  75. package/src/tigerbeetle/src/message_pool.zig +22 -12
  76. package/src/tigerbeetle/src/ring_buffer.zig +126 -30
  77. package/src/tigerbeetle/src/simulator.zig +205 -140
  78. package/src/tigerbeetle/src/state_machine.zig +1268 -721
  79. package/src/tigerbeetle/src/static_allocator.zig +65 -0
  80. package/src/tigerbeetle/src/storage.zig +40 -14
  81. package/src/tigerbeetle/src/test/accounting/auditor.zig +577 -0
  82. package/src/tigerbeetle/src/test/accounting/workload.zig +819 -0
  83. package/src/tigerbeetle/src/test/cluster.zig +104 -88
  84. package/src/tigerbeetle/src/test/conductor.zig +365 -0
  85. package/src/tigerbeetle/src/test/fuzz.zig +121 -0
  86. package/src/tigerbeetle/src/test/id.zig +89 -0
  87. package/src/tigerbeetle/src/test/message_bus.zig +15 -24
  88. package/src/tigerbeetle/src/test/network.zig +26 -17
  89. package/src/tigerbeetle/src/test/priority_queue.zig +645 -0
  90. package/src/tigerbeetle/src/test/state_checker.zig +94 -68
  91. package/src/tigerbeetle/src/test/state_machine.zig +135 -69
  92. package/src/tigerbeetle/src/test/storage.zig +78 -28
  93. package/src/tigerbeetle/src/tigerbeetle.zig +19 -16
  94. package/src/tigerbeetle/src/unit_tests.zig +15 -0
  95. package/src/tigerbeetle/src/util.zig +51 -0
  96. package/src/tigerbeetle/src/vopr.zig +494 -0
  97. package/src/tigerbeetle/src/vopr_hub/README.md +58 -0
  98. package/src/tigerbeetle/src/vopr_hub/SETUP.md +199 -0
  99. package/src/tigerbeetle/src/vopr_hub/go.mod +3 -0
  100. package/src/tigerbeetle/src/vopr_hub/main.go +1022 -0
  101. package/src/tigerbeetle/src/vopr_hub/scheduler/go.mod +3 -0
  102. package/src/tigerbeetle/src/vopr_hub/scheduler/main.go +403 -0
  103. package/src/tigerbeetle/src/vsr/client.zig +34 -7
  104. package/src/tigerbeetle/src/vsr/journal.zig +164 -174
  105. package/src/tigerbeetle/src/vsr/replica.zig +1602 -651
  106. package/src/tigerbeetle/src/vsr/superblock.zig +1761 -0
  107. package/src/tigerbeetle/src/vsr/superblock_client_table.zig +255 -0
  108. package/src/tigerbeetle/src/vsr/superblock_free_set.zig +644 -0
  109. package/src/tigerbeetle/src/vsr/superblock_manifest.zig +561 -0
  110. package/src/tigerbeetle/src/vsr.zig +118 -170
  111. package/src/tigerbeetle/scripts/vopr.bat +0 -48
  112. package/src/tigerbeetle/scripts/vopr.sh +0 -33
@@ -0,0 +1,231 @@
1
+ const std = @import("std");
2
+ const assert = std.debug.assert;
3
+ const math = std.math;
4
+ const mem = std.mem;
5
+ const meta = std.meta;
6
+
7
+ pub fn NodePool(comptime _node_size: u32, comptime _node_alignment: u13) type {
8
+ return struct {
9
+ const Self = @This();
10
+
11
+ pub const node_size = _node_size;
12
+ pub const node_alignment = _node_alignment;
13
+ pub const Node = *align(node_alignment) [node_size]u8;
14
+
15
+ comptime {
16
+ assert(node_size > 0);
17
+ assert(node_alignment > 0);
18
+ assert(node_alignment <= 4096);
19
+ assert(math.isPowerOfTwo(node_size));
20
+ assert(math.isPowerOfTwo(node_alignment));
21
+ assert(node_size % node_alignment == 0);
22
+ }
23
+
24
+ buffer: []align(node_alignment) u8,
25
+ free: std.bit_set.DynamicBitSetUnmanaged,
26
+
27
+ pub fn init(allocator: mem.Allocator, node_count: u32) !Self {
28
+ assert(node_count > 0);
29
+
30
+ const size = node_size * node_count;
31
+ const buffer = try allocator.allocAdvanced(u8, node_alignment, size, .exact);
32
+ errdefer allocator.free(buffer);
33
+
34
+ const free = try std.bit_set.DynamicBitSetUnmanaged.initFull(allocator, node_count);
35
+ errdefer free.deinit(allocator);
36
+
37
+ return Self{
38
+ .buffer = buffer,
39
+ .free = free,
40
+ };
41
+ }
42
+
43
+ pub fn deinit(pool: *Self, allocator: mem.Allocator) void {
44
+ allocator.free(pool.buffer);
45
+ pool.free.deinit(allocator);
46
+ }
47
+
48
+ pub fn acquire(pool: *Self) Node {
49
+ // TODO: To ensure this "unreachable" is never reached, the primary must reject
50
+ // new requests when storage space is too low to fulfill them.
51
+ const node_index = pool.free.findFirstSet() orelse unreachable;
52
+ assert(pool.free.isSet(node_index));
53
+ pool.free.unset(node_index);
54
+
55
+ return pool.buffer[node_index * node_size ..][0..node_size];
56
+ }
57
+
58
+ pub fn release(pool: *Self, node: Node) void {
59
+ // Our pointer arithmetic assumes that the unit of node_size is a u8.
60
+ comptime assert(meta.Elem(Node) == u8);
61
+ comptime assert(meta.Elem(@TypeOf(pool.buffer)) == u8);
62
+
63
+ assert(@ptrToInt(node) >= @ptrToInt(pool.buffer.ptr));
64
+ assert(@ptrToInt(node) + node_size <= @ptrToInt(pool.buffer.ptr) + pool.buffer.len);
65
+
66
+ const node_index = @divExact(@ptrToInt(node) - @ptrToInt(pool.buffer.ptr), node_size);
67
+ assert(!pool.free.isSet(node_index));
68
+ pool.free.set(node_index);
69
+ }
70
+ };
71
+ }
72
+
73
+ fn TestContext(comptime node_size: usize, comptime node_alignment: u12) type {
74
+ const testing = std.testing;
75
+ const TestPool = NodePool(node_size, node_alignment);
76
+
77
+ const log = false;
78
+
79
+ return struct {
80
+ const Self = @This();
81
+
82
+ node_count: u32,
83
+ random: std.rand.Random,
84
+ node_pool: TestPool,
85
+ node_map: std.AutoArrayHashMap(TestPool.Node, u64),
86
+ sentinel: u64,
87
+
88
+ acquires: u64 = 0,
89
+ releases: u64 = 0,
90
+
91
+ fn init(random: std.rand.Random, node_count: u32) !Self {
92
+ var node_pool = try TestPool.init(testing.allocator, node_count);
93
+ errdefer node_pool.deinit(testing.allocator);
94
+
95
+ var node_map = std.AutoArrayHashMap(TestPool.Node, u64).init(testing.allocator);
96
+ errdefer node_map.deinit();
97
+
98
+ const sentinel = random.int(u64);
99
+ mem.set(u64, mem.bytesAsSlice(u64, node_pool.buffer), sentinel);
100
+
101
+ return Self{
102
+ .node_count = node_count,
103
+ .random = random,
104
+ .node_pool = node_pool,
105
+ .node_map = node_map,
106
+ .sentinel = sentinel,
107
+ };
108
+ }
109
+
110
+ fn deinit(context: *Self) void {
111
+ context.node_pool.deinit(testing.allocator);
112
+ context.node_map.deinit();
113
+ }
114
+
115
+ fn run(context: *Self) !void {
116
+ {
117
+ var i: usize = 0;
118
+ while (i < context.node_count * 4) : (i += 1) {
119
+ switch (context.random.uintLessThanBiased(u32, 100)) {
120
+ 0...59 => try context.acquire(),
121
+ 60...99 => try context.release(),
122
+ else => unreachable,
123
+ }
124
+ }
125
+ }
126
+
127
+ {
128
+ var i: usize = 0;
129
+ while (i < context.node_count * 4) : (i += 1) {
130
+ switch (context.random.uintLessThanBiased(u32, 100)) {
131
+ 0...39 => try context.acquire(),
132
+ 40...99 => try context.release(),
133
+ else => unreachable,
134
+ }
135
+ }
136
+ }
137
+
138
+ try context.release_all();
139
+ }
140
+
141
+ fn acquire(context: *Self) !void {
142
+ if (context.node_map.count() == context.node_count) return;
143
+
144
+ const node = context.node_pool.acquire();
145
+
146
+ // Verify that this node has not already been acquired.
147
+ for (mem.bytesAsSlice(u64, node)) |word| {
148
+ try testing.expectEqual(context.sentinel, word);
149
+ }
150
+
151
+ const gop = try context.node_map.getOrPut(node);
152
+ try testing.expect(!gop.found_existing);
153
+
154
+ // Write unique data into the node so we can test that it doesn't get overwritten.
155
+ const id = context.random.int(u64);
156
+ mem.set(u64, mem.bytesAsSlice(u64, node), id);
157
+ gop.value_ptr.* = id;
158
+
159
+ context.acquires += 1;
160
+ }
161
+
162
+ fn release(context: *Self) !void {
163
+ if (context.node_map.count() == 0) return;
164
+
165
+ const index = context.random.uintLessThanBiased(usize, context.node_map.count());
166
+ const node = context.node_map.keys()[index];
167
+ const id = context.node_map.values()[index];
168
+
169
+ // Verify that the data of this node has not been overwritten since we acquired it.
170
+ for (mem.bytesAsSlice(u64, node)) |word| {
171
+ try testing.expectEqual(id, word);
172
+ }
173
+
174
+ mem.set(u64, mem.bytesAsSlice(u64, node), context.sentinel);
175
+ context.node_pool.release(node);
176
+ context.node_map.swapRemoveAt(index);
177
+
178
+ context.releases += 1;
179
+ }
180
+
181
+ fn release_all(context: *Self) !void {
182
+ while (context.node_map.count() > 0) try context.release();
183
+
184
+ // Verify that nothing in the entire buffer has been acquired.
185
+ for (mem.bytesAsSlice(u64, context.node_pool.buffer)) |word| {
186
+ try testing.expectEqual(context.sentinel, word);
187
+ }
188
+
189
+ if (log) {
190
+ std.debug.print("\nacquires: {}, releases: {}\n", .{
191
+ context.acquires,
192
+ context.releases,
193
+ });
194
+ }
195
+
196
+ try testing.expect(context.acquires > 0);
197
+ try testing.expect(context.acquires == context.releases);
198
+ }
199
+ };
200
+ }
201
+
202
+ test "NodePool" {
203
+ const seed = 42;
204
+
205
+ var prng = std.rand.DefaultPrng.init(seed);
206
+ const random = prng.random();
207
+
208
+ const Tuple = struct {
209
+ node_size: u32,
210
+ node_alignment: u12,
211
+ };
212
+
213
+ inline for (.{
214
+ Tuple{ .node_size = 8, .node_alignment = 8 },
215
+ Tuple{ .node_size = 16, .node_alignment = 8 },
216
+ Tuple{ .node_size = 64, .node_alignment = 8 },
217
+ Tuple{ .node_size = 16, .node_alignment = 16 },
218
+ Tuple{ .node_size = 32, .node_alignment = 16 },
219
+ Tuple{ .node_size = 128, .node_alignment = 16 },
220
+ }) |tuple| {
221
+ const Context = TestContext(tuple.node_size, tuple.node_alignment);
222
+
223
+ var i: u32 = 1;
224
+ while (i < 64) : (i += 1) {
225
+ var context = try Context.init(random, i);
226
+ defer context.deinit();
227
+
228
+ try context.run();
229
+ }
230
+ }
231
+ }
@@ -0,0 +1,387 @@
1
+ const std = @import("std");
2
+ const builtin = @import("builtin");
3
+ const assert = std.debug.assert;
4
+ const math = std.math;
5
+ const mem = std.mem;
6
+
7
+ const config = @import("../config.zig");
8
+
9
+ const TableType = @import("table.zig").TableType;
10
+ const TreeType = @import("tree.zig").TreeType;
11
+ const GridType = @import("grid.zig").GridType;
12
+ const NodePool = @import("node_pool.zig").NodePool(config.lsm_manifest_node_size, 16);
13
+
14
+ const snapshot_latest = @import("tree.zig").snapshot_latest;
15
+ const compaction_snapshot_for_op = @import("tree.zig").compaction_snapshot_for_op;
16
+
17
+ /// This type wraps a single LSM tree in the API needed to integrate it with the Forest.
18
+ /// TigerBeetle's state machine requires a map from u128 ID to posted boolean for transfers
19
+ /// and this type implements that.
20
+ /// TODO Make the LSM Forest library flexible enough to be able to get rid of this special case.
21
+ pub fn PostedGrooveType(comptime Storage: type) type {
22
+ return struct {
23
+ const PostedGroove = @This();
24
+
25
+ const Value = extern struct {
26
+ id: u128,
27
+ data: enum(u8) {
28
+ posted,
29
+ voided,
30
+ tombstone,
31
+ },
32
+ padding: [15]u8 = [_]u8{0} ** 15,
33
+
34
+ comptime {
35
+ // Assert that there is no implicit padding.
36
+ assert(@sizeOf(Value) == 32);
37
+ assert(@bitSizeOf(Value) == 32 * 8);
38
+ }
39
+
40
+ inline fn compare_keys(a: u128, b: u128) math.Order {
41
+ return math.order(a, b);
42
+ }
43
+
44
+ inline fn key_from_value(value: *const Value) u128 {
45
+ return value.id;
46
+ }
47
+
48
+ const sentinel_key = math.maxInt(u128);
49
+
50
+ inline fn tombstone(value: *const Value) bool {
51
+ return value.data == .tombstone;
52
+ }
53
+
54
+ inline fn tombstone_from_key(id: u128) Value {
55
+ return .{
56
+ .id = id,
57
+ .data = .tombstone,
58
+ };
59
+ }
60
+ };
61
+
62
+ const Table = TableType(
63
+ u128,
64
+ Value,
65
+ Value.compare_keys,
66
+ Value.key_from_value,
67
+ Value.sentinel_key,
68
+ Value.tombstone,
69
+ Value.tombstone_from_key,
70
+ );
71
+
72
+ const Tree = TreeType(Table, Storage, "posted_groove");
73
+ const Grid = GridType(Storage);
74
+
75
+ const PrefetchIDs = std.AutoHashMapUnmanaged(u128, void);
76
+ const PrefetchObjects = std.AutoHashMapUnmanaged(u128, bool); // true:posted, false:voided
77
+
78
+ cache: *Tree.TableMutable.ValuesCache,
79
+ tree: Tree,
80
+
81
+ /// Object IDs enqueued to be prefetched.
82
+ /// Prefetching ensures that point lookups against the latest snapshot are synchronous.
83
+ /// This shields state machine implementations from the challenges of concurrency and I/O,
84
+ /// and enables simple state machine function signatures that commit writes atomically.
85
+ prefetch_ids: PrefetchIDs,
86
+
87
+ /// The prefetched Objects. This hash map holds the subset of objects in the LSM tree
88
+ /// that are required for the current commit. All get()/put()/remove() operations during
89
+ /// the commit are both passed to the LSM trees and mirrored in this hash map. It is always
90
+ /// sufficient to query this hashmap alone to know the state of the LSM trees.
91
+ prefetch_objects: PrefetchObjects,
92
+
93
+ /// The snapshot to prefetch from.
94
+ prefetch_snapshot: ?u64,
95
+
96
+ /// This field is necessary to expose the same open()/compact()/checkpoint() function
97
+ /// signatures as the real Groove type.
98
+ callback: ?fn (*PostedGroove) void = null,
99
+
100
+ /// See comments for Groove.Options.
101
+ pub const Options = struct {
102
+ cache_entries_max: u32,
103
+ prefetch_entries_max: u32,
104
+ commit_entries_max: u32,
105
+ };
106
+
107
+ pub fn init(
108
+ allocator: mem.Allocator,
109
+ node_pool: *NodePool,
110
+ grid: *Grid,
111
+ options: Options,
112
+ ) !PostedGroove {
113
+ // Cache is heap-allocated to pass a pointer into the Object tree.
114
+ const cache = try allocator.create(Tree.TableMutable.ValuesCache);
115
+ errdefer allocator.destroy(cache);
116
+
117
+ cache.* = try Tree.TableMutable.ValuesCache.init(allocator, options.cache_entries_max);
118
+ errdefer cache.deinit(allocator);
119
+
120
+ var tree = try Tree.init(
121
+ allocator,
122
+ node_pool,
123
+ grid,
124
+ cache,
125
+ .{
126
+ .commit_entries_max = options.commit_entries_max,
127
+ },
128
+ );
129
+ errdefer tree.deinit(allocator);
130
+
131
+ var prefetch_ids = PrefetchIDs{};
132
+ try prefetch_ids.ensureTotalCapacity(allocator, options.prefetch_entries_max);
133
+ errdefer prefetch_ids.deinit(allocator);
134
+
135
+ var prefetch_objects = PrefetchObjects{};
136
+ try prefetch_objects.ensureTotalCapacity(allocator, options.prefetch_entries_max);
137
+ errdefer prefetch_objects.deinit(allocator);
138
+
139
+ return PostedGroove{
140
+ .cache = cache,
141
+ .tree = tree,
142
+
143
+ .prefetch_ids = prefetch_ids,
144
+ .prefetch_objects = prefetch_objects,
145
+ .prefetch_snapshot = null,
146
+ };
147
+ }
148
+
149
+ pub fn deinit(groove: *PostedGroove, allocator: mem.Allocator) void {
150
+ groove.tree.deinit(allocator);
151
+ groove.cache.deinit(allocator);
152
+ allocator.destroy(groove.cache);
153
+
154
+ groove.prefetch_ids.deinit(allocator);
155
+ groove.prefetch_objects.deinit(allocator);
156
+
157
+ groove.* = undefined;
158
+ }
159
+
160
+ pub fn get(groove: *const PostedGroove, id: u128) ?bool {
161
+ return groove.prefetch_objects.get(id);
162
+ }
163
+
164
+ /// Must be called directly before the state machine begins queuing ids for prefetch.
165
+ /// When `snapshot` is null, prefetch from the current snapshot.
166
+ pub fn prefetch_setup(groove: *PostedGroove, snapshot: ?u64) void {
167
+ // We may query the input tables of an ongoing compaction, but must not query the
168
+ // output tables until the compaction is complete. (Until then, the output tables may
169
+ // be in the manifest but not yet on disk).
170
+ const snapshot_max = groove.tree.lookup_snapshot_max;
171
+ const snapshot_target = snapshot orelse snapshot_max;
172
+ assert(snapshot_target <= snapshot_max);
173
+
174
+ if (groove.prefetch_snapshot == null) {
175
+ groove.prefetch_objects.clearRetainingCapacity();
176
+ } else {
177
+ // If there is a snapshot already set from the previous prefetch_setup(), then its
178
+ // prefetch() was never called, so there must already be no queued objects or ids.
179
+ }
180
+
181
+ groove.prefetch_snapshot = snapshot_target;
182
+ assert(groove.prefetch_objects.count() == 0);
183
+ assert(groove.prefetch_ids.count() == 0);
184
+ }
185
+
186
+ /// This must be called by the state machine for every key to be prefetched.
187
+ /// We tolerate duplicate IDs enqueued by the state machine.
188
+ /// For example, if all unique operations require the same two dependencies.
189
+ pub fn prefetch_enqueue(groove: *PostedGroove, id: u128) void {
190
+ if (groove.tree.lookup_from_memory(groove.prefetch_snapshot.?, id)) |value| {
191
+ switch (value.data) {
192
+ .posted => groove.prefetch_objects.putAssumeCapacity(value.id, true),
193
+ .voided => groove.prefetch_objects.putAssumeCapacity(value.id, false),
194
+ .tombstone => {}, // Leave the ID out of prefetch_objects.
195
+ }
196
+ } else {
197
+ groove.prefetch_ids.putAssumeCapacity(id, {});
198
+ }
199
+ }
200
+
201
+ /// Ensure the objects corresponding to all ids enqueued with prefetch_enqueue() are
202
+ /// available in `prefetch_objects`.
203
+ pub fn prefetch(
204
+ groove: *PostedGroove,
205
+ callback: fn (*PrefetchContext) void,
206
+ context: *PrefetchContext,
207
+ ) void {
208
+ context.* = .{
209
+ .groove = groove,
210
+ .callback = callback,
211
+ .snapshot = groove.prefetch_snapshot.?,
212
+ .id_iterator = groove.prefetch_ids.keyIterator(),
213
+ };
214
+ groove.prefetch_snapshot = null;
215
+ context.start_workers();
216
+ }
217
+
218
+ pub const PrefetchContext = struct {
219
+ groove: *PostedGroove,
220
+ callback: fn (*PrefetchContext) void,
221
+ snapshot: u64,
222
+
223
+ id_iterator: PrefetchIDs.KeyIterator,
224
+
225
+ /// The goal is to fully utilize the disk I/O to ensure the prefetch completes as
226
+ /// quickly as possible, so we run multiple lookups in parallel based on the max
227
+ /// I/O depth of the Grid.
228
+ workers: [Grid.read_iops_max]PrefetchWorker = undefined,
229
+ /// The number of workers that are currently running in parallel.
230
+ workers_busy: u32 = 0,
231
+
232
+ fn start_workers(context: *PrefetchContext) void {
233
+ assert(context.workers_busy == 0);
234
+
235
+ // Track an extra "worker" that will finish after the loop.
236
+ //
237
+ // This prevents `context.finish()` from being called within the loop body when
238
+ // every worker finishes synchronously. `context.finish()` calls the user-provided
239
+ // callback which may re-use the memory of this `PrefetchContext`. However, we
240
+ // rely on `context` being well-defined for the loop condition.
241
+ context.workers_busy += 1;
242
+
243
+ for (context.workers) |*worker| {
244
+ worker.* = .{ .context = context };
245
+ context.workers_busy += 1;
246
+ worker.lookup_start_next();
247
+ }
248
+
249
+ assert(context.workers_busy >= 1);
250
+ context.worker_finished();
251
+ }
252
+
253
+ fn worker_finished(context: *PrefetchContext) void {
254
+ context.workers_busy -= 1;
255
+ if (context.workers_busy == 0) context.finish();
256
+ }
257
+
258
+ fn finish(context: *PrefetchContext) void {
259
+ assert(context.workers_busy == 0);
260
+
261
+ assert(context.id_iterator.next() == null);
262
+ context.groove.prefetch_ids.clearRetainingCapacity();
263
+ assert(context.groove.prefetch_ids.count() == 0);
264
+
265
+ context.callback(context);
266
+ }
267
+ };
268
+
269
+ pub const PrefetchWorker = struct {
270
+ context: *PrefetchContext,
271
+ lookup_id: Tree.LookupContext = undefined,
272
+
273
+ /// Returns true if asynchronous I/O has been started.
274
+ /// Returns false if there are no more IDs to prefetch.
275
+ fn lookup_start_next(worker: *PrefetchWorker) void {
276
+ const id = worker.context.id_iterator.next() orelse {
277
+ worker.context.worker_finished();
278
+ return;
279
+ };
280
+
281
+ if (config.verify) {
282
+ // This was checked in prefetch_enqueue().
283
+ assert(worker.context.groove.tree.lookup_from_memory(worker.context.snapshot, id.*) == null);
284
+ }
285
+
286
+ // If not in the LSM tree's cache, the object must be read from disk and added
287
+ // to the auxillary prefetch_objects hash map.
288
+ // TODO: this LSM tree function needlessly checks the LSM tree's cache a
289
+ // second time. Adding API to the LSM tree to avoid this may be worthwhile.
290
+ worker.context.groove.tree.lookup_from_levels(
291
+ lookup_id_callback,
292
+ &worker.lookup_id,
293
+ worker.context.snapshot,
294
+ id.*,
295
+ );
296
+ }
297
+
298
+ fn lookup_id_callback(
299
+ completion: *Tree.LookupContext,
300
+ result: ?*const Value,
301
+ ) void {
302
+ const worker = @fieldParentPtr(PrefetchWorker, "lookup_id", completion);
303
+ const groove = worker.context.groove;
304
+
305
+ if (result) |value| {
306
+ switch (value.data) {
307
+ .posted => {
308
+ groove.prefetch_objects.putAssumeCapacityNoClobber(value.id, true);
309
+ },
310
+ .voided => {
311
+ groove.prefetch_objects.putAssumeCapacityNoClobber(value.id, false);
312
+ },
313
+ .tombstone => {
314
+ // Leave the ID out of prefetch_objects.
315
+ },
316
+ }
317
+ }
318
+ worker.lookup_start_next();
319
+ }
320
+ };
321
+
322
+ pub fn put_no_clobber(groove: *PostedGroove, id: u128, posted: bool) void {
323
+ const gop = groove.prefetch_objects.getOrPutAssumeCapacity(id);
324
+ assert(!gop.found_existing);
325
+
326
+ const value: Value = .{
327
+ .id = id,
328
+ .data = if (posted) .posted else .voided,
329
+ };
330
+ groove.tree.put(&value);
331
+ gop.value_ptr.* = posted;
332
+ }
333
+
334
+ pub fn remove(groove: *PostedGroove, id: u128) void {
335
+ assert(groove.prefetch_objects.remove(id));
336
+ groove.tree.remove(&Value{ .id = id, .data = .tombstone });
337
+ }
338
+
339
+ fn tree_callback(tree: *Tree) void {
340
+ const groove = @fieldParentPtr(PostedGroove, "tree", tree);
341
+ const callback = groove.callback.?;
342
+ groove.callback = null;
343
+ callback(groove);
344
+ }
345
+
346
+ pub fn open(groove: *PostedGroove, callback: fn (*PostedGroove) void) void {
347
+ assert(groove.callback == null);
348
+ groove.callback = callback;
349
+ groove.tree.open(tree_callback);
350
+ }
351
+
352
+ pub fn compact(groove: *PostedGroove, callback: fn (*PostedGroove) void, op: u64) void {
353
+ assert(groove.callback == null);
354
+ groove.callback = callback;
355
+ groove.tree.compact(tree_callback, op);
356
+ }
357
+
358
+ pub fn checkpoint(groove: *PostedGroove, callback: fn (*PostedGroove) void) void {
359
+ assert(groove.callback == null);
360
+ groove.callback = callback;
361
+ groove.tree.checkpoint(tree_callback);
362
+ }
363
+ };
364
+ }
365
+
366
+ test "PostedGroove" {
367
+ const Storage = @import("../storage.zig").Storage;
368
+
369
+ const PostedGroove = PostedGrooveType(Storage);
370
+
371
+ _ = PostedGroove.init;
372
+ _ = PostedGroove.deinit;
373
+
374
+ _ = PostedGroove.get;
375
+ _ = PostedGroove.put_no_clobber;
376
+ _ = PostedGroove.remove;
377
+
378
+ _ = PostedGroove.compact;
379
+ _ = PostedGroove.checkpoint;
380
+
381
+ _ = PostedGroove.prefetch_enqueue;
382
+ _ = PostedGroove.prefetch;
383
+ _ = PostedGroove.prefetch_setup;
384
+
385
+ std.testing.refAllDecls(PostedGroove.PrefetchWorker);
386
+ std.testing.refAllDecls(PostedGroove.PrefetchContext);
387
+ }