tigerbeetle-node 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. package/README.md +3 -2
  2. package/dist/index.d.ts +66 -61
  3. package/dist/index.js +66 -61
  4. package/dist/index.js.map +1 -1
  5. package/package.json +1 -1
  6. package/src/index.ts +5 -0
  7. package/src/node.zig +17 -18
  8. package/src/tigerbeetle/scripts/benchmark.bat +4 -3
  9. package/src/tigerbeetle/scripts/benchmark.sh +25 -10
  10. package/src/tigerbeetle/scripts/install.sh +2 -1
  11. package/src/tigerbeetle/scripts/install_zig.sh +14 -18
  12. package/src/tigerbeetle/scripts/upgrade_ubuntu_kernel.sh +12 -3
  13. package/src/tigerbeetle/scripts/vopr.sh +5 -5
  14. package/src/tigerbeetle/src/benchmark.zig +17 -9
  15. package/src/tigerbeetle/src/benchmark_array_search.zig +317 -0
  16. package/src/tigerbeetle/src/benchmarks/perf.zig +299 -0
  17. package/src/tigerbeetle/src/c/tb_client/context.zig +103 -0
  18. package/src/tigerbeetle/src/c/tb_client/packet.zig +80 -0
  19. package/src/tigerbeetle/src/c/tb_client/signal.zig +288 -0
  20. package/src/tigerbeetle/src/c/tb_client/thread.zig +329 -0
  21. package/src/tigerbeetle/src/c/tb_client.h +201 -0
  22. package/src/tigerbeetle/src/c/tb_client.zig +101 -0
  23. package/src/tigerbeetle/src/c/test.zig +1 -0
  24. package/src/tigerbeetle/src/cli.zig +142 -83
  25. package/src/tigerbeetle/src/config.zig +119 -10
  26. package/src/tigerbeetle/src/demo.zig +12 -8
  27. package/src/tigerbeetle/src/demo_05_post_pending_transfers.zig +2 -2
  28. package/src/tigerbeetle/src/ewah.zig +318 -0
  29. package/src/tigerbeetle/src/ewah_benchmark.zig +121 -0
  30. package/src/tigerbeetle/src/eytzinger_benchmark.zig +317 -0
  31. package/src/tigerbeetle/src/fifo.zig +17 -1
  32. package/src/tigerbeetle/src/io/darwin.zig +12 -10
  33. package/src/tigerbeetle/src/io/linux.zig +25 -9
  34. package/src/tigerbeetle/src/io/windows.zig +13 -9
  35. package/src/tigerbeetle/src/iops.zig +101 -0
  36. package/src/tigerbeetle/src/lsm/binary_search.zig +214 -0
  37. package/src/tigerbeetle/src/lsm/bloom_filter.zig +82 -0
  38. package/src/tigerbeetle/src/lsm/compaction.zig +603 -0
  39. package/src/tigerbeetle/src/lsm/composite_key.zig +75 -0
  40. package/src/tigerbeetle/src/lsm/direction.zig +11 -0
  41. package/src/tigerbeetle/src/lsm/eytzinger.zig +587 -0
  42. package/src/tigerbeetle/src/lsm/forest.zig +630 -0
  43. package/src/tigerbeetle/src/lsm/grid.zig +473 -0
  44. package/src/tigerbeetle/src/lsm/groove.zig +939 -0
  45. package/src/tigerbeetle/src/lsm/k_way_merge.zig +452 -0
  46. package/src/tigerbeetle/src/lsm/level_iterator.zig +296 -0
  47. package/src/tigerbeetle/src/lsm/manifest.zig +680 -0
  48. package/src/tigerbeetle/src/lsm/manifest_level.zig +1169 -0
  49. package/src/tigerbeetle/src/lsm/manifest_log.zig +904 -0
  50. package/src/tigerbeetle/src/lsm/node_pool.zig +231 -0
  51. package/src/tigerbeetle/src/lsm/posted_groove.zig +399 -0
  52. package/src/tigerbeetle/src/lsm/segmented_array.zig +998 -0
  53. package/src/tigerbeetle/src/lsm/set_associative_cache.zig +844 -0
  54. package/src/tigerbeetle/src/lsm/table.zig +932 -0
  55. package/src/tigerbeetle/src/lsm/table_immutable.zig +196 -0
  56. package/src/tigerbeetle/src/lsm/table_iterator.zig +295 -0
  57. package/src/tigerbeetle/src/lsm/table_mutable.zig +123 -0
  58. package/src/tigerbeetle/src/lsm/test.zig +429 -0
  59. package/src/tigerbeetle/src/lsm/tree.zig +1085 -0
  60. package/src/tigerbeetle/src/main.zig +119 -109
  61. package/src/tigerbeetle/src/message_bus.zig +49 -48
  62. package/src/tigerbeetle/src/message_pool.zig +15 -2
  63. package/src/tigerbeetle/src/ring_buffer.zig +126 -30
  64. package/src/tigerbeetle/src/simulator.zig +76 -44
  65. package/src/tigerbeetle/src/state_machine.zig +1022 -585
  66. package/src/tigerbeetle/src/storage.zig +46 -16
  67. package/src/tigerbeetle/src/test/cluster.zig +109 -63
  68. package/src/tigerbeetle/src/test/message_bus.zig +15 -24
  69. package/src/tigerbeetle/src/test/network.zig +26 -17
  70. package/src/tigerbeetle/src/test/state_checker.zig +7 -5
  71. package/src/tigerbeetle/src/test/state_machine.zig +159 -69
  72. package/src/tigerbeetle/src/test/storage.zig +57 -28
  73. package/src/tigerbeetle/src/tigerbeetle.zig +5 -0
  74. package/src/tigerbeetle/src/unit_tests.zig +8 -0
  75. package/src/tigerbeetle/src/util.zig +51 -0
  76. package/src/tigerbeetle/src/vsr/client.zig +21 -7
  77. package/src/tigerbeetle/src/vsr/journal.zig +154 -167
  78. package/src/tigerbeetle/src/vsr/replica.zig +744 -226
  79. package/src/tigerbeetle/src/vsr/superblock.zig +1743 -0
  80. package/src/tigerbeetle/src/vsr/superblock_client_table.zig +258 -0
  81. package/src/tigerbeetle/src/vsr/superblock_free_set.zig +644 -0
  82. package/src/tigerbeetle/src/vsr/superblock_manifest.zig +546 -0
  83. package/src/tigerbeetle/src/vsr.zig +43 -115
@@ -0,0 +1,939 @@
1
+ const std = @import("std");
2
+ const builtin = @import("builtin");
3
+ const assert = std.debug.assert;
4
+ const math = std.math;
5
+ const mem = std.mem;
6
+
7
+ const config = @import("../config.zig");
8
+
9
+ const TableType = @import("table.zig").TableType;
10
+ const TreeType = @import("tree.zig").TreeType;
11
+ const GridType = @import("grid.zig").GridType;
12
+ const CompositeKey = @import("composite_key.zig").CompositeKey;
13
+ const NodePool = @import("node_pool.zig").NodePool(config.lsm_manifest_node_size, 16);
14
+
15
+ const snapshot_latest = @import("tree.zig").snapshot_latest;
16
+
17
+ fn ObjectTreeHelpers(comptime Object: type) type {
18
+ assert(@hasField(Object, "id"));
19
+ assert(std.meta.fieldInfo(Object, .id).field_type == u128);
20
+ assert(@hasField(Object, "timestamp"));
21
+ assert(std.meta.fieldInfo(Object, .timestamp).field_type == u64);
22
+
23
+ return struct {
24
+ inline fn compare_keys(timestamp_a: u64, timestamp_b: u64) std.math.Order {
25
+ return std.math.order(timestamp_a, timestamp_b);
26
+ }
27
+
28
+ inline fn key_from_value(value: *const Object) u64 {
29
+ return value.timestamp & ~@as(u64, tombstone_bit);
30
+ }
31
+
32
+ const sentinel_key = std.math.maxInt(u64);
33
+ const tombstone_bit = 1 << (64 - 1);
34
+
35
+ inline fn tombstone(value: *const Object) bool {
36
+ return (value.timestamp & tombstone_bit) != 0;
37
+ }
38
+
39
+ inline fn tombstone_from_key(timestamp: u64) Object {
40
+ var value = std.mem.zeroes(Object); // Full zero-initialized Value.
41
+ value.timestamp = timestamp | tombstone_bit;
42
+ return value;
43
+ }
44
+ };
45
+ }
46
+
47
+ const IdTreeValue = extern struct {
48
+ id: u128,
49
+ timestamp: u64,
50
+ padding: u64 = 0,
51
+
52
+ inline fn compare_keys(a: u128, b: u128) std.math.Order {
53
+ return std.math.order(a, b);
54
+ }
55
+
56
+ inline fn key_from_value(value: *const IdTreeValue) u128 {
57
+ return value.id;
58
+ }
59
+
60
+ const sentinel_key = std.math.maxInt(u128);
61
+ const tombstone_bit = 1 << (64 - 1);
62
+
63
+ inline fn tombstone(value: *const IdTreeValue) bool {
64
+ return (value.timestamp & tombstone_bit) != 0;
65
+ }
66
+
67
+ inline fn tombstone_from_key(id: u128) IdTreeValue {
68
+ return .{
69
+ .id = id,
70
+ .timestamp = tombstone_bit,
71
+ };
72
+ }
73
+ };
74
+
75
+ /// Normalizes index tree field types into either u64 or u128 for CompositeKey
76
+ fn IndexCompositeKeyType(comptime Field: type) type {
77
+ switch (@typeInfo(Field)) {
78
+ .Enum => |e| {
79
+ return switch (@bitSizeOf(e.tag_type)) {
80
+ 0...@bitSizeOf(u64) => u64,
81
+ @bitSizeOf(u64)...@bitSizeOf(u128) => u128,
82
+ else => @compileError("Unsupported enum tag for index: " ++ @typeName(e.tag_type)),
83
+ };
84
+ },
85
+ .Int => |i| {
86
+ if (i.signedness != .unsigned) {
87
+ @compileError("Index int type (" ++ @typeName(Field) ++ ") is not unsigned");
88
+ }
89
+ return switch (@bitSizeOf(Field)) {
90
+ 0...@bitSizeOf(u64) => u64,
91
+ @bitSizeOf(u64)...@bitSizeOf(u128) => u128,
92
+ else => @compileError("Unsupported int type for index: " ++ @typeName(Field)),
93
+ };
94
+ },
95
+ else => @compileError("Index type " ++ @typeName(Field) ++ " is not supported"),
96
+ }
97
+ }
98
+
99
+ comptime {
100
+ assert(IndexCompositeKeyType(u1) == u64);
101
+ assert(IndexCompositeKeyType(u16) == u64);
102
+ assert(IndexCompositeKeyType(enum(u16) { x }) == u64);
103
+
104
+ assert(IndexCompositeKeyType(u32) == u64);
105
+ assert(IndexCompositeKeyType(u63) == u64);
106
+ assert(IndexCompositeKeyType(u64) == u64);
107
+
108
+ assert(IndexCompositeKeyType(enum(u65) { x }) == u128);
109
+ assert(IndexCompositeKeyType(u65) == u128);
110
+ assert(IndexCompositeKeyType(u128) == u128);
111
+ }
112
+
113
+ fn IndexTreeType(
114
+ comptime Storage: type,
115
+ comptime Field: type,
116
+ comptime tree_name: []const u8,
117
+ ) type {
118
+ const Key = CompositeKey(IndexCompositeKeyType(Field));
119
+ const Table = TableType(
120
+ Key,
121
+ Key.Value,
122
+ Key.compare_keys,
123
+ Key.key_from_value,
124
+ Key.sentinel_key,
125
+ Key.tombstone,
126
+ Key.tombstone_from_key,
127
+ );
128
+
129
+ return TreeType(Table, Storage, tree_name);
130
+ }
131
+
132
+ /// A Groove is a collection of LSM trees auto generated for fields on a struct type
133
+ /// as well as custom derived fields from said struct type.
134
+ pub fn GrooveType(
135
+ comptime Storage: type,
136
+ comptime Object: type,
137
+ /// An anonymous struct instance which contains the following:
138
+ ///
139
+ /// - ignored: [][]const u8:
140
+ /// An array of fields on the Object type that should not be given index trees
141
+ ///
142
+ /// - derived: { .field = fn (*const Object) ?DerivedType }:
143
+ /// An anonymous struct which contain fields that don't exist on the Object
144
+ /// but can be derived from an Object instance using the field's corresponding function.
145
+ comptime options: anytype,
146
+ ) type {
147
+ @setEvalBranchQuota(64000);
148
+
149
+ assert(@hasField(Object, "id"));
150
+ assert(std.meta.fieldInfo(Object, .id).field_type == u128);
151
+ assert(@hasField(Object, "timestamp"));
152
+ assert(std.meta.fieldInfo(Object, .timestamp).field_type == u64);
153
+
154
+ comptime var index_fields: []const std.builtin.TypeInfo.StructField = &.{};
155
+
156
+ // Generate index LSM trees from the struct fields.
157
+ inline for (std.meta.fields(Object)) |field| {
158
+ // See if we should ignore this field from the options.
159
+ //
160
+ // By default, we ignore the "timestamp" field since it's a special identifier.
161
+ // Since the "timestamp" is ignored by default, it shouldn't be provided in options.ignored.
162
+ comptime var ignored = mem.eql(u8, field.name, "timestamp") or mem.eql(u8, field.name, "id");
163
+ inline for (options.ignored) |ignored_field_name| {
164
+ comptime assert(!std.mem.eql(u8, ignored_field_name, "timestamp"));
165
+ comptime assert(!std.mem.eql(u8, ignored_field_name, "id"));
166
+ ignored = ignored or std.mem.eql(u8, field.name, ignored_field_name);
167
+ }
168
+
169
+ if (!ignored) {
170
+ const tree_name = @typeName(Object) ++ "." ++ field.name;
171
+ const IndexTree = IndexTreeType(Storage, field.field_type, tree_name);
172
+ index_fields = index_fields ++ [_]std.builtin.TypeInfo.StructField{
173
+ .{
174
+ .name = field.name,
175
+ .field_type = IndexTree,
176
+ .default_value = null,
177
+ .is_comptime = false,
178
+ .alignment = @alignOf(IndexTree),
179
+ },
180
+ };
181
+ }
182
+ }
183
+
184
+ // Generiate IndexTrees for fields derived from the Value in options.
185
+ const derived_fields = std.meta.fields(@TypeOf(options.derived));
186
+ inline for (derived_fields) |field| {
187
+ // Get the function info for the derived field.
188
+ const derive_func = @field(options.derived, field.name);
189
+ const derive_func_info = @typeInfo(@TypeOf(derive_func)).Fn;
190
+
191
+ // Make sure it has only one argument.
192
+ if (derive_func_info.args.len != 1) {
193
+ @compileError("expected derive fn to take in *const " ++ @typeName(Object));
194
+ }
195
+
196
+ // Make sure the function takes in a reference to the Value:
197
+ const derive_arg = derive_func_info.args[0];
198
+ if (derive_arg.is_generic) @compileError("expected derive fn arg to not be generic");
199
+ if (derive_arg.arg_type != *const Object) {
200
+ @compileError("expected derive fn to take in *const " ++ @typeName(Object));
201
+ }
202
+
203
+ // Get the return value from the derived field as the DerivedType.
204
+ const derive_return_type = derive_func_info.return_type orelse {
205
+ @compileError("expected derive fn to return valid tree index type");
206
+ };
207
+
208
+ // Create an IndexTree for the DerivedType:
209
+ const tree_name = @typeName(Object) ++ "." ++ field.name;
210
+ const DerivedType = @typeInfo(derive_return_type).Optional.child;
211
+ const IndexTree = IndexTreeType(Storage, DerivedType, tree_name);
212
+
213
+ index_fields = index_fields ++ &.{
214
+ .{
215
+ .name = field.name,
216
+ .field_type = IndexTree,
217
+ .default_value = null,
218
+ .is_comptime = false,
219
+ .alignment = @alignOf(IndexTree),
220
+ },
221
+ };
222
+ }
223
+
224
+ const ObjectTree = blk: {
225
+ const Table = TableType(
226
+ u64, // key = timestamp
227
+ Object,
228
+ ObjectTreeHelpers(Object).compare_keys,
229
+ ObjectTreeHelpers(Object).key_from_value,
230
+ ObjectTreeHelpers(Object).sentinel_key,
231
+ ObjectTreeHelpers(Object).tombstone,
232
+ ObjectTreeHelpers(Object).tombstone_from_key,
233
+ );
234
+
235
+ const tree_name = @typeName(Object);
236
+ break :blk TreeType(Table, Storage, tree_name);
237
+ };
238
+
239
+ const IdTree = blk: {
240
+ const Table = TableType(
241
+ u128,
242
+ IdTreeValue,
243
+ IdTreeValue.compare_keys,
244
+ IdTreeValue.key_from_value,
245
+ IdTreeValue.sentinel_key,
246
+ IdTreeValue.tombstone,
247
+ IdTreeValue.tombstone_from_key,
248
+ );
249
+
250
+ const tree_name = @typeName(Object) ++ ".id";
251
+ break :blk TreeType(Table, Storage, tree_name);
252
+ };
253
+
254
+ const IndexTrees = @Type(.{
255
+ .Struct = .{
256
+ .layout = .Auto,
257
+ .fields = index_fields,
258
+ .decls = &.{},
259
+ .is_tuple = false,
260
+ },
261
+ });
262
+
263
+ // Verify no hash collisions between all the trees:
264
+ comptime var hashes: []const u128 = &.{ObjectTree.hash};
265
+
266
+ inline for (std.meta.fields(IndexTrees)) |field| {
267
+ const IndexTree = @TypeOf(@field(@as(IndexTrees, undefined), field.name));
268
+ const hash: []const u128 = &.{IndexTree.hash};
269
+
270
+ assert(std.mem.containsAtLeast(u128, hashes, 0, hash));
271
+ hashes = hashes ++ hash;
272
+ }
273
+
274
+ // Verify groove index count:
275
+ const indexes_count_actual = std.meta.fields(IndexTrees).len;
276
+ const indexes_count_expect = std.meta.fields(Object).len -
277
+ options.ignored.len -
278
+ // The id/timestamp field is implicitly ignored since it's the primary key for ObjectTree:
279
+ 2 +
280
+ std.meta.fields(@TypeOf(options.derived)).len;
281
+
282
+ assert(indexes_count_actual == indexes_count_expect);
283
+
284
+ // Generate a helper function for interacting with an Index field type.
285
+ const IndexTreeFieldHelperType = struct {
286
+ /// Returns true if the field is a derived field.
287
+ fn is_derived(comptime field_name: []const u8) bool {
288
+ comptime var derived = false;
289
+ inline for (derived_fields) |derived_field| {
290
+ derived = derived or std.mem.eql(u8, derived_field.name, field_name);
291
+ }
292
+ return derived;
293
+ }
294
+
295
+ /// Gets the index type from the index name (even if the index is derived).
296
+ fn IndexType(comptime field_name: []const u8) type {
297
+ if (!is_derived(field_name)) {
298
+ return @TypeOf(@field(@as(Object, undefined), field_name));
299
+ }
300
+
301
+ const derived_fn = @TypeOf(@field(options.derived, field_name));
302
+ return @typeInfo(derived_fn).Fn.return_type.?.Optional.child;
303
+ }
304
+
305
+ fn HelperType(comptime field_name: []const u8) type {
306
+ return struct {
307
+ const Index = IndexType(field_name);
308
+
309
+ /// Try to extract an index from the object, deriving it when necessary.
310
+ pub fn derive_index(object: *const Object) ?Index {
311
+ if (comptime is_derived(field_name)) {
312
+ return @field(options.derived, field_name)(object);
313
+ } else {
314
+ return @field(object, field_name);
315
+ }
316
+ }
317
+
318
+ /// Create a Value from the index that can be used in the IndexTree.
319
+ pub fn derive_value(
320
+ object: *const Object,
321
+ index: Index,
322
+ ) CompositeKey(IndexCompositeKeyType(Index)).Value {
323
+ return .{
324
+ .timestamp = object.timestamp,
325
+ .field = switch (@typeInfo(Index)) {
326
+ .Int => index,
327
+ .Enum => @enumToInt(index),
328
+ else => @compileError("Unsupported index type for " ++ field_name),
329
+ },
330
+ };
331
+ }
332
+ };
333
+ }
334
+ }.HelperType;
335
+
336
+ return struct {
337
+ const Groove = @This();
338
+
339
+ const Grid = GridType(Storage);
340
+
341
+ const Callback = fn (*Groove) void;
342
+ const JoinOp = enum {
343
+ compacting,
344
+ checkpoint,
345
+ open,
346
+ };
347
+
348
+ const PrefetchIDs = std.AutoHashMapUnmanaged(u128, void);
349
+
350
+ const PrefetchObjectsContext = struct {
351
+ pub fn hash(_: PrefetchObjectsContext, object: Object) u64 {
352
+ return std.hash.Wyhash.hash(0, mem.asBytes(&object.id));
353
+ }
354
+
355
+ pub fn eql(_: PrefetchObjectsContext, a: Object, b: Object) bool {
356
+ return a.id == b.id;
357
+ }
358
+ };
359
+ const PrefetchObjectsAdapter = struct {
360
+ pub fn hash(_: PrefetchObjectsAdapter, id: u128) u64 {
361
+ return std.hash.Wyhash.hash(0, mem.asBytes(&id));
362
+ }
363
+
364
+ pub fn eql(_: PrefetchObjectsAdapter, a_id: u128, b_object: Object) bool {
365
+ return a_id == b_object.id;
366
+ }
367
+ };
368
+ const PrefetchObjects = std.HashMapUnmanaged(Object, void, PrefetchObjectsContext, 70);
369
+
370
+ join_op: ?JoinOp = null,
371
+ join_pending: usize = 0,
372
+ join_callback: ?Callback = null,
373
+
374
+ objects_cache: *ObjectTree.ValueCache,
375
+ objects: ObjectTree,
376
+
377
+ ids_cache: *IdTree.ValueCache,
378
+ ids: IdTree,
379
+
380
+ indexes: IndexTrees,
381
+
382
+ /// Object IDs enqueued to be prefetched.
383
+ /// Prefetching ensures that point lookups against the latest snapshot are synchronous.
384
+ /// This shields state machine implementations from the challenges of concurrency and I/O,
385
+ /// and enables simple state machine function signatures that commit writes atomically.
386
+ prefetch_ids: PrefetchIDs,
387
+
388
+ /// The prefetched Objects. This hash map holds the subset of objects in the LSM trees
389
+ /// that are required for the current commit. All get()/put()/remove() operations during
390
+ /// the commit are both passed to the LSM trees and mirrored in this hash map. It is always
391
+ /// sufficient to query this hashmap alone to know the state of the LSM trees.
392
+ prefetch_objects: PrefetchObjects,
393
+
394
+ pub fn init(
395
+ allocator: mem.Allocator,
396
+ node_pool: *NodePool,
397
+ grid: *Grid,
398
+ // The cache size is meant to be computed based on the left over available memory
399
+ // that tigerbeetle was given to allocate from CLI arguments.
400
+ // TODO Improve unit in this name to make more clear what should be passed.
401
+ // For example, is this a size in bytes or a count in objects? It's a count in objects,
402
+ // but the name poorly reflects this.
403
+ cache_size: u32,
404
+ // In general, the commit count max for a field, depends on the field's object,
405
+ // how many objects might be changed by a batch:
406
+ // (config.message_size_max - sizeOf(vsr.header))
407
+ // For example, there are at most 8191 transfers in a batch.
408
+ // So commit_count_max=8191 for transfer objects and indexes.
409
+ //
410
+ // However, if a transfer is ever mutated, then this will double commit_count_max
411
+ // since the old index might need to be removed, and the new index inserted.
412
+ //
413
+ // A way to see this is by looking at the state machine. If a transfer is inserted,
414
+ // how many accounts and transfer put/removes will be generated?
415
+ //
416
+ // This also means looking at the state machine operation that will generate the
417
+ // most put/removes in the worst case.
418
+ // For example, create_accounts will put at most 8191 accounts.
419
+ // However, create_transfers will put 2 accounts (8191 * 2) for every transfer, and
420
+ // some of these accounts may exist, requiring a remove/put to update the index.
421
+ commit_count_max: u32,
422
+ ) !Groove {
423
+ // Cache is dynamically allocated to pass a pointer into the Object tree.
424
+ const objects_cache = try allocator.create(ObjectTree.ValueCache);
425
+ errdefer allocator.destroy(objects_cache);
426
+
427
+ objects_cache.* = .{};
428
+ try objects_cache.ensureTotalCapacity(allocator, cache_size);
429
+ errdefer objects_cache.deinit(allocator);
430
+
431
+ // Intialize the object LSM tree.
432
+ var object_tree = try ObjectTree.init(
433
+ allocator,
434
+ node_pool,
435
+ grid,
436
+ objects_cache,
437
+ .{
438
+ .commit_count_max = commit_count_max,
439
+ },
440
+ );
441
+ errdefer object_tree.deinit(allocator);
442
+
443
+ // Cache is dynamically allocated to pass a pointer into the ID tree.
444
+ const ids_cache = try allocator.create(IdTree.ValueCache);
445
+ errdefer allocator.destroy(ids_cache);
446
+
447
+ ids_cache.* = .{};
448
+ try ids_cache.ensureTotalCapacity(allocator, cache_size);
449
+ errdefer ids_cache.deinit(allocator);
450
+
451
+ var id_tree = try IdTree.init(
452
+ allocator,
453
+ node_pool,
454
+ grid,
455
+ ids_cache,
456
+ .{
457
+ .commit_count_max = commit_count_max,
458
+ },
459
+ );
460
+ errdefer id_tree.deinit(allocator);
461
+
462
+ var index_trees_initialized: usize = 0;
463
+ var index_trees: IndexTrees = undefined;
464
+
465
+ // Make sure to deinit initialized index LSM trees on error.
466
+ errdefer inline for (std.meta.fields(IndexTrees)) |field, field_index| {
467
+ if (index_trees_initialized >= field_index + 1) {
468
+ @field(index_trees, field.name).deinit(allocator);
469
+ }
470
+ };
471
+
472
+ // Initialize index LSM trees.
473
+ inline for (std.meta.fields(IndexTrees)) |field| {
474
+ @field(index_trees, field.name) = try field.field_type.init(
475
+ allocator,
476
+ node_pool,
477
+ grid,
478
+ null, // No value cache for index trees.
479
+ .{
480
+ .commit_count_max = commit_count_max,
481
+ },
482
+ );
483
+ index_trees_initialized += 1;
484
+ }
485
+
486
+ // TODO: document why this is twice the commit count max.
487
+ const prefetch_count_max = commit_count_max * 2;
488
+
489
+ var prefetch_ids = PrefetchIDs{};
490
+ try prefetch_ids.ensureTotalCapacity(allocator, prefetch_count_max);
491
+ errdefer prefetch_ids.deinit(allocator);
492
+
493
+ var prefetch_objects = PrefetchObjects{};
494
+ try prefetch_objects.ensureTotalCapacity(allocator, prefetch_count_max);
495
+ errdefer prefetch_objects.deinit(allocator);
496
+
497
+ return Groove{
498
+ .objects_cache = objects_cache,
499
+ .objects = object_tree,
500
+
501
+ .ids_cache = ids_cache,
502
+ .ids = id_tree,
503
+
504
+ .indexes = index_trees,
505
+
506
+ .prefetch_ids = prefetch_ids,
507
+ .prefetch_objects = prefetch_objects,
508
+ };
509
+ }
510
+
511
+ pub fn deinit(groove: *Groove, allocator: mem.Allocator) void {
512
+ assert(groove.join_op == null);
513
+ assert(groove.join_pending == 0);
514
+ assert(groove.join_callback == null);
515
+
516
+ inline for (std.meta.fields(IndexTrees)) |field| {
517
+ @field(groove.indexes, field.name).deinit(allocator);
518
+ }
519
+
520
+ groove.objects.deinit(allocator);
521
+ groove.objects_cache.deinit(allocator);
522
+ allocator.destroy(groove.objects_cache);
523
+
524
+ groove.ids.deinit(allocator);
525
+ groove.ids_cache.deinit(allocator);
526
+ allocator.destroy(groove.ids_cache);
527
+
528
+ groove.prefetch_ids.deinit(allocator);
529
+ groove.prefetch_objects.deinit(allocator);
530
+
531
+ groove.* = undefined;
532
+ }
533
+
534
+ pub fn get(groove: *const Groove, id: u128) ?*const Object {
535
+ return groove.prefetch_objects.getKeyPtrAdapted(id, PrefetchObjectsAdapter{});
536
+ }
537
+
538
+ /// Must be called directly after the state machine commit is finished and prefetch results
539
+ /// are no longer needed.
540
+ pub fn prefetch_clear(groove: *Groove) void {
541
+ groove.prefetch_objects.clearRetainingCapacity();
542
+ assert(groove.prefetch_objects.count() == 0);
543
+ assert(groove.prefetch_ids.count() == 0);
544
+ }
545
+
546
+ /// This must be called by the state machine for every key to be prefetched.
547
+ /// We tolerate duplicate IDs enqueued by the state machine.
548
+ /// For example, if all unique operations require the same two dependencies.
549
+ pub fn prefetch_enqueue(groove: *Groove, id: u128) void {
550
+ if (groove.ids.get_cached(id)) |id_tree_value| {
551
+ if (!id_tree_value.tombstone()) {
552
+ const object = groove.objects.get_cached(id_tree_value.timestamp).?;
553
+ assert(!ObjectTreeHelpers(Object).tombstone(object));
554
+ groove.prefetch_objects.putAssumeCapacity(object.*, {});
555
+ } else {
556
+ // Do nothing, a prefetched ID not present in prefetch_objects indicates
557
+ // that the object has either been deleted or never existed.
558
+ }
559
+ } else {
560
+ groove.prefetch_ids.putAssumeCapacity(id, {});
561
+ }
562
+ }
563
+
564
+ /// Ensure the objects corresponding to all ids enqueued with prefetch_enqueue() are
565
+ /// in memory, either in the value cache of the object tree or in the prefetch_objects
566
+ /// backup hash map.
567
+ pub fn prefetch(
568
+ groove: *Groove,
569
+ callback: fn (*PrefetchContext) void,
570
+ context: *PrefetchContext,
571
+ ) void {
572
+ context.* = .{
573
+ .groove = groove,
574
+ .callback = callback,
575
+ .id_iterator = groove.prefetch_ids.keyIterator(),
576
+ };
577
+ context.start_workers();
578
+ }
579
+
580
+ pub const PrefetchContext = struct {
581
+ groove: *Groove,
582
+ callback: fn (*PrefetchContext) void,
583
+
584
+ id_iterator: PrefetchIDs.KeyIterator,
585
+
586
+ /// The goal is to fully utilize the disk I/O to ensure the prefetch completes as
587
+ /// quickly as possible, so we run multiple lookups in parallel based on the max
588
+ /// I/O depth of the Grid.
589
+ workers: [Grid.read_iops_max]PrefetchWorker = undefined,
590
+ /// The number of workers that are currently running in parallel.
591
+ workers_busy: u32 = 0,
592
+
593
+ fn start_workers(context: *PrefetchContext) void {
594
+ assert(context.workers_busy == 0);
595
+
596
+ // Track an extra "worker" that will finish after the loop.
597
+ //
598
+ // This prevents `context.finish()` from being called within the loop body when every
599
+ // worker finishes synchronously. `context.finish()` sets the `context` to undefined,
600
+ // but `context` is required for the last loop condition check.
601
+ context.workers_busy += 1;
602
+
603
+ // -1 to ignore the extra worker.
604
+ while (context.workers_busy - 1 < context.workers.len) {
605
+ const worker = &context.workers[context.workers_busy - 1];
606
+ worker.* = .{ .context = context };
607
+ context.workers_busy += 1;
608
+ if (!worker.lookup_start()) break;
609
+ }
610
+
611
+ assert(context.workers_busy >= 1);
612
+ context.worker_finished();
613
+ }
614
+
615
+ fn worker_finished(context: *PrefetchContext) void {
616
+ context.workers_busy -= 1;
617
+ if (context.workers_busy == 0) context.finish();
618
+ }
619
+
620
+ fn finish(context: *PrefetchContext) void {
621
+ assert(context.workers_busy == 0);
622
+ assert(context.groove.prefetch_ids.count() == 0);
623
+ assert(context.id_iterator.next() == null);
624
+
625
+ const callback = context.callback;
626
+ context.* = undefined;
627
+ callback(context);
628
+ }
629
+ };
630
+
631
+ pub const PrefetchWorker = struct {
632
+ context: *PrefetchContext,
633
+ // TODO(ifreund): use a union for these to save memory, likely an extern union
634
+ // so that we can safetly @ptrCast() until @fieldParentPtr() is implemented
635
+ // for unions. See: https://github.com/ziglang/zig/issues/6611
636
+ lookup_id: IdTree.LookupContext = undefined,
637
+ lookup_object: ObjectTree.LookupContext = undefined,
638
+
639
+ /// Returns true if asynchronous I/O has been started.
640
+ /// Returns false if there are no more IDs to prefetch.
641
+ fn lookup_start(worker: *PrefetchWorker) bool {
642
+ const groove = worker.context.groove;
643
+
644
+ const id = worker.context.id_iterator.next() orelse {
645
+ groove.prefetch_ids.clearRetainingCapacity();
646
+ assert(groove.prefetch_ids.count() == 0);
647
+ worker.context.worker_finished();
648
+ return false;
649
+ };
650
+
651
+ if (config.verify) {
652
+ // This is checked in prefetch_enqueue()
653
+ assert(groove.ids.get_cached(id.*) == null);
654
+ }
655
+
656
+ // If not in the LSM tree's cache, the object must be read from disk and added
657
+ // to the auxiliary prefetch_objects hash map.
658
+ // TODO: this LSM tree function needlessly checks the LSM tree's cache a
659
+ // second time. Adding API to the LSM tree to avoid this may be worthwhile.
660
+ groove.ids.lookup(
661
+ lookup_id_callback,
662
+ &worker.lookup_id,
663
+ snapshot_latest,
664
+ id.*,
665
+ );
666
+ return true;
667
+ }
668
+
669
+ fn lookup_id_callback(
670
+ completion: *IdTree.LookupContext,
671
+ result: ?*const IdTreeValue,
672
+ ) void {
673
+ const worker = @fieldParentPtr(PrefetchWorker, "lookup_id", completion);
674
+
675
+ if (result) |id_tree_value| {
676
+ if (!id_tree_value.tombstone()) {
677
+ worker.context.groove.objects.lookup(
678
+ lookup_object_callback,
679
+ &worker.lookup_object,
680
+ snapshot_latest,
681
+ id_tree_value.timestamp,
682
+ );
683
+ } else {
684
+ worker.lookup_finish();
685
+ }
686
+ } else {
687
+ worker.lookup_finish();
688
+ }
689
+ }
690
+
691
+ fn lookup_object_callback(
692
+ completion: *ObjectTree.LookupContext,
693
+ result: ?*const Object,
694
+ ) void {
695
+ const worker = @fieldParentPtr(PrefetchWorker, "lookup_object", completion);
696
+
697
+ // The result must be non-null as we keep the ID and Object trees in sync.
698
+ const object = result.?;
699
+ assert(!ObjectTreeHelpers(Object).tombstone(object));
700
+
701
+ worker.context.groove.prefetch_objects.putAssumeCapacityNoClobber(object.*, {});
702
+ worker.lookup_finish();
703
+ }
704
+
705
+ fn lookup_finish(worker: *PrefetchWorker) void {
706
+ if (!worker.lookup_start()) {
707
+ worker.* = undefined;
708
+ }
709
+ }
710
+ };
711
+
712
+ pub fn put_no_clobber(groove: *Groove, object: *const Object) void {
713
+ const gop = groove.prefetch_objects.getOrPutAssumeCapacityAdapted(object.id, PrefetchObjectsAdapter{});
714
+ assert(!gop.found_existing);
715
+ groove.insert(object);
716
+ gop.key_ptr.* = object.*;
717
+ }
718
+
719
+ pub fn put(groove: *Groove, object: *const Object) void {
720
+ const gop = groove.prefetch_objects.getOrPutAssumeCapacityAdapted(object.id, PrefetchObjectsAdapter{});
721
+ if (gop.found_existing) {
722
+ groove.update(gop.key_ptr, object);
723
+ } else {
724
+ groove.insert(object);
725
+ }
726
+ gop.key_ptr.* = object.*;
727
+ }
728
+
729
+ /// Insert the value into the objects tree and its fields into the index trees.
730
+ fn insert(groove: *Groove, object: *const Object) void {
731
+ groove.objects.put(object);
732
+ groove.ids.put(&IdTreeValue{ .id = object.id, .timestamp = object.timestamp });
733
+
734
+ inline for (std.meta.fields(IndexTrees)) |field| {
735
+ const Helper = IndexTreeFieldHelperType(field.name);
736
+
737
+ if (Helper.derive_index(object)) |index| {
738
+ const index_value = Helper.derive_value(object, index);
739
+ @field(groove.indexes, field.name).put(&index_value);
740
+ }
741
+ }
742
+ }
743
+
744
+ /// Update the object and index trees by diff'ing the old and new values.
745
+ fn update(groove: *Groove, old: *const Object, new: *const Object) void {
746
+ assert(old.id == new.id);
747
+ assert(old.timestamp == new.timestamp);
748
+
749
+ // Update the object tree entry if any of the fields (even ignored) are different.
750
+ if (!std.mem.eql(u8, std.mem.asBytes(old), std.mem.asBytes(new))) {
751
+ // Unlike the index trees, the new and old values in the object tree share the
752
+ // same key. Therefore put() is sufficient to overwrite the old value.
753
+ groove.objects.put(new);
754
+ }
755
+
756
+ inline for (std.meta.fields(IndexTrees)) |field| {
757
+ const Helper = IndexTreeFieldHelperType(field.name);
758
+ const old_index = Helper.derive_index(old);
759
+ const new_index = Helper.derive_index(new);
760
+
761
+ // Only update the indexes that change.
762
+ if (!std.meta.eql(old_index, new_index)) {
763
+ if (old_index) |index| {
764
+ const old_index_value = Helper.derive_value(old, index);
765
+ @field(groove.indexes, field.name).remove(&old_index_value);
766
+ }
767
+
768
+ if (new_index) |index| {
769
+ const new_index_value = Helper.derive_value(new, index);
770
+ @field(groove.indexes, field.name).put(&new_index_value);
771
+ }
772
+ }
773
+ }
774
+ }
775
+
776
+ /// Asserts that the object with the given ID exists.
777
+ pub fn remove(groove: *Groove, id: u128) void {
778
+ const object = groove.prefetch_objects.getKeyPtrAdapted(id, PrefetchObjectsAdapter{}).?;
779
+
780
+ groove.objects.remove(object);
781
+ groove.ids.remove(&IdTreeValue{ .id = object.id, .timestamp = object.timestamp });
782
+
783
+ inline for (std.meta.fields(IndexTrees)) |field| {
784
+ const Helper = IndexTreeFieldHelperType(field.name);
785
+
786
+ if (Helper.derive_index(object)) |index| {
787
+ const index_value = Helper.derive_value(object, index);
788
+ @field(groove.indexes, field.name).remove(&index_value);
789
+ }
790
+ }
791
+
792
+ // TODO(zig) Replace this with a call to removeByPtr() after upgrading to 0.10.
793
+ // removeByPtr() replaces an unnecessary lookup here with some pointer arithmetic.
794
+ assert(groove.prefetch_objects.removeAdapted(object.id, PrefetchObjectsAdapter{}));
795
+ }
796
+
797
+ /// Maximum number of pending sync callbacks (ObjecTree + IdTree + IndexTrees).
798
+ const join_pending_max = 2 + std.meta.fields(IndexTrees).len;
799
+
800
+ fn JoinType(comptime join_op: JoinOp) type {
801
+ return struct {
802
+ pub fn start(groove: *Groove, join_callback: Callback) void {
803
+ // Make sure no sync op is currently running.
804
+ assert(groove.join_op == null);
805
+ assert(groove.join_pending == 0);
806
+ assert(groove.join_callback == null);
807
+
808
+ // Start the sync operations
809
+ groove.join_op = join_op;
810
+ groove.join_callback = join_callback;
811
+ groove.join_pending = join_pending_max;
812
+ }
813
+
814
+ const JoinField = union(enum) {
815
+ ids,
816
+ objects,
817
+ index: []const u8,
818
+ };
819
+
820
+ /// Returns LSM tree type for the given index field name (or ObjectTree if null).
821
+ fn TreeFor(comptime join_field: JoinField) type {
822
+ return switch (join_field) {
823
+ .ids => IdTree,
824
+ .objects => ObjectTree,
825
+ .index => |field| @TypeOf(@field(@as(IndexTrees, undefined), field)),
826
+ };
827
+ }
828
+
829
+ pub fn tree_callback(
830
+ comptime join_field: JoinField,
831
+ ) fn (*TreeFor(join_field)) void {
832
+ return struct {
833
+ fn tree_cb(tree: *TreeFor(join_field)) void {
834
+ // Derive the groove pointer from the tree using the join_field.
835
+ const groove = switch (join_field) {
836
+ .ids => @fieldParentPtr(Groove, "ids", tree),
837
+ .objects => @fieldParentPtr(Groove, "objects", tree),
838
+ .index => |field| blk: {
839
+ const indexes = @fieldParentPtr(IndexTrees, field, tree);
840
+ break :blk @fieldParentPtr(Groove, "indexes", indexes);
841
+ },
842
+ };
843
+
844
+ // Make sure the sync operation is currently running.
845
+ assert(groove.join_op == join_op);
846
+ assert(groove.join_callback != null);
847
+ assert(groove.join_pending <= join_pending_max);
848
+
849
+ // Guard until all pending sync ops complete.
850
+ groove.join_pending -= 1;
851
+ if (groove.join_pending > 0) return;
852
+
853
+ const callback = groove.join_callback.?;
854
+ groove.join_op = null;
855
+ groove.join_callback = null;
856
+ callback(groove);
857
+ }
858
+ }.tree_cb;
859
+ }
860
+ };
861
+ }
862
+
863
+ pub fn open(groove: *Groove, callback: fn (*Groove) void) void {
864
+ const Join = JoinType(.open);
865
+ Join.start(groove, callback);
866
+
867
+ groove.ids.open(Join.tree_callback(.ids));
868
+ groove.objects.open(Join.tree_callback(.objects));
869
+
870
+ inline for (std.meta.fields(IndexTrees)) |field| {
871
+ const open_callback = Join.tree_callback(.{ .index = field.name });
872
+ @field(groove.indexes, field.name).open(open_callback);
873
+ }
874
+ }
875
+
876
+ pub fn compact(groove: *Groove, callback: Callback, op: u64) void {
877
+ // Start a compacting join operation.
878
+ const Join = JoinType(.compacting);
879
+ Join.start(groove, callback);
880
+
881
+ // Compact the ObjectTree and IdTree
882
+ groove.ids.compact(Join.tree_callback(.ids), op);
883
+ groove.objects.compact(Join.tree_callback(.objects), op);
884
+
885
+ // Compact the IndexTrees.
886
+ inline for (std.meta.fields(IndexTrees)) |field| {
887
+ const compact_callback = Join.tree_callback(.{ .index = field.name });
888
+ @field(groove.indexes, field.name).compact(compact_callback, op);
889
+ }
890
+ }
891
+
892
+ pub fn checkpoint(groove: *Groove, callback: fn (*Groove) void) void {
893
+ // Start a checkpoint join operation.
894
+ const Join = JoinType(.checkpoint);
895
+ Join.start(groove, callback);
896
+
897
+ // Checkpoint the IdTree and ObjectTree.
898
+ groove.ids.checkpoint(Join.tree_callback(.ids));
899
+ groove.objects.checkpoint(Join.tree_callback(.objects));
900
+
901
+ // Checkpoint the IndexTrees.
902
+ inline for (std.meta.fields(IndexTrees)) |field| {
903
+ const checkpoint_callback = Join.tree_callback(.{ .index = field.name });
904
+ @field(groove.indexes, field.name).checkpoint(checkpoint_callback);
905
+ }
906
+ }
907
+ };
908
+ }
909
+
910
+ test "Groove" {
911
+ const Transfer = @import("../tigerbeetle.zig").Transfer;
912
+ const Storage = @import("../storage.zig").Storage;
913
+
914
+ const Groove = GrooveType(
915
+ Storage,
916
+ Transfer,
917
+ .{
918
+ .ignored = [_][]const u8{ "reserved", "user_data", "flags" },
919
+ .derived = .{},
920
+ },
921
+ );
922
+
923
+ _ = Groove.init;
924
+ _ = Groove.deinit;
925
+
926
+ _ = Groove.get;
927
+ _ = Groove.put;
928
+ _ = Groove.remove;
929
+
930
+ _ = Groove.compact;
931
+ _ = Groove.checkpoint;
932
+
933
+ _ = Groove.prefetch_enqueue;
934
+ _ = Groove.prefetch;
935
+ _ = Groove.prefetch_clear;
936
+
937
+ std.testing.refAllDecls(Groove.PrefetchWorker);
938
+ std.testing.refAllDecls(Groove.PrefetchContext);
939
+ }