tigerbeetle-node 0.10.0 → 0.11.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. package/README.md +302 -101
  2. package/dist/index.d.ts +70 -72
  3. package/dist/index.js +70 -72
  4. package/dist/index.js.map +1 -1
  5. package/package.json +9 -8
  6. package/scripts/download_node_headers.sh +14 -7
  7. package/src/index.ts +6 -10
  8. package/src/node.zig +6 -3
  9. package/src/tigerbeetle/scripts/benchmark.sh +4 -4
  10. package/src/tigerbeetle/scripts/confirm_image.sh +44 -0
  11. package/src/tigerbeetle/scripts/fuzz_loop.sh +15 -0
  12. package/src/tigerbeetle/scripts/fuzz_unique_errors.sh +7 -0
  13. package/src/tigerbeetle/scripts/install.sh +19 -4
  14. package/src/tigerbeetle/scripts/install_zig.bat +5 -1
  15. package/src/tigerbeetle/scripts/install_zig.sh +24 -14
  16. package/src/tigerbeetle/scripts/pre-commit.sh +9 -0
  17. package/src/tigerbeetle/scripts/shellcheck.sh +5 -0
  18. package/src/tigerbeetle/scripts/tests_on_alpine.sh +10 -0
  19. package/src/tigerbeetle/scripts/tests_on_ubuntu.sh +14 -0
  20. package/src/tigerbeetle/scripts/validate_docs.sh +17 -0
  21. package/src/tigerbeetle/src/benchmark.zig +29 -13
  22. package/src/tigerbeetle/src/c/tb_client/context.zig +248 -47
  23. package/src/tigerbeetle/src/c/tb_client/echo_client.zig +108 -0
  24. package/src/tigerbeetle/src/c/tb_client/packet.zig +2 -2
  25. package/src/tigerbeetle/src/c/tb_client/signal.zig +2 -4
  26. package/src/tigerbeetle/src/c/tb_client/thread.zig +17 -257
  27. package/src/tigerbeetle/src/c/tb_client.h +118 -84
  28. package/src/tigerbeetle/src/c/tb_client.zig +88 -23
  29. package/src/tigerbeetle/src/c/tb_client_header_test.zig +135 -0
  30. package/src/tigerbeetle/src/c/test.zig +371 -1
  31. package/src/tigerbeetle/src/cli.zig +37 -7
  32. package/src/tigerbeetle/src/config.zig +58 -17
  33. package/src/tigerbeetle/src/demo.zig +5 -2
  34. package/src/tigerbeetle/src/demo_01_create_accounts.zig +1 -1
  35. package/src/tigerbeetle/src/demo_03_create_transfers.zig +13 -0
  36. package/src/tigerbeetle/src/ewah.zig +11 -33
  37. package/src/tigerbeetle/src/ewah_benchmark.zig +8 -9
  38. package/src/tigerbeetle/src/io/linux.zig +1 -1
  39. package/src/tigerbeetle/src/lsm/README.md +308 -0
  40. package/src/tigerbeetle/src/lsm/binary_search.zig +137 -10
  41. package/src/tigerbeetle/src/lsm/bloom_filter.zig +43 -0
  42. package/src/tigerbeetle/src/lsm/compaction.zig +376 -397
  43. package/src/tigerbeetle/src/lsm/composite_key.zig +2 -0
  44. package/src/tigerbeetle/src/lsm/eytzinger.zig +1 -1
  45. package/src/tigerbeetle/src/{eytzinger_benchmark.zig → lsm/eytzinger_benchmark.zig} +34 -21
  46. package/src/tigerbeetle/src/lsm/forest.zig +21 -447
  47. package/src/tigerbeetle/src/lsm/forest_fuzz.zig +414 -0
  48. package/src/tigerbeetle/src/lsm/grid.zig +170 -76
  49. package/src/tigerbeetle/src/lsm/groove.zig +197 -133
  50. package/src/tigerbeetle/src/lsm/k_way_merge.zig +40 -18
  51. package/src/tigerbeetle/src/lsm/level_iterator.zig +28 -9
  52. package/src/tigerbeetle/src/lsm/manifest.zig +93 -180
  53. package/src/tigerbeetle/src/lsm/manifest_level.zig +161 -454
  54. package/src/tigerbeetle/src/lsm/manifest_log.zig +243 -356
  55. package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +665 -0
  56. package/src/tigerbeetle/src/lsm/node_pool.zig +4 -0
  57. package/src/tigerbeetle/src/lsm/posted_groove.zig +65 -76
  58. package/src/tigerbeetle/src/lsm/segmented_array.zig +580 -251
  59. package/src/tigerbeetle/src/lsm/segmented_array_benchmark.zig +148 -0
  60. package/src/tigerbeetle/src/lsm/segmented_array_fuzz.zig +9 -0
  61. package/src/tigerbeetle/src/lsm/set_associative_cache.zig +62 -12
  62. package/src/tigerbeetle/src/lsm/table.zig +115 -68
  63. package/src/tigerbeetle/src/lsm/table_immutable.zig +30 -23
  64. package/src/tigerbeetle/src/lsm/table_iterator.zig +27 -17
  65. package/src/tigerbeetle/src/lsm/table_mutable.zig +63 -12
  66. package/src/tigerbeetle/src/lsm/test.zig +61 -56
  67. package/src/tigerbeetle/src/lsm/tree.zig +450 -407
  68. package/src/tigerbeetle/src/lsm/tree_fuzz.zig +461 -0
  69. package/src/tigerbeetle/src/main.zig +83 -8
  70. package/src/tigerbeetle/src/message_bus.zig +20 -9
  71. package/src/tigerbeetle/src/message_pool.zig +22 -19
  72. package/src/tigerbeetle/src/ring_buffer.zig +7 -3
  73. package/src/tigerbeetle/src/simulator.zig +179 -119
  74. package/src/tigerbeetle/src/state_machine.zig +381 -246
  75. package/src/tigerbeetle/src/static_allocator.zig +65 -0
  76. package/src/tigerbeetle/src/storage.zig +3 -7
  77. package/src/tigerbeetle/src/test/accounting/auditor.zig +577 -0
  78. package/src/tigerbeetle/src/test/accounting/workload.zig +823 -0
  79. package/src/tigerbeetle/src/test/cluster.zig +33 -81
  80. package/src/tigerbeetle/src/test/conductor.zig +366 -0
  81. package/src/tigerbeetle/src/test/fuzz.zig +121 -0
  82. package/src/tigerbeetle/src/test/id.zig +89 -0
  83. package/src/tigerbeetle/src/test/network.zig +45 -19
  84. package/src/tigerbeetle/src/test/packet_simulator.zig +40 -29
  85. package/src/tigerbeetle/src/test/priority_queue.zig +645 -0
  86. package/src/tigerbeetle/src/test/state_checker.zig +91 -69
  87. package/src/tigerbeetle/src/test/state_machine.zig +11 -35
  88. package/src/tigerbeetle/src/test/storage.zig +470 -106
  89. package/src/tigerbeetle/src/test/storage_checker.zig +204 -0
  90. package/src/tigerbeetle/src/tigerbeetle.zig +15 -16
  91. package/src/tigerbeetle/src/unit_tests.zig +13 -1
  92. package/src/tigerbeetle/src/util.zig +97 -11
  93. package/src/tigerbeetle/src/vopr.zig +495 -0
  94. package/src/tigerbeetle/src/vsr/client.zig +21 -3
  95. package/src/tigerbeetle/src/vsr/journal.zig +293 -212
  96. package/src/tigerbeetle/src/vsr/replica.zig +1086 -515
  97. package/src/tigerbeetle/src/vsr/superblock.zig +382 -637
  98. package/src/tigerbeetle/src/vsr/superblock_client_table.zig +14 -16
  99. package/src/tigerbeetle/src/vsr/superblock_free_set.zig +416 -153
  100. package/src/tigerbeetle/src/vsr/superblock_free_set_fuzz.zig +332 -0
  101. package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +349 -0
  102. package/src/tigerbeetle/src/vsr/superblock_manifest.zig +62 -12
  103. package/src/tigerbeetle/src/vsr/superblock_quorums.zig +394 -0
  104. package/src/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +312 -0
  105. package/src/tigerbeetle/src/vsr.zig +94 -60
  106. package/src/tigerbeetle/scripts/vopr.bat +0 -48
  107. package/src/tigerbeetle/scripts/vopr.sh +0 -33
  108. package/src/tigerbeetle/src/benchmark_array_search.zig +0 -317
  109. package/src/tigerbeetle/src/benchmarks/perf.zig +0 -299
@@ -7,11 +7,10 @@ const meta = std.meta;
7
7
  const config = @import("../config.zig");
8
8
  const lsm = @import("tree.zig");
9
9
  const binary_search = @import("binary_search.zig");
10
- const binary_search_keys_raw = binary_search.binary_search_keys_raw;
11
10
 
12
11
  const Direction = @import("direction.zig").Direction;
13
12
  const SegmentedArray = @import("segmented_array.zig").SegmentedArray;
14
- const SegmentedArrayCursor = @import("segmented_array.zig").Cursor;
13
+ const SortedSegmentedArray = @import("segmented_array.zig").SortedSegmentedArray;
15
14
 
16
15
  pub fn ManifestLevelType(
17
16
  comptime NodePool: type,
@@ -23,23 +22,27 @@ pub fn ManifestLevelType(
23
22
  return struct {
24
23
  const Self = @This();
25
24
 
26
- const Keys = SegmentedArray(Key, NodePool, table_count_max);
27
- const Tables = SegmentedArray(TableInfo, NodePool, table_count_max);
28
-
29
- /// The maximum key of each key node in the keys segmented array.
30
- /// This is the starting point of our tiered lookup approach.
31
- /// Only the first keys.node_count elements are valid.
32
- root_keys_array: *[Keys.node_count_max]Key,
25
+ pub const Keys = SortedSegmentedArray(
26
+ Key,
27
+ NodePool,
28
+ table_count_max,
29
+ Key,
30
+ struct {
31
+ inline fn key_from_value(value: *const Key) Key {
32
+ return value.*;
33
+ }
34
+ }.key_from_value,
35
+ compare_keys,
36
+ .{},
37
+ );
33
38
 
34
- /// This is the index of the first table node that might contain the TableInfo
35
- /// corresponding to a given key node. This allows us to skip table nodes which cannot
36
- /// contain the target TableInfo when searching for the TableInfo with a given absolute
37
- /// index. Only the first keys.node_count elements are valid.
38
- root_table_nodes_array: *[Keys.node_count_max]u32,
39
+ pub const Tables = SegmentedArray(TableInfo, NodePool, table_count_max, .{});
39
40
 
40
41
  // These two segmented arrays are parallel. That is, the absolute indexes of maximum key
41
42
  // and corresponding TableInfo are the same. However, the number of nodes, node index, and
42
43
  // relative index into the node differ as the elements per node are different.
44
+ //
45
+ // Ordered by ascending (maximum) key. Keys may repeat due to snapshots.
43
46
  keys: Keys,
44
47
  tables: Tables,
45
48
 
@@ -48,12 +51,6 @@ pub fn ManifestLevelType(
48
51
  table_count_visible: u32 = 0,
49
52
 
50
53
  pub fn init(allocator: mem.Allocator) !Self {
51
- var root_keys_array = try allocator.create([Keys.node_count_max]Key);
52
- errdefer allocator.destroy(root_keys_array);
53
-
54
- var root_table_nodes_array = try allocator.create([Keys.node_count_max]u32);
55
- errdefer allocator.destroy(root_table_nodes_array);
56
-
57
54
  var keys = try Keys.init(allocator);
58
55
  errdefer keys.deinit(allocator, null);
59
56
 
@@ -61,190 +58,42 @@ pub fn ManifestLevelType(
61
58
  errdefer tables.deinit(allocator, null);
62
59
 
63
60
  return Self{
64
- .root_keys_array = root_keys_array,
65
- .root_table_nodes_array = root_table_nodes_array,
66
61
  .keys = keys,
67
62
  .tables = tables,
68
63
  };
69
64
  }
70
65
 
71
66
  pub fn deinit(level: *Self, allocator: mem.Allocator, node_pool: *NodePool) void {
72
- allocator.destroy(level.root_keys_array);
73
- allocator.destroy(level.root_table_nodes_array);
74
67
  level.keys.deinit(allocator, node_pool);
75
68
  level.tables.deinit(allocator, node_pool);
76
69
  }
77
70
 
78
71
  /// Inserts an ordered batch of tables into the level, then rebuilds the indexes.
79
- pub fn insert_tables(level: *Self, node_pool: *NodePool, tables: []const TableInfo) void {
80
- assert(tables.len > 0);
72
+ pub fn insert_table(level: *Self, node_pool: *NodePool, table: *const TableInfo) void {
81
73
  assert(level.keys.len() == level.tables.len());
82
74
 
83
- {
84
- var a = tables[0];
85
- assert(compare_keys(a.key_min, a.key_max) != .gt);
86
- for (tables[1..]) |b| {
87
- assert(compare_keys(b.key_min, b.key_max) != .gt);
88
- assert(compare_keys(a.key_max, b.key_min) == .lt);
89
- a = b;
90
- }
91
- }
92
-
93
- // Inserting multiple tables all at once is tricky due to duplicate keys via snapshots.
94
- // We therefore insert tables one by one, and then rebuild the indexes.
95
-
96
- var absolute_index = level.absolute_index_for_insert(tables[0].key_max);
75
+ const absolute_index = level.keys.insert_element(node_pool, table.key_max);
76
+ assert(absolute_index < level.keys.len());
77
+ level.tables.insert_elements(node_pool, absolute_index, &[_]TableInfo{table.*});
97
78
 
98
- var i: usize = 0;
99
- while (i < tables.len) : (i += 1) {
100
- const table = &tables[i];
101
-
102
- // Increment absolute_index until the key_max at absolute_index is greater than
103
- // or equal to table.key_max. This is the index we want to insert the table at.
104
- if (absolute_index < level.keys.len()) {
105
- var it = level.keys.iterator(absolute_index, 0, .ascending);
106
- while (it.next()) |key_max| : (absolute_index += 1) {
107
- if (compare_keys(key_max.*, table.key_max) != .lt) break;
108
- }
109
- }
110
-
111
- level.keys.insert_elements(node_pool, absolute_index, &[_]Key{table.key_max});
112
- level.tables.insert_elements(node_pool, absolute_index, tables[i..][0..1]);
113
-
114
- if (table.visible(lsm.snapshot_latest)) level.table_count_visible += 1;
115
- }
79
+ if (table.visible(lsm.snapshot_latest)) level.table_count_visible += 1;
116
80
 
117
81
  assert(level.keys.len() == level.tables.len());
118
-
119
- level.rebuild_root();
120
82
  }
121
83
 
122
- /// Return the index at which to insert a new table given the table's key_max.
123
- /// Requires all metadata/indexes to be valid.
124
- fn absolute_index_for_insert(level: Self, key_max: Key) u32 {
125
- const root = level.root_keys();
126
- if (root.len == 0) {
127
- assert(level.keys.len() == 0);
128
- assert(level.tables.len() == 0);
129
- return 0;
130
- }
131
-
132
- const key_node = binary_search_keys_raw(Key, compare_keys, root, key_max);
133
- assert(key_node <= level.keys.node_count);
134
- if (key_node == level.keys.node_count) {
135
- assert(level.keys.len() == level.tables.len());
136
- return level.keys.len();
137
- }
138
-
139
- const keys = level.keys.node_elements(key_node);
140
- const relative_index = binary_search_keys_raw(Key, compare_keys, keys, key_max);
141
-
142
- // The key must be less than or equal to the maximum key of this key node since the
143
- // first binary search checked this exact condition.
144
- assert(relative_index < keys.len);
145
-
146
- return level.keys.absolute_index_for_cursor(.{
147
- .node = key_node,
148
- .relative_index = relative_index,
149
- });
150
- }
151
-
152
- /// Rebuilds the root_keys and root_table_nodes arrays based on the current state of the
153
- /// keys and tables segmented arrays.
154
- fn rebuild_root(level: *Self) void {
155
- assert(level.keys.len() == level.tables.len());
156
-
157
- {
158
- mem.set(Key, level.root_keys_array, undefined);
159
- var key_node: u32 = 0;
160
- while (key_node < level.keys.node_count) : (key_node += 1) {
161
- level.root_keys_array[key_node] = level.keys.node_last_element(key_node);
162
- }
163
- }
164
-
165
- if (config.verify and level.keys.node_count > 1) {
166
- var a = level.root_keys_array[0];
167
- for (level.root_keys_array[1..level.keys.node_count]) |b| {
168
- assert(compare_keys(a, b) != .gt);
169
- a = b;
170
- }
171
- }
172
-
173
- {
174
- mem.set(u32, level.root_table_nodes_array, undefined);
175
- var key_node: u32 = 0;
176
- var table_node: u32 = 0;
177
- while (key_node < level.keys.node_count) : (key_node += 1) {
178
- const key_node_first_key = level.keys.node_elements(key_node)[0];
179
-
180
- // While the key_max of the table node is less than the first key_max of the
181
- // key_node, increment table_node.
182
- while (table_node < level.tables.node_count) : (table_node += 1) {
183
- const table_node_table_max = level.tables.node_last_element(table_node);
184
- const table_node_key_max = table_node_table_max.key_max;
185
- if (compare_keys(table_node_key_max, key_node_first_key) != .lt) {
186
- break;
187
- }
188
- } else {
189
- // Assert that we found the appropriate table_node and hit the break above.
190
- unreachable;
191
- }
192
-
193
- level.root_table_nodes_array[key_node] = table_node;
194
- }
195
- }
196
-
197
- if (config.verify and level.keys.node_count > 1) {
198
- var a = level.root_table_nodes_array[0];
199
- for (level.root_table_nodes_array[1..level.keys.node_count]) |b| {
200
- assert(a <= b);
201
- a = b;
202
- }
203
- }
204
-
205
- if (config.verify) {
206
- // Assert that the first key in each key node is in the range of the table
207
- // directly mapped to by root_table_nodes_array.
208
- for (level.root_table_nodes_array[0..level.keys.node_count]) |table_node, i| {
209
- const key_node = @intCast(u32, i);
210
- const key_node_first_key = level.keys.node_elements(key_node)[0];
211
-
212
- const table_node_key_min = level.tables.node_elements(table_node)[0].key_min;
213
- const table_node_key_max = level.tables.node_last_element(table_node).key_max;
214
-
215
- assert(compare_keys(table_node_key_min, table_node_key_max) != .gt);
216
-
217
- assert(compare_keys(key_node_first_key, table_node_key_min) != .lt);
218
- assert(compare_keys(key_node_first_key, table_node_key_max) != .gt);
219
- }
220
- }
221
- }
222
-
223
- /// Set snapshot_max for the given tables in the ManifestLevel.
224
- /// The tables slice must be sorted by table min/max key.
225
- /// Asserts that the tables currently have snapshot_max of math.maxInt(u64).
226
- /// Asserts that all tables in the ManifestLevel in the key range tables[0].key_min
227
- /// to tables[tables.len - 1].key_max are present in the tables slice.
228
- pub fn set_snapshot_max(level: *Self, snapshot: u64, tables: []const TableInfo) void {
84
+ /// Set snapshot_max for the given table in the ManifestLevel.
85
+ ///
86
+ /// * The table is mutable so that this function can update its snapshot.
87
+ /// * Asserts that the table currently has snapshot_max of math.maxInt(u64).
88
+ /// * Asserts that the table exists in the manifest.
89
+ pub fn set_snapshot_max(level: *Self, snapshot: u64, table: *TableInfo) void {
229
90
  assert(snapshot < lsm.snapshot_latest);
230
- assert(tables.len > 0);
231
- assert(level.table_count_visible >= tables.len);
232
-
233
- {
234
- var a = tables[0];
235
- assert(compare_keys(a.key_min, a.key_max) != .gt);
236
- for (tables[1..]) |b| {
237
- assert(compare_keys(b.key_min, b.key_max) != .gt);
238
- assert(compare_keys(a.key_max, b.key_min) == .lt);
239
- a = b;
240
- }
241
- }
91
+ assert(table.snapshot_max == math.maxInt(u64));
242
92
 
243
- const key_min = tables[0].key_min;
244
- const key_max = tables[tables.len - 1].key_max;
93
+ const key_min = table.key_min;
94
+ const key_max = table.key_max;
245
95
  assert(compare_keys(key_min, key_max) != .gt);
246
96
 
247
- var i: u32 = 0;
248
97
  var it = level.iterator(
249
98
  .visible,
250
99
  @as(*const [1]u64, &lsm.snapshot_latest),
@@ -252,135 +101,52 @@ pub fn ManifestLevelType(
252
101
  KeyRange{ .key_min = key_min, .key_max = key_max },
253
102
  );
254
103
 
255
- while (it.next()) |table_const| : (i += 1) {
256
- // This const cast is safe as we know that the memory pointed to is in fact
257
- // mutable. That is, the table is not in the .text or .rodata section. We do this
258
- // to avoid duplicating the iterator code in order to expose only a const iterator
259
- // in the public API.
260
- const table = @intToPtr(*TableInfo, @ptrToInt(table_const));
261
- assert(table.equal(&tables[i]));
104
+ const level_table_const = it.next().?;
105
+ // This const cast is safe as we know that the memory pointed to is in fact
106
+ // mutable. That is, the table is not in the .text or .rodata section. We do this
107
+ // to avoid duplicating the iterator code in order to expose only a const iterator
108
+ // in the public API.
109
+ const level_table = @intToPtr(*TableInfo, @ptrToInt(level_table_const));
110
+ assert(level_table.equal(table));
111
+ assert(level_table.snapshot_max == math.maxInt(u64));
262
112
 
263
- assert(table.snapshot_max == math.maxInt(u64));
264
- table.snapshot_max = snapshot;
265
- }
113
+ level_table.snapshot_max = snapshot;
114
+ table.snapshot_max = snapshot;
266
115
 
267
- assert(i == tables.len);
268
- level.table_count_visible -= @intCast(u32, tables.len);
116
+ assert(it.next() == null);
117
+ level.table_count_visible -= 1;
269
118
  }
270
119
 
271
- /// Remove the given tables from the ManifestLevel, asserting that they are not visible
272
- /// by any snapshot in snapshots or by lsm.snapshot_latest.
273
- /// The tables slice must be sorted by table min/max key.
274
- /// Asserts that all tables in the ManifestLevel in the key range tables[0].key_min
275
- /// to tables[tables.len - 1].key_max and not visible by any snapshot are present in
276
- /// the tables slice.
277
- pub fn remove_tables(
120
+ /// Remove the given table from the ManifestLevel, asserting that it is not visible
121
+ /// by any snapshot in `snapshots` or by `lsm.snapshot_latest`.
122
+ pub fn remove_table(
278
123
  level: *Self,
279
124
  node_pool: *NodePool,
280
125
  snapshots: []const u64,
281
- tables: []const TableInfo,
126
+ table: *const TableInfo,
282
127
  ) void {
283
- assert(tables.len > 0);
284
128
  assert(level.keys.len() == level.tables.len());
285
- assert(level.keys.len() - level.table_count_visible >= tables.len);
286
-
287
- {
288
- var a = tables[0];
289
- assert(compare_keys(a.key_min, a.key_max) != .gt);
290
- for (tables[1..]) |b| {
291
- assert(compare_keys(b.key_min, b.key_max) != .gt);
292
- assert(compare_keys(a.key_max, b.key_min) == .lt);
293
- a = b;
294
- }
295
- }
296
-
297
- const key_min = tables[0].key_min;
298
- const key_max = tables[tables.len - 1].key_max;
299
129
  // The batch may contain a single table, with a single key, i.e. key_min == key_max:
300
- assert(compare_keys(key_min, key_max) != .gt);
130
+ assert(compare_keys(table.key_min, table.key_max) != .gt);
301
131
 
302
- var absolute_index = level.absolute_index_for_remove(key_min);
132
+ // Use `key_min` for both ends of the iterator; we are looking for a single table.
133
+ const cursor_start = level.iterator_start(table.key_min, table.key_min, .ascending).?;
134
+ var absolute_index = level.keys.absolute_index_for_cursor(cursor_start);
303
135
 
304
- {
305
- var it = level.tables.iterator(absolute_index, 0, .ascending);
306
- while (it.next()) |table| : (absolute_index += 1) {
307
- if (table.invisible(snapshots)) {
308
- assert(table.equal(&tables[0]));
309
- break;
310
- }
311
- } else {
312
- unreachable;
313
- }
314
- }
136
+ var it = level.tables.iterator_from_index(absolute_index, .ascending);
137
+ while (it.next()) |level_table| : (absolute_index += 1) {
138
+ if (level_table.invisible(snapshots)) {
139
+ assert(level_table.equal(table));
315
140
 
316
- var i: u32 = 0;
317
- var safety_counter: u32 = 0;
318
- outer: while (safety_counter < tables.len) : (safety_counter += 1) {
319
- var it = level.tables.iterator(absolute_index, 0, .ascending);
320
- inner: while (it.next()) |table| : (absolute_index += 1) {
321
- if (table.invisible(snapshots)) {
322
- assert(table.equal(&tables[i]));
323
-
324
- const table_key_max = table.key_max;
325
- level.keys.remove_elements(node_pool, absolute_index, 1);
326
- level.tables.remove_elements(node_pool, absolute_index, 1);
327
- i += 1;
328
-
329
- switch (compare_keys(table_key_max, key_max)) {
330
- .lt => break :inner,
331
- .eq => break :outer,
332
- // We require the key_min/key_max to be exact, so the last table
333
- // matching the snapshot must have the provided key_max.
334
- .gt => unreachable,
335
- }
336
- } else {
337
- // We handle the first table to be removed specially before this main loop
338
- // in order to check for an exact key_min match.
339
- assert(i > 0);
340
- }
341
- } else {
342
- unreachable;
141
+ level.keys.remove_elements(node_pool, absolute_index, 1);
142
+ level.tables.remove_elements(node_pool, absolute_index, 1);
143
+ break;
343
144
  }
344
145
  } else {
345
146
  unreachable;
346
147
  }
347
- assert(i == tables.len);
348
- // The loop will never terminate naturally, only through the `break :outer`, which
349
- // means the +1 here is required as the continue safety_counter += 1 continue
350
- // expression isn't run on the last iteration of the loop.
351
- assert(safety_counter + 1 == tables.len);
352
148
 
353
149
  assert(level.keys.len() == level.tables.len());
354
-
355
- level.rebuild_root();
356
- }
357
-
358
- /// Return the index of the first table that could have the given key_min.
359
- /// Requires all metadata/indexes to be valid.
360
- fn absolute_index_for_remove(level: Self, key_min: Key) u32 {
361
- const root = level.root_keys();
362
- assert(root.len > 0);
363
-
364
- const key_node = binary_search_keys_raw(Key, compare_keys, root, key_min);
365
- assert(key_node < level.keys.node_count);
366
-
367
- const keys = level.keys.node_elements(key_node);
368
- assert(keys.len > 0);
369
-
370
- const relative_index = binary_search_keys_raw(Key, compare_keys, keys, key_min);
371
- assert(relative_index < keys.len);
372
-
373
- return level.keys.absolute_index_for_cursor(level.iterator_start_boundary(
374
- .{
375
- .node = key_node,
376
- .relative_index = relative_index,
377
- },
378
- .ascending,
379
- ));
380
- }
381
-
382
- inline fn root_keys(level: Self) []Key {
383
- return level.root_keys_array[0..level.keys.node_count];
384
150
  }
385
151
 
386
152
  pub const Visibility = enum {
@@ -389,8 +155,8 @@ pub fn ManifestLevelType(
389
155
  };
390
156
 
391
157
  pub const KeyRange = struct {
392
- key_min: Key,
393
- key_max: Key,
158
+ key_min: Key, // Inclusive.
159
+ key_max: Key, // Inclusive.
394
160
  };
395
161
 
396
162
  pub fn iterator(
@@ -409,9 +175,8 @@ pub fn ManifestLevelType(
409
175
  assert(compare_keys(range.key_min, range.key_max) != .gt);
410
176
 
411
177
  if (level.iterator_start(range.key_min, range.key_max, direction)) |start| {
412
- break :blk level.tables.iterator(
178
+ break :blk level.tables.iterator_from_index(
413
179
  level.keys.absolute_index_for_cursor(start),
414
- level.iterator_start_table_node_for_key_node(start.node, direction),
415
180
  direction,
416
181
  );
417
182
  } else {
@@ -424,14 +189,9 @@ pub fn ManifestLevelType(
424
189
  }
425
190
  } else {
426
191
  switch (direction) {
427
- .ascending => break :blk level.tables.iterator(0, 0, direction),
192
+ .ascending => break :blk level.tables.iterator_from_index(0, direction),
428
193
  .descending => {
429
- const last = level.tables.last();
430
- break :blk level.tables.iterator(
431
- level.tables.absolute_index_for_cursor(last),
432
- last.node,
433
- .descending,
434
- );
194
+ break :blk level.tables.iterator_from_cursor(level.tables.last(), .descending);
435
195
  },
436
196
  }
437
197
  }
@@ -496,7 +256,7 @@ pub fn ManifestLevelType(
496
256
  // Unlike in the ascending case, it is not guaranteed that
497
257
  // table.key_min is less than or equal to key_range.key_max on the
498
258
  // first iteration as only the key_max of a table is stored in our
499
- // root/key nodes. On subsequent iterations this check will always
259
+ // key nodes. On subsequent iterations this check will always
500
260
  // be false.
501
261
  if (compare_keys(table.key_min, key_range.key_max) == .gt) {
502
262
  continue;
@@ -519,9 +279,10 @@ pub fn ManifestLevelType(
519
279
  }
520
280
  };
521
281
 
522
- /// Returns the table segmented array cursor at which iteration should be started.
282
+ /// Returns the keys segmented array cursor at which iteration should be started.
523
283
  /// May return null if there is nothing to iterate because we know for sure that the key
524
284
  /// range is disjoint with the tables stored in this level.
285
+ ///
525
286
  /// However, the cursor returned is not guaranteed to be in range for the query as only
526
287
  /// the key_max is stored in the index structures, not the key_min, and only the start
527
288
  /// bound for the given direction is checked here.
@@ -530,66 +291,52 @@ pub fn ManifestLevelType(
530
291
  key_min: Key,
531
292
  key_max: Key,
532
293
  direction: Direction,
533
- ) ?SegmentedArrayCursor {
294
+ ) ?Keys.Cursor {
534
295
  assert(compare_keys(key_min, key_max) != .gt);
296
+ assert(level.keys.len() == level.tables.len());
535
297
 
536
- const root = level.root_keys();
537
- if (root.len == 0) {
538
- assert(level.keys.len() == 0);
539
- assert(level.tables.len() == 0);
540
- return null;
541
- }
298
+ if (level.keys.len() == 0) return null;
542
299
 
543
- const key = switch (direction) {
300
+ // Ascending: Find the first table where table.key_max ≤ iterator.key_min.
301
+ // Descending: Find the first table where table.key_max ≤ iterator.key_max.
302
+ const target = level.keys.search(switch (direction) {
544
303
  .ascending => key_min,
545
304
  .descending => key_max,
546
- };
305
+ });
306
+ assert(target.node <= level.keys.node_count);
547
307
 
548
- const key_node = binary_search_keys_raw(Key, compare_keys, root, key);
549
- assert(key_node <= level.keys.node_count);
550
- if (key_node == level.keys.node_count) {
551
- switch (direction) {
308
+ if (level.keys.absolute_index_for_cursor(target) == level.keys.len()) {
309
+ return switch (direction) {
552
310
  // The key_min of the target range is greater than the key_max of the last
553
311
  // table in the level and we are ascending, so this range matches no tables
554
312
  // on this level.
555
- .ascending => return null,
313
+ .ascending => null,
556
314
  // The key_max of the target range is greater than the key_max of the last
557
315
  // table in the level and we are descending, so we need to start iteration
558
316
  // at the last table in the level.
559
- .descending => return level.keys.last(),
560
- }
317
+ .descending => level.keys.last(),
318
+ };
319
+ } else {
320
+ // Multiple tables in the level may share a key.
321
+ // Scan to the edge so that the iterator will cover them all.
322
+ return level.iterator_start_boundary(target, direction);
561
323
  }
562
-
563
- const keys = level.keys.node_elements(key_node);
564
- const relative_index = binary_search_keys_raw(Key, compare_keys, keys, key);
565
-
566
- // The key must be less than or equal to the maximum key of this key node since the
567
- // first binary search checked this exact condition.
568
- assert(relative_index < keys.len);
569
-
570
- return level.iterator_start_boundary(
571
- .{
572
- .node = key_node,
573
- .relative_index = relative_index,
574
- },
575
- direction,
576
- );
577
324
  }
578
325
 
579
326
  /// This function exists because there may be tables in the level with the same
580
327
  /// key_max but non-overlapping snapshot visibility.
328
+ ///
329
+ /// Put differently, there may be several tables with different snapshots but the same
330
+ /// `key_max`, and `iterator_start`'s binary search (`key_cursor`) may have landed in the
331
+ /// middle of them.
581
332
  fn iterator_start_boundary(
582
333
  level: Self,
583
- key_cursor: SegmentedArrayCursor,
334
+ key_cursor: Keys.Cursor,
584
335
  direction: Direction,
585
- ) SegmentedArrayCursor {
586
- var reverse = level.keys.iterator(
587
- level.keys.absolute_index_for_cursor(key_cursor),
588
- key_cursor.node,
589
- direction.reverse(),
590
- );
591
-
336
+ ) Keys.Cursor {
337
+ var reverse = level.keys.iterator_from_cursor(key_cursor, direction.reverse());
592
338
  assert(meta.eql(reverse.cursor, key_cursor));
339
+
593
340
  // This cursor will always point to a key equal to start_key.
594
341
  var adjusted = reverse.cursor;
595
342
  const start_key = reverse.next().?.*;
@@ -611,26 +358,20 @@ pub fn ManifestLevelType(
611
358
  return adjusted;
612
359
  }
613
360
 
614
- inline fn iterator_start_table_node_for_key_node(
615
- level: Self,
616
- key_node: u32,
617
- direction: Direction,
618
- ) u32 {
619
- assert(key_node < level.keys.node_count);
620
-
621
- switch (direction) {
622
- .ascending => return level.root_table_nodes_array[key_node],
623
- .descending => {
624
- if (key_node + 1 < level.keys.node_count) {
625
- // Since the corresponding node in root_table_nodes_array is a lower bound,
626
- // we must add one to make it an upper bound when descending.
627
- return level.root_table_nodes_array[key_node + 1];
628
- } else {
629
- // However, if we are at the last key node, then return the last table node.
630
- return level.tables.node_count - 1;
631
- }
632
- },
361
+ /// The function is only used for verification; it is not performance-critical.
362
+ pub fn contains(level: Self, table: *const TableInfo) bool {
363
+ assert(config.verify);
364
+
365
+ var level_tables = level.iterator(.visible, &.{
366
+ table.snapshot_min,
367
+ }, .ascending, KeyRange{
368
+ .key_min = table.key_min,
369
+ .key_max = table.key_max,
370
+ });
371
+ while (level_tables.next()) |level_table| {
372
+ if (level_table.equal(table)) return true;
633
373
  }
374
+ return false;
634
375
  }
635
376
  };
636
377
  }
@@ -647,64 +388,38 @@ pub fn TestContext(
647
388
 
648
389
  const log = false;
649
390
 
391
+ const Value = struct {
392
+ key: Key,
393
+ tombstone: bool,
394
+ };
395
+
650
396
  inline fn compare_keys(a: Key, b: Key) math.Order {
651
397
  return math.order(a, b);
652
398
  }
653
399
 
654
- // TODO Import this type from lsm/tree.zig.
655
- const TableInfo = extern struct {
656
- checksum: u128,
657
- address: u64,
658
- flags: u64 = 0,
659
-
660
- /// The minimum snapshot that can see this table (with exclusive bounds).
661
- /// This value is set to the current snapshot tick on table creation.
662
- snapshot_min: u64,
663
-
664
- /// The maximum snapshot that can see this table (with exclusive bounds).
665
- /// This value is set to the current snapshot tick on table deletion.
666
- snapshot_max: u64 = math.maxInt(u64),
667
-
668
- key_min: Key,
669
- key_max: Key,
670
-
671
- comptime {
672
- assert(@sizeOf(TableInfo) == 48 + @sizeOf(Key) * 2);
673
- assert(@alignOf(TableInfo) == 16);
674
- }
675
-
676
- pub fn visible(table: *const @This(), snapshot: u64) bool {
677
- assert(table.address != 0);
678
- assert(table.snapshot_min < table.snapshot_max);
679
- assert(snapshot <= lsm.snapshot_latest);
680
-
681
- assert(snapshot != table.snapshot_min);
682
- assert(snapshot != table.snapshot_max);
400
+ inline fn key_from_value(value: *const Value) Key {
401
+ return value.key;
402
+ }
683
403
 
684
- return table.snapshot_min < snapshot and snapshot < table.snapshot_max;
685
- }
404
+ inline fn tombstone_from_key(key: Key) Value {
405
+ return .{ .key = key, .tombstone = true };
406
+ }
686
407
 
687
- pub fn invisible(table: *const TableInfo, snapshots: []const u64) bool {
688
- if (table.visible(lsm.snapshot_latest)) return false;
689
- for (snapshots) |snapshot| if (table.visible(snapshot)) return false;
690
- return true;
691
- }
408
+ inline fn tombstone(value: *const Value) bool {
409
+ return value.tombstone;
410
+ }
692
411
 
693
- pub fn equal(table: *const TableInfo, other: *const TableInfo) bool {
694
- // TODO since the layout of TableInfo is well defined, a direct memcmp might
695
- // be faster here. However, it's not clear if we can make the assumption that
696
- // compare_keys() will return .eq exactly when the memory of the keys are
697
- // equal. Consider defining the API to allow this and check the generated code.
698
- return table.checksum == other.checksum and
699
- table.address == other.address and
700
- table.flags == other.flags and
701
- table.snapshot_min == other.snapshot_min and
702
- table.snapshot_max == other.snapshot_max and
703
- compare_keys(table.key_min, other.key_min) == .eq and
704
- compare_keys(table.key_max, other.key_max) == .eq;
705
- }
706
- };
412
+ const Table = @import("table.zig").TableType(
413
+ Key,
414
+ Value,
415
+ compare_keys,
416
+ key_from_value,
417
+ std.math.maxInt(Key),
418
+ tombstone,
419
+ tombstone_from_key,
420
+ );
707
421
 
422
+ const TableInfo = @import("manifest.zig").TableInfoType(Table);
708
423
  const NodePool = @import("node_pool.zig").NodePool;
709
424
 
710
425
  const TestPool = NodePool(node_size, @alignOf(TableInfo));
@@ -807,23 +522,20 @@ pub fn TestContext(
807
522
  }
808
523
  }
809
524
 
810
- context.level.insert_tables(&context.pool, buffer[0..count]);
525
+ for (buffer[0..count]) |*table| {
526
+ context.level.insert_table(&context.pool, table);
527
+ }
811
528
 
812
529
  for (buffer[0..count]) |table| {
813
- const index = blk: {
814
- if (context.reference.items.len == 0) {
815
- break :blk 0;
816
- } else {
817
- break :blk binary_search.binary_search_values_raw(
818
- Key,
819
- TableInfo,
820
- key_min_from_table,
821
- compare_keys,
822
- context.reference.items,
823
- table.key_max,
824
- );
825
- }
826
- };
530
+ const index = binary_search.binary_search_values_raw(
531
+ Key,
532
+ TableInfo,
533
+ key_min_from_table,
534
+ compare_keys,
535
+ context.reference.items,
536
+ table.key_max,
537
+ .{},
538
+ );
827
539
  // Can't be equal as the tables may not overlap
828
540
  if (index < context.reference.items.len) {
829
541
  assert(context.reference.items[index].key_min > table.key_max);
@@ -841,20 +553,15 @@ pub fn TestContext(
841
553
 
842
554
  assert(compare_keys(new_key_min, key) == .gt);
843
555
 
844
- var i = blk: {
845
- if (context.reference.items.len == 0) {
846
- break :blk 0;
847
- } else {
848
- break :blk binary_search.binary_search_values_raw(
849
- Key,
850
- TableInfo,
851
- key_min_from_table,
852
- compare_keys,
853
- context.reference.items,
854
- new_key_min,
855
- );
856
- }
857
- };
556
+ var i = binary_search.binary_search_values_raw(
557
+ Key,
558
+ TableInfo,
559
+ key_min_from_table,
560
+ compare_keys,
561
+ context.reference.items,
562
+ new_key_min,
563
+ .{},
564
+ );
858
565
 
859
566
  if (i > 0) {
860
567
  if (compare_keys(new_key_min, context.reference.items[i - 1].key_max) != .gt) {
@@ -930,11 +637,9 @@ pub fn TestContext(
930
637
  }
931
638
 
932
639
  if (tables.items.len > 0) {
933
- context.level.remove_tables(
934
- &context.pool,
935
- snapshots,
936
- tables.items,
937
- );
640
+ for (tables.items) |*table| {
641
+ context.level.remove_table(&context.pool, snapshots, table);
642
+ }
938
643
  }
939
644
  }
940
645
 
@@ -950,10 +655,10 @@ pub fn TestContext(
950
655
 
951
656
  const snapshot = context.take_snapshot();
952
657
 
953
- context.level.set_snapshot_max(snapshot, context.reference.items[index..][0..count]);
954
658
  for (context.reference.items[index..][0..count]) |*table| {
955
- table.snapshot_max = snapshot;
659
+ context.level.set_snapshot_max(snapshot, table);
956
660
  }
661
+
957
662
  for (context.snapshot_tables.slice()) |tables| {
958
663
  for (tables.items) |*table| {
959
664
  for (context.reference.items[index..][0..count]) |modified| {
@@ -993,11 +698,13 @@ pub fn TestContext(
993
698
  }
994
699
 
995
700
  if (to_remove.items.len > 0) {
996
- context.level.remove_tables(
997
- &context.pool,
998
- context.snapshots.slice(),
999
- to_remove.items,
1000
- );
701
+ for (to_remove.items) |*table| {
702
+ context.level.remove_table(
703
+ &context.pool,
704
+ context.snapshots.slice(),
705
+ table,
706
+ );
707
+ }
1001
708
  }
1002
709
  }
1003
710