tigerbeetle-node 0.9.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -2
- package/dist/index.d.ts +66 -61
- package/dist/index.js +66 -61
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/src/index.ts +5 -0
- package/src/node.zig +17 -18
- package/src/tigerbeetle/scripts/benchmark.bat +4 -3
- package/src/tigerbeetle/scripts/benchmark.sh +25 -10
- package/src/tigerbeetle/scripts/install.sh +2 -1
- package/src/tigerbeetle/scripts/install_zig.sh +14 -18
- package/src/tigerbeetle/scripts/upgrade_ubuntu_kernel.sh +12 -3
- package/src/tigerbeetle/scripts/vopr.sh +5 -5
- package/src/tigerbeetle/src/benchmark.zig +17 -9
- package/src/tigerbeetle/src/benchmark_array_search.zig +317 -0
- package/src/tigerbeetle/src/benchmarks/perf.zig +299 -0
- package/src/tigerbeetle/src/c/tb_client/context.zig +103 -0
- package/src/tigerbeetle/src/c/tb_client/packet.zig +80 -0
- package/src/tigerbeetle/src/c/tb_client/signal.zig +288 -0
- package/src/tigerbeetle/src/c/tb_client/thread.zig +329 -0
- package/src/tigerbeetle/src/c/tb_client.h +201 -0
- package/src/tigerbeetle/src/c/tb_client.zig +101 -0
- package/src/tigerbeetle/src/c/test.zig +1 -0
- package/src/tigerbeetle/src/cli.zig +142 -83
- package/src/tigerbeetle/src/config.zig +119 -10
- package/src/tigerbeetle/src/demo.zig +12 -8
- package/src/tigerbeetle/src/demo_05_post_pending_transfers.zig +2 -2
- package/src/tigerbeetle/src/ewah.zig +318 -0
- package/src/tigerbeetle/src/ewah_benchmark.zig +121 -0
- package/src/tigerbeetle/src/eytzinger_benchmark.zig +317 -0
- package/src/tigerbeetle/src/fifo.zig +17 -1
- package/src/tigerbeetle/src/io/darwin.zig +12 -10
- package/src/tigerbeetle/src/io/linux.zig +25 -9
- package/src/tigerbeetle/src/io/windows.zig +13 -9
- package/src/tigerbeetle/src/iops.zig +101 -0
- package/src/tigerbeetle/src/lsm/binary_search.zig +214 -0
- package/src/tigerbeetle/src/lsm/bloom_filter.zig +82 -0
- package/src/tigerbeetle/src/lsm/compaction.zig +603 -0
- package/src/tigerbeetle/src/lsm/composite_key.zig +75 -0
- package/src/tigerbeetle/src/lsm/direction.zig +11 -0
- package/src/tigerbeetle/src/lsm/eytzinger.zig +587 -0
- package/src/tigerbeetle/src/lsm/forest.zig +630 -0
- package/src/tigerbeetle/src/lsm/grid.zig +473 -0
- package/src/tigerbeetle/src/lsm/groove.zig +939 -0
- package/src/tigerbeetle/src/lsm/k_way_merge.zig +452 -0
- package/src/tigerbeetle/src/lsm/level_iterator.zig +296 -0
- package/src/tigerbeetle/src/lsm/manifest.zig +680 -0
- package/src/tigerbeetle/src/lsm/manifest_level.zig +1169 -0
- package/src/tigerbeetle/src/lsm/manifest_log.zig +904 -0
- package/src/tigerbeetle/src/lsm/node_pool.zig +231 -0
- package/src/tigerbeetle/src/lsm/posted_groove.zig +399 -0
- package/src/tigerbeetle/src/lsm/segmented_array.zig +998 -0
- package/src/tigerbeetle/src/lsm/set_associative_cache.zig +844 -0
- package/src/tigerbeetle/src/lsm/table.zig +932 -0
- package/src/tigerbeetle/src/lsm/table_immutable.zig +196 -0
- package/src/tigerbeetle/src/lsm/table_iterator.zig +295 -0
- package/src/tigerbeetle/src/lsm/table_mutable.zig +123 -0
- package/src/tigerbeetle/src/lsm/test.zig +429 -0
- package/src/tigerbeetle/src/lsm/tree.zig +1085 -0
- package/src/tigerbeetle/src/main.zig +119 -109
- package/src/tigerbeetle/src/message_bus.zig +49 -48
- package/src/tigerbeetle/src/message_pool.zig +15 -2
- package/src/tigerbeetle/src/ring_buffer.zig +126 -30
- package/src/tigerbeetle/src/simulator.zig +76 -44
- package/src/tigerbeetle/src/state_machine.zig +1022 -585
- package/src/tigerbeetle/src/storage.zig +46 -16
- package/src/tigerbeetle/src/test/cluster.zig +109 -63
- package/src/tigerbeetle/src/test/message_bus.zig +15 -24
- package/src/tigerbeetle/src/test/network.zig +26 -17
- package/src/tigerbeetle/src/test/state_checker.zig +7 -5
- package/src/tigerbeetle/src/test/state_machine.zig +159 -69
- package/src/tigerbeetle/src/test/storage.zig +57 -28
- package/src/tigerbeetle/src/tigerbeetle.zig +5 -0
- package/src/tigerbeetle/src/unit_tests.zig +8 -0
- package/src/tigerbeetle/src/util.zig +51 -0
- package/src/tigerbeetle/src/vsr/client.zig +21 -7
- package/src/tigerbeetle/src/vsr/journal.zig +154 -167
- package/src/tigerbeetle/src/vsr/replica.zig +744 -226
- package/src/tigerbeetle/src/vsr/superblock.zig +1743 -0
- package/src/tigerbeetle/src/vsr/superblock_client_table.zig +258 -0
- package/src/tigerbeetle/src/vsr/superblock_free_set.zig +644 -0
- package/src/tigerbeetle/src/vsr/superblock_manifest.zig +546 -0
- package/src/tigerbeetle/src/vsr.zig +43 -115
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
const std = @import("std");
|
|
2
|
+
const assert = std.debug.assert;
|
|
3
|
+
const math = std.math;
|
|
4
|
+
const mem = std.mem;
|
|
5
|
+
const meta = std.meta;
|
|
6
|
+
|
|
7
|
+
pub fn NodePool(comptime _node_size: u32, comptime _node_alignment: u13) type {
|
|
8
|
+
return struct {
|
|
9
|
+
const Self = @This();
|
|
10
|
+
|
|
11
|
+
pub const node_size = _node_size;
|
|
12
|
+
pub const node_alignment = _node_alignment;
|
|
13
|
+
pub const Node = *align(node_alignment) [node_size]u8;
|
|
14
|
+
|
|
15
|
+
comptime {
|
|
16
|
+
assert(node_size > 0);
|
|
17
|
+
assert(node_alignment > 0);
|
|
18
|
+
assert(node_alignment <= 4096);
|
|
19
|
+
assert(math.isPowerOfTwo(node_size));
|
|
20
|
+
assert(math.isPowerOfTwo(node_alignment));
|
|
21
|
+
assert(node_size % node_alignment == 0);
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
buffer: []align(node_alignment) u8,
|
|
25
|
+
free: std.bit_set.DynamicBitSetUnmanaged,
|
|
26
|
+
|
|
27
|
+
pub fn init(allocator: mem.Allocator, node_count: u32) !Self {
|
|
28
|
+
assert(node_count > 0);
|
|
29
|
+
|
|
30
|
+
const size = node_size * node_count;
|
|
31
|
+
const buffer = try allocator.allocAdvanced(u8, node_alignment, size, .exact);
|
|
32
|
+
errdefer allocator.free(buffer);
|
|
33
|
+
|
|
34
|
+
const free = try std.bit_set.DynamicBitSetUnmanaged.initFull(allocator, node_count);
|
|
35
|
+
errdefer free.deinit(allocator);
|
|
36
|
+
|
|
37
|
+
return Self{
|
|
38
|
+
.buffer = buffer,
|
|
39
|
+
.free = free,
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
pub fn deinit(pool: *Self, allocator: mem.Allocator) void {
|
|
44
|
+
allocator.free(pool.buffer);
|
|
45
|
+
pool.free.deinit(allocator);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
pub fn acquire(pool: *Self) Node {
|
|
49
|
+
// TODO: To ensure this "unreachable" is never reached, the primary must reject
|
|
50
|
+
// new requests when storage space is too low to fulfill them.
|
|
51
|
+
const node_index = pool.free.findFirstSet() orelse unreachable;
|
|
52
|
+
assert(pool.free.isSet(node_index));
|
|
53
|
+
pool.free.unset(node_index);
|
|
54
|
+
|
|
55
|
+
return pool.buffer[node_index * node_size ..][0..node_size];
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
pub fn release(pool: *Self, node: Node) void {
|
|
59
|
+
// Our pointer arithmetic assumes that the unit of node_size is a u8.
|
|
60
|
+
comptime assert(meta.Elem(Node) == u8);
|
|
61
|
+
comptime assert(meta.Elem(@TypeOf(pool.buffer)) == u8);
|
|
62
|
+
|
|
63
|
+
assert(@ptrToInt(node) >= @ptrToInt(pool.buffer.ptr));
|
|
64
|
+
assert(@ptrToInt(node) + node_size <= @ptrToInt(pool.buffer.ptr) + pool.buffer.len);
|
|
65
|
+
|
|
66
|
+
const node_index = @divExact(@ptrToInt(node) - @ptrToInt(pool.buffer.ptr), node_size);
|
|
67
|
+
assert(!pool.free.isSet(node_index));
|
|
68
|
+
pool.free.set(node_index);
|
|
69
|
+
}
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
fn TestContext(comptime node_size: usize, comptime node_alignment: u12) type {
|
|
74
|
+
const testing = std.testing;
|
|
75
|
+
const TestPool = NodePool(node_size, node_alignment);
|
|
76
|
+
|
|
77
|
+
const log = false;
|
|
78
|
+
|
|
79
|
+
return struct {
|
|
80
|
+
const Self = @This();
|
|
81
|
+
|
|
82
|
+
node_count: u32,
|
|
83
|
+
random: std.rand.Random,
|
|
84
|
+
node_pool: TestPool,
|
|
85
|
+
node_map: std.AutoArrayHashMap(TestPool.Node, u64),
|
|
86
|
+
sentinel: u64,
|
|
87
|
+
|
|
88
|
+
acquires: u64 = 0,
|
|
89
|
+
releases: u64 = 0,
|
|
90
|
+
|
|
91
|
+
fn init(random: std.rand.Random, node_count: u32) !Self {
|
|
92
|
+
var node_pool = try TestPool.init(testing.allocator, node_count);
|
|
93
|
+
errdefer node_pool.deinit(testing.allocator);
|
|
94
|
+
|
|
95
|
+
var node_map = std.AutoArrayHashMap(TestPool.Node, u64).init(testing.allocator);
|
|
96
|
+
errdefer node_map.deinit();
|
|
97
|
+
|
|
98
|
+
const sentinel = random.int(u64);
|
|
99
|
+
mem.set(u64, mem.bytesAsSlice(u64, node_pool.buffer), sentinel);
|
|
100
|
+
|
|
101
|
+
return Self{
|
|
102
|
+
.node_count = node_count,
|
|
103
|
+
.random = random,
|
|
104
|
+
.node_pool = node_pool,
|
|
105
|
+
.node_map = node_map,
|
|
106
|
+
.sentinel = sentinel,
|
|
107
|
+
};
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
fn deinit(context: *Self) void {
|
|
111
|
+
context.node_pool.deinit(testing.allocator);
|
|
112
|
+
context.node_map.deinit();
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
fn run(context: *Self) !void {
|
|
116
|
+
{
|
|
117
|
+
var i: usize = 0;
|
|
118
|
+
while (i < context.node_count * 4) : (i += 1) {
|
|
119
|
+
switch (context.random.uintLessThanBiased(u32, 100)) {
|
|
120
|
+
0...59 => try context.acquire(),
|
|
121
|
+
60...99 => try context.release(),
|
|
122
|
+
else => unreachable,
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
{
|
|
128
|
+
var i: usize = 0;
|
|
129
|
+
while (i < context.node_count * 4) : (i += 1) {
|
|
130
|
+
switch (context.random.uintLessThanBiased(u32, 100)) {
|
|
131
|
+
0...39 => try context.acquire(),
|
|
132
|
+
40...99 => try context.release(),
|
|
133
|
+
else => unreachable,
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
try context.release_all();
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
fn acquire(context: *Self) !void {
|
|
142
|
+
if (context.node_map.count() == context.node_count) return;
|
|
143
|
+
|
|
144
|
+
const node = context.node_pool.acquire();
|
|
145
|
+
|
|
146
|
+
// Verify that this node has not already been acquired.
|
|
147
|
+
for (mem.bytesAsSlice(u64, node)) |word| {
|
|
148
|
+
try testing.expectEqual(context.sentinel, word);
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
const gop = try context.node_map.getOrPut(node);
|
|
152
|
+
try testing.expect(!gop.found_existing);
|
|
153
|
+
|
|
154
|
+
// Write unique data into the node so we can test that it doesn't get overwritten.
|
|
155
|
+
const id = context.random.int(u64);
|
|
156
|
+
mem.set(u64, mem.bytesAsSlice(u64, node), id);
|
|
157
|
+
gop.value_ptr.* = id;
|
|
158
|
+
|
|
159
|
+
context.acquires += 1;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
fn release(context: *Self) !void {
|
|
163
|
+
if (context.node_map.count() == 0) return;
|
|
164
|
+
|
|
165
|
+
const index = context.random.uintLessThanBiased(usize, context.node_map.count());
|
|
166
|
+
const node = context.node_map.keys()[index];
|
|
167
|
+
const id = context.node_map.values()[index];
|
|
168
|
+
|
|
169
|
+
// Verify that the data of this node has not been overwritten since we acquired it.
|
|
170
|
+
for (mem.bytesAsSlice(u64, node)) |word| {
|
|
171
|
+
try testing.expectEqual(id, word);
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
mem.set(u64, mem.bytesAsSlice(u64, node), context.sentinel);
|
|
175
|
+
context.node_pool.release(node);
|
|
176
|
+
context.node_map.swapRemoveAt(index);
|
|
177
|
+
|
|
178
|
+
context.releases += 1;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
fn release_all(context: *Self) !void {
|
|
182
|
+
while (context.node_map.count() > 0) try context.release();
|
|
183
|
+
|
|
184
|
+
// Verify that nothing in the entire buffer has been acquired.
|
|
185
|
+
for (mem.bytesAsSlice(u64, context.node_pool.buffer)) |word| {
|
|
186
|
+
try testing.expectEqual(context.sentinel, word);
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
if (log) {
|
|
190
|
+
std.debug.print("\nacquires: {}, releases: {}\n", .{
|
|
191
|
+
context.acquires,
|
|
192
|
+
context.releases,
|
|
193
|
+
});
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
try testing.expect(context.acquires > 0);
|
|
197
|
+
try testing.expect(context.acquires == context.releases);
|
|
198
|
+
}
|
|
199
|
+
};
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
test "NodePool" {
|
|
203
|
+
const seed = 42;
|
|
204
|
+
|
|
205
|
+
var prng = std.rand.DefaultPrng.init(seed);
|
|
206
|
+
const random = prng.random();
|
|
207
|
+
|
|
208
|
+
const Tuple = struct {
|
|
209
|
+
node_size: u32,
|
|
210
|
+
node_alignment: u12,
|
|
211
|
+
};
|
|
212
|
+
|
|
213
|
+
inline for (.{
|
|
214
|
+
Tuple{ .node_size = 8, .node_alignment = 8 },
|
|
215
|
+
Tuple{ .node_size = 16, .node_alignment = 8 },
|
|
216
|
+
Tuple{ .node_size = 64, .node_alignment = 8 },
|
|
217
|
+
Tuple{ .node_size = 16, .node_alignment = 16 },
|
|
218
|
+
Tuple{ .node_size = 32, .node_alignment = 16 },
|
|
219
|
+
Tuple{ .node_size = 128, .node_alignment = 16 },
|
|
220
|
+
}) |tuple| {
|
|
221
|
+
const Context = TestContext(tuple.node_size, tuple.node_alignment);
|
|
222
|
+
|
|
223
|
+
var i: u32 = 1;
|
|
224
|
+
while (i < 64) : (i += 1) {
|
|
225
|
+
var context = try Context.init(random, i);
|
|
226
|
+
defer context.deinit();
|
|
227
|
+
|
|
228
|
+
try context.run();
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
}
|
|
@@ -0,0 +1,399 @@
|
|
|
1
|
+
const std = @import("std");
|
|
2
|
+
const builtin = @import("builtin");
|
|
3
|
+
const assert = std.debug.assert;
|
|
4
|
+
const math = std.math;
|
|
5
|
+
const mem = std.mem;
|
|
6
|
+
|
|
7
|
+
const config = @import("../config.zig");
|
|
8
|
+
|
|
9
|
+
const TableType = @import("table.zig").TableType;
|
|
10
|
+
const TreeType = @import("tree.zig").TreeType;
|
|
11
|
+
const GridType = @import("grid.zig").GridType;
|
|
12
|
+
const NodePool = @import("node_pool.zig").NodePool(config.lsm_manifest_node_size, 16);
|
|
13
|
+
|
|
14
|
+
const snapshot_latest = @import("tree.zig").snapshot_latest;
|
|
15
|
+
|
|
16
|
+
/// This type wraps a single LSM tree in the API needed to integrate it with the Forest.
|
|
17
|
+
/// TigerBeetle's state machine requires a map from u128 ID to posted boolean for transfers
|
|
18
|
+
/// and this type implements that.
|
|
19
|
+
/// TODO Make the LSM Forest library flexible enough to be able to get rid of this special case.
|
|
20
|
+
pub fn PostedGrooveType(comptime Storage: type) type {
|
|
21
|
+
return struct {
|
|
22
|
+
const PostedGroove = @This();
|
|
23
|
+
|
|
24
|
+
const Value = extern struct {
|
|
25
|
+
id: u128,
|
|
26
|
+
data: enum(u8) {
|
|
27
|
+
posted,
|
|
28
|
+
voided,
|
|
29
|
+
tombstone,
|
|
30
|
+
},
|
|
31
|
+
padding: [15]u8 = [_]u8{0} ** 15,
|
|
32
|
+
|
|
33
|
+
comptime {
|
|
34
|
+
// Assert that there is no implicit padding.
|
|
35
|
+
assert(@sizeOf(Value) == 32);
|
|
36
|
+
assert(@bitSizeOf(Value) == 32 * 8);
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
inline fn compare_keys(a: u128, b: u128) math.Order {
|
|
40
|
+
return math.order(a, b);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
inline fn key_from_value(value: *const Value) u128 {
|
|
44
|
+
return value.id;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// TODO(ifreund): disallow this id in the state machine.
|
|
48
|
+
const sentinel_key = math.maxInt(u128);
|
|
49
|
+
|
|
50
|
+
inline fn tombstone(value: *const Value) bool {
|
|
51
|
+
return value.data == .tombstone;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
inline fn tombstone_from_key(id: u128) Value {
|
|
55
|
+
return .{
|
|
56
|
+
.id = id,
|
|
57
|
+
.data = .tombstone,
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
};
|
|
61
|
+
|
|
62
|
+
const Table = TableType(
|
|
63
|
+
u128,
|
|
64
|
+
Value,
|
|
65
|
+
Value.compare_keys,
|
|
66
|
+
Value.key_from_value,
|
|
67
|
+
Value.sentinel_key,
|
|
68
|
+
Value.tombstone,
|
|
69
|
+
Value.tombstone_from_key,
|
|
70
|
+
);
|
|
71
|
+
|
|
72
|
+
const Tree = TreeType(Table, Storage, "groove");
|
|
73
|
+
const Grid = GridType(Storage);
|
|
74
|
+
|
|
75
|
+
const PrefetchIDs = std.AutoHashMapUnmanaged(u128, void);
|
|
76
|
+
const PrefetchObjects = std.AutoHashMapUnmanaged(u128, bool);
|
|
77
|
+
|
|
78
|
+
cache: *Tree.ValueCache,
|
|
79
|
+
tree: Tree,
|
|
80
|
+
|
|
81
|
+
/// Object IDs enqueued to be prefetched.
|
|
82
|
+
/// Prefetching ensures that point lookups against the latest snapshot are synchronous.
|
|
83
|
+
/// This shields state machine implementations from the challenges of concurrency and I/O,
|
|
84
|
+
/// and enables simple state machine function signatures that commit writes atomically.
|
|
85
|
+
prefetch_ids: PrefetchIDs,
|
|
86
|
+
|
|
87
|
+
/// The prefetched Objects. This hash map holds the subset of objects in the LSM tree
|
|
88
|
+
/// that are required for the current commit. All get()/put()/remove() operations during
|
|
89
|
+
/// the commit are both passed to the LSM trees and mirrored in this hash map. It is always
|
|
90
|
+
/// sufficient to query this hashmap alone to know the state of the LSM trees.
|
|
91
|
+
prefetch_objects: PrefetchObjects,
|
|
92
|
+
|
|
93
|
+
/// This field is necessary to expose the same open()/compact_cpu()/compact_io() function
|
|
94
|
+
/// signatures as the real Groove type.
|
|
95
|
+
callback: ?fn (*PostedGroove) void = null,
|
|
96
|
+
|
|
97
|
+
pub fn init(
|
|
98
|
+
allocator: mem.Allocator,
|
|
99
|
+
node_pool: *NodePool,
|
|
100
|
+
grid: *Grid,
|
|
101
|
+
// The cache size is meant to be computed based on the left over available memory
|
|
102
|
+
// that tigerbeetle was given to allocate from CLI arguments.
|
|
103
|
+
cache_size: u32,
|
|
104
|
+
// In general, the commit count max for a field, depends on the field's object,
|
|
105
|
+
// how many objects might be changed by a batch:
|
|
106
|
+
// (config.message_size_max - sizeOf(vsr.header))
|
|
107
|
+
// For example, there are at most 8191 transfers in a batch.
|
|
108
|
+
// So commit_count_max=8191 for transfer objects and indexes.
|
|
109
|
+
//
|
|
110
|
+
// However, if a transfer is ever mutated, then this will double commit_count_max
|
|
111
|
+
// since the old index might need to be removed, and the new index inserted.
|
|
112
|
+
//
|
|
113
|
+
// A way to see this is by looking at the state machine. If a transfer is inserted,
|
|
114
|
+
// how many accounts and transfer put/removes will be generated?
|
|
115
|
+
//
|
|
116
|
+
// This also means looking at the state machine operation that will generate the
|
|
117
|
+
// most put/removes in the worst case.
|
|
118
|
+
// For example, create_accounts will put at most 8191 accounts.
|
|
119
|
+
// However, create_transfers will put 2 accounts (8191 * 2) for every transfer, and
|
|
120
|
+
// some of these accounts may exist, requiring a remove/put to update the index.
|
|
121
|
+
commit_count_max: u32,
|
|
122
|
+
) !PostedGroove {
|
|
123
|
+
// Cache is dynamically allocated to pass a pointer into the Object tree.
|
|
124
|
+
const cache = try allocator.create(Tree.ValueCache);
|
|
125
|
+
errdefer allocator.destroy(cache);
|
|
126
|
+
|
|
127
|
+
cache.* = .{};
|
|
128
|
+
try cache.ensureTotalCapacity(allocator, cache_size);
|
|
129
|
+
errdefer cache.deinit(allocator);
|
|
130
|
+
|
|
131
|
+
var tree = try Tree.init(
|
|
132
|
+
allocator,
|
|
133
|
+
node_pool,
|
|
134
|
+
grid,
|
|
135
|
+
cache,
|
|
136
|
+
.{
|
|
137
|
+
.commit_count_max = commit_count_max,
|
|
138
|
+
},
|
|
139
|
+
);
|
|
140
|
+
errdefer tree.deinit(allocator);
|
|
141
|
+
|
|
142
|
+
// TODO: document why this is twice the commit count max.
|
|
143
|
+
const prefetch_count_max = commit_count_max * 2;
|
|
144
|
+
|
|
145
|
+
var prefetch_ids = PrefetchIDs{};
|
|
146
|
+
try prefetch_ids.ensureTotalCapacity(allocator, prefetch_count_max);
|
|
147
|
+
errdefer prefetch_ids.deinit(allocator);
|
|
148
|
+
|
|
149
|
+
var prefetch_objects = PrefetchObjects{};
|
|
150
|
+
try prefetch_objects.ensureTotalCapacity(allocator, prefetch_count_max);
|
|
151
|
+
errdefer prefetch_objects.deinit(allocator);
|
|
152
|
+
|
|
153
|
+
return PostedGroove{
|
|
154
|
+
.cache = cache,
|
|
155
|
+
.tree = tree,
|
|
156
|
+
|
|
157
|
+
.prefetch_ids = prefetch_ids,
|
|
158
|
+
.prefetch_objects = prefetch_objects,
|
|
159
|
+
};
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
pub fn deinit(groove: *PostedGroove, allocator: mem.Allocator) void {
|
|
163
|
+
assert(groove.callback == null);
|
|
164
|
+
|
|
165
|
+
groove.tree.deinit(allocator);
|
|
166
|
+
groove.cache.deinit(allocator);
|
|
167
|
+
allocator.destroy(groove.cache);
|
|
168
|
+
|
|
169
|
+
groove.prefetch_ids.deinit(allocator);
|
|
170
|
+
groove.prefetch_objects.deinit(allocator);
|
|
171
|
+
|
|
172
|
+
groove.* = undefined;
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
pub fn get(groove: *const PostedGroove, id: u128) ?bool {
|
|
176
|
+
return groove.prefetch_objects.get(id);
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
/// Must be called directly after the state machine commit is finished and prefetch results
|
|
180
|
+
/// are no longer needed.
|
|
181
|
+
pub fn prefetch_clear(groove: *PostedGroove) void {
|
|
182
|
+
groove.prefetch_objects.clearRetainingCapacity();
|
|
183
|
+
assert(groove.prefetch_objects.count() == 0);
|
|
184
|
+
assert(groove.prefetch_ids.count() == 0);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
/// This must be called by the state machine for every key to be prefetched.
|
|
188
|
+
/// We tolerate duplicate IDs enqueued by the state machine.
|
|
189
|
+
/// For example, if all unique operations require the same two dependencies.
|
|
190
|
+
pub fn prefetch_enqueue(groove: *PostedGroove, id: u128) void {
|
|
191
|
+
if (groove.tree.get_cached(id)) |value| {
|
|
192
|
+
switch (value.data) {
|
|
193
|
+
.posted => groove.prefetch_objects.putAssumeCapacity(value.id, true),
|
|
194
|
+
.voided => groove.prefetch_objects.putAssumeCapacity(value.id, false),
|
|
195
|
+
.tombstone => {}, // Leave the ID out of prefetch_objects.
|
|
196
|
+
}
|
|
197
|
+
} else {
|
|
198
|
+
groove.prefetch_ids.putAssumeCapacity(id, {});
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
/// Ensure the objects corresponding to all ids enqueued with prefetch_enqueue() are
|
|
203
|
+
/// in memory, either in the value cache of the object tree or in the prefetch_objects
|
|
204
|
+
/// backup hash map.
|
|
205
|
+
pub fn prefetch(
|
|
206
|
+
groove: *PostedGroove,
|
|
207
|
+
callback: fn (*PrefetchContext) void,
|
|
208
|
+
context: *PrefetchContext,
|
|
209
|
+
) void {
|
|
210
|
+
context.* = .{
|
|
211
|
+
.groove = groove,
|
|
212
|
+
.callback = callback,
|
|
213
|
+
.id_iterator = groove.prefetch_ids.keyIterator(),
|
|
214
|
+
};
|
|
215
|
+
context.start_workers();
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
pub const PrefetchContext = struct {
|
|
219
|
+
groove: *PostedGroove,
|
|
220
|
+
callback: fn (*PrefetchContext) void,
|
|
221
|
+
|
|
222
|
+
id_iterator: PrefetchIDs.KeyIterator,
|
|
223
|
+
|
|
224
|
+
/// The goal is to fully utilize the disk I/O to ensure the prefetch completes as
|
|
225
|
+
/// quickly as possible, so we run multiple lookups in parallel based on the max
|
|
226
|
+
/// I/O depth of the Grid.
|
|
227
|
+
workers: [Grid.read_iops_max]PrefetchWorker = undefined,
|
|
228
|
+
/// The number of workers that are currently running in parallel.
|
|
229
|
+
workers_busy: u32 = 0,
|
|
230
|
+
|
|
231
|
+
fn start_workers(context: *PrefetchContext) void {
|
|
232
|
+
assert(context.workers_busy == 0);
|
|
233
|
+
|
|
234
|
+
// Track an extra "worker" that will finish after the loop.
|
|
235
|
+
//
|
|
236
|
+
// This prevents `context.finish()` from being called within the loop body when every
|
|
237
|
+
// worker finishes synchronously. `context.finish()` sets the `context` to undefined,
|
|
238
|
+
// but `context` is required for the last loop condition check.
|
|
239
|
+
context.workers_busy += 1;
|
|
240
|
+
|
|
241
|
+
// -1 to ignore the extra worker.
|
|
242
|
+
while (context.workers_busy - 1 < context.workers.len) {
|
|
243
|
+
const worker = &context.workers[context.workers_busy - 1];
|
|
244
|
+
worker.* = .{ .context = context };
|
|
245
|
+
context.workers_busy += 1;
|
|
246
|
+
if (!worker.lookup_start()) break;
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
assert(context.workers_busy >= 1);
|
|
250
|
+
context.worker_finished();
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
fn worker_finished(context: *PrefetchContext) void {
|
|
254
|
+
context.workers_busy -= 1;
|
|
255
|
+
if (context.workers_busy == 0) context.finish();
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
fn finish(context: *PrefetchContext) void {
|
|
259
|
+
assert(context.workers_busy == 0);
|
|
260
|
+
assert(context.groove.prefetch_ids.count() == 0);
|
|
261
|
+
assert(context.id_iterator.next() == null);
|
|
262
|
+
|
|
263
|
+
const callback = context.callback;
|
|
264
|
+
context.* = undefined;
|
|
265
|
+
callback(context);
|
|
266
|
+
}
|
|
267
|
+
};
|
|
268
|
+
|
|
269
|
+
pub const PrefetchWorker = struct {
|
|
270
|
+
context: *PrefetchContext,
|
|
271
|
+
lookup_id: Tree.LookupContext = undefined,
|
|
272
|
+
|
|
273
|
+
/// Returns true if asynchronous I/O has been started.
|
|
274
|
+
/// Returns false if there are no more IDs to prefetch.
|
|
275
|
+
fn lookup_start(worker: *PrefetchWorker) bool {
|
|
276
|
+
const groove = worker.context.groove;
|
|
277
|
+
|
|
278
|
+
const id = worker.context.id_iterator.next() orelse {
|
|
279
|
+
groove.prefetch_ids.clearRetainingCapacity();
|
|
280
|
+
assert(groove.prefetch_ids.count() == 0);
|
|
281
|
+
worker.context.worker_finished();
|
|
282
|
+
return false;
|
|
283
|
+
};
|
|
284
|
+
|
|
285
|
+
if (config.verify) {
|
|
286
|
+
// This is checked in prefetch_enqueue()
|
|
287
|
+
assert(groove.tree.get_cached(id.*) == null);
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
// If not in the LSM tree's cache, the object must be read from disk and added
|
|
291
|
+
// to the auxillary prefetch_objects hash map.
|
|
292
|
+
// TODO: this LSM tree function needlessly checks the LSM tree's cache a
|
|
293
|
+
// second time. Adding API to the LSM tree to avoid this may be worthwhile.
|
|
294
|
+
groove.tree.lookup(
|
|
295
|
+
lookup_id_callback,
|
|
296
|
+
&worker.lookup_id,
|
|
297
|
+
snapshot_latest,
|
|
298
|
+
id.*,
|
|
299
|
+
);
|
|
300
|
+
|
|
301
|
+
return true;
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
fn lookup_id_callback(
|
|
305
|
+
completion: *Tree.LookupContext,
|
|
306
|
+
result: ?*const Value,
|
|
307
|
+
) void {
|
|
308
|
+
const worker = @fieldParentPtr(PrefetchWorker, "lookup_id", completion);
|
|
309
|
+
const groove = worker.context.groove;
|
|
310
|
+
|
|
311
|
+
if (result) |value| {
|
|
312
|
+
switch (value.data) {
|
|
313
|
+
.posted => {
|
|
314
|
+
groove.prefetch_objects.putAssumeCapacityNoClobber(value.id, true);
|
|
315
|
+
},
|
|
316
|
+
.voided => {
|
|
317
|
+
groove.prefetch_objects.putAssumeCapacityNoClobber(value.id, false);
|
|
318
|
+
},
|
|
319
|
+
.tombstone => {
|
|
320
|
+
// Leave the ID out of prefetch_objects.
|
|
321
|
+
},
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
worker.lookup_finish();
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
fn lookup_finish(worker: *PrefetchWorker) void {
|
|
328
|
+
if (!worker.lookup_start()) {
|
|
329
|
+
worker.* = undefined;
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
};
|
|
333
|
+
|
|
334
|
+
pub fn put_no_clobber(groove: *PostedGroove, id: u128, posted: bool) void {
|
|
335
|
+
const gop = groove.prefetch_objects.getOrPutAssumeCapacity(id);
|
|
336
|
+
assert(!gop.found_existing);
|
|
337
|
+
|
|
338
|
+
const value: Value = .{
|
|
339
|
+
.id = id,
|
|
340
|
+
.data = if (posted) .posted else .voided,
|
|
341
|
+
};
|
|
342
|
+
groove.tree.put(&value);
|
|
343
|
+
gop.value_ptr.* = posted;
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
pub fn remove(groove: *PostedGroove, id: u128) void {
|
|
347
|
+
assert(groove.prefetch_objects.remove(id));
|
|
348
|
+
groove.tree.remove(&Value{ .id = id, .data = .tombstone });
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
fn tree_callback(tree: *Tree) void {
|
|
352
|
+
const groove = @fieldParentPtr(PostedGroove, "tree", tree);
|
|
353
|
+
const callback = groove.callback.?;
|
|
354
|
+
groove.callback = null;
|
|
355
|
+
callback(groove);
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
pub fn open(groove: *PostedGroove, callback: fn (*PostedGroove) void) void {
|
|
359
|
+
assert(groove.callback == null);
|
|
360
|
+
groove.callback = callback;
|
|
361
|
+
groove.tree.open(tree_callback);
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
pub fn compact(groove: *PostedGroove, callback: fn (*PostedGroove) void, op: u64) void {
|
|
365
|
+
assert(groove.callback == null);
|
|
366
|
+
groove.callback = callback;
|
|
367
|
+
groove.tree.compact(tree_callback, op);
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
pub fn checkpoint(groove: *PostedGroove, callback: fn (*PostedGroove) void) void {
|
|
371
|
+
assert(groove.callback == null);
|
|
372
|
+
groove.callback = callback;
|
|
373
|
+
groove.tree.checkpoint(tree_callback);
|
|
374
|
+
}
|
|
375
|
+
};
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
test "PostedGroove" {
|
|
379
|
+
const Storage = @import("../storage.zig").Storage;
|
|
380
|
+
|
|
381
|
+
const PostedGroove = PostedGrooveType(Storage);
|
|
382
|
+
|
|
383
|
+
_ = PostedGroove.init;
|
|
384
|
+
_ = PostedGroove.deinit;
|
|
385
|
+
|
|
386
|
+
_ = PostedGroove.get;
|
|
387
|
+
_ = PostedGroove.put_no_clobber;
|
|
388
|
+
_ = PostedGroove.remove;
|
|
389
|
+
|
|
390
|
+
_ = PostedGroove.compact;
|
|
391
|
+
_ = PostedGroove.checkpoint;
|
|
392
|
+
|
|
393
|
+
_ = PostedGroove.prefetch_enqueue;
|
|
394
|
+
_ = PostedGroove.prefetch;
|
|
395
|
+
_ = PostedGroove.prefetch_clear;
|
|
396
|
+
|
|
397
|
+
std.testing.refAllDecls(PostedGroove.PrefetchWorker);
|
|
398
|
+
std.testing.refAllDecls(PostedGroove.PrefetchContext);
|
|
399
|
+
}
|