tigerbeetle-node 0.11.6 → 0.11.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/.client.node.sha256 +1 -1
- package/package.json +1 -1
- package/src/tigerbeetle/scripts/benchmark.bat +1 -2
- package/src/tigerbeetle/scripts/benchmark.sh +1 -2
- package/src/tigerbeetle/scripts/install.bat +7 -0
- package/src/tigerbeetle/scripts/install.sh +2 -3
- package/src/tigerbeetle/src/benchmark.zig +3 -3
- package/src/tigerbeetle/src/ewah.zig +6 -5
- package/src/tigerbeetle/src/ewah_fuzz.zig +1 -1
- package/src/tigerbeetle/src/io/darwin.zig +19 -0
- package/src/tigerbeetle/src/io/linux.zig +8 -0
- package/src/tigerbeetle/src/io/windows.zig +20 -2
- package/src/tigerbeetle/src/iops.zig +7 -1
- package/src/tigerbeetle/src/lsm/compaction.zig +18 -29
- package/src/tigerbeetle/src/lsm/forest_fuzz.zig +9 -5
- package/src/tigerbeetle/src/lsm/grid.zig +267 -267
- package/src/tigerbeetle/src/lsm/level_iterator.zig +18 -1
- package/src/tigerbeetle/src/lsm/manifest.zig +29 -1
- package/src/tigerbeetle/src/lsm/manifest_log.zig +5 -5
- package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +2 -2
- package/src/tigerbeetle/src/lsm/set_associative_cache.zig +26 -70
- package/src/tigerbeetle/src/lsm/table.zig +42 -0
- package/src/tigerbeetle/src/lsm/table_iterator.zig +27 -0
- package/src/tigerbeetle/src/lsm/table_mutable.zig +1 -1
- package/src/tigerbeetle/src/lsm/test.zig +2 -3
- package/src/tigerbeetle/src/lsm/tree.zig +27 -6
- package/src/tigerbeetle/src/lsm/tree_fuzz.zig +1 -1
- package/src/tigerbeetle/src/simulator.zig +0 -5
- package/src/tigerbeetle/src/storage.zig +58 -6
- package/src/tigerbeetle/src/test/cluster.zig +3 -0
- package/src/tigerbeetle/src/test/state_checker.zig +1 -1
- package/src/tigerbeetle/src/test/storage.zig +22 -1
- package/src/tigerbeetle/src/tracer.zig +50 -28
- package/src/tigerbeetle/src/unit_tests.zig +9 -4
- package/src/tigerbeetle/src/vopr.zig +4 -4
- package/src/tigerbeetle/src/vsr/client.zig +11 -7
- package/src/tigerbeetle/src/vsr/journal.zig +153 -93
- package/src/tigerbeetle/src/vsr/replica.zig +10 -20
- package/src/tigerbeetle/src/vsr/superblock.zig +19 -16
- package/src/tigerbeetle/src/vsr/superblock_free_set.zig +114 -93
- package/src/tigerbeetle/src/vsr/superblock_free_set_fuzz.zig +1 -1
- package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +1 -3
- package/src/tigerbeetle/src/vsr.zig +55 -8
- package/src/tigerbeetle/src/c/tb_client/context.zig +0 -304
- package/src/tigerbeetle/src/c/tb_client/echo_client.zig +0 -108
- package/src/tigerbeetle/src/c/tb_client/packet.zig +0 -80
- package/src/tigerbeetle/src/c/tb_client/signal.zig +0 -286
- package/src/tigerbeetle/src/c/tb_client/thread.zig +0 -88
- package/src/tigerbeetle/src/c/tb_client.h +0 -220
- package/src/tigerbeetle/src/c/tb_client.zig +0 -177
- package/src/tigerbeetle/src/c/tb_client_header.zig +0 -218
- package/src/tigerbeetle/src/c/tb_client_header_test.zig +0 -135
- package/src/tigerbeetle/src/c/test.zig +0 -371
- package/src/tigerbeetle/src/cli.zig +0 -399
- package/src/tigerbeetle/src/main.zig +0 -242
|
@@ -44,62 +44,26 @@ pub fn GridType(comptime Storage: type) type {
|
|
|
44
44
|
const block_size = constants.block_size;
|
|
45
45
|
const SuperBlock = SuperBlockType(Storage);
|
|
46
46
|
|
|
47
|
-
const cache_interface = struct {
|
|
48
|
-
inline fn address_from_block(block: *const [block_size]u8) u64 {
|
|
49
|
-
const header_bytes = block[0..@sizeOf(vsr.Header)];
|
|
50
|
-
const header = mem.bytesAsValue(vsr.Header, header_bytes);
|
|
51
|
-
const address = header.op;
|
|
52
|
-
assert(address > 0);
|
|
53
|
-
return address;
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
inline fn hash_address(address: u64) u64 {
|
|
57
|
-
assert(address > 0);
|
|
58
|
-
return std.hash.Wyhash.hash(0, mem.asBytes(&address));
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
inline fn equal_addresses(a: u64, b: u64) bool {
|
|
62
|
-
return a == b;
|
|
63
|
-
}
|
|
64
|
-
};
|
|
65
|
-
|
|
66
|
-
const set_associative_cache_ways = 16;
|
|
67
|
-
const Cache = SetAssociativeCache(
|
|
68
|
-
u64,
|
|
69
|
-
[block_size]u8,
|
|
70
|
-
cache_interface.address_from_block,
|
|
71
|
-
cache_interface.hash_address,
|
|
72
|
-
cache_interface.equal_addresses,
|
|
73
|
-
.{
|
|
74
|
-
.ways = set_associative_cache_ways,
|
|
75
|
-
.value_alignment = constants.sector_size,
|
|
76
|
-
},
|
|
77
|
-
);
|
|
78
|
-
|
|
79
47
|
return struct {
|
|
80
48
|
const Grid = @This();
|
|
81
49
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
// This + 1 ensures that it is always possible for writes to add the written block
|
|
85
|
-
// to the cache on completion, even if the maximum number of concurrent reads are in
|
|
86
|
-
// progress and have locked all but one way in the target set.
|
|
87
|
-
assert(read_iops_max + 1 <= set_associative_cache_ways);
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
// TODO put more thought into how low/high this limit should be.
|
|
50
|
+
// TODO put more thought into how low/high these limits should be.
|
|
51
|
+
pub const read_iops_max = 16;
|
|
91
52
|
pub const write_iops_max = 16;
|
|
92
53
|
|
|
93
54
|
pub const BlockPtr = *align(constants.sector_size) [block_size]u8;
|
|
94
55
|
pub const BlockPtrConst = *align(constants.sector_size) const [block_size]u8;
|
|
95
56
|
pub const Reservation = free_set.Reservation;
|
|
96
57
|
|
|
58
|
+
// Grid just reuses the Storage's NextTick abstraction for simplicity.
|
|
59
|
+
pub const NextTick = Storage.NextTick;
|
|
60
|
+
|
|
97
61
|
pub const Write = struct {
|
|
98
62
|
callback: fn (*Grid.Write) void,
|
|
99
63
|
address: u64,
|
|
100
|
-
block:
|
|
64
|
+
block: *BlockPtr,
|
|
101
65
|
|
|
102
|
-
/// Link for the write_queue linked list.
|
|
66
|
+
/// Link for the Grid.write_queue linked list.
|
|
103
67
|
next: ?*Write = null,
|
|
104
68
|
};
|
|
105
69
|
|
|
@@ -115,81 +79,129 @@ pub fn GridType(comptime Storage: type) type {
|
|
|
115
79
|
checksum: u128,
|
|
116
80
|
block_type: BlockType,
|
|
117
81
|
|
|
118
|
-
|
|
82
|
+
pending: ReadPending = .{},
|
|
83
|
+
resolves: FIFO(ReadPending) = .{},
|
|
84
|
+
|
|
85
|
+
grid: *Grid,
|
|
86
|
+
next_tick: Grid.NextTick = undefined,
|
|
87
|
+
|
|
88
|
+
/// Link for Grid.read_queue/Grid.read_recovery_queue linked lists.
|
|
119
89
|
next: ?*Read = null,
|
|
120
90
|
};
|
|
121
91
|
|
|
92
|
+
const ReadPending = struct {
|
|
93
|
+
/// Link for Read.resolves linked lists.
|
|
94
|
+
next: ?*ReadPending = null,
|
|
95
|
+
};
|
|
96
|
+
|
|
122
97
|
const ReadIOP = struct {
|
|
123
|
-
grid: *Grid,
|
|
124
98
|
completion: Storage.Read,
|
|
125
|
-
|
|
126
|
-
/// This is a pointer to a value in the block cache.
|
|
127
|
-
block: BlockPtr,
|
|
99
|
+
read: *Read,
|
|
128
100
|
};
|
|
129
101
|
|
|
102
|
+
const cache_interface = struct {
|
|
103
|
+
inline fn address_from_address(address: *const u64) u64 {
|
|
104
|
+
return address.*;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
inline fn hash_address(address: u64) u64 {
|
|
108
|
+
assert(address > 0);
|
|
109
|
+
return std.hash.Wyhash.hash(0, mem.asBytes(&address));
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
inline fn equal_addresses(a: u64, b: u64) bool {
|
|
113
|
+
return a == b;
|
|
114
|
+
}
|
|
115
|
+
};
|
|
116
|
+
|
|
117
|
+
const set_associative_cache_ways = 16;
|
|
118
|
+
|
|
119
|
+
const Cache = SetAssociativeCache(
|
|
120
|
+
u64,
|
|
121
|
+
u64,
|
|
122
|
+
cache_interface.address_from_address,
|
|
123
|
+
cache_interface.hash_address,
|
|
124
|
+
cache_interface.equal_addresses,
|
|
125
|
+
.{
|
|
126
|
+
.ways = set_associative_cache_ways,
|
|
127
|
+
.value_alignment = @alignOf(u64),
|
|
128
|
+
},
|
|
129
|
+
);
|
|
130
|
+
|
|
130
131
|
superblock: *SuperBlock,
|
|
132
|
+
|
|
133
|
+
// Each entry in cache has a corresponding block.
|
|
134
|
+
cache_blocks: []BlockPtr,
|
|
131
135
|
cache: Cache,
|
|
132
136
|
|
|
133
137
|
write_iops: IOPS(WriteIOP, write_iops_max) = .{},
|
|
134
138
|
write_queue: FIFO(Write) = .{},
|
|
135
139
|
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
///
|
|
139
|
-
/// Invariants:
|
|
140
|
-
/// * An address is listed in `read_iops` at most once. Multiple reads of the same address
|
|
141
|
-
/// (past or present) are coalesced.
|
|
140
|
+
// Each read_iops has a corresponding block.
|
|
141
|
+
read_iop_blocks: [read_iops_max]BlockPtr,
|
|
142
142
|
read_iops: IOPS(ReadIOP, read_iops_max) = .{},
|
|
143
143
|
read_queue: FIFO(Read) = .{},
|
|
144
144
|
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
read_cached_queue: FIFO(Read) = .{},
|
|
145
|
+
// List if Read.pending's which are in `read_queue` but also waiting for a free `read_iops`.
|
|
146
|
+
read_pending_queue: FIFO(ReadPending) = .{},
|
|
148
147
|
// TODO interrogate this list and do recovery in Replica.tick().
|
|
149
148
|
read_recovery_queue: FIFO(Read) = .{},
|
|
149
|
+
// True if there's a read thats resolving callbacks. If so, the read cache must not be invalidated.
|
|
150
|
+
read_resolving: bool = false,
|
|
150
151
|
|
|
151
152
|
pub fn init(allocator: mem.Allocator, superblock: *SuperBlock) !Grid {
|
|
152
153
|
// TODO Determine this at runtime based on runtime configured maximum
|
|
153
154
|
// memory usage of tigerbeetle.
|
|
154
|
-
const
|
|
155
|
+
const cache_blocks_count = 2048;
|
|
156
|
+
|
|
157
|
+
const cache_blocks = try allocator.alloc(BlockPtr, cache_blocks_count);
|
|
158
|
+
errdefer allocator.free(cache_blocks);
|
|
159
|
+
|
|
160
|
+
for (cache_blocks) |*cache_block, i| {
|
|
161
|
+
errdefer for (cache_blocks[0..i]) |block| allocator.free(block);
|
|
162
|
+
cache_block.* = try alloc_block(allocator);
|
|
163
|
+
}
|
|
155
164
|
|
|
156
|
-
var cache = try Cache.init(allocator,
|
|
165
|
+
var cache = try Cache.init(allocator, cache_blocks_count);
|
|
157
166
|
errdefer cache.deinit(allocator);
|
|
158
167
|
|
|
168
|
+
var read_iop_blocks: [read_iops_max]BlockPtr = undefined;
|
|
169
|
+
|
|
170
|
+
for (&read_iop_blocks) |*read_iop_block, i| {
|
|
171
|
+
errdefer for (read_iop_blocks[0..i]) |block| allocator.free(block);
|
|
172
|
+
read_iop_block.* = try alloc_block(allocator);
|
|
173
|
+
}
|
|
174
|
+
|
|
159
175
|
return Grid{
|
|
160
176
|
.superblock = superblock,
|
|
177
|
+
.cache_blocks = cache_blocks,
|
|
161
178
|
.cache = cache,
|
|
179
|
+
.read_iop_blocks = read_iop_blocks,
|
|
162
180
|
};
|
|
163
181
|
}
|
|
164
182
|
|
|
183
|
+
pub fn alloc_block(allocator: mem.Allocator) !BlockPtr {
|
|
184
|
+
const block = try allocator.alignedAlloc(u8, constants.sector_size, block_size);
|
|
185
|
+
return block[0..block_size];
|
|
186
|
+
}
|
|
187
|
+
|
|
165
188
|
pub fn deinit(grid: *Grid, allocator: mem.Allocator) void {
|
|
189
|
+
for (&grid.read_iop_blocks) |block| allocator.free(block);
|
|
190
|
+
|
|
166
191
|
grid.cache.deinit(allocator);
|
|
167
192
|
|
|
193
|
+
for (grid.cache_blocks) |block| allocator.free(block);
|
|
194
|
+
allocator.free(grid.cache_blocks);
|
|
195
|
+
|
|
168
196
|
grid.* = undefined;
|
|
169
197
|
}
|
|
170
198
|
|
|
171
|
-
pub fn
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
// (which may be milliseconds later due to IO.run_for_ns). This is necessary to ensure
|
|
178
|
-
// that groove prefetch completes promptly.
|
|
179
|
-
//
|
|
180
|
-
// Even still, we cap the reads processed to prevent going over
|
|
181
|
-
// any implicit time slice expected of Grid.tick(). This limit is fairly arbitrary.
|
|
182
|
-
var retry_max: u32 = 100_000;
|
|
183
|
-
while (grid.read_cached_queue.pop()) |read| {
|
|
184
|
-
if (grid.cache.get(read.address)) |block| {
|
|
185
|
-
read.callback(read, block);
|
|
186
|
-
} else {
|
|
187
|
-
grid.start_read(read);
|
|
188
|
-
}
|
|
189
|
-
|
|
190
|
-
retry_max -= 1;
|
|
191
|
-
if (retry_max == 0) break;
|
|
192
|
-
}
|
|
199
|
+
pub fn on_next_tick(
|
|
200
|
+
grid: *Grid,
|
|
201
|
+
callback: fn (*Grid.NextTick) void,
|
|
202
|
+
next_tick: *Grid.NextTick,
|
|
203
|
+
) void {
|
|
204
|
+
grid.superblock.storage.on_next_tick(callback, next_tick);
|
|
193
205
|
}
|
|
194
206
|
|
|
195
207
|
/// Returning null indicates that there are not enough free blocks to fill the reservation.
|
|
@@ -231,16 +243,16 @@ pub fn GridType(comptime Storage: type) type {
|
|
|
231
243
|
assert(address > 0);
|
|
232
244
|
{
|
|
233
245
|
var it = grid.write_queue.peek();
|
|
234
|
-
while (it) |
|
|
235
|
-
assert(address !=
|
|
236
|
-
assert(block !=
|
|
246
|
+
while (it) |queued_write| : (it = queued_write.next) {
|
|
247
|
+
assert(address != queued_write.address);
|
|
248
|
+
assert(block != queued_write.block.*);
|
|
237
249
|
}
|
|
238
250
|
}
|
|
239
251
|
{
|
|
240
252
|
var it = grid.write_iops.iterate();
|
|
241
253
|
while (it.next()) |iop| {
|
|
242
254
|
assert(address != iop.write.address);
|
|
243
|
-
assert(block != iop.write.block);
|
|
255
|
+
assert(block != iop.write.block.*);
|
|
244
256
|
}
|
|
245
257
|
}
|
|
246
258
|
}
|
|
@@ -249,38 +261,39 @@ pub fn GridType(comptime Storage: type) type {
|
|
|
249
261
|
/// Assert that the block pointer is not being used for any read if non-null.
|
|
250
262
|
fn assert_not_reading(grid: *Grid, address: u64, block: ?BlockPtrConst) void {
|
|
251
263
|
assert(address > 0);
|
|
252
|
-
for ([_]FIFO(Read){
|
|
253
|
-
grid.read_queue,
|
|
254
|
-
grid.
|
|
255
|
-
grid.read_recovery_queue,
|
|
264
|
+
for ([_]*const FIFO(Read){
|
|
265
|
+
&grid.read_queue,
|
|
266
|
+
&grid.read_recovery_queue,
|
|
256
267
|
}) |queue| {
|
|
257
268
|
var it = queue.peek();
|
|
258
|
-
while (it) |
|
|
259
|
-
assert(address !=
|
|
269
|
+
while (it) |queued_read| : (it = queued_read.next) {
|
|
270
|
+
assert(address != queued_read.address);
|
|
260
271
|
}
|
|
261
272
|
}
|
|
262
273
|
{
|
|
263
274
|
var it = grid.read_iops.iterate();
|
|
264
275
|
while (it.next()) |iop| {
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
assert(block !=
|
|
276
|
+
assert(address != iop.read.address);
|
|
277
|
+
const iop_block = grid.read_iop_blocks[grid.read_iops.index(iop)];
|
|
278
|
+
assert(block != iop_block);
|
|
268
279
|
}
|
|
269
280
|
}
|
|
270
281
|
}
|
|
271
282
|
|
|
283
|
+
/// NOTE: This will consume `block` and replace it with a fresh block.
|
|
272
284
|
pub fn write_block(
|
|
273
285
|
grid: *Grid,
|
|
274
286
|
callback: fn (*Grid.Write) void,
|
|
275
287
|
write: *Grid.Write,
|
|
276
|
-
block:
|
|
288
|
+
block: *BlockPtr,
|
|
277
289
|
address: u64,
|
|
278
290
|
) void {
|
|
279
|
-
assert(grid.superblock.opened);
|
|
280
291
|
assert(address > 0);
|
|
292
|
+
grid.assert_not_writing(address, block.*);
|
|
293
|
+
grid.assert_not_reading(address, block.*);
|
|
294
|
+
|
|
295
|
+
assert(grid.superblock.opened);
|
|
281
296
|
assert(!grid.superblock.free_set.is_free(address));
|
|
282
|
-
grid.assert_not_writing(address, block);
|
|
283
|
-
grid.assert_not_reading(address, block);
|
|
284
297
|
|
|
285
298
|
write.* = .{
|
|
286
299
|
.callback = callback,
|
|
@@ -288,27 +301,15 @@ pub fn GridType(comptime Storage: type) type {
|
|
|
288
301
|
.block = block,
|
|
289
302
|
};
|
|
290
303
|
|
|
291
|
-
const initial_iops_available = grid.write_iops.available();
|
|
292
|
-
if (initial_iops_available > 0) {
|
|
293
|
-
assert(grid.write_queue.empty());
|
|
294
|
-
}
|
|
295
|
-
|
|
296
|
-
grid.start_write(write);
|
|
297
|
-
|
|
298
|
-
if (initial_iops_available > 0) {
|
|
299
|
-
assert(grid.write_iops.available() == initial_iops_available - 1);
|
|
300
|
-
}
|
|
301
|
-
}
|
|
302
|
-
|
|
303
|
-
fn start_write(grid: *Grid, write: *Write) void {
|
|
304
|
-
grid.assert_not_writing(write.address, write.block);
|
|
305
|
-
grid.assert_not_reading(write.address, write.block);
|
|
306
|
-
|
|
307
304
|
const iop = grid.write_iops.acquire() orelse {
|
|
308
305
|
grid.write_queue.push(write);
|
|
309
306
|
return;
|
|
310
307
|
};
|
|
311
308
|
|
|
309
|
+
grid.write_block_with(iop, write);
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
fn write_block_with(grid: *Grid, iop: *WriteIOP, write: *Write) void {
|
|
312
313
|
iop.* = .{
|
|
313
314
|
.grid = grid,
|
|
314
315
|
.completion = undefined,
|
|
@@ -318,7 +319,7 @@ pub fn GridType(comptime Storage: type) type {
|
|
|
318
319
|
grid.superblock.storage.write_sectors(
|
|
319
320
|
write_block_callback,
|
|
320
321
|
&iop.completion,
|
|
321
|
-
write.block
|
|
322
|
+
write.block.*,
|
|
322
323
|
.grid,
|
|
323
324
|
block_offset(write.address),
|
|
324
325
|
);
|
|
@@ -332,27 +333,27 @@ pub fn GridType(comptime Storage: type) type {
|
|
|
332
333
|
const grid = iop.grid;
|
|
333
334
|
const completed_write = iop.write;
|
|
334
335
|
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
block_locked,
|
|
338
|
-
grid,
|
|
339
|
-
completed_write.address,
|
|
340
|
-
);
|
|
341
|
-
util.copy_disjoint(.exact, u8, cached_block, completed_write.block);
|
|
336
|
+
// We can only update the cache if the Grid is not resolving callbacks with a cache block.
|
|
337
|
+
assert(!grid.read_resolving);
|
|
342
338
|
|
|
343
|
-
|
|
339
|
+
// Insert the write block into the cache, and give the evicted block to the writer.
|
|
340
|
+
const cache_index = grid.cache.insert_index(&completed_write.address);
|
|
341
|
+
const cache_block = &grid.cache_blocks[cache_index];
|
|
342
|
+
std.mem.swap(BlockPtr, cache_block, completed_write.block);
|
|
343
|
+
if (constants.verify) {
|
|
344
|
+
std.mem.set(u8, completed_write.block.*, undefined);
|
|
345
|
+
}
|
|
344
346
|
|
|
345
347
|
// Start a queued write if possible *before* calling the completed
|
|
346
348
|
// write's callback. This ensures that if the callback calls
|
|
347
349
|
// Grid.write_block() it doesn't preempt the queue.
|
|
348
350
|
if (grid.write_queue.pop()) |queued_write| {
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
grid.
|
|
352
|
-
assert(grid.write_iops.available() == initial_iops_available - 1);
|
|
351
|
+
grid.write_block_with(iop, queued_write);
|
|
352
|
+
} else {
|
|
353
|
+
grid.write_iops.release(iop);
|
|
353
354
|
}
|
|
354
355
|
|
|
355
|
-
// This call must come after releasing the IOP. Otherwise we risk tripping
|
|
356
|
+
// This call must come after (logicall) releasing the IOP. Otherwise we risk tripping
|
|
356
357
|
// assertions forbidding concurrent writes using the same block/address
|
|
357
358
|
// if the callback calls write_block().
|
|
358
359
|
completed_write.callback(completed_write);
|
|
@@ -370,11 +371,11 @@ pub fn GridType(comptime Storage: type) type {
|
|
|
370
371
|
checksum: u128,
|
|
371
372
|
block_type: BlockType,
|
|
372
373
|
) void {
|
|
373
|
-
assert(grid.superblock.opened);
|
|
374
374
|
assert(address > 0);
|
|
375
375
|
assert(block_type != .reserved);
|
|
376
|
-
|
|
377
376
|
grid.assert_not_writing(address, null);
|
|
377
|
+
|
|
378
|
+
assert(grid.superblock.opened);
|
|
378
379
|
assert(!grid.superblock.free_set.is_free(address));
|
|
379
380
|
|
|
380
381
|
read.* = .{
|
|
@@ -382,180 +383,170 @@ pub fn GridType(comptime Storage: type) type {
|
|
|
382
383
|
.address = address,
|
|
383
384
|
.checksum = checksum,
|
|
384
385
|
.block_type = block_type,
|
|
386
|
+
.grid = grid,
|
|
385
387
|
};
|
|
386
388
|
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
// 3. The replica checkpoints.
|
|
399
|
-
// 4. The read dequeues, but the requested block is no longer allocated.
|
|
400
|
-
// TODO(State Transfer):
|
|
401
|
-
// 1. If a local read results in a fault, then the replica should attempt a
|
|
402
|
-
// remote read.
|
|
403
|
-
// 2. If a remote replica has the block then it responds (and the local read
|
|
404
|
-
// completes), otherwise it nacks.
|
|
405
|
-
// 3. If we receive too many nacks or if we get the feeling that we are too far
|
|
406
|
-
// behind (perhaps the primary nacks), then complete the read callback but now
|
|
407
|
-
// with a null result, so that it unwinds the stack all the way back to VSR,
|
|
408
|
-
// which then initiates state transfer. At present, we expect that reads always
|
|
409
|
-
// return a block, so to support this bubbling up, we'll need to make the block
|
|
410
|
-
// result optional.
|
|
411
|
-
unreachable;
|
|
412
|
-
}
|
|
413
|
-
|
|
414
|
-
// Check if a read is already in progress for the target address.
|
|
415
|
-
{
|
|
416
|
-
var it = grid.read_iops.iterate();
|
|
417
|
-
while (it.next()) |iop| {
|
|
418
|
-
const iop_read = iop.reads.peek() orelse continue;
|
|
419
|
-
if (iop_read.address == read.address) {
|
|
420
|
-
assert(iop_read.checksum == read.checksum);
|
|
421
|
-
iop.reads.push(read);
|
|
389
|
+
// Check if a read is already processing/recovering and merge with it.
|
|
390
|
+
for ([_]*const FIFO(Read){
|
|
391
|
+
&grid.read_queue,
|
|
392
|
+
&grid.read_recovery_queue,
|
|
393
|
+
}) |queue| {
|
|
394
|
+
var it = queue.peek();
|
|
395
|
+
while (it) |queued_read| : (it = queued_read.next) {
|
|
396
|
+
if (address == queued_read.address) {
|
|
397
|
+
assert(checksum == queued_read.checksum);
|
|
398
|
+
assert(block_type == queued_read.block_type);
|
|
399
|
+
queued_read.resolves.push(&read.pending);
|
|
422
400
|
return;
|
|
423
401
|
}
|
|
424
402
|
}
|
|
425
403
|
}
|
|
426
404
|
|
|
427
|
-
//
|
|
428
|
-
//
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
405
|
+
// Become the "root" read thats fetching the block for the given address.
|
|
406
|
+
// The fetch happens asynchronously to avoid stack-overflow and nested cache invalidation.
|
|
407
|
+
grid.read_queue.push(read);
|
|
408
|
+
grid.on_next_tick(read_block_tick_callback, &read.next_tick);
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
fn read_block_tick_callback(next_tick: *Storage.NextTick) void {
|
|
412
|
+
const read = @fieldParentPtr(Grid.Read, "next_tick", next_tick);
|
|
413
|
+
const grid = read.grid;
|
|
414
|
+
|
|
415
|
+
// Try to resolve the read from the cache.
|
|
416
|
+
if (grid.cache.get_index(read.address)) |cache_index| {
|
|
417
|
+
const cache_block = grid.cache_blocks[cache_index];
|
|
418
|
+
if (constants.verify) grid.verify_cached_read(read.address, cache_block);
|
|
419
|
+
grid.read_block_resolve(read, cache_block);
|
|
433
420
|
return;
|
|
434
421
|
}
|
|
435
422
|
|
|
423
|
+
// Grab an IOP to resolve the block from storage.
|
|
424
|
+
// Failure to do so means the read is queued to receive an IOP when one finishes.
|
|
436
425
|
const iop = grid.read_iops.acquire() orelse {
|
|
437
|
-
grid.
|
|
426
|
+
grid.read_pending_queue.push(&read.pending);
|
|
438
427
|
return;
|
|
439
428
|
};
|
|
440
429
|
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
);
|
|
430
|
+
grid.read_block_with(iop, read);
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
fn read_block_with(grid: *Grid, iop: *Grid.ReadIOP, read: *Grid.Read) void {
|
|
434
|
+
const address = read.address;
|
|
435
|
+
assert(address > 0);
|
|
436
|
+
|
|
437
|
+
// We can only update the cache if the Grid is not resolving callbacks with a cache block.
|
|
438
|
+
assert(!grid.read_resolving);
|
|
447
439
|
|
|
448
440
|
iop.* = .{
|
|
449
|
-
.grid = grid,
|
|
450
441
|
.completion = undefined,
|
|
451
|
-
.
|
|
442
|
+
.read = read,
|
|
452
443
|
};
|
|
453
|
-
|
|
454
|
-
// Collect the current Read and any other pending Reads for the same address to this IOP.
|
|
455
|
-
// If we didn't gather them here, they would eventually be processed at the end of
|
|
456
|
-
// read_block_callback(), but that would issue a new call to read_sectors().
|
|
457
|
-
iop.reads.push(read);
|
|
458
|
-
{
|
|
459
|
-
// Make a copy here to avoid an infinite loop from pending_reads being
|
|
460
|
-
// re-added to read_queue after not matching the current read.
|
|
461
|
-
var copy = grid.read_queue;
|
|
462
|
-
grid.read_queue = .{};
|
|
463
|
-
while (copy.pop()) |pending_read| {
|
|
464
|
-
if (pending_read.address == read.address) {
|
|
465
|
-
assert(pending_read.checksum == read.checksum);
|
|
466
|
-
iop.reads.push(pending_read);
|
|
467
|
-
} else {
|
|
468
|
-
grid.read_queue.push(pending_read);
|
|
469
|
-
}
|
|
470
|
-
}
|
|
471
|
-
}
|
|
444
|
+
const iop_block = grid.read_iop_blocks[grid.read_iops.index(iop)];
|
|
472
445
|
|
|
473
446
|
grid.superblock.storage.read_sectors(
|
|
474
447
|
read_block_callback,
|
|
475
448
|
&iop.completion,
|
|
476
|
-
|
|
449
|
+
iop_block,
|
|
477
450
|
.grid,
|
|
478
|
-
block_offset(
|
|
451
|
+
block_offset(address),
|
|
479
452
|
);
|
|
480
453
|
}
|
|
481
454
|
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
455
|
+
fn read_block_callback(completion: *Storage.Read) void {
|
|
456
|
+
const iop = @fieldParentPtr(ReadIOP, "completion", completion);
|
|
457
|
+
const read = iop.read;
|
|
458
|
+
const grid = read.grid;
|
|
459
|
+
const iop_block = &grid.read_iop_blocks[grid.read_iops.index(iop)];
|
|
460
|
+
|
|
461
|
+
// Insert the block into the cache, and give the evicted block to `iop`.
|
|
462
|
+
const cache_index = grid.cache.insert_index(&read.address);
|
|
463
|
+
const cache_block = &grid.cache_blocks[cache_index];
|
|
464
|
+
std.mem.swap(BlockPtr, iop_block, cache_block);
|
|
465
|
+
if (constants.verify) {
|
|
466
|
+
std.mem.set(u8, iop_block.*, undefined);
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
// Handoff the iop to a pending read or release it before resolving the callbacks below.
|
|
470
|
+
if (grid.read_pending_queue.pop()) |pending| {
|
|
471
|
+
const queued_read = @fieldParentPtr(Read, "pending", pending);
|
|
472
|
+
grid.read_block_with(iop, queued_read);
|
|
473
|
+
} else {
|
|
474
|
+
grid.read_iops.release(iop);
|
|
486
475
|
}
|
|
487
|
-
|
|
476
|
+
|
|
477
|
+
// A valid block filled by storage means the reads for the address can be resolved
|
|
478
|
+
if (read_block_valid(read, cache_block.*)) {
|
|
479
|
+
grid.read_block_resolve(read, cache_block.*);
|
|
480
|
+
return;
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
// On the result of an invalid block, move the "root" read (and all others it resolves)
|
|
484
|
+
// to recovery queue. Future reads on the same address will see the "root" read in the
|
|
485
|
+
// recovery queue and enqueue to it.
|
|
486
|
+
grid.read_queue.remove(read);
|
|
487
|
+
grid.read_recovery_queue.push(read);
|
|
488
488
|
}
|
|
489
489
|
|
|
490
|
-
fn
|
|
491
|
-
const
|
|
492
|
-
const
|
|
490
|
+
fn read_block_valid(read: *Grid.Read, block: BlockPtrConst) bool {
|
|
491
|
+
const address = read.address;
|
|
492
|
+
const checksum = read.checksum;
|
|
493
|
+
const block_type = read.block_type;
|
|
493
494
|
|
|
494
|
-
const header_bytes =
|
|
495
|
+
const header_bytes = block[0..@sizeOf(vsr.Header)];
|
|
495
496
|
const header = mem.bytesAsValue(vsr.Header, header_bytes);
|
|
496
497
|
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
const checksum_valid = header.valid_checksum();
|
|
502
|
-
const checksum_body_valid = checksum_valid and
|
|
503
|
-
header.valid_checksum_body(iop.block[@sizeOf(vsr.Header)..header.size]);
|
|
504
|
-
const checksum_match = header.checksum == checksum;
|
|
505
|
-
|
|
506
|
-
if (checksum_valid and checksum_body_valid and checksum_match) {
|
|
507
|
-
assert(header.op == address);
|
|
508
|
-
assert(header.operation == block_type.operation());
|
|
509
|
-
|
|
510
|
-
// NOTE: read callbacks resolved here could queue up reads into this very iop.
|
|
511
|
-
// This extends this while loop, but that's fine as it keeps the callbacks
|
|
512
|
-
// asynchronous to themselves (preventing something like a stack-overflow).
|
|
513
|
-
while (iop.reads.pop()) |read| {
|
|
514
|
-
assert(read.address == address);
|
|
515
|
-
assert(read.checksum == checksum);
|
|
516
|
-
assert(read.block_type == BlockType.from(header.operation));
|
|
517
|
-
read.callback(read, iop.block);
|
|
518
|
-
}
|
|
519
|
-
} else {
|
|
520
|
-
if (!checksum_valid) {
|
|
521
|
-
log.err("invalid checksum at address {}", .{address});
|
|
522
|
-
} else if (!checksum_body_valid) {
|
|
523
|
-
log.err("invalid checksum body at address {}", .{address});
|
|
524
|
-
} else if (!checksum_match) {
|
|
525
|
-
log.err(
|
|
526
|
-
"expected address={} checksum={} block_type={}, " ++
|
|
527
|
-
"found address={} checksum={} block_type={}",
|
|
528
|
-
.{
|
|
529
|
-
address,
|
|
530
|
-
checksum,
|
|
531
|
-
block_type,
|
|
532
|
-
header.op,
|
|
533
|
-
header.checksum,
|
|
534
|
-
@enumToInt(header.operation),
|
|
535
|
-
},
|
|
536
|
-
);
|
|
537
|
-
} else {
|
|
538
|
-
unreachable;
|
|
539
|
-
}
|
|
498
|
+
if (!header.valid_checksum()) {
|
|
499
|
+
log.err("invalid checksum at address {}", .{address});
|
|
500
|
+
return false;
|
|
501
|
+
}
|
|
540
502
|
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
iop.grid.read_recovery_queue.push(read);
|
|
545
|
-
}
|
|
503
|
+
if (!header.valid_checksum_body(block[@sizeOf(vsr.Header)..header.size])) {
|
|
504
|
+
log.err("invalid checksum body at address {}", .{address});
|
|
505
|
+
return false;
|
|
546
506
|
}
|
|
547
507
|
|
|
548
|
-
|
|
508
|
+
if (header.checksum != checksum) {
|
|
509
|
+
log.err(
|
|
510
|
+
"expected address={} checksum={} block_type={}, " ++
|
|
511
|
+
"found address={} checksum={} block_type={}",
|
|
512
|
+
.{
|
|
513
|
+
address,
|
|
514
|
+
checksum,
|
|
515
|
+
block_type,
|
|
516
|
+
header.op,
|
|
517
|
+
header.checksum,
|
|
518
|
+
@enumToInt(header.operation),
|
|
519
|
+
},
|
|
520
|
+
);
|
|
521
|
+
return false;
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
assert(header.op == address);
|
|
525
|
+
assert(header.operation == block_type.operation());
|
|
526
|
+
return true;
|
|
527
|
+
}
|
|
549
528
|
|
|
550
|
-
|
|
551
|
-
//
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
grid.start_read(read);
|
|
529
|
+
fn read_block_resolve(grid: *Grid, read: *Grid.Read, block: BlockPtrConst) void {
|
|
530
|
+
// Guard to make sure the cache cannot be updated by any read.callbacks() below.
|
|
531
|
+
assert(!grid.read_resolving);
|
|
532
|
+
grid.read_resolving = true;
|
|
533
|
+
defer {
|
|
534
|
+
assert(grid.read_resolving);
|
|
535
|
+
grid.read_resolving = false;
|
|
558
536
|
}
|
|
537
|
+
|
|
538
|
+
// Remove the "root" read so that the address is no longer actively reading / locked.
|
|
539
|
+
grid.read_queue.remove(read);
|
|
540
|
+
|
|
541
|
+
// Resolve all reads queued to the address with the block.
|
|
542
|
+
while (read.resolves.pop()) |pending| {
|
|
543
|
+
const pending_read = @fieldParentPtr(Read, "pending", pending);
|
|
544
|
+
pending_read.callback(pending_read, block);
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
// Then invoke the callback with the cache block (which should be valid for the duration
|
|
548
|
+
// of the callback as any nested Grid calls cannot synchronously update the cache).
|
|
549
|
+
read.callback(read, block);
|
|
559
550
|
}
|
|
560
551
|
|
|
561
552
|
fn block_offset(address: u64) u64 {
|
|
@@ -563,5 +554,14 @@ pub fn GridType(comptime Storage: type) type {
|
|
|
563
554
|
|
|
564
555
|
return (address - 1) * block_size;
|
|
565
556
|
}
|
|
557
|
+
|
|
558
|
+
fn verify_cached_read(grid: *Grid, address: u64, cached_block: BlockPtrConst) void {
|
|
559
|
+
if (Storage != @import("../test/storage.zig").Storage)
|
|
560
|
+
// Too complicated to do async verification
|
|
561
|
+
return;
|
|
562
|
+
|
|
563
|
+
const actual_block = grid.superblock.storage.grid_block(address);
|
|
564
|
+
assert(std.mem.eql(u8, cached_block, actual_block));
|
|
565
|
+
}
|
|
566
566
|
};
|
|
567
567
|
}
|