tigerbeetle-node 0.11.6 → 0.11.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/dist/.client.node.sha256 +1 -1
  2. package/package.json +1 -1
  3. package/src/tigerbeetle/scripts/benchmark.bat +1 -2
  4. package/src/tigerbeetle/scripts/benchmark.sh +1 -2
  5. package/src/tigerbeetle/scripts/install.bat +7 -0
  6. package/src/tigerbeetle/scripts/install.sh +2 -3
  7. package/src/tigerbeetle/src/benchmark.zig +3 -3
  8. package/src/tigerbeetle/src/ewah.zig +6 -5
  9. package/src/tigerbeetle/src/ewah_fuzz.zig +1 -1
  10. package/src/tigerbeetle/src/io/darwin.zig +19 -0
  11. package/src/tigerbeetle/src/io/linux.zig +8 -0
  12. package/src/tigerbeetle/src/io/windows.zig +20 -2
  13. package/src/tigerbeetle/src/iops.zig +7 -1
  14. package/src/tigerbeetle/src/lsm/compaction.zig +18 -29
  15. package/src/tigerbeetle/src/lsm/forest_fuzz.zig +9 -5
  16. package/src/tigerbeetle/src/lsm/grid.zig +267 -267
  17. package/src/tigerbeetle/src/lsm/level_iterator.zig +18 -1
  18. package/src/tigerbeetle/src/lsm/manifest.zig +29 -1
  19. package/src/tigerbeetle/src/lsm/manifest_log.zig +5 -5
  20. package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +2 -2
  21. package/src/tigerbeetle/src/lsm/set_associative_cache.zig +26 -70
  22. package/src/tigerbeetle/src/lsm/table.zig +42 -0
  23. package/src/tigerbeetle/src/lsm/table_iterator.zig +27 -0
  24. package/src/tigerbeetle/src/lsm/table_mutable.zig +1 -1
  25. package/src/tigerbeetle/src/lsm/test.zig +2 -3
  26. package/src/tigerbeetle/src/lsm/tree.zig +27 -6
  27. package/src/tigerbeetle/src/lsm/tree_fuzz.zig +1 -1
  28. package/src/tigerbeetle/src/simulator.zig +0 -5
  29. package/src/tigerbeetle/src/storage.zig +58 -6
  30. package/src/tigerbeetle/src/test/cluster.zig +3 -0
  31. package/src/tigerbeetle/src/test/state_checker.zig +1 -1
  32. package/src/tigerbeetle/src/test/storage.zig +22 -1
  33. package/src/tigerbeetle/src/tracer.zig +50 -28
  34. package/src/tigerbeetle/src/unit_tests.zig +9 -4
  35. package/src/tigerbeetle/src/vopr.zig +4 -4
  36. package/src/tigerbeetle/src/vsr/client.zig +11 -7
  37. package/src/tigerbeetle/src/vsr/journal.zig +153 -93
  38. package/src/tigerbeetle/src/vsr/replica.zig +10 -20
  39. package/src/tigerbeetle/src/vsr/superblock.zig +19 -16
  40. package/src/tigerbeetle/src/vsr/superblock_free_set.zig +114 -93
  41. package/src/tigerbeetle/src/vsr/superblock_free_set_fuzz.zig +1 -1
  42. package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +1 -3
  43. package/src/tigerbeetle/src/vsr.zig +55 -8
  44. package/src/tigerbeetle/src/c/tb_client/context.zig +0 -304
  45. package/src/tigerbeetle/src/c/tb_client/echo_client.zig +0 -108
  46. package/src/tigerbeetle/src/c/tb_client/packet.zig +0 -80
  47. package/src/tigerbeetle/src/c/tb_client/signal.zig +0 -286
  48. package/src/tigerbeetle/src/c/tb_client/thread.zig +0 -88
  49. package/src/tigerbeetle/src/c/tb_client.h +0 -220
  50. package/src/tigerbeetle/src/c/tb_client.zig +0 -177
  51. package/src/tigerbeetle/src/c/tb_client_header.zig +0 -218
  52. package/src/tigerbeetle/src/c/tb_client_header_test.zig +0 -135
  53. package/src/tigerbeetle/src/c/test.zig +0 -371
  54. package/src/tigerbeetle/src/cli.zig +0 -399
  55. package/src/tigerbeetle/src/main.zig +0 -242
@@ -44,62 +44,26 @@ pub fn GridType(comptime Storage: type) type {
44
44
  const block_size = constants.block_size;
45
45
  const SuperBlock = SuperBlockType(Storage);
46
46
 
47
- const cache_interface = struct {
48
- inline fn address_from_block(block: *const [block_size]u8) u64 {
49
- const header_bytes = block[0..@sizeOf(vsr.Header)];
50
- const header = mem.bytesAsValue(vsr.Header, header_bytes);
51
- const address = header.op;
52
- assert(address > 0);
53
- return address;
54
- }
55
-
56
- inline fn hash_address(address: u64) u64 {
57
- assert(address > 0);
58
- return std.hash.Wyhash.hash(0, mem.asBytes(&address));
59
- }
60
-
61
- inline fn equal_addresses(a: u64, b: u64) bool {
62
- return a == b;
63
- }
64
- };
65
-
66
- const set_associative_cache_ways = 16;
67
- const Cache = SetAssociativeCache(
68
- u64,
69
- [block_size]u8,
70
- cache_interface.address_from_block,
71
- cache_interface.hash_address,
72
- cache_interface.equal_addresses,
73
- .{
74
- .ways = set_associative_cache_ways,
75
- .value_alignment = constants.sector_size,
76
- },
77
- );
78
-
79
47
  return struct {
80
48
  const Grid = @This();
81
49
 
82
- pub const read_iops_max = 15;
83
- comptime {
84
- // This + 1 ensures that it is always possible for writes to add the written block
85
- // to the cache on completion, even if the maximum number of concurrent reads are in
86
- // progress and have locked all but one way in the target set.
87
- assert(read_iops_max + 1 <= set_associative_cache_ways);
88
- }
89
-
90
- // TODO put more thought into how low/high this limit should be.
50
+ // TODO put more thought into how low/high these limits should be.
51
+ pub const read_iops_max = 16;
91
52
  pub const write_iops_max = 16;
92
53
 
93
54
  pub const BlockPtr = *align(constants.sector_size) [block_size]u8;
94
55
  pub const BlockPtrConst = *align(constants.sector_size) const [block_size]u8;
95
56
  pub const Reservation = free_set.Reservation;
96
57
 
58
+ // Grid just reuses the Storage's NextTick abstraction for simplicity.
59
+ pub const NextTick = Storage.NextTick;
60
+
97
61
  pub const Write = struct {
98
62
  callback: fn (*Grid.Write) void,
99
63
  address: u64,
100
- block: BlockPtrConst,
64
+ block: *BlockPtr,
101
65
 
102
- /// Link for the write_queue linked list.
66
+ /// Link for the Grid.write_queue linked list.
103
67
  next: ?*Write = null,
104
68
  };
105
69
 
@@ -115,81 +79,129 @@ pub fn GridType(comptime Storage: type) type {
115
79
  checksum: u128,
116
80
  block_type: BlockType,
117
81
 
118
- /// Link for read_queue/read_recovery_queue/ReadIOP.reads linked lists.
82
+ pending: ReadPending = .{},
83
+ resolves: FIFO(ReadPending) = .{},
84
+
85
+ grid: *Grid,
86
+ next_tick: Grid.NextTick = undefined,
87
+
88
+ /// Link for Grid.read_queue/Grid.read_recovery_queue linked lists.
119
89
  next: ?*Read = null,
120
90
  };
121
91
 
92
+ const ReadPending = struct {
93
+ /// Link for Read.resolves linked lists.
94
+ next: ?*ReadPending = null,
95
+ };
96
+
122
97
  const ReadIOP = struct {
123
- grid: *Grid,
124
98
  completion: Storage.Read,
125
- reads: FIFO(Read) = .{},
126
- /// This is a pointer to a value in the block cache.
127
- block: BlockPtr,
99
+ read: *Read,
128
100
  };
129
101
 
102
+ const cache_interface = struct {
103
+ inline fn address_from_address(address: *const u64) u64 {
104
+ return address.*;
105
+ }
106
+
107
+ inline fn hash_address(address: u64) u64 {
108
+ assert(address > 0);
109
+ return std.hash.Wyhash.hash(0, mem.asBytes(&address));
110
+ }
111
+
112
+ inline fn equal_addresses(a: u64, b: u64) bool {
113
+ return a == b;
114
+ }
115
+ };
116
+
117
+ const set_associative_cache_ways = 16;
118
+
119
+ const Cache = SetAssociativeCache(
120
+ u64,
121
+ u64,
122
+ cache_interface.address_from_address,
123
+ cache_interface.hash_address,
124
+ cache_interface.equal_addresses,
125
+ .{
126
+ .ways = set_associative_cache_ways,
127
+ .value_alignment = @alignOf(u64),
128
+ },
129
+ );
130
+
130
131
  superblock: *SuperBlock,
132
+
133
+ // Each entry in cache has a corresponding block.
134
+ cache_blocks: []BlockPtr,
131
135
  cache: Cache,
132
136
 
133
137
  write_iops: IOPS(WriteIOP, write_iops_max) = .{},
134
138
  write_queue: FIFO(Write) = .{},
135
139
 
136
- /// `read_iops` maintains a list of ReadIOPs currently performing storage.read_sector() on
137
- /// a unique address.
138
- ///
139
- /// Invariants:
140
- /// * An address is listed in `read_iops` at most once. Multiple reads of the same address
141
- /// (past or present) are coalesced.
140
+ // Each read_iops has a corresponding block.
141
+ read_iop_blocks: [read_iops_max]BlockPtr,
142
142
  read_iops: IOPS(ReadIOP, read_iops_max) = .{},
143
143
  read_queue: FIFO(Read) = .{},
144
144
 
145
- /// Reads that were found to be in the cache on start_read() and queued to be resolved on
146
- /// the next tick(). This keeps read_block() always asynchronous to the caller.
147
- read_cached_queue: FIFO(Read) = .{},
145
+ // List if Read.pending's which are in `read_queue` but also waiting for a free `read_iops`.
146
+ read_pending_queue: FIFO(ReadPending) = .{},
148
147
  // TODO interrogate this list and do recovery in Replica.tick().
149
148
  read_recovery_queue: FIFO(Read) = .{},
149
+ // True if there's a read thats resolving callbacks. If so, the read cache must not be invalidated.
150
+ read_resolving: bool = false,
150
151
 
151
152
  pub fn init(allocator: mem.Allocator, superblock: *SuperBlock) !Grid {
152
153
  // TODO Determine this at runtime based on runtime configured maximum
153
154
  // memory usage of tigerbeetle.
154
- const blocks_in_cache = 2048;
155
+ const cache_blocks_count = 2048;
156
+
157
+ const cache_blocks = try allocator.alloc(BlockPtr, cache_blocks_count);
158
+ errdefer allocator.free(cache_blocks);
159
+
160
+ for (cache_blocks) |*cache_block, i| {
161
+ errdefer for (cache_blocks[0..i]) |block| allocator.free(block);
162
+ cache_block.* = try alloc_block(allocator);
163
+ }
155
164
 
156
- var cache = try Cache.init(allocator, blocks_in_cache);
165
+ var cache = try Cache.init(allocator, cache_blocks_count);
157
166
  errdefer cache.deinit(allocator);
158
167
 
168
+ var read_iop_blocks: [read_iops_max]BlockPtr = undefined;
169
+
170
+ for (&read_iop_blocks) |*read_iop_block, i| {
171
+ errdefer for (read_iop_blocks[0..i]) |block| allocator.free(block);
172
+ read_iop_block.* = try alloc_block(allocator);
173
+ }
174
+
159
175
  return Grid{
160
176
  .superblock = superblock,
177
+ .cache_blocks = cache_blocks,
161
178
  .cache = cache,
179
+ .read_iop_blocks = read_iop_blocks,
162
180
  };
163
181
  }
164
182
 
183
+ pub fn alloc_block(allocator: mem.Allocator) !BlockPtr {
184
+ const block = try allocator.alignedAlloc(u8, constants.sector_size, block_size);
185
+ return block[0..block_size];
186
+ }
187
+
165
188
  pub fn deinit(grid: *Grid, allocator: mem.Allocator) void {
189
+ for (&grid.read_iop_blocks) |block| allocator.free(block);
190
+
166
191
  grid.cache.deinit(allocator);
167
192
 
193
+ for (grid.cache_blocks) |block| allocator.free(block);
194
+ allocator.free(grid.cache_blocks);
195
+
168
196
  grid.* = undefined;
169
197
  }
170
198
 
171
- pub fn tick(grid: *Grid) void {
172
- // Resolve reads that were seen in the cache during start_read()
173
- // but deferred to be asynchronously resolved on the next tick.
174
- //
175
- // Drain directly from the queue so that new cache reads (added upon completion of old
176
- // cache reads) that can be serviced immediately aren't deferred until the next tick
177
- // (which may be milliseconds later due to IO.run_for_ns). This is necessary to ensure
178
- // that groove prefetch completes promptly.
179
- //
180
- // Even still, we cap the reads processed to prevent going over
181
- // any implicit time slice expected of Grid.tick(). This limit is fairly arbitrary.
182
- var retry_max: u32 = 100_000;
183
- while (grid.read_cached_queue.pop()) |read| {
184
- if (grid.cache.get(read.address)) |block| {
185
- read.callback(read, block);
186
- } else {
187
- grid.start_read(read);
188
- }
189
-
190
- retry_max -= 1;
191
- if (retry_max == 0) break;
192
- }
199
+ pub fn on_next_tick(
200
+ grid: *Grid,
201
+ callback: fn (*Grid.NextTick) void,
202
+ next_tick: *Grid.NextTick,
203
+ ) void {
204
+ grid.superblock.storage.on_next_tick(callback, next_tick);
193
205
  }
194
206
 
195
207
  /// Returning null indicates that there are not enough free blocks to fill the reservation.
@@ -231,16 +243,16 @@ pub fn GridType(comptime Storage: type) type {
231
243
  assert(address > 0);
232
244
  {
233
245
  var it = grid.write_queue.peek();
234
- while (it) |pending_write| : (it = pending_write.next) {
235
- assert(address != pending_write.address);
236
- assert(block != pending_write.block);
246
+ while (it) |queued_write| : (it = queued_write.next) {
247
+ assert(address != queued_write.address);
248
+ assert(block != queued_write.block.*);
237
249
  }
238
250
  }
239
251
  {
240
252
  var it = grid.write_iops.iterate();
241
253
  while (it.next()) |iop| {
242
254
  assert(address != iop.write.address);
243
- assert(block != iop.write.block);
255
+ assert(block != iop.write.block.*);
244
256
  }
245
257
  }
246
258
  }
@@ -249,38 +261,39 @@ pub fn GridType(comptime Storage: type) type {
249
261
  /// Assert that the block pointer is not being used for any read if non-null.
250
262
  fn assert_not_reading(grid: *Grid, address: u64, block: ?BlockPtrConst) void {
251
263
  assert(address > 0);
252
- for ([_]FIFO(Read){
253
- grid.read_queue,
254
- grid.read_cached_queue,
255
- grid.read_recovery_queue,
264
+ for ([_]*const FIFO(Read){
265
+ &grid.read_queue,
266
+ &grid.read_recovery_queue,
256
267
  }) |queue| {
257
268
  var it = queue.peek();
258
- while (it) |pending_read| : (it = pending_read.next) {
259
- assert(address != pending_read.address);
269
+ while (it) |queued_read| : (it = queued_read.next) {
270
+ assert(address != queued_read.address);
260
271
  }
261
272
  }
262
273
  {
263
274
  var it = grid.read_iops.iterate();
264
275
  while (it.next()) |iop| {
265
- const iop_read = iop.reads.peek() orelse continue;
266
- assert(address != iop_read.address);
267
- assert(block != iop.block);
276
+ assert(address != iop.read.address);
277
+ const iop_block = grid.read_iop_blocks[grid.read_iops.index(iop)];
278
+ assert(block != iop_block);
268
279
  }
269
280
  }
270
281
  }
271
282
 
283
+ /// NOTE: This will consume `block` and replace it with a fresh block.
272
284
  pub fn write_block(
273
285
  grid: *Grid,
274
286
  callback: fn (*Grid.Write) void,
275
287
  write: *Grid.Write,
276
- block: BlockPtrConst,
288
+ block: *BlockPtr,
277
289
  address: u64,
278
290
  ) void {
279
- assert(grid.superblock.opened);
280
291
  assert(address > 0);
292
+ grid.assert_not_writing(address, block.*);
293
+ grid.assert_not_reading(address, block.*);
294
+
295
+ assert(grid.superblock.opened);
281
296
  assert(!grid.superblock.free_set.is_free(address));
282
- grid.assert_not_writing(address, block);
283
- grid.assert_not_reading(address, block);
284
297
 
285
298
  write.* = .{
286
299
  .callback = callback,
@@ -288,27 +301,15 @@ pub fn GridType(comptime Storage: type) type {
288
301
  .block = block,
289
302
  };
290
303
 
291
- const initial_iops_available = grid.write_iops.available();
292
- if (initial_iops_available > 0) {
293
- assert(grid.write_queue.empty());
294
- }
295
-
296
- grid.start_write(write);
297
-
298
- if (initial_iops_available > 0) {
299
- assert(grid.write_iops.available() == initial_iops_available - 1);
300
- }
301
- }
302
-
303
- fn start_write(grid: *Grid, write: *Write) void {
304
- grid.assert_not_writing(write.address, write.block);
305
- grid.assert_not_reading(write.address, write.block);
306
-
307
304
  const iop = grid.write_iops.acquire() orelse {
308
305
  grid.write_queue.push(write);
309
306
  return;
310
307
  };
311
308
 
309
+ grid.write_block_with(iop, write);
310
+ }
311
+
312
+ fn write_block_with(grid: *Grid, iop: *WriteIOP, write: *Write) void {
312
313
  iop.* = .{
313
314
  .grid = grid,
314
315
  .completion = undefined,
@@ -318,7 +319,7 @@ pub fn GridType(comptime Storage: type) type {
318
319
  grid.superblock.storage.write_sectors(
319
320
  write_block_callback,
320
321
  &iop.completion,
321
- write.block,
322
+ write.block.*,
322
323
  .grid,
323
324
  block_offset(write.address),
324
325
  );
@@ -332,27 +333,27 @@ pub fn GridType(comptime Storage: type) type {
332
333
  const grid = iop.grid;
333
334
  const completed_write = iop.write;
334
335
 
335
- const cached_block = grid.cache.insert_preserve_locked(
336
- *Grid,
337
- block_locked,
338
- grid,
339
- completed_write.address,
340
- );
341
- util.copy_disjoint(.exact, u8, cached_block, completed_write.block);
336
+ // We can only update the cache if the Grid is not resolving callbacks with a cache block.
337
+ assert(!grid.read_resolving);
342
338
 
343
- grid.write_iops.release(iop);
339
+ // Insert the write block into the cache, and give the evicted block to the writer.
340
+ const cache_index = grid.cache.insert_index(&completed_write.address);
341
+ const cache_block = &grid.cache_blocks[cache_index];
342
+ std.mem.swap(BlockPtr, cache_block, completed_write.block);
343
+ if (constants.verify) {
344
+ std.mem.set(u8, completed_write.block.*, undefined);
345
+ }
344
346
 
345
347
  // Start a queued write if possible *before* calling the completed
346
348
  // write's callback. This ensures that if the callback calls
347
349
  // Grid.write_block() it doesn't preempt the queue.
348
350
  if (grid.write_queue.pop()) |queued_write| {
349
- const initial_iops_available = grid.write_iops.available();
350
- assert(initial_iops_available > 0);
351
- grid.start_write(queued_write);
352
- assert(grid.write_iops.available() == initial_iops_available - 1);
351
+ grid.write_block_with(iop, queued_write);
352
+ } else {
353
+ grid.write_iops.release(iop);
353
354
  }
354
355
 
355
- // This call must come after releasing the IOP. Otherwise we risk tripping
356
+ // This call must come after (logicall) releasing the IOP. Otherwise we risk tripping
356
357
  // assertions forbidding concurrent writes using the same block/address
357
358
  // if the callback calls write_block().
358
359
  completed_write.callback(completed_write);
@@ -370,11 +371,11 @@ pub fn GridType(comptime Storage: type) type {
370
371
  checksum: u128,
371
372
  block_type: BlockType,
372
373
  ) void {
373
- assert(grid.superblock.opened);
374
374
  assert(address > 0);
375
375
  assert(block_type != .reserved);
376
-
377
376
  grid.assert_not_writing(address, null);
377
+
378
+ assert(grid.superblock.opened);
378
379
  assert(!grid.superblock.free_set.is_free(address));
379
380
 
380
381
  read.* = .{
@@ -382,180 +383,170 @@ pub fn GridType(comptime Storage: type) type {
382
383
  .address = address,
383
384
  .checksum = checksum,
384
385
  .block_type = block_type,
386
+ .grid = grid,
385
387
  };
386
388
 
387
- grid.start_read(read);
388
- }
389
-
390
- fn start_read(grid: *Grid, read: *Grid.Read) void {
391
- grid.assert_not_writing(read.address, null);
392
-
393
- if (grid.superblock.free_set.is_free(read.address)) {
394
- // We cannot assert `free_set.is_free()` because of the following case:
395
- // 1. The replica receives a request_block from a repairing replica.
396
- // The block is allocated but not cached — but is due to be freed at checkpoint.
397
- // 2. All of the Grid's Read IOPS are occupied, so queue the read.
398
- // 3. The replica checkpoints.
399
- // 4. The read dequeues, but the requested block is no longer allocated.
400
- // TODO(State Transfer):
401
- // 1. If a local read results in a fault, then the replica should attempt a
402
- // remote read.
403
- // 2. If a remote replica has the block then it responds (and the local read
404
- // completes), otherwise it nacks.
405
- // 3. If we receive too many nacks or if we get the feeling that we are too far
406
- // behind (perhaps the primary nacks), then complete the read callback but now
407
- // with a null result, so that it unwinds the stack all the way back to VSR,
408
- // which then initiates state transfer. At present, we expect that reads always
409
- // return a block, so to support this bubbling up, we'll need to make the block
410
- // result optional.
411
- unreachable;
412
- }
413
-
414
- // Check if a read is already in progress for the target address.
415
- {
416
- var it = grid.read_iops.iterate();
417
- while (it.next()) |iop| {
418
- const iop_read = iop.reads.peek() orelse continue;
419
- if (iop_read.address == read.address) {
420
- assert(iop_read.checksum == read.checksum);
421
- iop.reads.push(read);
389
+ // Check if a read is already processing/recovering and merge with it.
390
+ for ([_]*const FIFO(Read){
391
+ &grid.read_queue,
392
+ &grid.read_recovery_queue,
393
+ }) |queue| {
394
+ var it = queue.peek();
395
+ while (it) |queued_read| : (it = queued_read.next) {
396
+ if (address == queued_read.address) {
397
+ assert(checksum == queued_read.checksum);
398
+ assert(block_type == queued_read.block_type);
399
+ queued_read.resolves.push(&read.pending);
422
400
  return;
423
401
  }
424
402
  }
425
403
  }
426
404
 
427
- // If the block is already in the cache, queue up the read to be resolved
428
- // from the cache on the next tick. This keeps start_read() asynchronous.
429
- // Note that this must be called after we have checked for an in
430
- // progress read targeting the same address.
431
- if (grid.cache.exists(read.address)) {
432
- grid.read_cached_queue.push(read);
405
+ // Become the "root" read thats fetching the block for the given address.
406
+ // The fetch happens asynchronously to avoid stack-overflow and nested cache invalidation.
407
+ grid.read_queue.push(read);
408
+ grid.on_next_tick(read_block_tick_callback, &read.next_tick);
409
+ }
410
+
411
+ fn read_block_tick_callback(next_tick: *Storage.NextTick) void {
412
+ const read = @fieldParentPtr(Grid.Read, "next_tick", next_tick);
413
+ const grid = read.grid;
414
+
415
+ // Try to resolve the read from the cache.
416
+ if (grid.cache.get_index(read.address)) |cache_index| {
417
+ const cache_block = grid.cache_blocks[cache_index];
418
+ if (constants.verify) grid.verify_cached_read(read.address, cache_block);
419
+ grid.read_block_resolve(read, cache_block);
433
420
  return;
434
421
  }
435
422
 
423
+ // Grab an IOP to resolve the block from storage.
424
+ // Failure to do so means the read is queued to receive an IOP when one finishes.
436
425
  const iop = grid.read_iops.acquire() orelse {
437
- grid.read_queue.push(read);
426
+ grid.read_pending_queue.push(&read.pending);
438
427
  return;
439
428
  };
440
429
 
441
- const block = grid.cache.insert_preserve_locked(
442
- *Grid,
443
- block_locked,
444
- grid,
445
- read.address,
446
- );
430
+ grid.read_block_with(iop, read);
431
+ }
432
+
433
+ fn read_block_with(grid: *Grid, iop: *Grid.ReadIOP, read: *Grid.Read) void {
434
+ const address = read.address;
435
+ assert(address > 0);
436
+
437
+ // We can only update the cache if the Grid is not resolving callbacks with a cache block.
438
+ assert(!grid.read_resolving);
447
439
 
448
440
  iop.* = .{
449
- .grid = grid,
450
441
  .completion = undefined,
451
- .block = block,
442
+ .read = read,
452
443
  };
453
-
454
- // Collect the current Read and any other pending Reads for the same address to this IOP.
455
- // If we didn't gather them here, they would eventually be processed at the end of
456
- // read_block_callback(), but that would issue a new call to read_sectors().
457
- iop.reads.push(read);
458
- {
459
- // Make a copy here to avoid an infinite loop from pending_reads being
460
- // re-added to read_queue after not matching the current read.
461
- var copy = grid.read_queue;
462
- grid.read_queue = .{};
463
- while (copy.pop()) |pending_read| {
464
- if (pending_read.address == read.address) {
465
- assert(pending_read.checksum == read.checksum);
466
- iop.reads.push(pending_read);
467
- } else {
468
- grid.read_queue.push(pending_read);
469
- }
470
- }
471
- }
444
+ const iop_block = grid.read_iop_blocks[grid.read_iops.index(iop)];
472
445
 
473
446
  grid.superblock.storage.read_sectors(
474
447
  read_block_callback,
475
448
  &iop.completion,
476
- iop.block,
449
+ iop_block,
477
450
  .grid,
478
- block_offset(read.address),
451
+ block_offset(address),
479
452
  );
480
453
  }
481
454
 
482
- inline fn block_locked(grid: *Grid, block: BlockPtrConst) bool {
483
- var it = grid.read_iops.iterate();
484
- while (it.next()) |iop| {
485
- if (block == iop.block) return true;
455
+ fn read_block_callback(completion: *Storage.Read) void {
456
+ const iop = @fieldParentPtr(ReadIOP, "completion", completion);
457
+ const read = iop.read;
458
+ const grid = read.grid;
459
+ const iop_block = &grid.read_iop_blocks[grid.read_iops.index(iop)];
460
+
461
+ // Insert the block into the cache, and give the evicted block to `iop`.
462
+ const cache_index = grid.cache.insert_index(&read.address);
463
+ const cache_block = &grid.cache_blocks[cache_index];
464
+ std.mem.swap(BlockPtr, iop_block, cache_block);
465
+ if (constants.verify) {
466
+ std.mem.set(u8, iop_block.*, undefined);
467
+ }
468
+
469
+ // Handoff the iop to a pending read or release it before resolving the callbacks below.
470
+ if (grid.read_pending_queue.pop()) |pending| {
471
+ const queued_read = @fieldParentPtr(Read, "pending", pending);
472
+ grid.read_block_with(iop, queued_read);
473
+ } else {
474
+ grid.read_iops.release(iop);
486
475
  }
487
- return false;
476
+
477
+ // A valid block filled by storage means the reads for the address can be resolved
478
+ if (read_block_valid(read, cache_block.*)) {
479
+ grid.read_block_resolve(read, cache_block.*);
480
+ return;
481
+ }
482
+
483
+ // On the result of an invalid block, move the "root" read (and all others it resolves)
484
+ // to recovery queue. Future reads on the same address will see the "root" read in the
485
+ // recovery queue and enqueue to it.
486
+ grid.read_queue.remove(read);
487
+ grid.read_recovery_queue.push(read);
488
488
  }
489
489
 
490
- fn read_block_callback(completion: *Storage.Read) void {
491
- const iop = @fieldParentPtr(ReadIOP, "completion", completion);
492
- const grid = iop.grid;
490
+ fn read_block_valid(read: *Grid.Read, block: BlockPtrConst) bool {
491
+ const address = read.address;
492
+ const checksum = read.checksum;
493
+ const block_type = read.block_type;
493
494
 
494
- const header_bytes = iop.block[0..@sizeOf(vsr.Header)];
495
+ const header_bytes = block[0..@sizeOf(vsr.Header)];
495
496
  const header = mem.bytesAsValue(vsr.Header, header_bytes);
496
497
 
497
- const address = iop.reads.peek().?.address;
498
- const checksum = iop.reads.peek().?.checksum;
499
- const block_type = iop.reads.peek().?.block_type;
500
-
501
- const checksum_valid = header.valid_checksum();
502
- const checksum_body_valid = checksum_valid and
503
- header.valid_checksum_body(iop.block[@sizeOf(vsr.Header)..header.size]);
504
- const checksum_match = header.checksum == checksum;
505
-
506
- if (checksum_valid and checksum_body_valid and checksum_match) {
507
- assert(header.op == address);
508
- assert(header.operation == block_type.operation());
509
-
510
- // NOTE: read callbacks resolved here could queue up reads into this very iop.
511
- // This extends this while loop, but that's fine as it keeps the callbacks
512
- // asynchronous to themselves (preventing something like a stack-overflow).
513
- while (iop.reads.pop()) |read| {
514
- assert(read.address == address);
515
- assert(read.checksum == checksum);
516
- assert(read.block_type == BlockType.from(header.operation));
517
- read.callback(read, iop.block);
518
- }
519
- } else {
520
- if (!checksum_valid) {
521
- log.err("invalid checksum at address {}", .{address});
522
- } else if (!checksum_body_valid) {
523
- log.err("invalid checksum body at address {}", .{address});
524
- } else if (!checksum_match) {
525
- log.err(
526
- "expected address={} checksum={} block_type={}, " ++
527
- "found address={} checksum={} block_type={}",
528
- .{
529
- address,
530
- checksum,
531
- block_type,
532
- header.op,
533
- header.checksum,
534
- @enumToInt(header.operation),
535
- },
536
- );
537
- } else {
538
- unreachable;
539
- }
498
+ if (!header.valid_checksum()) {
499
+ log.err("invalid checksum at address {}", .{address});
500
+ return false;
501
+ }
540
502
 
541
- // IOP reads that fail checksum validation get punted to a recovery queue.
542
- // TODO: Have the replica do something with the pending reads here.
543
- while (iop.reads.pop()) |read| {
544
- iop.grid.read_recovery_queue.push(read);
545
- }
503
+ if (!header.valid_checksum_body(block[@sizeOf(vsr.Header)..header.size])) {
504
+ log.err("invalid checksum body at address {}", .{address});
505
+ return false;
546
506
  }
547
507
 
548
- grid.read_iops.release(iop);
508
+ if (header.checksum != checksum) {
509
+ log.err(
510
+ "expected address={} checksum={} block_type={}, " ++
511
+ "found address={} checksum={} block_type={}",
512
+ .{
513
+ address,
514
+ checksum,
515
+ block_type,
516
+ header.op,
517
+ header.checksum,
518
+ @enumToInt(header.operation),
519
+ },
520
+ );
521
+ return false;
522
+ }
523
+
524
+ assert(header.op == address);
525
+ assert(header.operation == block_type.operation());
526
+ return true;
527
+ }
549
528
 
550
- // Always iterate through the full list of pending reads instead of just one to ensure
551
- // that those serviced from the cache don't prevent others waiting for an IOP from
552
- // seeing the IOP that was just released.
553
- var copy = grid.read_queue;
554
- grid.read_queue = .{};
555
- while (copy.pop()) |read| {
556
- assert(read.address != address);
557
- grid.start_read(read);
529
+ fn read_block_resolve(grid: *Grid, read: *Grid.Read, block: BlockPtrConst) void {
530
+ // Guard to make sure the cache cannot be updated by any read.callbacks() below.
531
+ assert(!grid.read_resolving);
532
+ grid.read_resolving = true;
533
+ defer {
534
+ assert(grid.read_resolving);
535
+ grid.read_resolving = false;
558
536
  }
537
+
538
+ // Remove the "root" read so that the address is no longer actively reading / locked.
539
+ grid.read_queue.remove(read);
540
+
541
+ // Resolve all reads queued to the address with the block.
542
+ while (read.resolves.pop()) |pending| {
543
+ const pending_read = @fieldParentPtr(Read, "pending", pending);
544
+ pending_read.callback(pending_read, block);
545
+ }
546
+
547
+ // Then invoke the callback with the cache block (which should be valid for the duration
548
+ // of the callback as any nested Grid calls cannot synchronously update the cache).
549
+ read.callback(read, block);
559
550
  }
560
551
 
561
552
  fn block_offset(address: u64) u64 {
@@ -563,5 +554,14 @@ pub fn GridType(comptime Storage: type) type {
563
554
 
564
555
  return (address - 1) * block_size;
565
556
  }
557
+
558
+ fn verify_cached_read(grid: *Grid, address: u64, cached_block: BlockPtrConst) void {
559
+ if (Storage != @import("../test/storage.zig").Storage)
560
+ // Too complicated to do async verification
561
+ return;
562
+
563
+ const actual_block = grid.superblock.storage.grid_block(address);
564
+ assert(std.mem.eql(u8, cached_block, actual_block));
565
+ }
566
566
  };
567
567
  }