tigerbeetle-node 0.11.6 → 0.11.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/dist/.client.node.sha256 +1 -1
  2. package/package.json +1 -1
  3. package/src/tigerbeetle/scripts/benchmark.bat +1 -2
  4. package/src/tigerbeetle/scripts/benchmark.sh +1 -2
  5. package/src/tigerbeetle/scripts/install.bat +7 -0
  6. package/src/tigerbeetle/scripts/install.sh +2 -3
  7. package/src/tigerbeetle/src/benchmark.zig +3 -3
  8. package/src/tigerbeetle/src/ewah.zig +6 -5
  9. package/src/tigerbeetle/src/ewah_fuzz.zig +1 -1
  10. package/src/tigerbeetle/src/io/darwin.zig +19 -0
  11. package/src/tigerbeetle/src/io/linux.zig +8 -0
  12. package/src/tigerbeetle/src/io/windows.zig +20 -2
  13. package/src/tigerbeetle/src/iops.zig +7 -1
  14. package/src/tigerbeetle/src/lsm/compaction.zig +18 -29
  15. package/src/tigerbeetle/src/lsm/forest_fuzz.zig +9 -5
  16. package/src/tigerbeetle/src/lsm/grid.zig +267 -267
  17. package/src/tigerbeetle/src/lsm/level_iterator.zig +18 -1
  18. package/src/tigerbeetle/src/lsm/manifest.zig +29 -1
  19. package/src/tigerbeetle/src/lsm/manifest_log.zig +5 -5
  20. package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +2 -2
  21. package/src/tigerbeetle/src/lsm/set_associative_cache.zig +26 -70
  22. package/src/tigerbeetle/src/lsm/table.zig +42 -0
  23. package/src/tigerbeetle/src/lsm/table_iterator.zig +27 -0
  24. package/src/tigerbeetle/src/lsm/table_mutable.zig +1 -1
  25. package/src/tigerbeetle/src/lsm/test.zig +2 -3
  26. package/src/tigerbeetle/src/lsm/tree.zig +27 -6
  27. package/src/tigerbeetle/src/lsm/tree_fuzz.zig +1 -1
  28. package/src/tigerbeetle/src/simulator.zig +0 -5
  29. package/src/tigerbeetle/src/storage.zig +58 -6
  30. package/src/tigerbeetle/src/test/cluster.zig +3 -0
  31. package/src/tigerbeetle/src/test/state_checker.zig +1 -1
  32. package/src/tigerbeetle/src/test/storage.zig +22 -1
  33. package/src/tigerbeetle/src/tracer.zig +50 -28
  34. package/src/tigerbeetle/src/unit_tests.zig +9 -4
  35. package/src/tigerbeetle/src/vopr.zig +4 -4
  36. package/src/tigerbeetle/src/vsr/journal.zig +153 -93
  37. package/src/tigerbeetle/src/vsr/replica.zig +10 -20
  38. package/src/tigerbeetle/src/vsr/superblock.zig +19 -16
  39. package/src/tigerbeetle/src/vsr/superblock_free_set.zig +114 -93
  40. package/src/tigerbeetle/src/vsr/superblock_free_set_fuzz.zig +1 -1
  41. package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +1 -3
  42. package/src/tigerbeetle/src/vsr.zig +55 -8
  43. package/src/tigerbeetle/src/c/tb_client/context.zig +0 -304
  44. package/src/tigerbeetle/src/c/tb_client/echo_client.zig +0 -108
  45. package/src/tigerbeetle/src/c/tb_client/packet.zig +0 -80
  46. package/src/tigerbeetle/src/c/tb_client/signal.zig +0 -286
  47. package/src/tigerbeetle/src/c/tb_client/thread.zig +0 -88
  48. package/src/tigerbeetle/src/c/tb_client.h +0 -220
  49. package/src/tigerbeetle/src/c/tb_client.zig +0 -177
  50. package/src/tigerbeetle/src/c/tb_client_header.zig +0 -218
  51. package/src/tigerbeetle/src/c/tb_client_header_test.zig +0 -135
  52. package/src/tigerbeetle/src/c/test.zig +0 -371
  53. package/src/tigerbeetle/src/cli.zig +0 -399
  54. package/src/tigerbeetle/src/main.zig +0 -242
@@ -68,6 +68,9 @@ pub fn LevelIteratorType(comptime Table: type, comptime Storage: type) type {
68
68
  values: ValuesRingBuffer,
69
69
  tables: TablesRingBuffer,
70
70
 
71
+ // Used for verifying key order when constants.verify == true.
72
+ key_prev: ?Key,
73
+
71
74
  pub fn init(allocator: mem.Allocator) !LevelIterator {
72
75
  var values = try ValuesRingBuffer.init(allocator);
73
76
  errdefer values.deinit(allocator);
@@ -96,6 +99,7 @@ pub fn LevelIteratorType(comptime Table: type, comptime Storage: type) type {
96
99
  .{ .table_iterator = table_b },
97
100
  },
98
101
  },
102
+ .key_prev = null,
99
103
  };
100
104
  }
101
105
 
@@ -138,6 +142,7 @@ pub fn LevelIteratorType(comptime Table: type, comptime Storage: type) type {
138
142
  .direction = context.direction,
139
143
  .values = .{ .buffer = it.values.buffer },
140
144
  .tables = .{ .buffer = it.tables.buffer },
145
+ .key_prev = null,
141
146
  };
142
147
 
143
148
  assert(it.key_exclusive == null);
@@ -281,7 +286,7 @@ pub fn LevelIteratorType(comptime Table: type, comptime Storage: type) type {
281
286
  /// Returns either:
282
287
  /// - the next Key, if available.
283
288
  /// - error.Empty when there are no values remaining to iterate.
284
- /// - error.Drained when the iterator isn't empty, but the values
289
+ /// - error.Drained when the iterator isn't empty, but the values
285
290
  /// still need to be buffered into memory via tick().
286
291
  pub fn peek(it: LevelIterator) error{ Empty, Drained }!Key {
287
292
  if (it.values.head_ptr_const()) |value| return key_from_value(value);
@@ -299,6 +304,18 @@ pub fn LevelIteratorType(comptime Table: type, comptime Storage: type) type {
299
304
 
300
305
  /// This may only be called after peek() returns a Key (and not Empty or Drained)
301
306
  pub fn pop(it: *LevelIterator) Value {
307
+ const value = it.pop_internal();
308
+
309
+ if (constants.verify) {
310
+ const key = Table.key_from_value(&value);
311
+ if (it.key_prev) |k| assert(Table.compare_keys(k, key) == .lt);
312
+ it.key_prev = key;
313
+ }
314
+
315
+ return value;
316
+ }
317
+
318
+ fn pop_internal(it: *LevelIterator) Value {
302
319
  if (it.values.pop()) |value| return value;
303
320
 
304
321
  const table_iterator = &it.tables.head_ptr().?.table_iterator;
@@ -531,7 +531,7 @@ pub fn ManifestType(comptime Table: type, comptime Storage: type) type {
531
531
  .ascending,
532
532
  KeyRange{ .key_min = range.key_min, .key_max = range.key_max },
533
533
  );
534
- if (it.next() == null) {
534
+ if (it.next() != null) {
535
535
  // If the range is being compacted into the last level then this is unreachable,
536
536
  // as the last level has no subsequent levels and must always drop tombstones.
537
537
  assert(level_b != constants.lsm_levels - 1);
@@ -585,5 +585,33 @@ pub fn ManifestType(comptime Table: type, comptime Storage: type) type {
585
585
  manifest.checkpoint_callback = null;
586
586
  callback(manifest);
587
587
  }
588
+
589
+ pub fn verify(manifest: *Manifest, snapshot: u64) void {
590
+ for (manifest.levels) |*level| {
591
+ var key_max_prev: ?Key = null;
592
+ var table_info_iter = level.iterator(
593
+ .visible,
594
+ &.{snapshot},
595
+ .ascending,
596
+ null,
597
+ );
598
+ while (table_info_iter.next()) |table_info| {
599
+ if (key_max_prev) |k| {
600
+ assert(compare_keys(k, table_info.key_min) == .lt);
601
+ }
602
+ // We could have key_min == key_max if there is only one value.
603
+ assert(compare_keys(table_info.key_min, table_info.key_max) != .gt);
604
+ key_max_prev = table_info.key_max;
605
+
606
+ Table.verify(
607
+ Storage,
608
+ manifest.manifest_log.grid.superblock.storage,
609
+ table_info.address,
610
+ table_info.key_min,
611
+ table_info.key_max,
612
+ );
613
+ }
614
+ }
615
+ }
588
616
  };
589
617
  }
@@ -404,14 +404,14 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
404
404
  return;
405
405
  }
406
406
 
407
- const block = manifest_log.blocks.head().?;
408
- verify_block(block, null, null);
407
+ const block = manifest_log.blocks.head_ptr().?;
408
+ verify_block(block.*, null, null);
409
409
 
410
- const header = mem.bytesAsValue(vsr.Header, block[0..@sizeOf(vsr.Header)]);
411
- const address = Block.address(block);
410
+ const header = mem.bytesAsValue(vsr.Header, block.*[0..@sizeOf(vsr.Header)]);
411
+ const address = Block.address(block.*);
412
412
  assert(address > 0);
413
413
 
414
- const entry_count = Block.entry_count(block);
414
+ const entry_count = Block.entry_count(block.*);
415
415
 
416
416
  if (manifest_log.blocks_closed == 1 and manifest_log.blocks.count == 1) {
417
417
  // This might be the last block of a checkpoint, which can be a partial block.
@@ -319,7 +319,7 @@ const Environment = struct {
319
319
 
320
320
  fn wait(env: *Environment, manifest_log: *ManifestLog) void {
321
321
  while (env.pending > 0) {
322
- manifest_log.grid.tick();
322
+ // manifest_log.grid.tick();
323
323
  manifest_log.superblock.storage.tick();
324
324
  }
325
325
  }
@@ -635,7 +635,7 @@ fn verify_manifest_compaction_set(
635
635
  var compact_blocks_checked: u32 = 0;
636
636
 
637
637
  // This test doesn't include any actual table blocks, so all blocks are manifest blocks.
638
- var blocks = superblock.free_set.blocks.iterator(.{ .kind = .unset });
638
+ var blocks = superblock.free_set.blocks.iterator(.{ .kind = .set });
639
639
  while (blocks.next()) |block_index| {
640
640
  const block_address = block_index + 1;
641
641
  const block = superblock.storage.grid_block(block_address);
@@ -188,21 +188,26 @@ pub fn SetAssociativeCache(
188
188
  mem.set(u64, self.clocks.words, 0);
189
189
  }
190
190
 
191
- /// Returns whether an entry with the given key is cached,
191
+ /// Returns whether an entry with the given key is cached,
192
192
  /// without modifying the entry's counter.
193
193
  pub fn exists(self: *Self, key: Key) bool {
194
194
  const set = self.associate(key);
195
195
  return self.search(set, key) != null;
196
196
  }
197
197
 
198
- pub fn get(self: *Self, key: Key) ?*align(value_alignment) Value {
198
+ pub fn get_index(self: *Self, key: Key) ?usize {
199
199
  const set = self.associate(key);
200
200
  const way = self.search(set, key) orelse return null;
201
201
 
202
202
  const count = self.counts.get(set.offset + way);
203
203
  self.counts.set(set.offset + way, count +| 1);
204
204
 
205
- return @alignCast(value_alignment, &set.values[way]);
205
+ return set.offset + way;
206
+ }
207
+
208
+ pub fn get(self: *Self, key: Key) ?*align(value_alignment) Value {
209
+ const index = self.get_index(key) orelse return null;
210
+ return @alignCast(value_alignment, &self.values[index]);
206
211
  }
207
212
 
208
213
  /// Remove a key from the set associative cache if present.
@@ -211,6 +216,7 @@ pub fn SetAssociativeCache(
211
216
  const way = self.search(set, key) orelse return;
212
217
 
213
218
  self.counts.set(set.offset + way, 0);
219
+ set.values[way] = undefined;
214
220
  }
215
221
 
216
222
  /// Hint that the key is less likely to be accessed in the future, without actually removing
@@ -248,39 +254,21 @@ pub fn SetAssociativeCache(
248
254
  return @ptrCast(*const Ways, &result).*;
249
255
  }
250
256
 
251
- pub fn insert(self: *Self, key: Key) *align(value_alignment) Value {
252
- return self.insert_preserve_locked(
253
- void,
254
- struct {
255
- inline fn locked(_: void, _: *const Value) bool {
256
- return false;
257
- }
258
- }.locked,
259
- {},
260
- key,
261
- );
257
+ /// Insert a value, evicting an older entry if needed.
258
+ pub fn insert(self: *Self, value: *const Value) void {
259
+ _ = self.insert_index(value);
262
260
  }
263
261
 
264
- /// Add a key, evicting an older entry if needed, and return a pointer to the value.
265
- /// The key must not already be in the cache.
266
- /// Never evicts keys for which locked() returns true.
267
- /// The caller must guarantee that locked() returns true for less than layout.ways keys.
268
- pub fn insert_preserve_locked(
269
- self: *Self,
270
- comptime Context: type,
271
- comptime locked: fn (
272
- Context,
273
- *align(value_alignment) const Value,
274
- ) callconv(.Inline) bool,
275
- context: Context,
276
- key: Key,
277
- ) *align(value_alignment) Value {
262
+ /// Insert a value, evicting an older entry if needed.
263
+ /// Return the index at which the value was inserted.
264
+ pub fn insert_index(self: *Self, value: *const Value) usize {
265
+ const key = key_from_value(value);
278
266
  const set = self.associate(key);
279
267
  if (self.search(set, key)) |way| {
280
- // Remove the old entry for this key.
281
- // It should be a different value, but since we are returning a value pointer we
282
- // can't check against the new one.
283
- self.counts.set(set.offset + way, 0);
268
+ // Overwrite the old entry for this key.
269
+ self.counts.set(set.offset + way, 1);
270
+ set.values[way] = value.*;
271
+ return set.offset + way;
284
272
  }
285
273
 
286
274
  const clock_index = @divExact(set.offset, layout.ways);
@@ -299,11 +287,6 @@ pub fn SetAssociativeCache(
299
287
  safety_count += 1;
300
288
  way +%= 1;
301
289
  }) {
302
- // We pass a value pointer to the callback here so that a cache miss
303
- // can be avoided if the caller is able to determine if the value is
304
- // locked by comparing pointers directly.
305
- if (locked(context, @alignCast(value_alignment, &set.values[way]))) continue;
306
-
307
290
  var count = self.counts.get(set.offset + way);
308
291
  if (count == 0) break; // Way is already free.
309
292
 
@@ -316,10 +299,11 @@ pub fn SetAssociativeCache(
316
299
  assert(self.counts.get(set.offset + way) == 0);
317
300
 
318
301
  set.tags[way] = set.tag;
302
+ set.values[way] = value.*;
319
303
  self.counts.set(set.offset + way, 1);
320
304
  self.clocks.set(clock_index, way +% 1);
321
305
 
322
- return @alignCast(value_alignment, &set.values[way]);
306
+ return set.offset + way;
323
307
  }
324
308
 
325
309
  const Set = struct {
@@ -430,7 +414,7 @@ fn set_associative_cache_test(
430
414
  try expectEqual(i, sac.clocks.get(0));
431
415
 
432
416
  const key = i * sac.sets;
433
- sac.insert(key).* = key;
417
+ sac.insert(&key);
434
418
  try expect(sac.counts.get(i) == 1);
435
419
  try expectEqual(key, sac.get(key).?.*);
436
420
  try expect(sac.counts.get(i) == 2);
@@ -443,7 +427,7 @@ fn set_associative_cache_test(
443
427
  // Insert another element into the first set, causing key 0 to be evicted.
444
428
  {
445
429
  const key = layout.ways * sac.sets;
446
- sac.insert(key).* = key;
430
+ sac.insert(&key);
447
431
  try expect(sac.counts.get(0) == 1);
448
432
  try expectEqual(key, sac.get(key).?.*);
449
433
  try expect(sac.counts.get(0) == 2);
@@ -460,34 +444,6 @@ fn set_associative_cache_test(
460
444
 
461
445
  if (log) sac.associate(0).inspect(sac);
462
446
 
463
- // Lock all other slots, causing key layout.ways * sac.sets to be evicted despite having the
464
- // highest count.
465
- {
466
- {
467
- assert(sac.counts.get(0) == 2);
468
- var i: usize = 1;
469
- while (i < layout.ways) : (i += 1) assert(sac.counts.get(i) == 1);
470
- }
471
-
472
- const key = (layout.ways + 1) * sac.sets;
473
- const expect_evicted = layout.ways * sac.sets;
474
-
475
- sac.insert_preserve_locked(
476
- u64,
477
- struct {
478
- inline fn locked(only_unlocked: u64, value: *const Value) bool {
479
- return value.* != only_unlocked;
480
- }
481
- }.locked,
482
- expect_evicted,
483
- key,
484
- ).* = key;
485
-
486
- try expectEqual(@as(?*Value, null), sac.get(expect_evicted));
487
- }
488
-
489
- if (log) sac.associate(0).inspect(sac);
490
-
491
447
  // Ensure removal works.
492
448
  {
493
449
  const key = 5 * sac.sets;
@@ -512,7 +468,7 @@ fn set_associative_cache_test(
512
468
  try expectEqual(i, sac.clocks.get(0));
513
469
 
514
470
  const key = i * sac.sets;
515
- sac.insert(key).* = key;
471
+ sac.insert(&key);
516
472
  try expect(sac.counts.get(i) == 1);
517
473
  var j: usize = 2;
518
474
  while (j <= math.maxInt(SAC.Count)) : (j += 1) {
@@ -530,7 +486,7 @@ fn set_associative_cache_test(
530
486
  // Insert another element into the first set, causing key 0 to be evicted.
531
487
  {
532
488
  const key = layout.ways * sac.sets;
533
- sac.insert(key).* = key;
489
+ sac.insert(&key);
534
490
  try expect(sac.counts.get(0) == 1);
535
491
  try expectEqual(key, sac.get(key).?.*);
536
492
  try expect(sac.counts.get(0) == 2);
@@ -629,6 +629,11 @@ pub fn TableType(
629
629
  assert(compare_keys(builder.key_min, builder.key_max) == .lt);
630
630
  }
631
631
 
632
+ if (current > 0) {
633
+ const key_max_prev = index_data_keys(builder.index_block)[current - 1];
634
+ assert(compare_keys(key_max_prev, key_from_value(&values[0])) == .lt);
635
+ }
636
+
632
637
  builder.data_block_count += 1;
633
638
  builder.value = 0;
634
639
 
@@ -979,6 +984,43 @@ pub fn TableType(
979
984
 
980
985
  return null;
981
986
  }
987
+
988
+ pub fn verify(
989
+ comptime Storage: type,
990
+ storage: *Storage,
991
+ index_address: u64,
992
+ key_min: ?Key,
993
+ key_max: ?Key,
994
+ ) void {
995
+ if (Storage != @import("../test/storage.zig").Storage)
996
+ // Too complicated to do async verification
997
+ return;
998
+
999
+ const index_block = storage.grid_block(index_address);
1000
+ const addresses = index_data_addresses(index_block);
1001
+ const data_blocks_used = index_data_blocks_used(index_block);
1002
+ var data_block_index: usize = 0;
1003
+ while (data_block_index < data_blocks_used) : (data_block_index += 1) {
1004
+ const address = addresses[data_block_index];
1005
+ const data_block = storage.grid_block(address);
1006
+ const values = data_block_values_used(data_block);
1007
+ if (values.len > 0) {
1008
+ if (data_block_index == 0) {
1009
+ assert(key_min == null or
1010
+ compare_keys(key_min.?, key_from_value(&values[0])) == .eq);
1011
+ }
1012
+ if (data_block_index == data_blocks_used - 1) {
1013
+ assert(key_max == null or
1014
+ compare_keys(key_from_value(&values[values.len - 1]), key_max.?) == .eq);
1015
+ }
1016
+ var a = &values[0];
1017
+ for (values[1..]) |*b| {
1018
+ assert(compare_keys(key_from_value(a), key_from_value(b)) == .lt);
1019
+ a = b;
1020
+ }
1021
+ }
1022
+ }
1023
+ }
982
1024
  };
983
1025
  }
984
1026
 
@@ -52,6 +52,9 @@ pub fn TableIteratorType(comptime Table: type, comptime Storage: type) type {
52
52
  /// This field is only used for safety checks, it does not affect the behavior.
53
53
  read_pending: bool = false,
54
54
 
55
+ // Used for verifying key order when constants.verify == true.
56
+ key_prev: ?Table.Key,
57
+
55
58
  pub fn init(allocator: mem.Allocator) !TableIterator {
56
59
  const index_block = try allocator.alignedAlloc(
57
60
  u8,
@@ -95,6 +98,7 @@ pub fn TableIteratorType(comptime Table: type, comptime Storage: type) type {
95
98
  },
96
99
  },
97
100
  .value = undefined,
101
+ .key_prev = null,
98
102
  };
99
103
  }
100
104
 
@@ -132,10 +136,21 @@ pub fn TableIteratorType(comptime Table: type, comptime Storage: type) type {
132
136
  .values = .{ .buffer = it.values.buffer },
133
137
  .data_blocks = .{ .buffer = it.data_blocks.buffer },
134
138
  .value = 0,
139
+ .key_prev = null,
135
140
  };
136
141
 
137
142
  assert(it.values.empty());
138
143
  assert(it.data_blocks.empty());
144
+
145
+ if (constants.verify) {
146
+ Table.verify(
147
+ Storage,
148
+ context.grid.superblock.storage,
149
+ context.address,
150
+ null,
151
+ null,
152
+ );
153
+ }
139
154
  }
140
155
 
141
156
  /// Try to buffer at least a full block of values to be peek()'d.
@@ -291,6 +306,18 @@ pub fn TableIteratorType(comptime Table: type, comptime Storage: type) type {
291
306
 
292
307
  /// This may only be called after peek() returns a Key (and not Empty or Drained)
293
308
  pub fn pop(it: *TableIterator) Table.Value {
309
+ const value = it.pop_internal();
310
+
311
+ if (constants.verify) {
312
+ const key = Table.key_from_value(&value);
313
+ if (it.key_prev) |k| assert(Table.compare_keys(k, key) == .lt);
314
+ it.key_prev = key;
315
+ }
316
+
317
+ return value;
318
+ }
319
+
320
+ fn pop_internal(it: *TableIterator) Table.Value {
294
321
  assert(!it.read_pending);
295
322
  assert(!it.read_table_index);
296
323
 
@@ -186,7 +186,7 @@ pub fn TableMutableType(comptime Table: type) type {
186
186
  if (tombstone(value)) {
187
187
  cache.remove(key_from_value(value));
188
188
  } else {
189
- cache.insert(key_from_value(value)).* = value.*;
189
+ cache.insert(value);
190
190
  }
191
191
  }
192
192
  }
@@ -35,8 +35,7 @@ const Environment = struct {
35
35
  const node_count = 1024;
36
36
  const cache_entries_max = 2 * 1024 * 1024;
37
37
  const forest_options = StateMachine.forest_options(.{
38
- // Ignored by StateMachine.forest_options().
39
- .lsm_forest_node_count = undefined,
38
+ .lsm_forest_node_count = node_count,
40
39
  .cache_entries_accounts = cache_entries_max,
41
40
  .cache_entries_transfers = cache_entries_max,
42
41
  .cache_entries_posted = cache_entries_max,
@@ -121,7 +120,7 @@ const Environment = struct {
121
120
  }
122
121
 
123
122
  fn tick(env: *Environment) !void {
124
- env.grid.tick();
123
+ // env.grid.tick();
125
124
  try env.io.tick();
126
125
  }
127
126
 
@@ -212,13 +212,26 @@ pub fn TreeType(comptime TreeTable: type, comptime Storage: type, comptime tree_
212
212
  var manifest = try Manifest.init(allocator, node_pool, grid, tree_hash);
213
213
  errdefer manifest.deinit(allocator);
214
214
 
215
- var compaction_table_immutable = try CompactionTableImmutable.init(allocator, tree_name);
215
+ var compaction_table_immutable = try CompactionTableImmutable.init(
216
+ allocator,
217
+ std.fmt.comptimePrint("{s}(immutable->0)", .{tree_name}),
218
+ );
216
219
  errdefer compaction_table_immutable.deinit(allocator);
217
220
 
218
221
  var compaction_table: [@divFloor(constants.lsm_levels, 2)]CompactionTable = undefined;
219
- for (compaction_table) |*compaction, i| {
220
- errdefer for (compaction_table[0..i]) |*c| c.deinit(allocator);
221
- compaction.* = try CompactionTable.init(allocator, tree_name);
222
+ {
223
+ comptime var i: usize = 0;
224
+ inline while (i < compaction_table.len) : (i += 1) {
225
+ errdefer for (compaction_table[0..i]) |*c| c.deinit(allocator);
226
+ const compaction_name = std.fmt.comptimePrint("{s}({}->{}/{}->{})", .{
227
+ tree_name,
228
+ 2 * i,
229
+ 2 * i + 1,
230
+ 2 * i + 1,
231
+ 2 * i + 2,
232
+ });
233
+ compaction_table[i] = try CompactionTable.init(allocator, compaction_name);
234
+ }
222
235
  }
223
236
  errdefer for (compaction_table) |*c| c.deinit(allocator);
224
237
 
@@ -580,10 +593,14 @@ pub fn TreeType(comptime TreeTable: type, comptime Storage: type, comptime tree_
580
593
  assert(tree.compaction_io_pending == 0);
581
594
  assert(tree.compaction_callback == null);
582
595
 
596
+ if (constants.verify) {
597
+ tree.manifest.verify(tree.compaction_op);
598
+ }
599
+
583
600
  tracer.start(
584
601
  &tree.tracer_slot,
585
602
  .{ .tree = .{ .tree_name = tree_name } },
586
- .{ .tree_compaction_beat = .{ .tree_name = tree_name } },
603
+ .tree_compaction_beat,
587
604
  @src(),
588
605
  );
589
606
 
@@ -959,9 +976,13 @@ pub fn TreeType(comptime TreeTable: type, comptime Storage: type, comptime tree_
959
976
  tracer.end(
960
977
  &tree.tracer_slot,
961
978
  .{ .tree = .{ .tree_name = tree_name } },
962
- .{ .tree_compaction_beat = .{ .tree_name = tree_name } },
979
+ .tree_compaction_beat,
963
980
  );
964
981
 
982
+ if (constants.verify) {
983
+ tree.manifest.verify(tree.compaction_op);
984
+ }
985
+
965
986
  // Invoke the compact() callback after the manifest compacts at the end of the beat.
966
987
  const callback = tree.compaction_callback.?;
967
988
  tree.compaction_callback = null;
@@ -181,7 +181,7 @@ fn EnvironmentType(comptime table_usage: TableUsage) type {
181
181
  }
182
182
 
183
183
  fn tick(env: *Environment) void {
184
- env.grid.tick();
184
+ // env.grid.tick();
185
185
  env.storage.tick();
186
186
  }
187
187
 
@@ -269,11 +269,6 @@ pub fn main() !void {
269
269
  }
270
270
  };
271
271
 
272
- // Disable most faults at startup, so that the replicas don't get stuck in recovery mode.
273
- for (cluster.storages) |*storage, i| {
274
- storage.faulty = replica_normal_min <= i;
275
- }
276
-
277
272
  var tick: u64 = 0;
278
273
  while (tick < ticks_max) : (tick += 1) {
279
274
  const health_options = &cluster.options.health_options;
@@ -5,8 +5,8 @@ const assert = std.debug.assert;
5
5
  const log = std.log.scoped(.storage);
6
6
 
7
7
  const IO = @import("io.zig").IO;
8
+ const FIFO = @import("fifo.zig").FIFO;
8
9
  const constants = @import("constants.zig");
9
- const fatal = @import("cli.zig").fatal;
10
10
  const vsr = @import("vsr.zig");
11
11
 
12
12
  pub const Storage = struct {
@@ -68,9 +68,17 @@ pub const Storage = struct {
68
68
  offset: u64,
69
69
  };
70
70
 
71
+ pub const NextTick = struct {
72
+ next: ?*NextTick = null,
73
+ callback: fn (next_tick: *NextTick) void,
74
+ };
75
+
71
76
  io: *IO,
72
77
  fd: os.fd_t,
73
78
 
79
+ next_tick_queue: FIFO(NextTick) = .{},
80
+ next_tick_completion: IO.Completion = undefined,
81
+
74
82
  pub fn init(io: *IO, fd: os.fd_t) !Storage {
75
83
  return Storage{
76
84
  .io = io,
@@ -79,6 +87,7 @@ pub const Storage = struct {
79
87
  }
80
88
 
81
89
  pub fn deinit(storage: *Storage) void {
90
+ assert(storage.next_tick_queue.empty());
82
91
  assert(storage.fd != IO.INVALID_FILE);
83
92
  storage.fd = IO.INVALID_FILE;
84
93
  }
@@ -90,6 +99,43 @@ pub const Storage = struct {
90
99
  };
91
100
  }
92
101
 
102
+ pub fn on_next_tick(
103
+ storage: *Storage,
104
+ callback: fn (next_tick: *Storage.NextTick) void,
105
+ next_tick: *Storage.NextTick,
106
+ ) void {
107
+ next_tick.* = .{ .callback = callback };
108
+
109
+ const was_empty = storage.next_tick_queue.empty();
110
+ storage.next_tick_queue.push(next_tick);
111
+
112
+ if (was_empty) {
113
+ storage.io.timeout(
114
+ *Storage,
115
+ storage,
116
+ timeout_callback,
117
+ &storage.next_tick_completion,
118
+ 0, // 0ns timeout means to resolve as soon as possible - like a yield
119
+ );
120
+ }
121
+ }
122
+
123
+ fn timeout_callback(
124
+ storage: *Storage,
125
+ completion: *IO.Completion,
126
+ result: IO.TimeoutError!void,
127
+ ) void {
128
+ assert(completion == &storage.next_tick_completion);
129
+ _ = result catch |e| switch (e) {
130
+ error.Canceled => unreachable,
131
+ error.Unexpected => unreachable,
132
+ };
133
+
134
+ var queue = storage.next_tick_queue;
135
+ storage.next_tick_queue = .{};
136
+ while (queue.pop()) |next_tick| next_tick.callback(next_tick);
137
+ }
138
+
93
139
  pub fn read_sectors(
94
140
  self: *Storage,
95
141
  callback: fn (read: *Storage.Read) void,
@@ -113,18 +159,24 @@ pub const Storage = struct {
113
159
  .target_max = buffer.len,
114
160
  };
115
161
 
116
- self.start_read(read, 0);
162
+ self.start_read(read, null);
117
163
  assert(read.target().len > 0);
118
164
  }
119
165
 
120
- fn start_read(self: *Storage, read: *Storage.Read, bytes_read: usize) void {
121
- assert(bytes_read <= read.target().len);
166
+ fn start_read(self: *Storage, read: *Storage.Read, bytes_read: ?usize) void {
167
+ const bytes = bytes_read orelse 0;
168
+ assert(bytes <= read.target().len);
122
169
 
123
- read.offset += bytes_read;
124
- read.buffer = read.buffer[bytes_read..];
170
+ read.offset += bytes;
171
+ read.buffer = read.buffer[bytes..];
125
172
 
126
173
  const target = read.target();
127
174
  if (target.len == 0) {
175
+ // Resolving the read inline means start_read() must not have been called from
176
+ // read_sectors(). If it was, this is a synchronous callback resolution and should
177
+ // be reported.
178
+ assert(bytes_read != null);
179
+
128
180
  read.callback(read);
129
181
  return;
130
182
  }
@@ -137,6 +137,8 @@ pub const Cluster = struct {
137
137
  storage_options.replica_index = @intCast(u8, replica_index);
138
138
  storage_options.faulty_wal_areas = faulty_wal_areas[replica_index];
139
139
  storage.* = try Storage.init(allocator, options.storage_size_limit, storage_options);
140
+ // Disable most faults at startup, so that the replicas don't get stuck in recovery mode.
141
+ storage.faulty = replica_index >= vsr.quorums(options.replica_count).view_change;
140
142
  }
141
143
  errdefer for (cluster.storages) |*storage| storage.deinit(allocator);
142
144
 
@@ -330,6 +332,7 @@ pub const Cluster = struct {
330
332
  .{
331
333
  .replica_count = @intCast(u8, cluster.replicas.len),
332
334
  .storage = &cluster.storages[replica_index],
335
+ // TODO Test restarting with a higher storage limit.
333
336
  .storage_size_limit = cluster.options.storage_size_limit,
334
337
  .message_pool = &cluster.pools[replica_index],
335
338
  .time = time,