tigerbeetle-node 0.11.0 → 0.11.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. package/dist/.client.node.sha256 +1 -0
  2. package/package.json +5 -3
  3. package/src/tigerbeetle/scripts/fuzz_loop.sh +1 -1
  4. package/src/tigerbeetle/scripts/pre-commit.sh +2 -2
  5. package/src/tigerbeetle/scripts/validate_docs.sh +17 -0
  6. package/src/tigerbeetle/src/benchmark.zig +25 -11
  7. package/src/tigerbeetle/src/c/tb_client/context.zig +248 -47
  8. package/src/tigerbeetle/src/c/tb_client/echo_client.zig +108 -0
  9. package/src/tigerbeetle/src/c/tb_client/packet.zig +2 -2
  10. package/src/tigerbeetle/src/c/tb_client/signal.zig +2 -4
  11. package/src/tigerbeetle/src/c/tb_client/thread.zig +17 -256
  12. package/src/tigerbeetle/src/c/tb_client.h +18 -4
  13. package/src/tigerbeetle/src/c/tb_client.zig +88 -26
  14. package/src/tigerbeetle/src/c/tb_client_header_test.zig +135 -0
  15. package/src/tigerbeetle/src/c/test.zig +371 -1
  16. package/src/tigerbeetle/src/cli.zig +90 -18
  17. package/src/tigerbeetle/src/config.zig +12 -4
  18. package/src/tigerbeetle/src/demo.zig +2 -1
  19. package/src/tigerbeetle/src/demo_01_create_accounts.zig +1 -1
  20. package/src/tigerbeetle/src/demo_03_create_transfers.zig +13 -0
  21. package/src/tigerbeetle/src/ewah.zig +11 -33
  22. package/src/tigerbeetle/src/ewah_benchmark.zig +8 -9
  23. package/src/tigerbeetle/src/lsm/README.md +97 -3
  24. package/src/tigerbeetle/src/lsm/compaction.zig +32 -7
  25. package/src/tigerbeetle/src/{eytzinger_benchmark.zig → lsm/eytzinger_benchmark.zig} +34 -21
  26. package/src/tigerbeetle/src/lsm/forest_fuzz.zig +34 -32
  27. package/src/tigerbeetle/src/lsm/grid.zig +39 -21
  28. package/src/tigerbeetle/src/lsm/groove.zig +1 -0
  29. package/src/tigerbeetle/src/lsm/k_way_merge.zig +3 -3
  30. package/src/tigerbeetle/src/lsm/level_iterator.zig +1 -1
  31. package/src/tigerbeetle/src/lsm/manifest.zig +13 -0
  32. package/src/tigerbeetle/src/lsm/manifest_level.zig +0 -49
  33. package/src/tigerbeetle/src/lsm/manifest_log.zig +173 -335
  34. package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +665 -0
  35. package/src/tigerbeetle/src/lsm/node_pool.zig +4 -0
  36. package/src/tigerbeetle/src/lsm/posted_groove.zig +1 -0
  37. package/src/tigerbeetle/src/lsm/segmented_array.zig +24 -15
  38. package/src/tigerbeetle/src/lsm/table.zig +32 -20
  39. package/src/tigerbeetle/src/lsm/table_immutable.zig +1 -1
  40. package/src/tigerbeetle/src/lsm/table_iterator.zig +4 -5
  41. package/src/tigerbeetle/src/lsm/test.zig +13 -2
  42. package/src/tigerbeetle/src/lsm/tree.zig +45 -7
  43. package/src/tigerbeetle/src/lsm/tree_fuzz.zig +36 -32
  44. package/src/tigerbeetle/src/main.zig +69 -13
  45. package/src/tigerbeetle/src/message_bus.zig +18 -7
  46. package/src/tigerbeetle/src/message_pool.zig +8 -2
  47. package/src/tigerbeetle/src/ring_buffer.zig +7 -3
  48. package/src/tigerbeetle/src/simulator.zig +38 -11
  49. package/src/tigerbeetle/src/state_machine.zig +48 -23
  50. package/src/tigerbeetle/src/test/accounting/workload.zig +9 -5
  51. package/src/tigerbeetle/src/test/cluster.zig +15 -33
  52. package/src/tigerbeetle/src/test/conductor.zig +2 -1
  53. package/src/tigerbeetle/src/test/network.zig +45 -19
  54. package/src/tigerbeetle/src/test/packet_simulator.zig +40 -29
  55. package/src/tigerbeetle/src/test/state_checker.zig +5 -7
  56. package/src/tigerbeetle/src/test/storage.zig +453 -110
  57. package/src/tigerbeetle/src/test/storage_checker.zig +204 -0
  58. package/src/tigerbeetle/src/tigerbeetle.zig +1 -0
  59. package/src/tigerbeetle/src/unit_tests.zig +7 -1
  60. package/src/tigerbeetle/src/util.zig +97 -11
  61. package/src/tigerbeetle/src/vopr.zig +2 -1
  62. package/src/tigerbeetle/src/vsr/client.zig +8 -3
  63. package/src/tigerbeetle/src/vsr/journal.zig +280 -202
  64. package/src/tigerbeetle/src/vsr/replica.zig +169 -31
  65. package/src/tigerbeetle/src/vsr/superblock.zig +356 -629
  66. package/src/tigerbeetle/src/vsr/superblock_client_table.zig +7 -6
  67. package/src/tigerbeetle/src/vsr/superblock_free_set.zig +414 -151
  68. package/src/tigerbeetle/src/vsr/superblock_free_set_fuzz.zig +332 -0
  69. package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +349 -0
  70. package/src/tigerbeetle/src/vsr/superblock_manifest.zig +44 -9
  71. package/src/tigerbeetle/src/vsr/superblock_quorums.zig +394 -0
  72. package/src/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +312 -0
  73. package/src/tigerbeetle/src/vsr.zig +19 -5
  74. package/src/tigerbeetle/src/benchmark_array_search.zig +0 -317
  75. package/src/tigerbeetle/src/benchmarks/perf.zig +0 -299
  76. package/src/tigerbeetle/src/vopr_hub/README.md +0 -58
  77. package/src/tigerbeetle/src/vopr_hub/SETUP.md +0 -199
  78. package/src/tigerbeetle/src/vopr_hub/go.mod +0 -3
  79. package/src/tigerbeetle/src/vopr_hub/main.go +0 -1022
  80. package/src/tigerbeetle/src/vopr_hub/scheduler/go.mod +0 -3
  81. package/src/tigerbeetle/src/vopr_hub/scheduler/main.go +0 -403
@@ -10,7 +10,8 @@ const MessagePool = @import("../message_pool.zig").MessagePool;
10
10
  const Message = @import("../message_pool.zig").MessagePool.Message;
11
11
  const RingBuffer = @import("../ring_buffer.zig").RingBuffer;
12
12
  const ClientTable = @import("superblock_client_table.zig").ClientTable;
13
- const format_journal = @import("./journal.zig").format_journal;
13
+ const format_wal_headers = @import("./journal.zig").format_wal_headers;
14
+ const format_wal_prepares = @import("./journal.zig").format_wal_prepares;
14
15
 
15
16
  const vsr = @import("../vsr.zig");
16
17
  const Header = vsr.Header;
@@ -148,8 +149,12 @@ pub fn ReplicaType(
148
149
  /// Invariants (not applicable during status=recovering):
149
150
  /// * `replica.op` exists in the Journal.
150
151
  /// * `replica.op ≥ replica.commit_min`.
151
- /// * `replica.op replica.op_checkpoint_trigger`: don't wrap the WAL until we are sure
152
- /// that the overwritten entry will not be required for recovery.
152
+ /// * `replica.op - replica.commit_min ≤ journal_slot_count`
153
+ /// * `replica.op - replica.op_checkpoint journal_slot_count`
154
+ /// It is safe to overwrite `op_checkpoint` itself.
155
+ /// * `replica.op ≤ replica.op_checkpoint_trigger`:
156
+ /// Don't wrap the WAL until we are sure that the overwritten entry will not be required
157
+ /// for recovery.
153
158
  // TODO: When recovery protocol is removed, load the `op` from the WAL, and verify that it is ≥op_checkpoint.
154
159
  // Also verify that a corresponding header exists in the WAL.
155
160
  op: u64,
@@ -162,10 +167,10 @@ pub fn ReplicaType(
162
167
  /// The replica may have to wait for repairs to complete before commit_min reaches commit_max.
163
168
  ///
164
169
  /// Invariants (not applicable during status=recovering):
165
- /// * `replica.commit_min` exists in the Journal.
170
+ /// * `replica.commit_min` exists in the Journal OR `replica.commit_min == op_checkpoint`.
166
171
  /// * `replica.commit_min ≤ replica.op`
167
172
  /// * `replica.commit_min ≥ replica.op_checkpoint`.
168
- /// * never decreases
173
+ /// * never decreases while the replica is alive
169
174
  commit_min: u64,
170
175
 
171
176
  /// The op number of the latest committed operation (according to the cluster):
@@ -263,7 +268,17 @@ pub fn ReplicaType(
263
268
  /// Seeded with the replica's index number.
264
269
  prng: std.rand.DefaultPrng,
265
270
 
266
- on_change_state: ?fn (replica: *Self) void = null,
271
+ /// Simulator hooks.
272
+ on_change_state: ?fn (replica: *const Self) void = null,
273
+ /// Called immediately after a compaction.
274
+ on_compact: ?fn (replica: *const Self) void = null,
275
+ /// Called immediately after a checkpoint.
276
+ /// Note: The replica may checkpoint without calling this function:
277
+ /// 1. Begin checkpoint.
278
+ /// 2. Write 2/4 SuperBlock copies.
279
+ /// 3. Crash.
280
+ /// 4. Recover in the new checkpoint (but op_checkpoint wasn't called).
281
+ on_checkpoint: ?fn (replica: *const Self) void = null,
267
282
 
268
283
  /// Called when `commit_prepare` finishes committing.
269
284
  commit_callback: ?fn (*Self) void = null,
@@ -984,7 +999,11 @@ pub fn ReplicaType(
984
999
  // const threshold = self.quorum_replication;
985
1000
  // TODO: When Block recover & state transfer are implemented, this can be removed.
986
1001
  const threshold =
987
- if (prepare.message.header.op == self.op_checkpoint_trigger()) self.replica_count else self.quorum_replication;
1002
+ if (prepare.message.header.op == self.op_checkpoint_trigger() or
1003
+ prepare.message.header.op == self.op_checkpoint + config.lsm_batch_multiple + 1)
1004
+ self.replica_count
1005
+ else
1006
+ self.quorum_replication;
988
1007
 
989
1008
  const count = self.count_message_and_receive_quorum_exactly_once(
990
1009
  &prepare.ok_from_all_replicas,
@@ -2560,6 +2579,9 @@ pub fn ReplicaType(
2560
2579
 
2561
2580
  const op = self.commit_prepare.?.header.op;
2562
2581
  assert(op == self.commit_min);
2582
+ assert(op <= self.op_checkpoint_trigger());
2583
+
2584
+ if (self.on_compact) |on_compact| on_compact(self);
2563
2585
 
2564
2586
  if (op == self.op_checkpoint_trigger()) {
2565
2587
  assert(op == self.op);
@@ -2573,7 +2595,6 @@ pub fn ReplicaType(
2573
2595
  });
2574
2596
  self.state_machine.checkpoint(commit_op_checkpoint_state_machine_callback);
2575
2597
  } else {
2576
- assert(op < self.op_checkpoint_trigger());
2577
2598
  self.commit_op_done();
2578
2599
  }
2579
2600
  }
@@ -2596,18 +2617,20 @@ pub fn ReplicaType(
2596
2617
  // They will only be compacted to disk in the next bar.
2597
2618
  // Therefore, only ops "A..D" are committed to disk.
2598
2619
  // Thus, the SuperBlock's `commit_min` is set to 7-2=5.
2620
+ const vsr_state_commit_min = self.op_checkpoint_next();
2599
2621
  const vsr_state_new = .{
2600
- .commit_min = self.op_checkpoint_next(),
2622
+ .commit_min_checksum = self.journal.header_with_op(vsr_state_commit_min).?.checksum,
2623
+ .commit_min = vsr_state_commit_min,
2601
2624
  .commit_max = self.commit_max,
2602
2625
  .view_normal = self.view_normal,
2603
2626
  .view = self.view,
2604
2627
  };
2605
- assert(VSRState.monotonic(self.superblock.working.vsr_state, vsr_state_new));
2628
+ assert(self.superblock.working.vsr_state.monotonic(vsr_state_new));
2606
2629
 
2607
- self.superblock.staging.vsr_state = vsr_state_new;
2608
2630
  self.superblock.checkpoint(
2609
2631
  commit_op_checkpoint_superblock_callback,
2610
2632
  &self.superblock_context,
2633
+ vsr_state_new,
2611
2634
  );
2612
2635
  }
2613
2636
 
@@ -2629,6 +2652,7 @@ pub fn ReplicaType(
2629
2652
  self.op_checkpoint,
2630
2653
  });
2631
2654
 
2655
+ if (self.on_checkpoint) |on_checkpoint| on_checkpoint(self);
2632
2656
  self.commit_op_done();
2633
2657
  }
2634
2658
 
@@ -2662,10 +2686,17 @@ pub fn ReplicaType(
2662
2686
  // this commit.
2663
2687
 
2664
2688
  assert(self.journal.has(prepare.header));
2665
- assert(self.journal.header_with_op(self.commit_min).?.checksum ==
2666
- prepare.header.parent);
2689
+ if (self.op_checkpoint == self.commit_min) {
2690
+ // op_checkpoint's slot may have been overwritten in the WAL — but we can
2691
+ // always use the VSRState to anchor the hash chain.
2692
+ assert(prepare.header.parent ==
2693
+ self.superblock.working.vsr_state.commit_min_checksum);
2694
+ } else {
2695
+ assert(prepare.header.parent ==
2696
+ self.journal.header_with_op(self.commit_min).?.checksum);
2697
+ }
2667
2698
 
2668
- log.debug("{}: commit_op: executing view={} {} op={} checksum={} ({s})", .{
2699
+ log.debug("{}: commit_op: executing view={} leader={} op={} checksum={} ({s})", .{
2669
2700
  self.replica,
2670
2701
  self.view,
2671
2702
  self.leader_index(self.view) == self.replica,
@@ -3282,7 +3313,7 @@ pub fn ReplicaType(
3282
3313
  // Verify that the new request will fit in the WAL.
3283
3314
  // The message's op hasn't been assigned yet, but it will be `self.op + 1`.
3284
3315
  if (self.op == self.op_checkpoint_trigger()) {
3285
- log.debug("{}: on_request: ignoring op={} (too far ahead, checkpoint={})", .{
3316
+ log.debug("{}: on_request: ignoring op={} (too far ahead, checkpoint_trigger={})", .{
3286
3317
  self.replica,
3287
3318
  self.op + 1,
3288
3319
  self.op_checkpoint,
@@ -3852,8 +3883,6 @@ pub fn ReplicaType(
3852
3883
  assert(self.op_checkpoint <= self.commit_min);
3853
3884
  assert(self.commit_min <= self.op);
3854
3885
  assert(self.commit_min <= self.commit_max);
3855
-
3856
- assert(self.journal.header_with_op(self.commit_min) != null);
3857
3886
  assert(self.journal.header_with_op(self.op) != null);
3858
3887
 
3859
3888
  // The replica repairs backwards from `commit_max`. But if `commit_max` is too high
@@ -3926,6 +3955,8 @@ pub fn ReplicaType(
3926
3955
  }
3927
3956
 
3928
3957
  // Assert that all headers are now present and connected with a perfect hash chain:
3958
+ // TODO(State Transfer): This may fail if the commit max is too far ahead and we
3959
+ // couldn't repair it without jumping ahead on the WAL.
3929
3960
  assert(self.op >= self.commit_max);
3930
3961
  assert(self.valid_hash_chain_between(self.commit_min, self.op));
3931
3962
 
@@ -4014,8 +4045,9 @@ pub fn ReplicaType(
4014
4045
  if (header.op == 0 and self.op_checkpoint == 0) {
4015
4046
  // Repairing the root op is allowed until the first checkpoint.
4016
4047
  } else {
4017
- // Otherwise don't repair checkpointed ops, since their slots now belong to
4018
- // the next wrap of the WAL.
4048
+ // It is critical that we do not repair checkpointed ops; their slots now belong
4049
+ // to the next wrap of the log, and overwriting a new op with an old op is a
4050
+ // correctness violation.
4019
4051
  log.debug("{}: repair_header: false (precedes self.op_checkpoint={})", .{
4020
4052
  self.replica,
4021
4053
  self.op_checkpoint,
@@ -4311,6 +4343,8 @@ pub fn ReplicaType(
4311
4343
  assert(self.status == .normal or self.status == .view_change);
4312
4344
  assert(self.repairs_allowed());
4313
4345
  assert(self.journal.dirty.count > 0);
4346
+ assert(self.op >= self.commit_min);
4347
+ assert(self.op - self.commit_min + 1 <= config.journal_slot_count);
4314
4348
 
4315
4349
  // Request enough prepares to utilize our max IO depth:
4316
4350
  var budget = self.journal.writes.available();
@@ -4322,7 +4356,7 @@ pub fn ReplicaType(
4322
4356
  if (self.op < config.journal_slot_count) {
4323
4357
  // The op is known, and this is the first WAL cycle.
4324
4358
  // Therefore, any faulty ops to the right of `replica.op` are corrupt reserved
4325
- // entries from the initial format.
4359
+ // entries from the initial format, or corrupt prepares which were since truncated.
4326
4360
  var op: usize = self.op + 1;
4327
4361
  while (op < config.journal_slot_count) : (op += 1) {
4328
4362
  const slot = self.journal.slot_for_op(op);
@@ -4330,23 +4364,56 @@ pub fn ReplicaType(
4330
4364
 
4331
4365
  if (self.journal.faulty.bit(slot)) {
4332
4366
  assert(self.journal.headers[op].command == .reserved);
4367
+ assert(self.journal.headers_redundant[op].command == .reserved);
4333
4368
  self.journal.dirty.clear(slot);
4334
4369
  self.journal.faulty.clear(slot);
4335
- log.debug("{}: repair_prepares: op={} (op known, first cycle)", .{
4370
+ log.debug("{}: repair_prepares: remove slot={} " ++
4371
+ "(faulty, op known, first cycle)", .{
4336
4372
  self.replica,
4337
- op,
4373
+ slot.index,
4338
4374
  });
4339
4375
  }
4340
4376
  }
4341
4377
  }
4342
4378
 
4343
4379
  var op = self.op + 1;
4380
+ // To maximize durability, repair all prepares for which we have a header (not only
4381
+ // uncommitted headers). This in turn enables the replica to help repair other replicas.
4344
4382
  const op_min = op -| config.journal_slot_count;
4345
4383
  while (op > op_min) {
4346
4384
  op -= 1;
4347
4385
 
4348
4386
  const slot = self.journal.slot_for_op(op);
4349
4387
  if (self.journal.dirty.bit(slot)) {
4388
+ if (self.journal.slot_with_op(op) == null) {
4389
+ // If this op was between `commit_min` and `replica.op`, we would have
4390
+ // requested and repaired these headers.
4391
+ // If this op was between `op_checkpoint` and `commit_min`, we would have
4392
+ // a header (since we couldn't have committed otherwise).
4393
+ //
4394
+ // Therefore, this op must be either:
4395
+ // - less-than-or-equal-to `op_checkpoint` — we committed before
4396
+ // checkpointing, but the entry in our WAL was found corrupt after
4397
+ // recovering from a crash.
4398
+ // - or (indistinguishably) this might originally have been an op greater
4399
+ // than replica.op, which was truncated, but is now corrupt.
4400
+ //
4401
+ // we don't try to repair this op because the slot belongs (or will soon
4402
+ // belong) to a newer op, from the new WAL wrap. Additionally, we may not
4403
+ // still have access to its surrounding commits to verify the hash chain.
4404
+ assert(op <= self.commit_min);
4405
+ assert(op <= self.op_checkpoint);
4406
+ assert(self.journal.faulty.bit(slot));
4407
+
4408
+ log.debug("{}: repair_prepares: remove slot={} " ++
4409
+ "(faulty, precedes checkpoint)", .{
4410
+ self.replica,
4411
+ slot.index,
4412
+ });
4413
+ self.journal.remove_entry(slot);
4414
+ continue;
4415
+ }
4416
+
4350
4417
  // If this is an uncommitted op, and we are the leader in `view_change` status,
4351
4418
  // then we will `request_prepare` from the cluster, set `nack_prepare_op`,
4352
4419
  // and stop repairing any further prepares:
@@ -4941,51 +5008,67 @@ pub fn ReplicaType(
4941
5008
  assert(!self.do_view_change_quorum);
4942
5009
  assert(message.header.view == self.view);
4943
5010
  assert(message.header.replica == self.replica);
5011
+ assert(message.header.replica != replica);
4944
5012
  },
4945
5013
  .view_change => {
4946
5014
  assert(self.start_view_change_quorum);
4947
5015
  assert(self.do_view_change_quorum);
4948
5016
  assert(message.header.view == self.view);
4949
5017
  assert(message.header.replica == self.replica);
5018
+ assert(message.header.replica != replica);
4950
5019
  },
4951
5020
  else => unreachable,
4952
5021
  },
4953
5022
  .recovery => {
4954
5023
  assert(self.status == .recovering);
4955
5024
  assert(message.header.replica == self.replica);
5025
+ assert(message.header.replica != replica);
4956
5026
  assert(message.header.context == self.recovery_nonce);
4957
5027
  },
4958
5028
  .recovery_response => {
4959
5029
  assert(self.status == .normal);
4960
5030
  assert(message.header.view == self.view);
4961
5031
  assert(message.header.replica == self.replica);
5032
+ assert(message.header.replica != replica);
4962
5033
  },
4963
5034
  .headers => {
4964
5035
  assert(self.status == .normal or self.status == .view_change);
4965
5036
  assert(message.header.view == self.view);
4966
5037
  assert(message.header.replica == self.replica);
5038
+ assert(message.header.replica != replica);
4967
5039
  },
4968
5040
  .ping, .pong => {
4969
5041
  assert(message.header.view == self.view);
4970
5042
  assert(message.header.replica == self.replica);
5043
+ assert(message.header.replica != replica);
4971
5044
  },
4972
5045
  .commit => {
4973
5046
  assert(self.status == .normal);
4974
5047
  assert(self.leader());
4975
5048
  assert(message.header.view == self.view);
4976
5049
  assert(message.header.replica == self.replica);
5050
+ assert(message.header.replica != replica);
5051
+ },
5052
+ .request_start_view => {
5053
+ assert(message.header.view >= self.view);
5054
+ assert(message.header.replica == self.replica);
5055
+ assert(message.header.replica != replica);
5056
+ assert(self.leader_index(message.header.view) == replica);
4977
5057
  },
4978
5058
  .request_headers => {
4979
5059
  assert(message.header.view == self.view);
4980
5060
  assert(message.header.replica == self.replica);
5061
+ assert(message.header.replica != replica);
4981
5062
  },
4982
5063
  .request_prepare => {
4983
5064
  assert(message.header.view == self.view);
4984
5065
  assert(message.header.replica == self.replica);
5066
+ assert(message.header.replica != replica);
4985
5067
  },
4986
5068
  .nack_prepare => {
4987
5069
  assert(message.header.view == self.view);
4988
5070
  assert(message.header.replica == self.replica);
5071
+ assert(message.header.replica != replica);
4989
5072
  assert(self.leader_index(self.view) == replica);
4990
5073
  },
4991
5074
  else => {
@@ -5022,6 +5105,7 @@ pub fn ReplicaType(
5022
5105
  // `commit_max` and not `self.op`. However, committed ops (`commit_max`) must survive:
5023
5106
  assert(op >= self.commit_max);
5024
5107
  assert(op >= commit_max);
5108
+ // TODO: This assertion may fail until recovery protocol is removed.
5025
5109
  assert(op <= self.op_checkpoint_trigger());
5026
5110
 
5027
5111
  // We expect that our commit numbers may also be greater even than `commit_max` because
@@ -5667,8 +5751,10 @@ pub fn ReplicaType(
5667
5751
 
5668
5752
  /// Returns true if all operations are present, correctly ordered and connected by hash
5669
5753
  /// chain, between `op_min` and `op_max` (both inclusive).
5670
- fn valid_hash_chain_between(self: *Self, op_min: u64, op_max: u64) bool {
5754
+ fn valid_hash_chain_between(self: *const Self, op_min: u64, op_max: u64) bool {
5671
5755
  assert(op_min <= op_max);
5756
+ // Headers with ops preceding the checkpoint may be unavailable due to a WAL wrap.
5757
+ assert(op_min >= self.op_checkpoint);
5672
5758
 
5673
5759
  // If we use anything less than self.op then we may commit ops for a forked hash chain
5674
5760
  // that have since been reordered by a new leader.
@@ -5679,6 +5765,27 @@ pub fn ReplicaType(
5679
5765
  while (op > op_min) {
5680
5766
  op -= 1;
5681
5767
 
5768
+ if (self.op_checkpoint == op) {
5769
+ // op_checkpoint's slot may have been overwritten in the WAL — but we can
5770
+ // always use the VSRState to anchor the hash chain.
5771
+ assert(op == op_min);
5772
+ assert(op == self.superblock.working.vsr_state.commit_min);
5773
+ if (self.superblock.working.vsr_state.commit_min_checksum == b.parent) {
5774
+ return true;
5775
+ } else {
5776
+ log.debug("{}: valid_hash_chain_between: break A: {} (checkpoint={})", .{
5777
+ self.replica,
5778
+ self.superblock.working.vsr_state.commit_min_checksum,
5779
+ self.op_checkpoint,
5780
+ });
5781
+ log.debug("{}: valid_hash_chain_between: break B: {}", .{
5782
+ self.replica,
5783
+ b,
5784
+ });
5785
+ return false;
5786
+ }
5787
+ }
5788
+
5682
5789
  if (self.journal.header_with_op(op)) |a| {
5683
5790
  assert(a.op + 1 == b.op);
5684
5791
  if (a.checksum == b.parent) {
@@ -5912,18 +6019,20 @@ fn ReplicaFormatType(comptime Storage: type) type {
5912
6019
  );
5913
6020
  errdefer allocator.free(wal_buffer);
5914
6021
 
5915
- // The logical offset *within the WAL*.
6022
+ // The logical offset *within the Zone*.
6023
+ // Even though the prepare zone follows the redundant header zone, write the prepares
6024
+ // first. This allows the test Storage to check the invariant "never write the redundant
6025
+ // header before the prepare".
5916
6026
  var wal_offset: u64 = 0;
5917
- while (wal_offset < config.journal_size_max) {
5918
- const size = format_journal(cluster, wal_offset, wal_buffer);
5919
- assert(size % config.sector_size == 0);
6027
+ while (wal_offset < config.journal_size_prepares) {
6028
+ const size = format_wal_prepares(cluster, wal_offset, wal_buffer);
5920
6029
  assert(size > 0);
5921
6030
 
5922
6031
  for (std.mem.bytesAsSlice(Header, wal_buffer[0..size])) |*header| {
5923
6032
  if (std.mem.eql(u8, std.mem.asBytes(header), &header_zeroes)) {
5924
6033
  // This is the (empty) body of a reserved or root Prepare.
5925
6034
  } else {
5926
- // This is either a Prepare's header or a redundant header.
6035
+ // This is a Prepare's header.
5927
6036
  assert(header.valid_checksum());
5928
6037
  if (header.op == 0) {
5929
6038
  assert(header.command == .prepare);
@@ -5939,16 +6048,45 @@ fn ReplicaFormatType(comptime Storage: type) type {
5939
6048
  format_wal_sectors_callback,
5940
6049
  &self.wal_write,
5941
6050
  wal_buffer[0..size],
5942
- .wal,
6051
+ .wal_prepares,
5943
6052
  wal_offset,
5944
6053
  );
5945
6054
  self.formatting = true;
5946
6055
  while (self.formatting) storage.tick();
5947
6056
  wal_offset += size;
5948
6057
  }
6058
+ // There are no prepares left to write.
6059
+ assert(format_wal_prepares(cluster, wal_offset, wal_buffer) == 0);
6060
+
6061
+ wal_offset = 0;
6062
+ while (wal_offset < config.journal_size_headers) {
6063
+ const size = format_wal_headers(cluster, wal_offset, wal_buffer);
6064
+ assert(size > 0);
6065
+
6066
+ for (std.mem.bytesAsSlice(Header, wal_buffer[0..size])) |*header| {
6067
+ assert(header.valid_checksum());
6068
+ if (header.op == 0) {
6069
+ assert(header.command == .prepare);
6070
+ assert(header.operation == .root);
6071
+ } else {
6072
+ assert(header.command == .reserved);
6073
+ assert(header.operation == .reserved);
6074
+ }
6075
+ }
5949
6076
 
5950
- // There is nothing left to write.
5951
- assert(format_journal(cluster, wal_offset, wal_buffer) == 0);
6077
+ storage.write_sectors(
6078
+ format_wal_sectors_callback,
6079
+ &self.wal_write,
6080
+ wal_buffer[0..size],
6081
+ .wal_headers,
6082
+ wal_offset,
6083
+ );
6084
+ self.formatting = true;
6085
+ while (self.formatting) storage.tick();
6086
+ wal_offset += size;
6087
+ }
6088
+ // There are no headers left to write.
6089
+ assert(format_wal_headers(cluster, wal_offset, wal_buffer) == 0);
5952
6090
  }
5953
6091
 
5954
6092
  fn format_wal_sectors_callback(write: *Storage.Write) void {