tigerbeetle-node 0.5.2 → 0.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/README.md +97 -78
  2. package/dist/benchmark.js +96 -94
  3. package/dist/benchmark.js.map +1 -1
  4. package/dist/index.d.ts +82 -82
  5. package/dist/index.js +74 -93
  6. package/dist/index.js.map +1 -1
  7. package/dist/test.js +134 -111
  8. package/dist/test.js.map +1 -1
  9. package/package.json +3 -2
  10. package/scripts/download_node_headers.sh +3 -1
  11. package/src/benchmark.ts +114 -118
  12. package/src/index.ts +102 -111
  13. package/src/node.zig +55 -63
  14. package/src/test.ts +146 -125
  15. package/src/tigerbeetle/scripts/benchmark.bat +46 -0
  16. package/src/tigerbeetle/scripts/benchmark.sh +5 -0
  17. package/src/tigerbeetle/scripts/install_zig.bat +109 -109
  18. package/src/tigerbeetle/scripts/install_zig.sh +8 -4
  19. package/src/tigerbeetle/scripts/vopr.bat +47 -47
  20. package/src/tigerbeetle/scripts/vopr.sh +2 -2
  21. package/src/tigerbeetle/src/benchmark.zig +65 -102
  22. package/src/tigerbeetle/src/cli.zig +39 -18
  23. package/src/tigerbeetle/src/config.zig +44 -25
  24. package/src/tigerbeetle/src/demo.zig +2 -15
  25. package/src/tigerbeetle/src/demo_01_create_accounts.zig +10 -10
  26. package/src/tigerbeetle/src/demo_03_create_transfers.zig +5 -3
  27. package/src/tigerbeetle/src/{demo_04_create_transfers_two_phase_commit.zig → demo_04_create_pending_transfers.zig} +18 -12
  28. package/src/tigerbeetle/src/demo_05_post_pending_transfers.zig +37 -0
  29. package/src/tigerbeetle/src/demo_06_void_pending_transfers.zig +24 -0
  30. package/src/tigerbeetle/src/demo_07_lookup_transfers.zig +1 -1
  31. package/src/tigerbeetle/src/io/benchmark.zig +24 -49
  32. package/src/tigerbeetle/src/io/darwin.zig +175 -44
  33. package/src/tigerbeetle/src/io/linux.zig +177 -72
  34. package/src/tigerbeetle/src/io/test.zig +61 -39
  35. package/src/tigerbeetle/src/io/windows.zig +1161 -0
  36. package/src/tigerbeetle/src/io.zig +2 -0
  37. package/src/tigerbeetle/src/main.zig +31 -10
  38. package/src/tigerbeetle/src/message_bus.zig +49 -61
  39. package/src/tigerbeetle/src/message_pool.zig +66 -57
  40. package/src/tigerbeetle/src/ring_buffer.zig +55 -3
  41. package/src/tigerbeetle/src/simulator.zig +108 -12
  42. package/src/tigerbeetle/src/state_machine.zig +1813 -816
  43. package/src/tigerbeetle/src/storage.zig +0 -230
  44. package/src/tigerbeetle/src/test/cluster.zig +168 -38
  45. package/src/tigerbeetle/src/test/message_bus.zig +4 -3
  46. package/src/tigerbeetle/src/test/network.zig +13 -16
  47. package/src/tigerbeetle/src/test/packet_simulator.zig +14 -1
  48. package/src/tigerbeetle/src/test/state_checker.zig +6 -3
  49. package/src/tigerbeetle/src/test/state_machine.zig +8 -7
  50. package/src/tigerbeetle/src/test/storage.zig +99 -40
  51. package/src/tigerbeetle/src/tigerbeetle.zig +108 -101
  52. package/src/tigerbeetle/src/time.zig +58 -11
  53. package/src/tigerbeetle/src/vsr/client.zig +18 -32
  54. package/src/tigerbeetle/src/vsr/clock.zig +1 -1
  55. package/src/tigerbeetle/src/vsr/journal.zig +1388 -464
  56. package/src/tigerbeetle/src/vsr/replica.zig +1340 -576
  57. package/src/tigerbeetle/src/vsr.zig +452 -40
  58. package/src/translate.zig +10 -0
  59. package/src/tigerbeetle/src/demo_05_accept_transfers.zig +0 -23
  60. package/src/tigerbeetle/src/demo_06_reject_transfers.zig +0 -17
  61. package/src/tigerbeetle/src/format_test.zig +0 -69
@@ -18,6 +18,24 @@ const log = std.log.scoped(.replica);
18
18
  pub const Status = enum {
19
19
  normal,
20
20
  view_change,
21
+ // Recovery (for replica_count > 1):
22
+ //
23
+ // 1. At replica start: `status=recovering` and `journal.recovered=false`
24
+ // 2. Load the WAL. Mark questionable entries as faulty.
25
+ // 3. If the WAL has no entries (besides the initial commit), skip to step 5 with view 0.
26
+ // 4. Run VSR recovery protocol:
27
+ // a. Send a `recovery` message to every replica (except self).
28
+ // b. Wait for f+1 `recovery_response` messages from replicas in `normal` status.
29
+ // Each `recovery_response` includes the current view number.
30
+ // Each `recovery_response` must include a nonce matching the `recovery` message.
31
+ // c. Wait for a `recovery_response` from the leader of the highest known view.
32
+ // 5. Transition to `status=normal` with the discovered view number:
33
+ // * Set `op` to the highest op in the leader's recovery response.
34
+ // * Repair faulty messages.
35
+ // * Commit through to the discovered `commit_max`.
36
+ // * Set `state_machine.prepare_timeout` to the current op's timestamp.
37
+ //
38
+ // TODO document snapshot recovery in this progression
21
39
  recovering,
22
40
  };
23
41
 
@@ -47,19 +65,24 @@ const ClientTableEntry = struct {
47
65
  reply: *Message,
48
66
  };
49
67
 
68
+ const Nonce = u128;
69
+
50
70
  const Prepare = struct {
51
71
  /// The current prepare message (used to cross-check prepare_ok messages, and for resending).
52
72
  message: *Message,
53
73
 
54
74
  /// Unique prepare_ok messages for the same view, op number and checksum from ALL replicas.
55
- ok_from_all_replicas: QuorumMessages = QuorumMessagesNull,
75
+ ok_from_all_replicas: QuorumCounter = quorum_counter_null,
56
76
 
57
77
  /// Whether a quorum of prepare_ok messages has been received for this prepare.
58
78
  ok_quorum_received: bool = false,
59
79
  };
60
80
 
61
81
  const QuorumMessages = [config.replicas_max]?*Message;
62
- const QuorumMessagesNull = [_]?*Message{null} ** config.replicas_max;
82
+ const quorum_messages_null = [_]?*Message{null} ** config.replicas_max;
83
+
84
+ const QuorumCounter = std.StaticBitSet(config.replicas_max);
85
+ const quorum_counter_null = QuorumCounter.initEmpty();
63
86
 
64
87
  pub fn Replica(
65
88
  comptime StateMachine: type,
@@ -111,12 +134,17 @@ pub fn Replica(
111
134
  view_normal: u32,
112
135
 
113
136
  /// The current status, either normal, view_change, or recovering:
114
- /// TODO Don't default to normal, set the starting status according to the journal's health.
115
- status: Status = .normal,
137
+ status: Status = .recovering,
116
138
 
117
139
  /// The op number assigned to the most recently prepared operation:
118
140
  op: u64,
119
141
 
142
+ /// The op of the highest checkpointed message.
143
+ // TODO Update this to use LSM storage.
144
+ // TODO Refuse to store/ack any op>op_checkpoint+journal_slot_count.
145
+ // TODO Enforce invariant op≥op_checkpoint.
146
+ op_checkpoint: u64 = 0,
147
+
120
148
  /// The op number of the latest committed and executed operation (according to the replica):
121
149
  /// The replica may have to wait for repairs to complete before commit_min reaches commit_max.
122
150
  commit_min: u64,
@@ -133,7 +161,10 @@ pub fn Replica(
133
161
 
134
162
  /// The leader's pipeline of inflight prepares waiting to commit in FIFO order.
135
163
  /// This allows us to pipeline without the complexity of out-of-order commits.
136
- pipeline: RingBuffer(Prepare, config.pipelining_max) = .{},
164
+ ///
165
+ /// After a view change, the old leader's pipeline is left untouched so that it is able to
166
+ /// help the new leader repair, even in the face of local storage faults.
167
+ pipeline: RingBuffer(Prepare, config.pipeline_max) = .{},
137
168
 
138
169
  /// In some cases, a replica may send a message to itself. We do not submit these messages
139
170
  /// to the message bus but rather queue them here for guaranteed immediate delivery, which
@@ -141,13 +172,16 @@ pub fn Replica(
141
172
  loopback_queue: ?*Message = null,
142
173
 
143
174
  /// Unique start_view_change messages for the same view from OTHER replicas (excluding ourself).
144
- start_view_change_from_other_replicas: QuorumMessages = QuorumMessagesNull,
175
+ start_view_change_from_other_replicas: QuorumCounter = quorum_counter_null,
145
176
 
146
177
  /// Unique do_view_change messages for the same view from ALL replicas (including ourself).
147
- do_view_change_from_all_replicas: QuorumMessages = QuorumMessagesNull,
178
+ do_view_change_from_all_replicas: QuorumMessages = quorum_messages_null,
148
179
 
149
180
  /// Unique nack_prepare messages for the same view from OTHER replicas (excluding ourself).
150
- nack_prepare_from_other_replicas: QuorumMessages = QuorumMessagesNull,
181
+ nack_prepare_from_other_replicas: QuorumCounter = quorum_counter_null,
182
+
183
+ /// Unique recovery_response messages from OTHER replicas (excluding ourself).
184
+ recovery_response_from_other_replicas: QuorumMessages = quorum_messages_null,
151
185
 
152
186
  /// Whether a replica has received a quorum of start_view_change messages for the view change:
153
187
  start_view_change_quorum: bool = false,
@@ -186,6 +220,12 @@ pub fn Replica(
186
220
  /// The number of ticks before repairing missing/disconnected headers and/or dirty entries:
187
221
  repair_timeout: Timeout,
188
222
 
223
+ /// The number of ticks before attempting to send another set of `recovery` messages.
224
+ recovery_timeout: Timeout,
225
+
226
+ /// The nonce of the `recovery` messages.
227
+ recovery_nonce: Nonce,
228
+
189
229
  /// Used to provide deterministic entropy to `choose_any_other_replica()`.
190
230
  /// Incremented whenever `choose_any_other_replica()` is called.
191
231
  choose_any_other_replica_ticks: u64 = 0,
@@ -242,25 +282,27 @@ pub fn Replica(
242
282
  try client_table.ensureTotalCapacity(allocator, @intCast(u32, config.clients_max));
243
283
  assert(client_table.capacity() >= config.clients_max);
244
284
 
245
- var init_prepare = Header{
246
- .parent = 0,
247
- .client = 0,
248
- .context = 0,
249
- .request = 0,
250
- .cluster = cluster,
251
- .epoch = 0,
252
- .view = 0,
253
- .op = 0,
254
- .commit = 0,
255
- .offset = 0,
256
- .size = @sizeOf(Header),
257
- .replica = 0,
258
- .command = .prepare,
259
- .operation = .init,
260
- .version = Version,
285
+ const root_prepare = Header.root_prepare(cluster);
286
+
287
+ var clock = try Clock.init(
288
+ allocator,
289
+ replica_count,
290
+ replica,
291
+ time,
292
+ );
293
+ errdefer clock.deinit(allocator);
294
+
295
+ const journal = try Journal.init(allocator, storage, replica);
296
+ errdefer journal.deinit(allocator);
297
+
298
+ const recovery_nonce = blk: {
299
+ var nonce: [@sizeOf(Nonce)]u8 = undefined;
300
+ var hash = std.crypto.hash.Blake3.init(.{});
301
+ hash.update(std.mem.asBytes(&clock.monotonic()));
302
+ hash.update(&[_]u8{replica});
303
+ hash.final(&nonce);
304
+ break :blk @bitCast(Nonce, nonce);
261
305
  };
262
- init_prepare.set_checksum_body(&[0]u8{});
263
- init_prepare.set_checksum();
264
306
 
265
307
  var self = Self{
266
308
  .cluster = cluster,
@@ -268,28 +310,16 @@ pub fn Replica(
268
310
  .replica = replica,
269
311
  .quorum_replication = quorum_replication,
270
312
  .quorum_view_change = quorum_view_change,
271
- .clock = try Clock.init(
272
- allocator,
273
- replica_count,
274
- replica,
275
- time,
276
- ),
277
- .journal = try Journal.init(
278
- allocator,
279
- storage,
280
- replica,
281
- config.journal_size_max,
282
- config.journal_headers_max,
283
- &init_prepare,
284
- ),
313
+ .clock = clock,
314
+ .journal = journal,
285
315
  .message_bus = message_bus,
286
316
  .state_machine = state_machine,
287
317
  .client_table = client_table,
288
- .view = init_prepare.view,
289
- .view_normal = init_prepare.view,
290
- .op = init_prepare.op,
291
- .commit_min = init_prepare.commit,
292
- .commit_max = init_prepare.commit,
318
+ .view = root_prepare.view,
319
+ .view_normal = root_prepare.view,
320
+ .op = root_prepare.op,
321
+ .commit_min = root_prepare.commit,
322
+ .commit_max = root_prepare.commit,
293
323
  .ping_timeout = Timeout{
294
324
  .name = "ping_timeout",
295
325
  .id = replica,
@@ -325,6 +355,12 @@ pub fn Replica(
325
355
  .id = replica,
326
356
  .after = 50,
327
357
  },
358
+ .recovery_timeout = Timeout{
359
+ .name = "recovery_timeout",
360
+ .id = replica,
361
+ .after = 200,
362
+ },
363
+ .recovery_nonce = recovery_nonce,
328
364
  .prng = std.rand.DefaultPrng.init(replica),
329
365
  };
330
366
 
@@ -343,20 +379,7 @@ pub fn Replica(
343
379
  config.clients_max,
344
380
  });
345
381
 
346
- // We must initialize timeouts here, not in tick() on the first tick, because on_message()
347
- // can race with tick()... before timeouts have been initialized:
348
- assert(self.status == .normal);
349
- if (self.leader()) {
350
- log.debug("{}: init: leader", .{self.replica});
351
- self.ping_timeout.start();
352
- self.commit_timeout.start();
353
- self.repair_timeout.start();
354
- } else {
355
- log.debug("{}: init: follower", .{self.replica});
356
- self.ping_timeout.start();
357
- self.normal_status_timeout.start();
358
- self.repair_timeout.start();
359
- }
382
+ assert(self.status == .recovering);
360
383
 
361
384
  return self;
362
385
  }
@@ -375,15 +398,7 @@ pub fn Replica(
375
398
  self.client_table.deinit(allocator);
376
399
  }
377
400
 
378
- {
379
- var it = self.pipeline.iterator();
380
- while (it.next()) |prepare| {
381
- self.message_bus.unref(prepare.message);
382
- for (prepare.ok_from_all_replicas) |message| {
383
- if (message) |m| self.message_bus.unref(m);
384
- }
385
- }
386
- }
401
+ while (self.pipeline.pop()) |prepare| self.message_bus.unref(prepare.message);
387
402
 
388
403
  if (self.loopback_queue) |loopback_message| {
389
404
  assert(loopback_message.next == null);
@@ -391,13 +406,11 @@ pub fn Replica(
391
406
  self.loopback_queue = null;
392
407
  }
393
408
 
394
- for (self.start_view_change_from_other_replicas) |message| {
395
- if (message) |m| self.message_bus.unref(m);
396
- }
397
409
  for (self.do_view_change_from_all_replicas) |message| {
398
410
  if (message) |m| self.message_bus.unref(m);
399
411
  }
400
- for (self.nack_prepare_from_other_replicas) |message| {
412
+
413
+ for (self.recovery_response_from_other_replicas) |message| {
401
414
  if (message) |m| self.message_bus.unref(m);
402
415
  }
403
416
  }
@@ -414,12 +427,40 @@ pub fn Replica(
414
427
  self.clock.tick();
415
428
 
416
429
  if (!self.journal.recovered) {
417
- self.journal.recover();
430
+ if (!self.journal.recovering) self.journal.recover();
418
431
  return;
419
432
  } else {
420
433
  assert(!self.journal.recovering);
421
434
  }
422
435
 
436
+ if (self.status == .recovering) {
437
+ if (self.recovery_timeout.ticking) {
438
+ // Continue running the VSR recovery protocol.
439
+ self.recovery_timeout.tick();
440
+ if (self.recovery_timeout.fired()) self.on_recovery_timeout();
441
+ } else if (self.journal.is_empty()) {
442
+ // The data file is brand new — no messages have ever been written.
443
+ // Transition to normal status; no need to run the VSR recovery protocol.
444
+ assert(self.journal.faulty.count == 0);
445
+ self.transition_to_normal_from_recovering_status(0);
446
+ assert(self.status == .normal);
447
+ } else if (self.replica_count == 1) {
448
+ // A cluster-of-one does not run the VSR recovery protocol.
449
+ if (self.journal.faulty.count != 0) @panic("journal is corrupt");
450
+ if (self.committing) return;
451
+ assert(self.op == 0);
452
+ self.op = self.journal.op_maximum();
453
+ self.commit_ops(self.op);
454
+ // The recovering→normal transition is deferred until all ops are committed.
455
+ } else {
456
+ // The journal just finished recovery.
457
+ // Now try to learn the current view via the VSR recovery protocol.
458
+ self.recovery_timeout.start();
459
+ self.recover();
460
+ }
461
+ return;
462
+ }
463
+
423
464
  self.ping_timeout.tick();
424
465
  self.prepare_timeout.tick();
425
466
  self.commit_timeout.tick();
@@ -443,11 +484,12 @@ pub fn Replica(
443
484
  /// Called by the MessageBus to deliver a message to the replica.
444
485
  pub fn on_message(self: *Self, message: *Message) void {
445
486
  assert(self.loopback_queue == null);
487
+ assert(message.references > 0);
446
488
 
447
- log.debug("{}: on_message: view={} status={s} {}", .{
489
+ log.debug("{}: on_message: view={} status={} {}", .{
448
490
  self.replica,
449
491
  self.view,
450
- @tagName(self.status),
492
+ self.status,
451
493
  message.header,
452
494
  });
453
495
 
@@ -469,7 +511,6 @@ pub fn Replica(
469
511
  }
470
512
 
471
513
  if (!self.journal.recovered) {
472
- self.journal.recover();
473
514
  log.debug("{}: on_message: waiting for journal to recover", .{self.replica});
474
515
  return;
475
516
  } else {
@@ -488,7 +529,7 @@ pub fn Replica(
488
529
  .do_view_change => self.on_do_view_change(message),
489
530
  .start_view => self.on_start_view(message),
490
531
  .recovery => self.on_recovery(message),
491
- .recovery_response => return, // TODO
532
+ .recovery_response => self.on_recovery_response(message),
492
533
  .request_start_view => self.on_request_start_view(message),
493
534
  .request_prepare => self.on_request_prepare(message),
494
535
  .request_headers => self.on_request_headers(message),
@@ -548,7 +589,7 @@ pub fn Replica(
548
589
  } else {
549
590
  // Copy the ping's monotonic timestamp to our pong and add our wall clock sample:
550
591
  pong.op = message.header.op;
551
- pong.offset = @bitCast(u64, self.clock.realtime());
592
+ pong.timestamp = @bitCast(u64, self.clock.realtime());
552
593
  self.send_header_to_replica(message.header.replica, pong);
553
594
  }
554
595
  }
@@ -558,7 +599,7 @@ pub fn Replica(
558
599
  if (message.header.replica == self.replica) return;
559
600
 
560
601
  const m0 = message.header.op;
561
- const t1 = @bitCast(i64, message.header.offset);
602
+ const t1 = @bitCast(i64, message.header.timestamp);
562
603
  const m2 = self.clock.monotonic();
563
604
 
564
605
  self.clock.learn(message.header.replica, m0, t1, m2);
@@ -566,9 +607,9 @@ pub fn Replica(
566
607
 
567
608
  /// The primary advances op-number, adds the request to the end of the log, and updates the
568
609
  /// information for this client in the client-table to contain the new request number, s.
569
- /// Then it sends a ⟨PREPARE v, m, n, k⟩ message to the other replicas, where v is the current
570
- /// view-number, m is the message it received from the client, n is the op-number it assigned to
571
- /// the request, and k is the commit-number.
610
+ /// Then it sends a ⟨PREPARE v, m, n, k⟩ message to the other replicas, where v is the
611
+ /// current view-number, m is the message it received from the client, n is the op-number
612
+ /// it assigned to the request, and k is the commit-number.
572
613
  fn on_request(self: *Self, message: *Message) void {
573
614
  if (self.ignore_request_message(message)) return;
574
615
 
@@ -587,19 +628,30 @@ pub fn Replica(
587
628
 
588
629
  log.debug("{}: on_request: request {}", .{ self.replica, message.header.checksum });
589
630
 
590
- self.state_machine.prepare(
591
- realtime,
631
+ // Guard against the wall clock going backwards by taking the max with timestamps issued:
632
+ self.state_machine.prepare_timestamp = std.math.max(
633
+ // The cluster `commit_timestamp` may be ahead of our `prepare_timestamp` because this
634
+ // may be our first prepare as a recently elected leader:
635
+ std.math.max(
636
+ self.state_machine.prepare_timestamp,
637
+ self.state_machine.commit_timestamp,
638
+ ) + 1,
639
+ @intCast(u64, realtime),
640
+ );
641
+ assert(self.state_machine.prepare_timestamp > self.state_machine.commit_timestamp);
642
+
643
+ const prepare_timestamp = self.state_machine.prepare(
592
644
  message.header.operation.cast(StateMachine),
593
645
  message.body(),
594
646
  );
595
647
 
596
- var latest_entry = self.journal.entry_for_op_exact(self.op).?;
648
+ const latest_entry = self.journal.header_with_op(self.op).?;
597
649
  message.header.parent = latest_entry.checksum;
598
650
  message.header.context = message.header.checksum;
599
651
  message.header.view = self.view;
600
652
  message.header.op = self.op + 1;
601
653
  message.header.commit = self.commit_max;
602
- message.header.offset = Journal.next_offset(latest_entry);
654
+ message.header.timestamp = prepare_timestamp;
603
655
  message.header.replica = self.replica;
604
656
  message.header.command = .prepare;
605
657
 
@@ -608,7 +660,7 @@ pub fn Replica(
608
660
 
609
661
  log.debug("{}: on_request: prepare {}", .{ self.replica, message.header.checksum });
610
662
 
611
- self.pipeline.push(.{ .message = message.ref() }) catch unreachable;
663
+ self.pipeline.push_assume_capacity(.{ .message = message.ref() });
612
664
  assert(self.pipeline.count >= 1);
613
665
 
614
666
  if (self.pipeline.count == 1) {
@@ -618,6 +670,8 @@ pub fn Replica(
618
670
  } else {
619
671
  // Do not restart the prepare timeout as it is already ticking for another prepare.
620
672
  assert(self.prepare_timeout.ticking);
673
+ const previous = self.pipeline.get_ptr(self.pipeline.count - 2).?;
674
+ assert(previous.message.header.checksum == message.header.parent);
621
675
  }
622
676
 
623
677
  self.on_prepare(message);
@@ -631,22 +685,23 @@ pub fn Replica(
631
685
  ///
632
686
  /// The leader starts by sending a prepare message to itself.
633
687
  ///
634
- /// Each replica (including the leader) then forwards this prepare message to the next replica
635
- /// in the configuration, in parallel to writing to its own journal, closing the circle until
636
- /// the next replica is back to the leader, in which case the replica does not forward.
688
+ /// Each replica (including the leader) then forwards this prepare message to the next
689
+ /// replica in the configuration, in parallel to writing to its own journal, closing the
690
+ /// circle until the next replica is back to the leader, in which case the replica does not
691
+ /// forward.
637
692
  ///
638
693
  /// This keeps the leader's outgoing bandwidth limited (one-for-one) to incoming bandwidth,
639
- /// since the leader need only replicate to the next replica. Otherwise, the leader would need
640
- /// to replicate to multiple followers, dividing available bandwidth.
694
+ /// since the leader need only replicate to the next replica. Otherwise, the leader would
695
+ /// need to replicate to multiple followers, dividing available bandwidth.
641
696
  ///
642
- /// This does not impact latency, since with Flexible Paxos we need only one remote prepare_ok.
643
- /// It is ideal if this synchronous replication to one remote replica is to the next replica,
644
- /// since that is the replica next in line to be leader, which will need to be up-to-date before
645
- /// it can start the next view.
697
+ /// This does not impact latency, since with Flexible Paxos we need only one remote
698
+ /// prepare_ok. It is ideal if this synchronous replication to one remote replica is to the
699
+ /// next replica, since that is the replica next in line to be leader, which will need to
700
+ /// be up-to-date before it can start the next view.
646
701
  ///
647
- /// At the same time, asynchronous replication keeps going, so that if our local disk is slow,
648
- /// then any latency spike will be masked by more remote prepare_ok messages as they come in.
649
- /// This gives automatic tail latency tolerance for storage latency spikes.
702
+ /// At the same time, asynchronous replication keeps going, so that if our local disk is
703
+ /// slow, then any latency spike will be masked by more remote prepare_ok messages as they
704
+ /// come in. This gives automatic tail latency tolerance for storage latency spikes.
650
705
  ///
651
706
  /// The remaining problem then is tail latency tolerance for network latency spikes.
652
707
  /// If the next replica is down or partitioned, then the leader's prepare timeout will fire,
@@ -675,12 +730,26 @@ pub fn Replica(
675
730
  return;
676
731
  }
677
732
 
733
+ // Verify that the new request will fit in the WAL.
734
+ if (message.header.op >= self.op_checkpoint + config.journal_slot_count) {
735
+ log.debug("{}: on_prepare: ignoring op={} (too far ahead, checkpoint={})", .{
736
+ self.replica,
737
+ message.header.op,
738
+ self.op_checkpoint,
739
+ });
740
+ // When we are the leader, `on_request` enforces this invariant.
741
+ assert(self.follower());
742
+ return;
743
+ }
744
+
678
745
  assert(self.status == .normal);
679
746
  assert(message.header.view == self.view);
680
747
  assert(self.leader() or self.follower());
681
748
  assert(message.header.replica == self.leader_index(message.header.view));
749
+ assert(message.header.op > self.op_checkpoint);
682
750
  assert(message.header.op > self.op);
683
751
  assert(message.header.op > self.commit_min);
752
+ assert(message.header.op < self.op_checkpoint + config.journal_slot_count);
684
753
 
685
754
  if (self.follower()) self.normal_status_timeout.reset();
686
755
 
@@ -691,7 +760,7 @@ pub fn Replica(
691
760
 
692
761
  if (self.journal.previous_entry(message.header)) |previous| {
693
762
  // Any previous entry may be a whole journal's worth of ops behind due to wrapping.
694
- // We therefore do not do any further op, offset or checksum assertions beyond this:
763
+ // We therefore do not do any further op or checksum assertions beyond this:
695
764
  self.panic_if_hash_chain_would_break_in_the_same_view(previous, message.header);
696
765
  }
697
766
 
@@ -706,7 +775,7 @@ pub fn Replica(
706
775
  });
707
776
  assert(message.header.op == self.op + 1);
708
777
  self.op = message.header.op;
709
- self.journal.set_entry_as_dirty(message.header);
778
+ self.journal.set_header_as_dirty(message.header);
710
779
 
711
780
  self.replicate(message);
712
781
  self.append(message);
@@ -735,7 +804,7 @@ pub fn Replica(
735
804
  // Wait until we have `f + 1` prepare_ok messages (including ourself) for quorum:
736
805
  const threshold = self.quorum_replication;
737
806
 
738
- const count = self.add_message_and_receive_quorum_exactly_once(
807
+ const count = self.count_message_and_receive_quorum_exactly_once(
739
808
  &prepare.ok_from_all_replicas,
740
809
  message,
741
810
  threshold,
@@ -786,7 +855,7 @@ pub fn Replica(
786
855
  assert(message.header.replica == self.leader_index(message.header.view));
787
856
 
788
857
  // We may not always have the latest commit entry but if we do our checksum must match:
789
- if (self.journal.entry_for_op_exact(message.header.commit)) |commit_entry| {
858
+ if (self.journal.header_with_op(message.header.commit)) |commit_entry| {
790
859
  if (commit_entry.checksum == message.header.context) {
791
860
  log.debug("{}: on_commit: checksum verified", .{self.replica});
792
861
  } else if (self.valid_hash_chain("on_commit")) {
@@ -798,7 +867,6 @@ pub fn Replica(
798
867
  }
799
868
 
800
869
  self.normal_status_timeout.reset();
801
-
802
870
  self.commit_ops(message.header.commit);
803
871
  }
804
872
 
@@ -878,32 +946,18 @@ pub fn Replica(
878
946
  assert(self.status == .view_change);
879
947
  assert(message.header.view == self.view);
880
948
 
881
- if (self.leader_index(self.view) == self.replica) {
882
- // If we are the leader of the new view, then wait until we have a message to send a
883
- // do_view_change message to ourself. The on_do_view_change() handler will panic if
884
- // we received a start_view_change quorum without a do_view_change to ourself.
885
- if (self.message_bus.get_message()) |available| {
886
- self.message_bus.unref(available);
887
- } else {
888
- log.err("{}: on_start_view_change: waiting for message for do_view_change", .{
889
- self.replica,
890
- });
891
- return;
892
- }
893
- }
894
-
895
949
  // Wait until we have `f` messages (excluding ourself) for quorum:
896
950
  assert(self.replica_count > 1);
897
951
  const threshold = self.quorum_view_change - 1;
898
952
 
899
- const count = self.add_message_and_receive_quorum_exactly_once(
953
+ const count = self.count_message_and_receive_quorum_exactly_once(
900
954
  &self.start_view_change_from_other_replicas,
901
955
  message,
902
956
  threshold,
903
957
  ) orelse return;
904
958
 
905
959
  assert(count == threshold);
906
- assert(self.start_view_change_from_other_replicas[self.replica] == null);
960
+ assert(!self.start_view_change_from_other_replicas.isSet(self.replica));
907
961
  log.debug("{}: on_start_view_change: view={} quorum received", .{
908
962
  self.replica,
909
963
  self.view,
@@ -956,7 +1010,7 @@ pub fn Replica(
956
1010
  assert(self.replica_count > 1);
957
1011
  const threshold = self.quorum_view_change;
958
1012
 
959
- const count = self.add_message_and_receive_quorum_exactly_once(
1013
+ const count = self.reference_message_and_receive_quorum_exactly_once(
960
1014
  &self.do_view_change_from_all_replicas,
961
1015
  message,
962
1016
  threshold,
@@ -971,7 +1025,7 @@ pub fn Replica(
971
1025
 
972
1026
  var v: ?u32 = null;
973
1027
  var k: ?u64 = null;
974
- var latest = Header.reserved();
1028
+ var latest = Header.reserved(self.cluster, 0);
975
1029
 
976
1030
  for (self.do_view_change_from_all_replicas) |received, replica| {
977
1031
  if (received) |m| {
@@ -982,10 +1036,10 @@ pub fn Replica(
982
1036
 
983
1037
  // The latest normal view experienced by this replica:
984
1038
  // This may be higher than the view in any of the prepare headers.
985
- var replica_view_normal = @intCast(u32, m.header.offset);
1039
+ var replica_view_normal = @intCast(u32, m.header.timestamp);
986
1040
  assert(replica_view_normal < m.header.view);
987
1041
 
988
- var replica_latest = Header.reserved();
1042
+ var replica_latest = Header.reserved(self.cluster, 0);
989
1043
  set_latest_op(self.message_body_as_headers(m), &replica_latest);
990
1044
  assert(replica_latest.op == m.header.op);
991
1045
 
@@ -1025,7 +1079,7 @@ pub fn Replica(
1025
1079
  }
1026
1080
 
1027
1081
  // Verify that the repairs above have not replaced or advanced the latest op:
1028
- assert(self.journal.entry_for_op_exact(self.op).?.checksum == latest.checksum);
1082
+ assert(self.journal.header_with_op(self.op).?.checksum == latest.checksum);
1029
1083
 
1030
1084
  assert(self.start_view_change_quorum);
1031
1085
  assert(!self.do_view_change_quorum);
@@ -1033,7 +1087,11 @@ pub fn Replica(
1033
1087
 
1034
1088
  self.discard_uncommitted_headers();
1035
1089
  assert(self.op >= self.commit_max);
1036
- assert(self.journal.entry_for_op_exact(self.op) != null);
1090
+
1091
+ const prepare_timestamp = self.journal.header_with_op(self.op).?.timestamp;
1092
+ if (self.state_machine.prepare_timestamp < prepare_timestamp) {
1093
+ self.state_machine.prepare_timestamp = prepare_timestamp;
1094
+ }
1037
1095
 
1038
1096
  // Start repairs according to the CTRL protocol:
1039
1097
  assert(!self.repair_timeout.ticking);
@@ -1061,7 +1119,7 @@ pub fn Replica(
1061
1119
  assert(self.status == .view_change);
1062
1120
  assert(message.header.view == self.view);
1063
1121
 
1064
- var latest = Header.reserved();
1122
+ var latest = Header.reserved(self.cluster, 0);
1065
1123
  set_latest_op(self.message_body_as_headers(message), &latest);
1066
1124
  assert(latest.op == message.header.op);
1067
1125
 
@@ -1073,10 +1131,10 @@ pub fn Replica(
1073
1131
  }
1074
1132
 
1075
1133
  // Verify that the repairs above have not replaced or advanced the latest op:
1076
- assert(self.journal.entry_for_op_exact(self.op).?.checksum == latest.checksum);
1134
+ assert(self.journal.header_with_op(self.op).?.checksum == latest.checksum);
1077
1135
 
1078
1136
  if (self.status == .view_change) {
1079
- self.transition_to_normal_status(message.header.view);
1137
+ self.transition_to_normal_from_view_change_status(message.header.view);
1080
1138
  self.send_prepare_oks_after_view_change();
1081
1139
  }
1082
1140
 
@@ -1097,12 +1155,7 @@ pub fn Replica(
1097
1155
  assert(message.header.replica != self.replica);
1098
1156
  assert(self.leader());
1099
1157
 
1100
- const start_view = self.create_view_change_message(.start_view) orelse {
1101
- log.err("{}: on_request_start_view: dropping start_view, no message available", .{
1102
- self.replica,
1103
- });
1104
- return;
1105
- };
1158
+ const start_view = self.create_view_change_message(.start_view);
1106
1159
  defer self.message_bus.unref(start_view);
1107
1160
 
1108
1161
  assert(start_view.references == 1);
@@ -1114,8 +1167,9 @@ pub fn Replica(
1114
1167
  self.send_message_to_replica(message.header.replica, start_view);
1115
1168
  }
1116
1169
 
1117
- /// TODO This is a work in progress (out of scope for the bounty)
1118
1170
  fn on_recovery(self: *Self, message: *const Message) void {
1171
+ assert(self.replica_count > 1);
1172
+
1119
1173
  if (self.status != .normal) {
1120
1174
  log.debug("{}: on_recovery: ignoring ({})", .{ self.replica, self.status });
1121
1175
  return;
@@ -1126,40 +1180,31 @@ pub fn Replica(
1126
1180
  return;
1127
1181
  }
1128
1182
 
1129
- const response = self.message_bus.get_message() orelse {
1130
- log.err("{}: on_recovery: ignoring (waiting for message)", .{self.replica});
1131
- return;
1132
- };
1183
+ const response = self.message_bus.get_message();
1133
1184
  defer self.message_bus.unref(response);
1134
1185
 
1186
+ log.debug("{}: on_recovery: view={} op={} commit={} nonce={}", .{
1187
+ self.replica,
1188
+ self.view,
1189
+ self.op,
1190
+ self.commit_max,
1191
+ message.header.context,
1192
+ });
1193
+
1135
1194
  response.header.* = .{
1136
1195
  .command = .recovery_response,
1137
1196
  .cluster = self.cluster,
1138
- .context = message.header.context,
1197
+ .context = message.header.context, // Echo the request's nonce.
1139
1198
  .replica = self.replica,
1140
1199
  .view = self.view,
1141
1200
  .op = self.op,
1142
1201
  .commit = self.commit_max,
1143
1202
  };
1144
1203
 
1145
- const count_max = 8; // The number of prepare headers to include in the body.
1146
-
1147
- const size_max = @sizeOf(Header) * std.math.min(
1148
- std.math.max(@divFloor(response.buffer.len, @sizeOf(Header)), 2),
1149
- 1 + count_max,
1150
- );
1151
- assert(size_max > @sizeOf(Header));
1152
-
1153
- const count = self.journal.copy_latest_headers_between(
1154
- 0,
1155
- self.op,
1156
- std.mem.bytesAsSlice(Header, response.buffer[@sizeOf(Header)..size_max]),
1157
- );
1158
-
1159
- // We expect that self.op always exists.
1160
- assert(count > 0);
1161
-
1162
- response.header.size = @intCast(u32, @sizeOf(Header) + @sizeOf(Header) * count);
1204
+ const count_max = 8; // The maximum number of prepare headers to include in the body.
1205
+ const count = self.copy_latest_headers_and_set_size(0, self.op, count_max, response);
1206
+ assert(count > 0); // We expect that self.op always exists.
1207
+ assert(@divExact(response.header.size, @sizeOf(Header)) == 1 + count);
1163
1208
 
1164
1209
  response.header.set_checksum_body(response.body());
1165
1210
  response.header.set_checksum();
@@ -1171,68 +1216,339 @@ pub fn Replica(
1171
1216
  self.send_message_to_replica(message.header.replica, response);
1172
1217
  }
1173
1218
 
1174
- /// TODO This is a work in progress (out of scope for the bounty)
1175
1219
  fn on_recovery_response(self: *Self, message: *Message) void {
1176
- _ = self;
1177
- _ = message;
1220
+ assert(self.replica_count > 1);
1221
+
1222
+ if (self.status != .recovering) {
1223
+ log.debug("{}: on_recovery_response: ignoring ({})", .{
1224
+ self.replica,
1225
+ self.status,
1226
+ });
1227
+ return;
1228
+ }
1229
+
1230
+ if (message.header.replica == self.replica) {
1231
+ log.warn("{}: on_recovery_response: ignoring (self)", .{self.replica});
1232
+ return;
1233
+ }
1234
+
1235
+ if (message.header.context != self.recovery_nonce) {
1236
+ log.warn("{}: on_recovery_response: ignoring (different nonce)", .{self.replica});
1237
+ return;
1238
+ }
1239
+
1240
+ // Recovery messages with our nonce are not sent until after the journal is recovered.
1241
+ assert(self.journal.recovered);
1242
+
1243
+ var responses: *QuorumMessages = &self.recovery_response_from_other_replicas;
1244
+ if (responses[message.header.replica]) |existing| {
1245
+ assert(message.header.replica == existing.header.replica);
1246
+
1247
+ if (message.header.checksum == existing.header.checksum) {
1248
+ // The response was replayed by the network; ignore it.
1249
+ log.debug("{}: on_recovery_response: ignoring (duplicate message)", .{
1250
+ self.replica,
1251
+ });
1252
+ return;
1253
+ }
1254
+
1255
+ // We received a second (distinct) response from a replica. Possible causes:
1256
+ // * We retried the `recovery` message, because we had not yet received a quorum.
1257
+ // * The `recovery` message was duplicated/misdirected by the network, and the
1258
+ // receiver's state changed in the mean time.
1259
+
1260
+ log.debug(
1261
+ "{}: on_recovery_response: replica={} view={}..{} op={}..{} commit={}..{}",
1262
+ .{
1263
+ self.replica,
1264
+ existing.header.replica,
1265
+ existing.header.view,
1266
+ message.header.view,
1267
+ existing.header.op,
1268
+ message.header.op,
1269
+ existing.header.commit,
1270
+ message.header.commit,
1271
+ },
1272
+ );
1273
+
1274
+ if (message.header.view < existing.header.view or
1275
+ (message.header.view == existing.header.view and
1276
+ message.header.op < existing.header.op) or
1277
+ (message.header.view == existing.header.view and
1278
+ message.header.op == existing.header.op and
1279
+ message.header.commit < existing.header.commit))
1280
+ {
1281
+ // The second message is older than the first one (reordered packets).
1282
+ log.debug("{}: on_recovery_response: ignoring (older)", .{self.replica});
1283
+ return;
1284
+ }
1285
+
1286
+ // The second message is newer than the first one.
1287
+ assert(message.header.view >= existing.header.view);
1288
+ // The op number may regress if an uncommitted op was discarded in a higher view.
1289
+ assert(message.header.op >= existing.header.op or
1290
+ message.header.view > existing.header.view);
1291
+ assert(message.header.commit >= existing.header.commit);
1292
+
1293
+ self.message_bus.unref(existing);
1294
+ responses[message.header.replica] = null;
1295
+ } else {
1296
+ log.debug(
1297
+ "{}: on_recovery_response: replica={} view={} op={} commit={}",
1298
+ .{
1299
+ self.replica,
1300
+ message.header.replica,
1301
+ message.header.view,
1302
+ message.header.op,
1303
+ message.header.commit,
1304
+ },
1305
+ );
1306
+ }
1307
+
1308
+ assert(responses[message.header.replica] == null);
1309
+ responses[message.header.replica] = message.ref();
1310
+
1311
+ // Wait until we have:
1312
+ // * at least `f + 1` messages for quorum (not including ourself), and
1313
+ // * a response from the leader of the highest discovered view.
1314
+ const count = self.count_quorum(responses, .recovery_response, self.recovery_nonce);
1315
+ assert(count <= self.replica_count - 1);
1316
+
1317
+ const threshold = self.quorum_view_change;
1318
+ if (count < threshold) {
1319
+ log.debug("{}: on_recovery_response: waiting for quorum ({}/{})", .{
1320
+ self.replica,
1321
+ count,
1322
+ threshold,
1323
+ });
1324
+ return;
1325
+ }
1326
+
1327
+ const view = blk: { // The latest known view.
1328
+ var view: u32 = 0;
1329
+ for (self.recovery_response_from_other_replicas) |received, replica| {
1330
+ if (received) |response| {
1331
+ assert(replica != self.replica);
1332
+ assert(response.header.replica == replica);
1333
+ assert(response.header.context == self.recovery_nonce);
1334
+
1335
+ view = std.math.max(view, response.header.view);
1336
+ }
1337
+ }
1338
+ break :blk view;
1339
+ };
1340
+
1341
+ const leader_response = responses[self.leader_index(view)];
1342
+ if (leader_response == null) {
1343
+ log.debug(
1344
+ "{}: on_recovery_response: ignoring (awaiting response from leader of view={})",
1345
+ .{
1346
+ self.replica,
1347
+ view,
1348
+ },
1349
+ );
1350
+ return;
1351
+ }
1352
+
1353
+ if (leader_response.?.header.view != view) {
1354
+ // The leader (according to the view quorum) isn't the leader (according to itself).
1355
+ // The `recovery_timeout` will retry shortly with another round.
1356
+ log.debug(
1357
+ "{}: on_recovery_response: ignoring (leader view={} != quorum view={})",
1358
+ .{
1359
+ self.replica,
1360
+ leader_response.?.header.view,
1361
+ view,
1362
+ },
1363
+ );
1364
+ return;
1365
+ }
1366
+
1367
+ // This recovering→normal status transition occurs exactly once.
1368
+ // All further `recovery_response` messages are ignored.
1369
+
1370
+ // TODO When the view is recovered from the superblock (instead of via the VSR recovery
1371
+ // protocol), if the view number indicates that this replica is a leader, it must
1372
+ // transition to status=view_change instead of status=normal.
1373
+
1374
+ const leader_headers = self.message_body_as_headers(leader_response.?);
1375
+ assert(leader_headers.len > 0);
1376
+
1377
+ const commit = leader_response.?.header.commit;
1378
+ {
1379
+ var latest = Header.reserved(self.cluster, 0);
1380
+ set_latest_op(leader_headers, &latest);
1381
+ assert(latest.op == leader_response.?.header.op);
1382
+
1383
+ self.set_latest_op_and_k(&latest, commit, "on_recovery_response");
1384
+ assert(self.op == latest.op);
1385
+ assert(self.journal.header_with_op(self.op) != null);
1386
+ }
1387
+
1388
+ assert(self.status == .recovering);
1389
+ self.transition_to_normal_from_recovering_status(view);
1390
+ assert(self.status == .normal);
1391
+ assert(self.follower());
1392
+
1393
+ // TODO If the view's primary is >1 WAL ahead of us, these headers could cause
1394
+ // problems. We don't want to jump this far ahead to repair, but we still need to use
1395
+ // the hash chain to figure out which headers to request. Maybe include our
1396
+ // `op_checkpoint` in the recovery (request) message so that the response can give more
1397
+ // useful (i.e. older) headers.
1398
+ for (leader_headers) |*header| {
1399
+ _ = self.repair_header(header);
1400
+ }
1401
+
1402
+ if (self.op < config.journal_slot_count) {
1403
+ if (self.journal.header_with_op(0)) |header| {
1404
+ assert(header.command == .prepare);
1405
+ assert(header.operation == .root);
1406
+ } else {
1407
+ // This is the first wrap of the log, and the root prepare is corrupt.
1408
+ // Repair the root repair. This is necessary to maintain the invariant that the
1409
+ // op=commit_min exists in-memory.
1410
+ const header = Header.root_prepare(self.cluster);
1411
+ self.journal.set_header_as_dirty(&header);
1412
+ log.debug("{}: on_recovery_response: repair root op", .{self.replica});
1413
+ }
1414
+ }
1415
+
1416
+ log.debug("{}: on_recovery_response: responses={} view={} headers={}..{}" ++
1417
+ " commit={} dirty={} faulty={}", .{
1418
+ self.replica,
1419
+ count,
1420
+ view,
1421
+ leader_headers[leader_headers.len - 1].op,
1422
+ leader_headers[0].op,
1423
+ commit,
1424
+ self.journal.dirty.count,
1425
+ self.journal.faulty.count,
1426
+ });
1427
+
1428
+ self.state_machine.prepare_timestamp = self.journal.header_with_op(self.op).?.timestamp;
1429
+ // `state_machine.commit_timestamp` is updated as messages are committed.
1430
+
1431
+ self.reset_quorum_recovery_response();
1432
+ self.commit_ops(commit);
1433
+ self.repair();
1178
1434
  }
1179
1435
 
1436
+ /// If the requested prepare has been guaranteed by this replica:
1437
+ /// * Read the prepare from storage, and forward it to the replica that requested it.
1438
+ /// * Otherwise send no reply — it isn't safe to nack.
1439
+ /// If the requested prepare has *not* been guaranteed by this replica, then send a nack.
1440
+ ///
1441
+ /// A prepare is considered "guaranteed" by a replica if that replica has acknowledged it
1442
+ /// to the cluster. The cluster sees the replica as an underwriter of a guaranteed
1443
+ /// prepare. If a guaranteed prepare is found to by faulty, the replica must repair it
1444
+ /// to restore durability.
1180
1445
  fn on_request_prepare(self: *Self, message: *const Message) void {
1181
1446
  if (self.ignore_repair_message(message)) return;
1182
1447
 
1448
+ assert(self.replica_count > 1);
1183
1449
  assert(self.status == .normal or self.status == .view_change);
1184
1450
  assert(message.header.view == self.view);
1185
1451
  assert(message.header.replica != self.replica);
1186
1452
 
1187
1453
  const op = message.header.op;
1188
- var checksum: ?u128 = message.header.context;
1189
- if (self.leader_index(self.view) == self.replica and checksum.? == 0) checksum = null;
1454
+ const slot = self.journal.slot_for_op(op);
1455
+ const checksum: ?u128 = switch (message.header.timestamp) {
1456
+ 0 => null,
1457
+ 1 => message.header.context,
1458
+ else => unreachable,
1459
+ };
1190
1460
 
1191
- if (self.journal.entry_for_op_exact_with_checksum(op, checksum)) |entry| {
1192
- assert(entry.op == op);
1193
- assert(checksum == null or entry.checksum == checksum.?);
1461
+ // Only the leader may respond to `request_prepare` messages without a checksum.
1462
+ assert(checksum != null or self.leader_index(self.view) == self.replica);
1194
1463
 
1195
- if (!self.journal.dirty.bit(op)) {
1196
- assert(!self.journal.faulty.bit(op));
1464
+ // Try to serve the message directly from the pipeline.
1465
+ // This saves us from going to disk. And we don't need to worry that the WAL's copy
1466
+ // of an uncommitted prepare is lost/corrupted.
1467
+ if (self.pipeline_prepare_for_op_and_checksum(op, checksum)) |prepare| {
1468
+ log.debug("{}: on_request_prepare: op={} checksum={} reply from pipeline", .{
1469
+ self.replica,
1470
+ op,
1471
+ checksum,
1472
+ });
1473
+ self.send_message_to_replica(message.header.replica, prepare.message);
1474
+ return;
1475
+ }
1197
1476
 
1477
+ if (self.journal.prepare_inhabited[slot.index]) {
1478
+ const prepare_checksum = self.journal.prepare_checksums[slot.index];
1479
+ // Consult `journal.prepare_checksums` (rather than `journal.headers`):
1480
+ // the former may have the prepare we want — even if journal recovery marked the
1481
+ // slot as faulty and left the in-memory header as reserved.
1482
+ if (checksum == null or checksum.? == prepare_checksum) {
1198
1483
  log.debug("{}: on_request_prepare: op={} checksum={} reading", .{
1199
1484
  self.replica,
1200
1485
  op,
1201
1486
  checksum,
1202
1487
  });
1203
1488
 
1204
- // TODO Do not reissue the read if we are already reading in order to send to
1205
- // this particular destination replica.
1206
-
1207
- self.journal.read_prepare(
1208
- on_request_prepare_read,
1209
- op,
1210
- entry.checksum,
1211
- message.header.replica,
1212
- );
1213
-
1214
- // We have guaranteed the prepare and our copy is clean (not safe to nack).
1215
- return;
1216
- } else if (self.journal.faulty.bit(op)) {
1217
- log.debug("{}: on_request_prepare: op={} checksum={} faulty", .{
1218
- self.replica,
1219
- op,
1220
- checksum,
1221
- });
1489
+ if (self.journal.header_with_op_and_checksum(op, checksum)) |_| {
1490
+ // The header for the target prepare is already in-memory.
1491
+ // This is preferable to the `else` case since we have the prepare's
1492
+ // `header.size` in-memory, so the read can be (potentially) shorter.
1493
+ // TODO Do not reissue the read if we are already reading in order to send
1494
+ // to this particular destination replica.
1495
+ self.journal.read_prepare(
1496
+ on_request_prepare_read,
1497
+ op,
1498
+ prepare_checksum,
1499
+ message.header.replica,
1500
+ );
1501
+ } else {
1502
+ // TODO Do not reissue the read if we are already reading in order to send to
1503
+ // this particular destination replica.
1504
+ self.journal.read_prepare_with_op_and_checksum(
1505
+ on_request_prepare_read,
1506
+ op,
1507
+ prepare_checksum,
1508
+ message.header.replica,
1509
+ );
1510
+ }
1222
1511
 
1223
- // We have gauranteed the prepare but our copy is faulty (not safe to nack).
1512
+ // We have guaranteed the prepare (not safe to nack).
1513
+ // Our copy may or may not be valid, but we will try to read & forward it.
1224
1514
  return;
1225
1515
  }
1516
+ }
1226
1517
 
1227
- // We know of the prepare but we have yet to write or guarantee it (safe to nack).
1228
- // Continue through below...
1518
+ {
1519
+ // We may have guaranteed the prepare but our copy is faulty (not safe to nack).
1520
+ if (self.journal.faulty.bit(slot)) return;
1521
+ if (self.journal.header_with_op_and_checksum(message.header.op, checksum)) |_| {
1522
+ if (self.journal.dirty.bit(slot)) {
1523
+ // We know of the prepare but have yet to write it (safe to nack).
1524
+ // Continue through below...
1525
+ } else {
1526
+ // We have guaranteed the prepare and our copy is clean (not safe to nack).
1527
+ return;
1528
+ }
1529
+ }
1229
1530
  }
1230
1531
 
1532
+ // Protocol-Aware Recovery's CTRL protocol only runs during the view change, when the
1533
+ // new primary needs to repair its own WAL before starting the new view.
1534
+ //
1535
+ // This branch is only where the backup doesn't have the prepare and could possibly
1536
+ // send a nack as part of the CTRL protocol. Nacks only get sent during a view change
1537
+ // to help the new primary trim uncommitted ops that couldn't otherwise be repaired.
1538
+ // Without doing this, the cluster would become permanently unavailable. So backups
1539
+ // shouldn't respond to the `request_prepare` if the new view has already started,
1540
+ // they should also be in view change status, waiting for the new primary to start
1541
+ // the view.
1231
1542
  if (self.status == .view_change) {
1232
1543
  assert(message.header.replica == self.leader_index(self.view));
1233
1544
  assert(checksum != null);
1234
- if (self.journal.entry_for_op_exact_with_checksum(op, checksum) != null) {
1235
- assert(self.journal.dirty.bit(op) and !self.journal.faulty.bit(op));
1545
+
1546
+ if (self.journal.header_with_op_and_checksum(op, checksum)) |_| {
1547
+ assert(self.journal.dirty.bit(slot) and !self.journal.faulty.bit(slot));
1548
+ }
1549
+
1550
+ if (self.journal.prepare_inhabited[slot.index]) {
1551
+ assert(self.journal.prepare_checksums[slot.index] != checksum.?);
1236
1552
  }
1237
1553
 
1238
1554
  log.debug("{}: on_request_prepare: op={} checksum={} nacking", .{
@@ -1276,14 +1592,7 @@ pub fn Replica(
1276
1592
  assert(message.header.view == self.view);
1277
1593
  assert(message.header.replica != self.replica);
1278
1594
 
1279
- const response = self.message_bus.get_message() orelse {
1280
- log.err("{}: on_request_headers: ignoring (op={}..{}, no message available)", .{
1281
- self.replica,
1282
- message.header.commit,
1283
- message.header.op,
1284
- });
1285
- return;
1286
- };
1595
+ const response = self.message_bus.get_message();
1287
1596
  defer self.message_bus.unref(response);
1288
1597
 
1289
1598
  response.header.* = .{
@@ -1299,21 +1608,9 @@ pub fn Replica(
1299
1608
  const op_max = message.header.op;
1300
1609
  assert(op_max >= op_min);
1301
1610
 
1302
- // We must add 1 because op_max and op_min are both inclusive:
1303
- const count_max = @intCast(u32, std.math.min(64, op_max - op_min + 1));
1304
- assert(count_max > 0);
1305
-
1306
- const size_max = @sizeOf(Header) * std.math.min(
1307
- std.math.max(@divFloor(message.buffer.len, @sizeOf(Header)), 2),
1308
- 1 + count_max,
1309
- );
1310
- assert(size_max > @sizeOf(Header));
1311
-
1312
- const count = self.journal.copy_latest_headers_between(
1313
- op_min,
1314
- op_max,
1315
- std.mem.bytesAsSlice(Header, response.buffer[@sizeOf(Header)..size_max]),
1316
- );
1611
+ const count = self.copy_latest_headers_and_set_size(op_min, op_max, null, response);
1612
+ assert(count >= 0);
1613
+ assert(@divExact(response.header.size, @sizeOf(Header)) == 1 + count);
1317
1614
 
1318
1615
  if (count == 0) {
1319
1616
  log.debug("{}: on_request_headers: ignoring (op={}..{}, no headers)", .{
@@ -1324,8 +1621,6 @@ pub fn Replica(
1324
1621
  return;
1325
1622
  }
1326
1623
 
1327
- response.header.size = @intCast(u32, @sizeOf(Header) + @sizeOf(Header) * count);
1328
-
1329
1624
  response.header.set_checksum_body(response.body());
1330
1625
  response.header.set_checksum();
1331
1626
 
@@ -1348,7 +1643,8 @@ pub fn Replica(
1348
1643
  }
1349
1644
 
1350
1645
  const op = self.nack_prepare_op.?;
1351
- const checksum = self.journal.entry_for_op_exact(op).?.checksum;
1646
+ const checksum = self.journal.header_with_op(op).?.checksum;
1647
+ const slot = self.journal.slot_with_op(op).?;
1352
1648
 
1353
1649
  if (message.header.op != op) {
1354
1650
  log.debug("{}: on_nack_prepare: ignoring (repairing another op)", .{self.replica});
@@ -1383,14 +1679,14 @@ pub fn Replica(
1383
1679
  // Otherwise, if we know we do not have the op, then we can exclude ourselves.
1384
1680
  assert(self.replica_count > 1);
1385
1681
 
1386
- const threshold = if (self.journal.faulty.bit(op))
1682
+ const threshold = if (self.journal.faulty.bit(slot))
1387
1683
  self.replica_count - self.quorum_replication + 1
1388
1684
  else
1389
1685
  self.replica_count - self.quorum_replication;
1390
1686
 
1391
1687
  if (threshold == 0) {
1392
1688
  assert(self.replica_count == 2);
1393
- assert(!self.journal.faulty.bit(op));
1689
+ assert(!self.journal.faulty.bit(slot));
1394
1690
 
1395
1691
  // This is a special case for a cluster-of-two, handled in `repair_prepare()`.
1396
1692
  log.debug("{}: on_nack_prepare: ignoring (cluster-of-two, not faulty)", .{
@@ -1399,10 +1695,11 @@ pub fn Replica(
1399
1695
  return;
1400
1696
  }
1401
1697
 
1402
- log.debug("{}: on_nack_prepare: quorum_replication={} threshold={}", .{
1698
+ log.debug("{}: on_nack_prepare: quorum_replication={} threshold={} op={}", .{
1403
1699
  self.replica,
1404
1700
  self.quorum_replication,
1405
1701
  threshold,
1702
+ op,
1406
1703
  });
1407
1704
 
1408
1705
  // We should never expect to receive a nack from ourselves:
@@ -1410,15 +1707,15 @@ pub fn Replica(
1410
1707
  assert(threshold < self.replica_count);
1411
1708
 
1412
1709
  // Wait until we have `threshold` messages for quorum:
1413
- const count = self.add_message_and_receive_quorum_exactly_once(
1710
+ const count = self.count_message_and_receive_quorum_exactly_once(
1414
1711
  &self.nack_prepare_from_other_replicas,
1415
1712
  message,
1416
1713
  threshold,
1417
1714
  ) orelse return;
1418
1715
 
1419
1716
  assert(count == threshold);
1420
- assert(self.nack_prepare_from_other_replicas[self.replica] == null);
1421
- log.debug("{}: on_nack_prepare: quorum received", .{self.replica});
1717
+ assert(!self.nack_prepare_from_other_replicas.isSet(self.replica));
1718
+ log.debug("{}: on_nack_prepare: quorum received op={}", .{ self.replica, op });
1422
1719
 
1423
1720
  self.discard_uncommitted_ops_from(op, checksum);
1424
1721
  self.reset_quorum_nack_prepare();
@@ -1487,23 +1784,37 @@ pub fn Replica(
1487
1784
  // The list of remote replicas yet to send a prepare_ok:
1488
1785
  var waiting: [config.replicas_max]u8 = undefined;
1489
1786
  var waiting_len: usize = 0;
1490
- for (prepare.ok_from_all_replicas[0..self.replica_count]) |received, replica| {
1491
- if (received == null and replica != self.replica) {
1787
+ var ok_from_all_replicas_iterator = prepare.ok_from_all_replicas.iterator(.{
1788
+ .kind = .unset,
1789
+ });
1790
+ while (ok_from_all_replicas_iterator.next()) |replica| {
1791
+ // Ensure we don't wait for replicas that don't exist.
1792
+ // The bits between `replica_count` and `replicas_max` are always unset,
1793
+ // since they don't actually represent replicas.
1794
+ if (replica == self.replica_count) {
1795
+ assert(self.replica_count < config.replicas_max);
1796
+ break;
1797
+ }
1798
+ assert(replica < self.replica_count);
1799
+
1800
+ if (replica != self.replica) {
1492
1801
  waiting[waiting_len] = @intCast(u8, replica);
1493
1802
  waiting_len += 1;
1494
1803
  }
1804
+ } else {
1805
+ assert(self.replica_count == config.replicas_max);
1495
1806
  }
1496
1807
 
1497
1808
  if (waiting_len == 0) {
1498
1809
  self.prepare_timeout.reset();
1499
1810
 
1500
1811
  log.debug("{}: on_prepare_timeout: waiting for journal", .{self.replica});
1501
- assert(prepare.ok_from_all_replicas[self.replica] == null);
1812
+ assert(!prepare.ok_from_all_replicas.isSet(self.replica));
1502
1813
 
1503
1814
  // We may be slow and waiting for the write to complete.
1504
1815
  //
1505
1816
  // We may even have maxed out our IO depth and been unable to initiate the write,
1506
- // which can happen if `config.pipelining_max` exceeds `config.io_depth_write`.
1817
+ // which can happen if `config.pipeline_max` exceeds `config.io_depth_write`.
1507
1818
  // This can lead to deadlock for a cluster of one or two (if we do not retry here),
1508
1819
  // since there is no other way for the leader to repair the dirty op because no
1509
1820
  // other replica has it.
@@ -1533,7 +1844,10 @@ pub fn Replica(
1533
1844
  const replica = waiting[self.prepare_timeout.attempts % waiting_len];
1534
1845
  assert(replica != self.replica);
1535
1846
 
1536
- log.debug("{}: on_prepare_timeout: replicating to replica {}", .{ self.replica, replica });
1847
+ log.debug("{}: on_prepare_timeout: replicating to replica {}", .{
1848
+ self.replica,
1849
+ replica,
1850
+ });
1537
1851
  self.send_message_to_replica(replica, prepare.message);
1538
1852
  }
1539
1853
 
@@ -1545,7 +1859,7 @@ pub fn Replica(
1545
1859
  assert(self.commit_min == self.commit_max);
1546
1860
 
1547
1861
  // TODO Snapshots: Use snapshot checksum if commit is no longer in journal.
1548
- const latest_committed_entry = self.journal.entry_for_op_exact(self.commit_max).?;
1862
+ const latest_committed_entry = self.journal.header_with_op(self.commit_max).?;
1549
1863
 
1550
1864
  self.send_header_to_other_replicas(.{
1551
1865
  .command = .commit,
@@ -1590,7 +1904,14 @@ pub fn Replica(
1590
1904
  self.repair();
1591
1905
  }
1592
1906
 
1593
- fn add_message_and_receive_quorum_exactly_once(
1907
+ fn on_recovery_timeout(self: *Self) void {
1908
+ assert(self.status == .recovering);
1909
+ assert(self.replica_count > 1);
1910
+ self.recovery_timeout.reset();
1911
+ self.recover();
1912
+ }
1913
+
1914
+ fn reference_message_and_receive_quorum_exactly_once(
1594
1915
  self: *Self,
1595
1916
  messages: *QuorumMessages,
1596
1917
  message: *Message,
@@ -1604,18 +1925,6 @@ pub fn Replica(
1604
1925
  assert(message.header.replica < self.replica_count);
1605
1926
  assert(message.header.view == self.view);
1606
1927
  switch (message.header.command) {
1607
- .prepare_ok => {
1608
- if (self.replica_count <= 2) assert(threshold == self.replica_count);
1609
-
1610
- assert(self.status == .normal);
1611
- assert(self.leader());
1612
- },
1613
- .start_view_change => {
1614
- assert(self.replica_count > 1);
1615
- if (self.replica_count == 2) assert(threshold == 1);
1616
-
1617
- assert(self.status == .view_change);
1618
- },
1619
1928
  .do_view_change => {
1620
1929
  assert(self.replica_count > 1);
1621
1930
  if (self.replica_count == 2) assert(threshold == 2);
@@ -1623,13 +1932,6 @@ pub fn Replica(
1623
1932
  assert(self.status == .view_change);
1624
1933
  assert(self.leader_index(self.view) == self.replica);
1625
1934
  },
1626
- .nack_prepare => {
1627
- assert(self.replica_count > 1);
1628
- if (self.replica_count == 2) assert(threshold >= 1);
1629
-
1630
- assert(self.status == .view_change);
1631
- assert(self.leader_index(self.view) == self.replica);
1632
- },
1633
1935
  else => unreachable,
1634
1936
  }
1635
1937
 
@@ -1665,8 +1967,88 @@ pub fn Replica(
1665
1967
 
1666
1968
  // This is not the first time we have had quorum, the state transition has already happened:
1667
1969
  if (count > threshold) {
1668
- log.debug("{}: on_{s}: ignoring (quorum received already)", .{ self.replica, command });
1669
- return null;
1970
+ log.debug("{}: on_{s}: ignoring (quorum received already)", .{
1971
+ self.replica,
1972
+ command,
1973
+ });
1974
+ return null;
1975
+ }
1976
+
1977
+ assert(count == threshold);
1978
+ return count;
1979
+ }
1980
+
1981
+ fn count_message_and_receive_quorum_exactly_once(
1982
+ self: *Self,
1983
+ counter: *QuorumCounter,
1984
+ message: *Message,
1985
+ threshold: u32,
1986
+ ) ?usize {
1987
+ assert(threshold >= 1);
1988
+ assert(threshold <= self.replica_count);
1989
+
1990
+ assert(QuorumCounter.bit_length == config.replicas_max);
1991
+ assert(message.header.cluster == self.cluster);
1992
+ assert(message.header.replica < self.replica_count);
1993
+ assert(message.header.view == self.view);
1994
+
1995
+ switch (message.header.command) {
1996
+ .prepare_ok => {
1997
+ if (self.replica_count <= 2) assert(threshold == self.replica_count);
1998
+
1999
+ assert(self.status == .normal);
2000
+ assert(self.leader());
2001
+ },
2002
+ .start_view_change => {
2003
+ assert(self.replica_count > 1);
2004
+ if (self.replica_count == 2) assert(threshold == 1);
2005
+
2006
+ assert(self.status == .view_change);
2007
+ },
2008
+ .nack_prepare => {
2009
+ assert(self.replica_count > 1);
2010
+ if (self.replica_count == 2) assert(threshold >= 1);
2011
+
2012
+ assert(self.status == .view_change);
2013
+ assert(self.leader_index(self.view) == self.replica);
2014
+ },
2015
+ else => unreachable,
2016
+ }
2017
+
2018
+ const command: []const u8 = @tagName(message.header.command);
2019
+
2020
+ // Do not allow duplicate messages to trigger multiple passes through a state transition:
2021
+ if (counter.isSet(message.header.replica)) {
2022
+ log.debug("{}: on_{s}: ignoring (duplicate message replica={})", .{
2023
+ self.replica,
2024
+ command,
2025
+ message.header.replica,
2026
+ });
2027
+ return null;
2028
+ }
2029
+
2030
+ // Record the first receipt of this message:
2031
+ counter.set(message.header.replica);
2032
+ assert(counter.isSet(message.header.replica));
2033
+
2034
+ // Count the number of unique messages now received:
2035
+ const count = counter.count();
2036
+ log.debug("{}: on_{s}: {} message(s)", .{ self.replica, command, count });
2037
+ assert(count <= self.replica_count);
2038
+
2039
+ // Wait until we have exactly `threshold` messages for quorum:
2040
+ if (count < threshold) {
2041
+ log.debug("{}: on_{s}: waiting for quorum", .{ self.replica, command });
2042
+ return null;
2043
+ }
2044
+
2045
+ // This is not the first time we have had quorum, the state transition has already happened:
2046
+ if (count > threshold) {
2047
+ log.debug("{}: on_{s}: ignoring (quorum received already)", .{
2048
+ self.replica,
2049
+ command,
2050
+ });
2051
+ return null;
1670
2052
  }
1671
2053
 
1672
2054
  assert(count == threshold);
@@ -1679,8 +2061,15 @@ pub fn Replica(
1679
2061
  assert(message.header.view == self.view);
1680
2062
  assert(message.header.op == self.op);
1681
2063
 
1682
- log.debug("{}: append: appending to journal", .{self.replica});
1683
- self.write_prepare(message, .append);
2064
+ if (self.replica_count == 1 and self.pipeline.count > 1) {
2065
+ // In a cluster-of-one, the prepares must always be written to the WAL sequentially
2066
+ // (never concurrently). This ensures that there will be no gaps in the WAL during
2067
+ // crash recovery.
2068
+ log.debug("{}: append: serializing append op={}", .{ self.replica, message.header.op });
2069
+ } else {
2070
+ log.debug("{}: append: appending to journal", .{self.replica});
2071
+ self.write_prepare(message, .append);
2072
+ }
1684
2073
  }
1685
2074
 
1686
2075
  /// Returns whether `b` succeeds `a` by having a newer view or same view and newer op.
@@ -1731,7 +2120,8 @@ pub fn Replica(
1731
2120
  fn commit_ops(self: *Self, commit: u64) void {
1732
2121
  // TODO Restrict `view_change` status only to the leader purely as defense-in-depth.
1733
2122
  // Be careful of concurrency when doing this, as successive view changes can happen quickly.
1734
- assert(self.status == .normal or self.status == .view_change);
2123
+ assert(self.status == .normal or self.status == .view_change or
2124
+ (self.status == .recovering and self.replica_count == 1));
1735
2125
  assert(self.commit_min <= self.commit_max);
1736
2126
  assert(self.commit_min <= self.op);
1737
2127
  assert(self.commit_max <= self.op or self.commit_max > self.op);
@@ -1775,12 +2165,14 @@ pub fn Replica(
1775
2165
 
1776
2166
  fn commit_ops_read(self: *Self) void {
1777
2167
  assert(self.committing);
1778
- assert(self.status == .normal or self.status == .view_change);
2168
+ assert(self.status == .normal or self.status == .view_change or
2169
+ (self.status == .recovering and self.replica_count == 1));
1779
2170
  assert(self.commit_min <= self.commit_max);
1780
2171
  assert(self.commit_min <= self.op);
1781
2172
 
1782
2173
  if (!self.valid_hash_chain("commit_ops_read")) {
1783
2174
  self.committing = false;
2175
+ assert(self.replica_count > 1);
1784
2176
  return;
1785
2177
  }
1786
2178
  assert(self.op >= self.commit_max);
@@ -1789,12 +2181,22 @@ pub fn Replica(
1789
2181
  // Even a naive state transfer may fail to correct for this.
1790
2182
  if (self.commit_min < self.commit_max and self.commit_min < self.op) {
1791
2183
  const op = self.commit_min + 1;
1792
- const checksum = self.journal.entry_for_op_exact(op).?.checksum;
2184
+ const checksum = self.journal.header_with_op(op).?.checksum;
1793
2185
  self.journal.read_prepare(commit_ops_commit, op, checksum, null);
1794
2186
  } else {
1795
2187
  self.committing = false;
1796
2188
  // This is an optimization to expedite the view change before the `repair_timeout`:
1797
2189
  if (self.status == .view_change and self.repairs_allowed()) self.repair();
2190
+
2191
+ if (self.status == .recovering) {
2192
+ assert(self.replica_count == 1);
2193
+ assert(self.commit_min == self.commit_max);
2194
+ assert(self.commit_min == self.op);
2195
+ self.transition_to_normal_from_recovering_status(0);
2196
+ } else {
2197
+ // We expect that a cluster-of-one only calls commit_ops() in recovering status.
2198
+ assert(self.replica_count > 1);
2199
+ }
1798
2200
  }
1799
2201
  }
1800
2202
 
@@ -1806,42 +2208,43 @@ pub fn Replica(
1806
2208
 
1807
2209
  if (prepare == null) {
1808
2210
  log.debug("{}: commit_ops_commit: prepare == null", .{self.replica});
2211
+ if (self.replica_count == 1) @panic("cannot recover corrupt prepare");
1809
2212
  return;
1810
2213
  }
1811
2214
 
1812
- if (self.status == .view_change) {
1813
- if (self.leader_index(self.view) != self.replica) {
1814
- log.debug("{}: commit_ops_commit: no longer leader", .{self.replica});
1815
- return;
1816
- }
2215
+ switch (self.status) {
2216
+ .normal => {},
2217
+ .view_change => {
2218
+ if (self.leader_index(self.view) != self.replica) {
2219
+ log.debug("{}: commit_ops_commit: no longer leader", .{self.replica});
2220
+ assert(self.replica_count > 1);
2221
+ return;
2222
+ }
1817
2223
 
1818
- // Only the leader may commit during a view change before starting the new view.
1819
- // Fall through if this is indeed the case.
1820
- } else if (self.status != .normal) {
1821
- log.debug("{}: commit_ops_commit: no longer in normal status", .{self.replica});
1822
- return;
2224
+ // Only the leader may commit during a view change before starting the new view.
2225
+ // Fall through if this is indeed the case.
2226
+ },
2227
+ .recovering => {
2228
+ assert(self.replica_count == 1);
2229
+ assert(self.leader_index(self.view) == self.replica);
2230
+ },
1823
2231
  }
1824
2232
 
1825
2233
  const op = self.commit_min + 1;
1826
2234
 
1827
2235
  if (prepare.?.header.op != op) {
1828
2236
  log.debug("{}: commit_ops_commit: op changed", .{self.replica});
2237
+ assert(self.replica_count > 1);
1829
2238
  return;
1830
2239
  }
1831
2240
 
1832
- if (prepare.?.header.checksum != self.journal.entry_for_op_exact(op).?.checksum) {
2241
+ if (prepare.?.header.checksum != self.journal.header_with_op(op).?.checksum) {
1833
2242
  log.debug("{}: commit_ops_commit: checksum changed", .{self.replica});
2243
+ assert(self.replica_count > 1);
1834
2244
  return;
1835
2245
  }
1836
2246
 
1837
- // TODO We can optimize this to commit into the client table reply if it exists.
1838
- const reply = self.message_bus.get_message() orelse {
1839
- log.err("{}: commit_ops_commit: waiting for message", .{self.replica});
1840
- return;
1841
- };
1842
- defer self.message_bus.unref(reply);
1843
-
1844
- self.commit_op(prepare.?, reply);
2247
+ self.commit_op(prepare.?);
1845
2248
 
1846
2249
  assert(self.commit_min == op);
1847
2250
  assert(self.commit_min <= self.commit_max);
@@ -1851,11 +2254,12 @@ pub fn Replica(
1851
2254
  self.commit_ops_read();
1852
2255
  }
1853
2256
 
1854
- fn commit_op(self: *Self, prepare: *const Message, reply: *Message) void {
2257
+ fn commit_op(self: *Self, prepare: *const Message) void {
1855
2258
  // TODO Can we add more checks around allowing commit_op() during a view change?
1856
- assert(self.status == .normal or self.status == .view_change);
2259
+ assert(self.status == .normal or self.status == .view_change or
2260
+ (self.status == .recovering and self.replica_count == 1));
1857
2261
  assert(prepare.header.command == .prepare);
1858
- assert(prepare.header.operation != .init);
2262
+ assert(prepare.header.operation != .root);
1859
2263
  assert(prepare.header.op == self.commit_min + 1);
1860
2264
  assert(prepare.header.op <= self.op);
1861
2265
 
@@ -1863,7 +2267,7 @@ pub fn Replica(
1863
2267
  // happened since we last checked in `commit_ops_read()`. However, this would relate to
1864
2268
  // subsequent ops, since by now we have already verified the hash chain for this commit.
1865
2269
 
1866
- assert(self.journal.entry_for_op_exact(self.commit_min).?.checksum ==
2270
+ assert(self.journal.header_with_op(self.commit_min).?.checksum ==
1867
2271
  prepare.header.parent);
1868
2272
 
1869
2273
  log.debug("{}: commit_op: executing view={} {} op={} checksum={} ({s})", .{
@@ -1875,6 +2279,11 @@ pub fn Replica(
1875
2279
  @tagName(prepare.header.operation.cast(StateMachine)),
1876
2280
  });
1877
2281
 
2282
+ const reply = self.message_bus.get_message();
2283
+ defer self.message_bus.unref(reply);
2284
+
2285
+ assert(self.state_machine.commit_timestamp < prepare.header.timestamp);
2286
+
1878
2287
  const reply_body_size = @intCast(u32, self.state_machine.commit(
1879
2288
  prepare.header.client,
1880
2289
  prepare.header.operation.cast(StateMachine),
@@ -1882,6 +2291,9 @@ pub fn Replica(
1882
2291
  reply.buffer[@sizeOf(Header)..],
1883
2292
  ));
1884
2293
 
2294
+ assert(self.state_machine.commit_timestamp <= prepare.header.timestamp);
2295
+ self.state_machine.commit_timestamp = prepare.header.timestamp;
2296
+
1885
2297
  self.commit_min += 1;
1886
2298
  assert(self.commit_min == prepare.header.op);
1887
2299
  if (self.commit_min > self.commit_max) self.commit_max = self.commit_min;
@@ -1901,10 +2313,10 @@ pub fn Replica(
1901
2313
  .commit = prepare.header.op,
1902
2314
  .size = @sizeOf(Header) + reply_body_size,
1903
2315
  };
1904
- assert(reply.header.offset == 0);
2316
+ assert(reply.header.timestamp == 0);
1905
2317
  assert(reply.header.epoch == 0);
1906
2318
 
1907
- reply.header.set_checksum_body(reply.buffer[@sizeOf(Header)..reply.header.size]);
2319
+ reply.header.set_checksum_body(reply.body());
1908
2320
  reply.header.set_checksum();
1909
2321
 
1910
2322
  if (reply.header.operation == .register) {
@@ -1939,28 +2351,25 @@ pub fn Replica(
1939
2351
  return;
1940
2352
  }
1941
2353
 
1942
- const count = self.count_quorum(
1943
- &prepare.ok_from_all_replicas,
1944
- .prepare_ok,
1945
- prepare.message.header.checksum,
1946
- );
2354
+ const count = prepare.ok_from_all_replicas.count();
1947
2355
  assert(count >= self.quorum_replication);
2356
+ assert(count <= self.replica_count);
1948
2357
 
1949
- // TODO We can optimize this to commit into the client table reply if it exists.
1950
- const reply = self.message_bus.get_message() orelse {
1951
- // Eventually handled by on_prepare_timeout().
1952
- log.err("{}: commit_pipeline: waiting for message", .{self.replica});
1953
- return;
1954
- };
1955
- defer self.message_bus.unref(reply);
1956
-
1957
- self.commit_op(prepare.message, reply);
2358
+ self.commit_op(prepare.message);
1958
2359
 
1959
2360
  assert(self.commit_min == self.commit_max);
1960
2361
  assert(self.commit_max == prepare.message.header.op);
1961
2362
 
1962
- self.unref_prepare_message_and_quorum_messages(prepare);
1963
- assert(self.pipeline.pop() != null);
2363
+ self.message_bus.unref(self.pipeline.pop().?.message);
2364
+
2365
+ if (self.replica_count == 1) {
2366
+ if (self.pipeline.head_ptr()) |head| {
2367
+ // Write the next message in the queue.
2368
+ // A cluster-of-one writes prepares sequentially to avoid gaps in the WAL.
2369
+ self.write_prepare(head.message, .append);
2370
+ // The loop will wrap around and exit when `!ok_quorum_received`.
2371
+ }
2372
+ }
1964
2373
  }
1965
2374
 
1966
2375
  assert(self.prepare_timeout.ticking);
@@ -1968,6 +2377,39 @@ pub fn Replica(
1968
2377
  if (self.pipeline.count == 0) self.prepare_timeout.stop();
1969
2378
  }
1970
2379
 
2380
+ fn copy_latest_headers_and_set_size(
2381
+ self: *const Self,
2382
+ op_min: u64,
2383
+ op_max: u64,
2384
+ count_max: ?usize,
2385
+ message: *Message,
2386
+ ) usize {
2387
+ assert(op_max >= op_min);
2388
+ assert(count_max == null or count_max.? > 0);
2389
+ assert(message.header.command == .do_view_change or
2390
+ message.header.command == .start_view or
2391
+ message.header.command == .headers or
2392
+ message.header.command == .recovery_response);
2393
+
2394
+ const body_size_max = @sizeOf(Header) * std.math.min(
2395
+ @divExact(message.buffer.len - @sizeOf(Header), @sizeOf(Header)),
2396
+ // We must add 1 because op_max and op_min are both inclusive:
2397
+ count_max orelse std.math.min(64, op_max - op_min + 1),
2398
+ );
2399
+ assert(body_size_max >= @sizeOf(Header));
2400
+ assert(count_max == null or body_size_max == count_max.? * @sizeOf(Header));
2401
+
2402
+ const count = self.journal.copy_latest_headers_between(
2403
+ op_min,
2404
+ op_max,
2405
+ std.mem.bytesAsSlice(Header, message.buffer[@sizeOf(Header)..][0..body_size_max]),
2406
+ );
2407
+
2408
+ message.header.size = @intCast(u32, @sizeOf(Header) * (1 + count));
2409
+
2410
+ return count;
2411
+ }
2412
+
1971
2413
  fn count_quorum(
1972
2414
  self: *Self,
1973
2415
  messages: *QuorumMessages,
@@ -1984,20 +2426,12 @@ pub fn Replica(
1984
2426
  assert(m.header.context == context);
1985
2427
  assert(m.header.replica == replica);
1986
2428
  switch (command) {
1987
- .prepare_ok => {
1988
- if (self.status == .normal) {
1989
- assert(self.leader());
1990
- assert(m.header.view == self.view);
1991
- } else {
1992
- assert(self.status == .view_change);
1993
- assert(m.header.view < self.view);
1994
- }
1995
- },
1996
2429
  .start_view_change => {
1997
2430
  assert(m.header.replica != self.replica);
1998
2431
  assert(m.header.view == self.view);
1999
2432
  },
2000
2433
  .do_view_change => assert(m.header.view == self.view),
2434
+ .recovery_response => assert(m.header.replica != self.replica),
2001
2435
  .nack_prepare => {
2002
2436
  // TODO See if we can restrict this branch further.
2003
2437
  assert(m.header.replica != self.replica);
@@ -2026,7 +2460,8 @@ pub fn Replica(
2026
2460
  const session = reply.header.commit; // The commit number becomes the session number.
2027
2461
  const request = reply.header.request;
2028
2462
 
2029
- assert(session > 0); // We reserved the `0` commit number for the cluster `.init` operation.
2463
+ // We reserved the `0` commit number for the cluster `.root` operation.
2464
+ assert(session > 0);
2030
2465
  assert(request == 0);
2031
2466
 
2032
2467
  // For correctness, it's critical that all replicas evict deterministically:
@@ -2090,13 +2525,13 @@ pub fn Replica(
2090
2525
  }
2091
2526
 
2092
2527
  /// The caller owns the returned message, if any, which has exactly 1 reference.
2093
- fn create_view_change_message(self: *Self, command: Command) ?*Message {
2528
+ fn create_view_change_message(self: *Self, command: Command) *Message {
2094
2529
  assert(command == .do_view_change or command == .start_view);
2095
2530
 
2096
2531
  // We may send a start_view message in normal status to resolve a follower's view jump:
2097
2532
  assert(self.status == .normal or self.status == .view_change);
2098
2533
 
2099
- const message = self.message_bus.get_message() orelse return null;
2534
+ const message = self.message_bus.get_message();
2100
2535
  defer self.message_bus.unref(message);
2101
2536
 
2102
2537
  message.header.* = .{
@@ -2107,8 +2542,8 @@ pub fn Replica(
2107
2542
  // The latest normal view (as specified in the 2012 paper) is different to the view
2108
2543
  // number contained in the prepare headers we include in the body. The former shows
2109
2544
  // how recent a view change the replica participated in, which may be much higher.
2110
- // We use the `offset` field to send this in addition to the current view number:
2111
- .offset = if (command == .do_view_change) self.view_normal else 0,
2545
+ // We use the `timestamp` field to send this in addition to the current view number:
2546
+ .timestamp = if (command == .do_view_change) self.view_normal else 0,
2112
2547
  .op = self.op,
2113
2548
  .commit = self.commit_max,
2114
2549
  };
@@ -2119,25 +2554,12 @@ pub fn Replica(
2119
2554
  // that cannot be repaired because they are gaps, and this must be relative to the
2120
2555
  // cluster as a whole (not relative to the difference between our op and commit number)
2121
2556
  // as otherwise we would break correctness.
2122
- const count_max = config.pipelining_max;
2557
+ const count_max = config.pipeline_max;
2123
2558
  assert(count_max > 0);
2124
2559
 
2125
- const size_max = @sizeOf(Header) * std.math.min(
2126
- std.math.max(@divFloor(message.buffer.len, @sizeOf(Header)), 2),
2127
- 1 + count_max,
2128
- );
2129
- assert(size_max > @sizeOf(Header));
2130
-
2131
- const count = self.journal.copy_latest_headers_between(
2132
- 0,
2133
- self.op,
2134
- std.mem.bytesAsSlice(Header, message.buffer[@sizeOf(Header)..size_max]),
2135
- );
2136
-
2137
- // We expect that self.op always exists.
2138
- assert(count > 0);
2139
-
2140
- message.header.size = @intCast(u32, @sizeOf(Header) + @sizeOf(Header) * count);
2560
+ const count = self.copy_latest_headers_and_set_size(0, self.op, count_max, message);
2561
+ assert(count > 0); // We expect that self.op always exists.
2562
+ assert(@divExact(message.header.size, @sizeOf(Header)) == 1 + count);
2141
2563
 
2142
2564
  message.header.set_checksum_body(message.body());
2143
2565
  message.header.set_checksum();
@@ -2146,12 +2568,14 @@ pub fn Replica(
2146
2568
  }
2147
2569
 
2148
2570
  /// The caller owns the returned message, if any, which has exactly 1 reference.
2149
- fn create_message_from_header(self: *Self, header: Header) ?*Message {
2571
+ fn create_message_from_header(self: *Self, header: Header) *Message {
2150
2572
  assert(header.replica == self.replica);
2151
- assert(header.view == self.view or header.command == .request_start_view);
2573
+ assert(header.view == self.view or
2574
+ header.command == .request_start_view or
2575
+ header.command == .recovery);
2152
2576
  assert(header.size == @sizeOf(Header));
2153
2577
 
2154
- const message = self.message_bus.pool.get_header_only_message() orelse return null;
2578
+ const message = self.message_bus.pool.get_message();
2155
2579
  defer self.message_bus.unref(message);
2156
2580
 
2157
2581
  message.header.* = header;
@@ -2175,6 +2599,12 @@ pub fn Replica(
2175
2599
  /// uncommitted header gaps and compare them with the quorum of do_view_change messages
2176
2600
  /// received from other replicas, before starting the new view, to discard any that may be
2177
2601
  /// impossible to repair.
2602
+ ///
2603
+ /// For example, if the old primary replicates ops=7,8,9 (all uncommitted) but only op=9 is
2604
+ /// prepared on another replica before the old primary crashes, then this function finds a
2605
+ /// gap for ops=7,8 and will attempt to discard ops 7,8,9.
2606
+ // TODO To improve availability, potentially call this before the local headers are
2607
+ // repaired during the view change, so that we can participate in nacking headers.
2178
2608
  fn discard_uncommitted_headers(self: *Self) void {
2179
2609
  assert(self.status == .view_change);
2180
2610
  assert(self.leader_index(self.view) == self.replica);
@@ -2182,6 +2612,7 @@ pub fn Replica(
2182
2612
  assert(!self.repair_timeout.ticking);
2183
2613
  assert(self.op >= self.commit_max);
2184
2614
  assert(self.replica_count > 1);
2615
+ assert(self.op - self.commit_max <= config.journal_slot_count);
2185
2616
 
2186
2617
  const threshold = self.replica_count - self.quorum_replication;
2187
2618
  if (threshold == 0) {
@@ -2189,9 +2620,13 @@ pub fn Replica(
2189
2620
  return;
2190
2621
  }
2191
2622
 
2623
+ // Iterating > commit_max does not in itself guarantee that the header is uncommitted.
2624
+ // We must also count nacks from the quorum, since the old primary may have committed
2625
+ // another op just before crashing, if there was sufficient quorum. Counting nacks
2626
+ // ensures that the old primary could not possibly have committed the header.
2192
2627
  var op = self.op;
2193
2628
  while (op > self.commit_max) : (op -= 1) {
2194
- if (self.journal.entry_for_op_exact(op) != null) continue;
2629
+ if (self.journal.header_with_op(op) != null) continue;
2195
2630
 
2196
2631
  log.debug("{}: discard_uncommitted_headers: op={} gap", .{ self.replica, op });
2197
2632
 
@@ -2202,14 +2637,30 @@ pub fn Replica(
2202
2637
  assert(m.header.cluster == self.cluster);
2203
2638
  assert(m.header.replica == replica);
2204
2639
  assert(m.header.view == self.view);
2640
+ assert(m.header.commit <= self.commit_max);
2205
2641
 
2206
2642
  if (replica != self.replica) {
2207
- if (m.header.op < op) nacks += 1;
2208
-
2209
- log.debug("{}: discard_uncommitted_headers: replica={} op={}", .{
2643
+ // Check for a gap in the uncommitted headers from this replica.
2644
+ const received_headers = self.message_body_as_headers(m);
2645
+ assert(received_headers.len >= 1);
2646
+
2647
+ const received_op_min = received_headers[received_headers.len - 1].op;
2648
+ const received_op_max = received_headers[0].op;
2649
+ assert(received_op_max >= received_op_min);
2650
+
2651
+ const nack = for (received_headers) |*h| {
2652
+ if (h.op == op) break false;
2653
+ } else nack: {
2654
+ // Don't nack ops that didn't fit in the message's attached headers.
2655
+ break :nack op >= received_op_min;
2656
+ };
2657
+
2658
+ if (nack) nacks += 1;
2659
+ log.debug("{}: discard_uncommitted_headers: replica={} op={} nack={}", .{
2210
2660
  self.replica,
2211
2661
  m.header.replica,
2212
- m.header.op,
2662
+ op,
2663
+ nack,
2213
2664
  });
2214
2665
  }
2215
2666
  }
@@ -2223,12 +2674,15 @@ pub fn Replica(
2223
2674
  });
2224
2675
 
2225
2676
  if (nacks >= threshold) {
2677
+ assert(op > self.commit_max);
2678
+
2226
2679
  self.journal.remove_entries_from(op);
2227
2680
  self.op = op - 1;
2228
2681
 
2229
- assert(self.journal.entry_for_op(op) == null);
2230
- assert(!self.journal.dirty.bit(op));
2231
- assert(!self.journal.faulty.bit(op));
2682
+ const slot = self.journal.slot_for_op(op);
2683
+ assert(self.journal.header_for_op(op) == null);
2684
+ assert(!self.journal.dirty.bit(slot));
2685
+ assert(!self.journal.faulty.bit(slot));
2232
2686
  }
2233
2687
  }
2234
2688
  }
@@ -2243,10 +2697,11 @@ pub fn Replica(
2243
2697
 
2244
2698
  assert(self.valid_hash_chain("discard_uncommitted_ops_from"));
2245
2699
 
2700
+ const slot = self.journal.slot_with_op(op).?;
2246
2701
  assert(op > self.commit_max);
2247
2702
  assert(op <= self.op);
2248
- assert(self.journal.entry_for_op_exact_with_checksum(op, checksum) != null);
2249
- assert(self.journal.dirty.bit(op));
2703
+ assert(self.journal.header_with_op_and_checksum(op, checksum) != null);
2704
+ assert(self.journal.dirty.bit(slot));
2250
2705
 
2251
2706
  log.debug("{}: discard_uncommitted_ops_from: ops={}..{} view={}", .{
2252
2707
  self.replica,
@@ -2258,13 +2713,13 @@ pub fn Replica(
2258
2713
  self.journal.remove_entries_from(op);
2259
2714
  self.op = op - 1;
2260
2715
 
2261
- assert(self.journal.entry_for_op(op) == null);
2262
- assert(!self.journal.dirty.bit(op));
2263
- assert(!self.journal.faulty.bit(op));
2716
+ assert(self.journal.header_for_op(op) == null);
2717
+ assert(!self.journal.dirty.bit(slot));
2718
+ assert(!self.journal.faulty.bit(slot));
2264
2719
 
2265
2720
  // We require that `self.op` always exists. Rewinding `self.op` could change that.
2266
2721
  // However, we do this only as the leader within a view change, with all headers intact.
2267
- assert(self.journal.entry_for_op_exact(self.op) != null);
2722
+ assert(self.journal.header_with_op(self.op) != null);
2268
2723
  }
2269
2724
 
2270
2725
  /// Returns whether the replica is a follower for the current view.
@@ -2364,7 +2819,7 @@ pub fn Replica(
2364
2819
  return true;
2365
2820
  },
2366
2821
  // Only the leader may answer a request for a prepare without a context:
2367
- .request_prepare => if (message.header.context == 0) {
2822
+ .request_prepare => if (message.header.timestamp == 0) {
2368
2823
  log.warn("{}: on_{s}: ignoring (no context)", .{ self.replica, command });
2369
2824
  return true;
2370
2825
  },
@@ -2433,6 +2888,18 @@ pub fn Replica(
2433
2888
  if (self.ignore_request_message_follower(message)) return true;
2434
2889
  if (self.ignore_request_message_duplicate(message)) return true;
2435
2890
  if (self.ignore_request_message_preparing(message)) return true;
2891
+
2892
+ // Verify that the new request will fit in the WAL.
2893
+ // The message's op hasn't been assigned yet, but it will be `self.op + 1`.
2894
+ if (self.op + 1 >= self.op_checkpoint + config.journal_slot_count) {
2895
+ log.debug("{}: on_request: ignoring op={} (too far ahead, checkpoint={})", .{
2896
+ self.replica,
2897
+ message.header.op,
2898
+ self.op_checkpoint,
2899
+ });
2900
+ return true;
2901
+ }
2902
+
2436
2903
  return false;
2437
2904
  }
2438
2905
 
@@ -2485,7 +2952,9 @@ pub fn Replica(
2485
2952
  return false;
2486
2953
  } else {
2487
2954
  // The client may have only one request inflight at a time.
2488
- log.err("{}: on_request: ignoring new request (client bug)", .{self.replica});
2955
+ log.err("{}: on_request: ignoring new request (client bug)", .{
2956
+ self.replica,
2957
+ });
2489
2958
  return true;
2490
2959
  }
2491
2960
  } else {
@@ -2636,7 +3105,71 @@ pub fn Replica(
2636
3105
  return false;
2637
3106
  }
2638
3107
 
2639
- fn is_repair(self: *Self, message: *const Message) bool {
3108
+ /// Returns whether the highest known op is certain.
3109
+ ///
3110
+ /// After recovering the WAL, there are 2 possible outcomes:
3111
+ /// * All entries valid. The highest op is certain, and safe to set as `replica.op`.
3112
+ /// * One or more entries are faulty. The highest op isn't certain — it may be one of the
3113
+ /// broken entries.
3114
+ ///
3115
+ /// The replica must refrain from repairing any faulty slots until the highest op is known.
3116
+ /// Otherwise, if we were to repair a slot while uncertain of `replica.op`:
3117
+ ///
3118
+ /// * we may nack an op that we shouldn't, or
3119
+ /// * we may replace a prepared op that we were guaranteeing for the primary, potentially
3120
+ /// forking the log.
3121
+ ///
3122
+ ///
3123
+ /// Test for a fault the right of the current op. The fault might be our true op, and
3124
+ /// sharing our current `replica.op` might cause the cluster's op to likewise regress.
3125
+ ///
3126
+ /// Note that for our purposes here, we only care about entries that were faulty during
3127
+ /// WAL recovery, not ones that were found to be faulty after the fact (e.g. due to
3128
+ /// `request_prepare`).
3129
+ ///
3130
+ /// Cases (`✓`: `replica.op_checkpoint`, `✗`: faulty, `o`: `replica.op`):
3131
+ /// * ` ✓ o ✗ `: View change is unsafe.
3132
+ /// * ` ✗ ✓ o `: View change is unsafe.
3133
+ /// * ` ✓ ✗ o `: View change is safe.
3134
+ /// * ` ✓ = o `: View change is unsafe if any slots are faulty.
3135
+ /// (`replica.op_checkpoint` == `replica.op`).
3136
+ // TODO Use this function once we switch from recovery protocol to the superblock.
3137
+ // If there is an "unsafe" fault, we will need to request a start_view from the leader to
3138
+ // learn the op.
3139
+ fn op_certain(self: *const Self) bool {
3140
+ assert(self.status == .recovering);
3141
+ assert(self.journal.recovered);
3142
+ assert(self.op_checkpoint <= self.op);
3143
+
3144
+ const slot_op_checkpoint = self.journal.slot_for_op(self.op_checkpoint).index;
3145
+ const slot_op = self.journal.slot_with_op(self.op).?.index;
3146
+ const slot_known_range = vsr.SlotRange{
3147
+ .head = slot_op_checkpoint,
3148
+ .tail = slot_op,
3149
+ };
3150
+
3151
+ var iterator = self.journal.faulty.bits.iterator(.{ .kind = .set });
3152
+ while (iterator.next()) |slot| {
3153
+ // The command is `reserved` when the entry was found faulty during WAL recovery.
3154
+ // Faults found after WAL recovery are not relevant, because we know their op.
3155
+ if (self.journal.headers[slot.index].command == .reserved) {
3156
+ if (slot_op_checkpoint == slot_op or
3157
+ !slot_known_range.contains(slot))
3158
+ {
3159
+ log.warn("{}: op_certain: op not known (faulty_slot={}, op={}, op_checkpoint={})", .{
3160
+ self.replica,
3161
+ slot.index,
3162
+ self.op,
3163
+ self.op_checkpoint,
3164
+ });
3165
+ return false;
3166
+ }
3167
+ }
3168
+ }
3169
+ return true;
3170
+ }
3171
+
3172
+ fn is_repair(self: *const Self, message: *const Message) bool {
2640
3173
  assert(message.header.command == .prepare);
2641
3174
 
2642
3175
  if (self.status == .normal) {
@@ -2668,15 +3201,17 @@ pub fn Replica(
2668
3201
  assert(self.follower());
2669
3202
  assert(header.view == self.view);
2670
3203
  assert(header.op > self.op + 1);
2671
- // We may have learned of a higher `commit_max` through a commit message before jumping to a
2672
- // newer op that is less than `commit_max` but greater than `commit_min`:
3204
+ // We may have learned of a higher `commit_max` through a commit message before jumping
3205
+ // to a newer op that is less than `commit_max` but greater than `commit_min`:
2673
3206
  assert(header.op > self.commit_min);
3207
+ // Never overwrite an op that still needs to be checkpointed.
3208
+ assert(header.op - self.op_checkpoint < config.journal_slot_count);
2674
3209
 
2675
3210
  log.debug("{}: jump_to_newer_op: advancing: op={}..{} checksum={}..{}", .{
2676
3211
  self.replica,
2677
3212
  self.op,
2678
3213
  header.op - 1,
2679
- self.journal.entry_for_op_exact(self.op).?.checksum,
3214
+ self.journal.header_with_op(self.op).?.checksum,
2680
3215
  header.parent,
2681
3216
  });
2682
3217
 
@@ -2688,7 +3223,10 @@ pub fn Replica(
2688
3223
  fn message_body_as_headers(_: *Self, message: *const Message) []Header {
2689
3224
  // TODO Assert message commands that we expect this to be called for.
2690
3225
  assert(message.header.size > @sizeOf(Header)); // Body must contain at least one header.
2691
- return std.mem.bytesAsSlice(Header, message.buffer[@sizeOf(Header)..message.header.size]);
3226
+ return std.mem.bytesAsSlice(
3227
+ Header,
3228
+ message.buffer[@sizeOf(Header)..message.header.size],
3229
+ );
2692
3230
  }
2693
3231
 
2694
3232
  /// Panics if immediate neighbors in the same view would have a broken hash chain.
@@ -2710,6 +3248,29 @@ pub fn Replica(
2710
3248
  }
2711
3249
  }
2712
3250
 
3251
+ /// Searches the pipeline for a prepare for a given op and checksum.
3252
+ /// When `checksum` is `null`, match any checksum.
3253
+ fn pipeline_prepare_for_op_and_checksum(self: *Self, op: u64, checksum: ?u128) ?*Prepare {
3254
+ assert(self.status == .normal or self.status == .view_change);
3255
+
3256
+ // To optimize the search, we can leverage the fact that the pipeline is ordered and
3257
+ // continuous.
3258
+ if (self.pipeline.count == 0) return null;
3259
+ const head_op = self.pipeline.head_ptr().?.message.header.op;
3260
+ const tail_op = self.pipeline.tail_ptr().?.message.header.op;
3261
+ if (op < head_op) return null;
3262
+ if (op > tail_op) return null;
3263
+
3264
+ const pipeline_prepare = self.pipeline.get_ptr(op - head_op).?;
3265
+ assert(pipeline_prepare.message.header.op == op);
3266
+
3267
+ if (checksum == null or pipeline_prepare.message.header.checksum == checksum.?) {
3268
+ return pipeline_prepare;
3269
+ } else {
3270
+ return null;
3271
+ }
3272
+ }
3273
+
2713
3274
  /// Searches the pipeline for a prepare for a given client.
2714
3275
  fn pipeline_prepare_for_client(self: *Self, client: u128) ?*Prepare {
2715
3276
  assert(self.status == .normal);
@@ -2717,7 +3278,7 @@ pub fn Replica(
2717
3278
  assert(self.commit_min == self.commit_max);
2718
3279
 
2719
3280
  var op = self.commit_max + 1;
2720
- var parent = self.journal.entry_for_op_exact(self.commit_max).?.checksum;
3281
+ var parent = self.journal.header_with_op(self.commit_max).?.checksum;
2721
3282
  var iterator = self.pipeline.iterator();
2722
3283
  while (iterator.next_ptr()) |prepare| {
2723
3284
  assert(prepare.message.header.command == .prepare);
@@ -2732,7 +3293,7 @@ pub fn Replica(
2732
3293
  op += 1;
2733
3294
  }
2734
3295
 
2735
- assert(self.pipeline.count <= config.pipelining_max);
3296
+ assert(self.pipeline.count <= config.pipeline_max);
2736
3297
  assert(self.commit_max + self.pipeline.count == op - 1);
2737
3298
  assert(self.commit_max + self.pipeline.count == self.op);
2738
3299
 
@@ -2770,15 +3331,33 @@ pub fn Replica(
2770
3331
  assert(prepare.message.header.view <= ok.header.view);
2771
3332
  assert(prepare.message.header.op == ok.header.op);
2772
3333
  assert(prepare.message.header.commit == ok.header.commit);
2773
- assert(prepare.message.header.offset == ok.header.offset);
3334
+ assert(prepare.message.header.timestamp == ok.header.timestamp);
2774
3335
  assert(prepare.message.header.operation == ok.header.operation);
2775
3336
 
2776
3337
  return prepare;
2777
3338
  }
2778
3339
 
3340
+ fn recover(self: *Self) void {
3341
+ assert(self.status == .recovering);
3342
+ assert(self.replica_count > 1);
3343
+ assert(self.journal.recovered);
3344
+
3345
+ log.debug("{}: recover: sending recovery messages nonce={}", .{
3346
+ self.replica,
3347
+ self.recovery_nonce,
3348
+ });
3349
+
3350
+ self.send_header_to_other_replicas(.{
3351
+ .command = .recovery,
3352
+ .cluster = self.cluster,
3353
+ .context = self.recovery_nonce,
3354
+ .replica = self.replica,
3355
+ });
3356
+ }
3357
+
2779
3358
  /// Starting from the latest journal entry, backfill any missing or disconnected headers.
2780
- /// A header is disconnected if it breaks the hash chain with its newer neighbor to the right.
2781
- /// Since we work backwards from the latest entry, we should always be able to fix the chain.
3359
+ /// A header is disconnected if it breaks the chain with its newer neighbor to the right.
3360
+ /// Since we work back from the latest entry, we should always be able to fix the chain.
2782
3361
  /// Once headers are connected, backfill any dirty or faulty prepares.
2783
3362
  fn repair(self: *Self) void {
2784
3363
  if (!self.repair_timeout.ticking) {
@@ -2790,38 +3369,50 @@ pub fn Replica(
2790
3369
 
2791
3370
  assert(self.status == .normal or self.status == .view_change);
2792
3371
  assert(self.repairs_allowed());
3372
+
3373
+ assert(self.op_checkpoint <= self.op);
3374
+ assert(self.op_checkpoint <= self.commit_min);
2793
3375
  assert(self.commit_min <= self.op);
2794
3376
  assert(self.commit_min <= self.commit_max);
2795
3377
 
2796
- // We expect these always to exist:
2797
- assert(self.journal.entry_for_op_exact(self.commit_min) != null);
2798
- assert(self.journal.entry_for_op_exact(self.op) != null);
3378
+ assert(self.journal.header_with_op(self.commit_min) != null);
3379
+ assert(self.journal.header_with_op(self.op) != null);
3380
+
3381
+ // The replica repairs backwards from `commit_max`. But if `commit_max` is too high
3382
+ // (>1 WAL ahead), then bound it such that uncommitted WAL entries are not overwritten.
3383
+ const commit_max_limit = std.math.min(
3384
+ self.commit_max,
3385
+ self.op_checkpoint + config.journal_slot_count,
3386
+ );
2799
3387
 
2800
3388
  // Request outstanding committed prepares to advance our op number:
2801
3389
  // This handles the case of an idle cluster, where a follower will not otherwise advance.
2802
3390
  // This is not required for correctness, but for durability.
2803
- if (self.op < self.commit_max) {
3391
+ if (self.op < commit_max_limit) {
2804
3392
  // If the leader repairs during a view change, it will have already advanced
2805
3393
  // `self.op` to the latest op according to the quorum of `do_view_change` messages
2806
3394
  // received, so we must therefore be a follower in normal status:
2807
3395
  assert(self.status == .normal);
2808
3396
  assert(self.follower());
2809
- log.debug("{}: repair: op={} < commit_max={}", .{
3397
+ log.debug("{}: repair: op={} < commit_max_limit={}, commit_max={}", .{
2810
3398
  self.replica,
2811
3399
  self.op,
3400
+ commit_max_limit,
2812
3401
  self.commit_max,
2813
3402
  });
2814
3403
  // We need to advance our op number and therefore have to `request_prepare`,
2815
3404
  // since only `on_prepare()` can do this, not `repair_header()` in `on_headers()`.
2816
3405
  self.send_header_to_replica(self.leader_index(self.view), .{
2817
3406
  .command = .request_prepare,
2818
- // We cannot yet know the checksum of the prepare so we set the context to 0:
2819
- // Context is optional when requesting from the leader but required otherwise.
3407
+ // We cannot yet know the checksum of the prepare so we set the context and
3408
+ // timestamp to 0: Context is optional when requesting from the leader but
3409
+ // required otherwise.
2820
3410
  .context = 0,
3411
+ .timestamp = 0,
2821
3412
  .cluster = self.cluster,
2822
3413
  .replica = self.replica,
2823
3414
  .view = self.view,
2824
- .op = self.commit_max,
3415
+ .op = commit_max_limit,
2825
3416
  });
2826
3417
  return;
2827
3418
  }
@@ -2842,9 +3433,10 @@ pub fn Replica(
2842
3433
  assert(range.op_min > self.commit_min);
2843
3434
  assert(range.op_max < self.op);
2844
3435
  // A range of `op_min=0` or `op_max=0` should be impossible as a header break:
2845
- // This is the init op that is prepared when the cluster is initialized.
3436
+ // This is the root op that is prepared when the cluster is initialized.
2846
3437
  assert(range.op_min > 0);
2847
3438
  assert(range.op_max > 0);
3439
+
2848
3440
  if (self.choose_any_other_replica()) |replica| {
2849
3441
  self.send_header_to_replica(replica, .{
2850
3442
  .command = .request_headers,
@@ -2863,10 +3455,14 @@ pub fn Replica(
2863
3455
  assert(self.valid_hash_chain_between(self.commit_min, self.op));
2864
3456
 
2865
3457
  // Request and repair any dirty or faulty prepares:
2866
- if (self.journal.dirty.len > 0) return self.repair_prepares();
3458
+ if (self.journal.dirty.count > 0) return self.repair_prepares();
2867
3459
 
2868
3460
  // Commit ops, which may in turn discover faulty prepares and drive more repairs:
2869
- if (self.commit_min < self.commit_max) return self.commit_ops(self.commit_max);
3461
+ if (self.commit_min < self.commit_max) {
3462
+ assert(self.replica_count > 1);
3463
+ self.commit_ops(self.commit_max);
3464
+ return;
3465
+ }
2870
3466
 
2871
3467
  if (self.status == .view_change and self.leader_index(self.view) == self.replica) {
2872
3468
  if (self.repair_pipeline_op() != null) return self.repair_pipeline();
@@ -2921,10 +3517,13 @@ pub fn Replica(
2921
3517
  }
2922
3518
 
2923
3519
  if (header.op > self.op) {
2924
- log.debug("{}: repair_header: false (advances self.op)", .{self.replica});
3520
+ log.debug("{}: repair_header: false (advances self.op={})", .{
3521
+ self.replica,
3522
+ self.op,
3523
+ });
2925
3524
  return false;
2926
3525
  } else if (header.op == self.op) {
2927
- if (self.journal.entry_for_op_exact_with_checksum(self.op, header.checksum)) |_| {
3526
+ if (self.journal.header_with_op_and_checksum(self.op, header.checksum)) |_| {
2928
3527
  // Fall through below to check if self.op is uncommitted AND reordered,
2929
3528
  // which we would see by the presence of an earlier op with higher view number,
2930
3529
  // that breaks the chain with self.op. In this case, we must skip the repair to
@@ -2938,27 +3537,42 @@ pub fn Replica(
2938
3537
  }
2939
3538
  }
2940
3539
 
2941
- if (self.journal.entry(header)) |existing| {
3540
+ if (self.journal.header_for_entry(header)) |existing| {
3541
+ assert(existing.op == header.op);
3542
+
2942
3543
  // Do not replace any existing op lightly as doing so may impair durability and even
2943
3544
  // violate correctness by undoing a prepare already acknowledged to the leader:
2944
3545
  if (existing.checksum == header.checksum) {
2945
- if (!self.journal.dirty.bit(header.op)) {
2946
- log.debug("{}: repair_header: false (checksum clean)", .{self.replica});
3546
+ const slot = self.journal.slot_with_header(header).?;
3547
+ if (!self.journal.dirty.bit(slot)) {
3548
+ log.debug("{}: repair_header: op={} false (checksum clean)", .{
3549
+ self.replica,
3550
+ header.op,
3551
+ });
2947
3552
  return false;
2948
3553
  }
2949
3554
 
2950
- log.debug("{}: repair_header: exists, checksum dirty", .{self.replica});
3555
+ log.debug("{}: repair_header: op={} exists, checksum dirty", .{
3556
+ self.replica,
3557
+ header.op,
3558
+ });
2951
3559
  } else if (existing.view == header.view) {
2952
3560
  // The journal must have wrapped:
2953
3561
  // We expect that the same view and op will have the same checksum.
2954
3562
  assert(existing.op != header.op);
2955
3563
 
2956
3564
  if (existing.op > header.op) {
2957
- log.debug("{}: repair_header: false (view has newer op)", .{self.replica});
3565
+ log.debug("{}: repair_header: op={} false (view has newer op)", .{
3566
+ self.replica,
3567
+ header.op,
3568
+ });
2958
3569
  return false;
2959
3570
  }
2960
3571
 
2961
- log.debug("{}: repair_header: exists, view has older op", .{self.replica});
3572
+ log.debug("{}: repair_header: op={} exists, view has older op", .{
3573
+ self.replica,
3574
+ header.op,
3575
+ });
2962
3576
  } else {
2963
3577
  assert(existing.view != header.view);
2964
3578
  assert(existing.op == header.op or existing.op != header.op);
@@ -2966,38 +3580,37 @@ pub fn Replica(
2966
3580
  if (!self.repair_header_would_connect_hash_chain(header)) {
2967
3581
  // We cannot replace this op until we are sure that doing so would not
2968
3582
  // violate any prior commitments made to the leader.
2969
- log.debug("{}: repair_header: false (exists)", .{self.replica});
3583
+ log.debug("{}: repair_header: op={} false (exists)", .{
3584
+ self.replica,
3585
+ header.op,
3586
+ });
2970
3587
  return false;
2971
3588
  }
2972
3589
 
2973
- log.debug("{}: repair_header: exists, connects hash chain", .{self.replica});
3590
+ log.debug("{}: repair_header: op={} exists, connects hash chain", .{
3591
+ self.replica,
3592
+ header.op,
3593
+ });
2974
3594
  }
2975
3595
  } else {
2976
- log.debug("{}: repair_header: gap", .{self.replica});
3596
+ log.debug("{}: repair_header: op={} gap", .{ self.replica, header.op });
2977
3597
  }
2978
3598
 
2979
3599
  // Caveat: Do not repair an existing op or gap if doing so would break the hash chain:
2980
3600
  if (self.repair_header_would_break_hash_chain_with_next_entry(header)) {
2981
- log.debug("{}: repair_header: false (breaks hash chain)", .{self.replica});
3601
+ log.debug("{}: repair_header: op={} false (breaks hash chain)", .{
3602
+ self.replica,
3603
+ header.op,
3604
+ });
2982
3605
  return false;
2983
3606
  }
2984
3607
 
2985
- // Caveat: Do not repair an existing op or gap if doing so would overlap another:
2986
- if (self.repair_header_would_overlap_another(header)) {
2987
- if (!self.repair_header_would_connect_hash_chain(header)) {
2988
- log.debug("{}: repair_header: false (overlap)", .{self.replica});
2989
- return false;
2990
- }
2991
- // We may have to overlap previous entries in order to connect the hash chain:
2992
- log.debug("{}: repair_header: overlap, connects hash chain", .{self.replica});
2993
- }
2994
-
2995
3608
  // TODO Snapshots: Skip if this header is already snapshotted.
2996
3609
 
2997
3610
  assert(header.op < self.op or
2998
- self.journal.entry_for_op_exact(self.op).?.checksum == header.checksum);
3611
+ self.journal.header_with_op(self.op).?.checksum == header.checksum);
2999
3612
 
3000
- self.journal.set_entry_as_dirty(header);
3613
+ self.journal.set_header_as_dirty(header);
3001
3614
  return true;
3002
3615
  }
3003
3616
 
@@ -3018,10 +3631,12 @@ pub fn Replica(
3018
3631
  if (header.checksum == next.parent) {
3019
3632
  assert(header.view <= next.view);
3020
3633
  assert(header.op + 1 == next.op);
3021
- // We don't break with `next` but this is no guarantee that `next` does not break.
3634
+ // We don't break with `next` but this is no guarantee that `next` does not
3635
+ // break.
3022
3636
  return false;
3023
3637
  } else {
3024
- // If the journal has wrapped, then err in favor of a break regardless of op order:
3638
+ // If the journal has wrapped, then err in favor of a break regardless of op
3639
+ // order:
3025
3640
  return true;
3026
3641
  }
3027
3642
  }
@@ -3030,14 +3645,17 @@ pub fn Replica(
3030
3645
  return false;
3031
3646
  }
3032
3647
 
3033
- /// If we repair this header, then would this connect the hash chain through to the latest op?
3034
- /// This offers a strong guarantee that may be used to replace or overlap an existing op.
3648
+ /// If we repair this header, then would this connect the hash chain through to the latest
3649
+ /// op? This offers a strong guarantee that may be used to replace or overlap an existing
3650
+ /// op.
3035
3651
  ///
3036
3652
  /// Here is an example of what could go wrong if we did not check for complete connection:
3037
3653
  ///
3038
3654
  /// 1. We do a prepare that's going to be committed.
3039
- /// 2. We do a stale prepare to the right of this, ignoring the hash chain break to the left.
3040
- /// 3. We do another stale prepare that replaces the first op because it connects to the second.
3655
+ /// 2. We do a stale prepare to the right of this, ignoring the hash chain break to the
3656
+ /// left.
3657
+ /// 3. We do another stale prepare that replaces the first op because it connects to the
3658
+ /// second.
3041
3659
  ///
3042
3660
  /// This would violate our quorum replication commitment to the leader.
3043
3661
  /// The mistake in this example was not that we ignored the break to the left, which we must
@@ -3060,43 +3678,16 @@ pub fn Replica(
3060
3678
  }
3061
3679
 
3062
3680
  assert(entry.op == self.op);
3063
- assert(entry.checksum == self.journal.entry_for_op_exact(self.op).?.checksum);
3681
+ assert(entry.checksum == self.journal.header_with_op(self.op).?.checksum);
3064
3682
  return true;
3065
3683
  }
3066
3684
 
3067
- /// If we repair this header, then would this overlap and overwrite part of another batch?
3068
- /// Journal entries have variable-sized batches that may overlap if entries are disconnected.
3069
- fn repair_header_would_overlap_another(self: *Self, header: *const Header) bool {
3070
- // TODO Snapshots: Handle journal wrap around.
3071
- {
3072
- // Look behind this entry for any preceeding entry that this would overlap:
3073
- var op: u64 = header.op;
3074
- while (op > 0) {
3075
- op -= 1;
3076
- if (self.journal.entry_for_op(op)) |neighbor| {
3077
- if (Journal.next_offset(neighbor) > header.offset) return true;
3078
- break;
3079
- }
3080
- }
3081
- }
3082
- {
3083
- // Look beyond this entry for any succeeding entry that this would overlap:
3084
- var op: u64 = header.op + 1;
3085
- while (op <= self.op) : (op += 1) {
3086
- if (self.journal.entry_for_op(op)) |neighbor| {
3087
- if (Journal.next_offset(header) > neighbor.offset) return true;
3088
- break;
3089
- }
3090
- }
3091
- }
3092
- return false;
3093
- }
3094
-
3095
3685
  /// Reads prepares into the pipeline (before we start the view as the new leader).
3096
3686
  fn repair_pipeline(self: *Self) void {
3097
3687
  assert(self.status == .view_change);
3098
3688
  assert(self.leader_index(self.view) == self.replica);
3099
3689
  assert(self.commit_max < self.op);
3690
+ assert(self.journal.dirty.count == 0);
3100
3691
 
3101
3692
  if (self.repairing_pipeline) {
3102
3693
  log.debug("{}: repair_pipeline: already repairing...", .{self.replica});
@@ -3111,11 +3702,57 @@ pub fn Replica(
3111
3702
  self.repair_pipeline_read();
3112
3703
  }
3113
3704
 
3705
+ /// Discard messages from the prepare pipeline.
3706
+ /// Retain uncommitted messages that belong in the current view to maximize durability.
3707
+ fn repair_pipeline_diff(self: *Self) void {
3708
+ assert(self.status == .view_change);
3709
+ assert(self.leader_index(self.view) == self.replica);
3710
+
3711
+ // Discard messages from the front of the pipeline that committed since we were leader.
3712
+ while (self.pipeline.head_ptr()) |prepare| {
3713
+ if (prepare.message.header.op > self.commit_max) break;
3714
+
3715
+ self.message_bus.unref(self.pipeline.pop().?.message);
3716
+ }
3717
+
3718
+ // Discard the whole pipeline if it is now disconnected from the WAL's hash chain.
3719
+ if (self.pipeline.head_ptr()) |pipeline_head| {
3720
+ const parent = self.journal.header_with_op_and_checksum(
3721
+ pipeline_head.message.header.op - 1,
3722
+ pipeline_head.message.header.parent,
3723
+ );
3724
+ if (parent == null) {
3725
+ while (self.pipeline.pop()) |prepare| self.message_bus.unref(prepare.message);
3726
+ assert(self.pipeline.count == 0);
3727
+ }
3728
+ }
3729
+
3730
+ // Discard messages from the back of the pipeline that are not part of this view.
3731
+ while (self.pipeline.tail_ptr()) |prepare| {
3732
+ if (self.journal.has(prepare.message.header)) break;
3733
+
3734
+ self.message_bus.unref(self.pipeline.pop_tail().?.message);
3735
+ }
3736
+
3737
+ log.debug("{}: repair_pipeline_diff: {} prepare(s)", .{
3738
+ self.replica,
3739
+ self.pipeline.count,
3740
+ });
3741
+
3742
+ self.verify_pipeline();
3743
+
3744
+ // Do not reset `repairing_pipeline` here as this must be reset by the read callback.
3745
+ // Otherwise, we would be making `repair_pipeline()` reentrant.
3746
+ }
3747
+
3114
3748
  /// Returns the next `op` number that needs to be read into the pipeline.
3115
3749
  fn repair_pipeline_op(self: *Self) ?u64 {
3116
3750
  assert(self.status == .view_change);
3117
3751
  assert(self.leader_index(self.view) == self.replica);
3118
3752
 
3753
+ // We cannot rely on `pipeline.count` below unless the pipeline has first been diffed.
3754
+ self.repair_pipeline_diff();
3755
+
3119
3756
  const op = self.commit_max + self.pipeline.count + 1;
3120
3757
  if (op <= self.op) return op;
3121
3758
 
@@ -3133,7 +3770,7 @@ pub fn Replica(
3133
3770
  assert(op <= self.op);
3134
3771
  assert(self.commit_max + self.pipeline.count + 1 == op);
3135
3772
 
3136
- const checksum = self.journal.entry_for_op_exact(op).?.checksum;
3773
+ const checksum = self.journal.header_with_op(op).?.checksum;
3137
3774
 
3138
3775
  log.debug("{}: repair_pipeline_read: op={} checksum={}", .{
3139
3776
  self.replica,
@@ -3192,7 +3829,7 @@ pub fn Replica(
3192
3829
  return;
3193
3830
  }
3194
3831
 
3195
- if (prepare.?.header.checksum != self.journal.entry_for_op_exact(op).?.checksum) {
3832
+ if (prepare.?.header.checksum != self.journal.header_with_op(op).?.checksum) {
3196
3833
  log.debug("{}: repair_pipeline_push: checksum changed", .{self.replica});
3197
3834
  return;
3198
3835
  }
@@ -3206,7 +3843,11 @@ pub fn Replica(
3206
3843
  prepare.?.header.checksum,
3207
3844
  });
3208
3845
 
3209
- self.pipeline.push(.{ .message = prepare.?.ref() }) catch unreachable;
3846
+ if (self.pipeline.tail_ptr()) |parent| {
3847
+ assert(prepare.?.header.parent == parent.message.header.checksum);
3848
+ }
3849
+
3850
+ self.pipeline.push_assume_capacity(.{ .message = prepare.?.ref() });
3210
3851
  assert(self.pipeline.count >= 1);
3211
3852
 
3212
3853
  self.repairing_pipeline = true;
@@ -3216,7 +3857,7 @@ pub fn Replica(
3216
3857
  fn repair_prepares(self: *Self) void {
3217
3858
  assert(self.status == .normal or self.status == .view_change);
3218
3859
  assert(self.repairs_allowed());
3219
- assert(self.journal.dirty.len > 0);
3860
+ assert(self.journal.dirty.count > 0);
3220
3861
 
3221
3862
  // Request enough prepares to utilize our max IO depth:
3222
3863
  var budget = self.journal.writes.available();
@@ -3225,11 +3866,34 @@ pub fn Replica(
3225
3866
  return;
3226
3867
  }
3227
3868
 
3869
+ if (self.op < config.journal_slot_count) {
3870
+ // The op is known, and this is the first WAL cycle.
3871
+ // Therefore, any faulty ops to the right of `replica.op` are corrupt reserved
3872
+ // entries from the initial format.
3873
+ var op: usize = self.op + 1;
3874
+ while (op < config.journal_slot_count) : (op += 1) {
3875
+ const slot = self.journal.slot_for_op(op);
3876
+ assert(slot.index == op);
3877
+
3878
+ if (self.journal.faulty.bit(slot)) {
3879
+ assert(self.journal.headers[op].command == .reserved);
3880
+ self.journal.dirty.clear(slot);
3881
+ self.journal.faulty.clear(slot);
3882
+ log.debug("{}: repair_prepares: op={} (op known, first cycle)", .{
3883
+ self.replica,
3884
+ op,
3885
+ });
3886
+ }
3887
+ }
3888
+ }
3889
+
3228
3890
  var op = self.op + 1;
3229
- while (op > 0) {
3891
+ const op_min = op -| config.journal_slot_count;
3892
+ while (op > op_min) {
3230
3893
  op -= 1;
3231
3894
 
3232
- if (self.journal.dirty.bit(op)) {
3895
+ const slot = self.journal.slot_for_op(op);
3896
+ if (self.journal.dirty.bit(slot)) {
3233
3897
  // If this is an uncommitted op, and we are the leader in `view_change` status,
3234
3898
  // then we will `request_prepare` from the cluster, set `nack_prepare_op`,
3235
3899
  // and stop repairing any further prepares:
@@ -3251,7 +3915,7 @@ pub fn Replica(
3251
3915
  }
3252
3916
  }
3253
3917
  } else {
3254
- assert(!self.journal.faulty.bit(op));
3918
+ assert(!self.journal.faulty.bit(slot));
3255
3919
  }
3256
3920
  }
3257
3921
  }
@@ -3273,16 +3937,17 @@ pub fn Replica(
3273
3937
  /// This is effectively "many-to-one" repair, where a single replica recovers using the
3274
3938
  /// resources of many replicas, for faster recovery.
3275
3939
  fn repair_prepare(self: *Self, op: u64) bool {
3940
+ const slot = self.journal.slot_with_op(op).?;
3941
+ const checksum = self.journal.header_with_op(op).?.checksum;
3942
+
3276
3943
  assert(self.status == .normal or self.status == .view_change);
3277
3944
  assert(self.repairs_allowed());
3278
- assert(self.journal.dirty.bit(op));
3279
-
3280
- const checksum = self.journal.entry_for_op_exact(op).?.checksum;
3945
+ assert(self.journal.dirty.bit(slot));
3281
3946
 
3282
3947
  // We may be appending to or repairing the journal concurrently.
3283
3948
  // We do not want to re-request any of these prepares unnecessarily.
3284
3949
  if (self.journal.writing(op, checksum)) {
3285
- log.debug("{}: repair_prepare: already writing op={} checksum={}", .{
3950
+ log.debug("{}: repair_prepare: op={} checksum={} (already writing)", .{
3286
3951
  self.replica,
3287
3952
  op,
3288
3953
  checksum,
@@ -3290,11 +3955,46 @@ pub fn Replica(
3290
3955
  return false;
3291
3956
  }
3292
3957
 
3958
+ // The message may be available in the local pipeline.
3959
+ // For example (replica_count=3):
3960
+ // 1. View=1: Replica 1 is leader, and prepares op 5. The local write fails.
3961
+ // 2. Time passes. The view changes (e.g. due to a timeout)…
3962
+ // 3. View=4: Replica 1 is leader again, and is repairing op 5
3963
+ // (which is still in the pipeline).
3964
+ //
3965
+ // Using the pipeline to repair is faster than a `request_prepare`.
3966
+ // Also, messages in the pipeline are never corrupt.
3967
+ if (self.pipeline_prepare_for_op_and_checksum(op, checksum)) |prepare| {
3968
+ assert(prepare.message.header.op == op);
3969
+ assert(prepare.message.header.checksum == checksum);
3970
+
3971
+ if (self.replica_count == 1) {
3972
+ // This op won't start writing until all ops in the pipeline preceding it have
3973
+ // been written.
3974
+ log.debug("{}: repair_prepare: op={} checksum={} (serializing append)", .{
3975
+ self.replica,
3976
+ op,
3977
+ checksum,
3978
+ });
3979
+ assert(op > self.pipeline.head_ptr().?.message.header.op);
3980
+ return false;
3981
+ }
3982
+
3983
+ log.debug("{}: repair_prepare: op={} checksum={} (from pipeline)", .{
3984
+ self.replica,
3985
+ op,
3986
+ checksum,
3987
+ });
3988
+ self.write_prepare(prepare.message, .pipeline);
3989
+ return true;
3990
+ }
3991
+
3293
3992
  const request_prepare = Header{
3294
3993
  .command = .request_prepare,
3295
- // If we request a prepare from a follower, as below, it is critical to pass a checksum:
3296
- // Otherwise we could receive different prepares for the same op number.
3994
+ // If we request a prepare from a follower, as below, it is critical to pass a
3995
+ // checksum: Otherwise we could receive different prepares for the same op number.
3297
3996
  .context = checksum,
3997
+ .timestamp = 1, // The checksum is included in context.
3298
3998
  .cluster = self.cluster,
3299
3999
  .replica = self.replica,
3300
4000
  .view = self.view,
@@ -3305,7 +4005,7 @@ pub fn Replica(
3305
4005
  // Only the leader is allowed to do repairs in a view change:
3306
4006
  assert(self.leader_index(self.view) == self.replica);
3307
4007
 
3308
- const reason = if (self.journal.faulty.bit(op)) "faulty" else "dirty";
4008
+ const reason = if (self.journal.faulty.bit(slot)) "faulty" else "dirty";
3309
4009
  log.debug(
3310
4010
  "{}: repair_prepare: op={} checksum={} (uncommitted, {s}, view_change)",
3311
4011
  .{
@@ -3316,7 +4016,7 @@ pub fn Replica(
3316
4016
  },
3317
4017
  );
3318
4018
 
3319
- if (self.replica_count == 2 and !self.journal.faulty.bit(op)) {
4019
+ if (self.replica_count == 2 and !self.journal.faulty.bit(slot)) {
3320
4020
  // This is required to avoid a liveness issue for a cluster-of-two where a new
3321
4021
  // leader learns of an op during a view change but where the op is faulty on
3322
4022
  // the old leader. We must immediately roll back the op since it could not have
@@ -3336,17 +4036,11 @@ pub fn Replica(
3336
4036
  assert(nack_prepare_op <= op);
3337
4037
  if (nack_prepare_op != op) {
3338
4038
  self.nack_prepare_op = op;
3339
- self.reset_quorum_messages(
3340
- &self.nack_prepare_from_other_replicas,
3341
- .nack_prepare,
3342
- );
4039
+ self.reset_quorum_counter(&self.nack_prepare_from_other_replicas);
3343
4040
  }
3344
4041
  } else {
3345
4042
  self.nack_prepare_op = op;
3346
- self.reset_quorum_messages(
3347
- &self.nack_prepare_from_other_replicas,
3348
- .nack_prepare,
3349
- );
4043
+ self.reset_quorum_counter(&self.nack_prepare_from_other_replicas);
3350
4044
  }
3351
4045
 
3352
4046
  assert(self.nack_prepare_op.? == op);
@@ -3354,7 +4048,7 @@ pub fn Replica(
3354
4048
  self.send_header_to_other_replicas(request_prepare);
3355
4049
  } else {
3356
4050
  const nature = if (op > self.commit_max) "uncommitted" else "committed";
3357
- const reason = if (self.journal.faulty.bit(op)) "faulty" else "dirty";
4051
+ const reason = if (self.journal.faulty.bit(slot)) "faulty" else "dirty";
3358
4052
  log.debug("{}: repair_prepare: op={} checksum={} ({s}, {s})", .{
3359
4053
  self.replica,
3360
4054
  op,
@@ -3417,22 +4111,6 @@ pub fn Replica(
3417
4111
  self.send_message_to_replica(next, message);
3418
4112
  }
3419
4113
 
3420
- /// Empties the prepare pipeline, unreffing all prepare and prepare_ok messages.
3421
- /// Stops the prepare timeout and resets the timeouts counter.
3422
- fn reset_pipeline(self: *Self) void {
3423
- while (self.pipeline.pop()) |prepare| {
3424
- self.unref_prepare_message_and_quorum_messages(&prepare);
3425
- }
3426
-
3427
- self.prepare_timeout.stop();
3428
-
3429
- assert(self.pipeline.count == 0);
3430
- assert(self.prepare_timeout.ticking == false);
3431
-
3432
- // Do not reset `repairing_pipeline` here as this must be reset by the read callback.
3433
- // Otherwise, we would be making `repair_pipeline()` reentrant.
3434
- }
3435
-
3436
4114
  fn reset_quorum_messages(self: *Self, messages: *QuorumMessages, command: Command) void {
3437
4115
  assert(messages.len == config.replicas_max);
3438
4116
  var view: ?u32 = null;
@@ -3457,7 +4135,27 @@ pub fn Replica(
3457
4135
  received.* = null;
3458
4136
  }
3459
4137
  assert(count <= self.replica_count);
3460
- log.debug("{}: reset {} {s} message(s)", .{ self.replica, count, @tagName(command) });
4138
+ log.debug("{}: reset {} {s} message(s) from view={}", .{
4139
+ self.replica,
4140
+ count,
4141
+ @tagName(command),
4142
+ view,
4143
+ });
4144
+ }
4145
+
4146
+ fn reset_quorum_counter(self: *Self, counter: *QuorumCounter) void {
4147
+ var counter_iterator = counter.iterator(.{});
4148
+ while (counter_iterator.next()) |replica| {
4149
+ assert(replica < self.replica_count);
4150
+ }
4151
+
4152
+ counter.setIntersection(quorum_counter_null);
4153
+ assert(counter.count() == 0);
4154
+
4155
+ var replica: usize = 0;
4156
+ while (replica < self.replica_count) : (replica += 1) {
4157
+ assert(!counter.isSet(replica));
4158
+ }
3461
4159
  }
3462
4160
 
3463
4161
  fn reset_quorum_do_view_change(self: *Self) void {
@@ -3466,15 +4164,25 @@ pub fn Replica(
3466
4164
  }
3467
4165
 
3468
4166
  fn reset_quorum_nack_prepare(self: *Self) void {
3469
- self.reset_quorum_messages(&self.nack_prepare_from_other_replicas, .nack_prepare);
4167
+ self.reset_quorum_counter(&self.nack_prepare_from_other_replicas);
3470
4168
  self.nack_prepare_op = null;
3471
4169
  }
3472
4170
 
3473
4171
  fn reset_quorum_start_view_change(self: *Self) void {
3474
- self.reset_quorum_messages(&self.start_view_change_from_other_replicas, .start_view_change);
4172
+ self.reset_quorum_counter(&self.start_view_change_from_other_replicas);
3475
4173
  self.start_view_change_quorum = false;
3476
4174
  }
3477
4175
 
4176
+ fn reset_quorum_recovery_response(self: *Self) void {
4177
+ for (self.recovery_response_from_other_replicas) |*received, replica| {
4178
+ if (received.*) |message| {
4179
+ assert(replica != self.replica);
4180
+ self.message_bus.unref(message);
4181
+ received.* = null;
4182
+ }
4183
+ }
4184
+ }
4185
+
3478
4186
  fn send_prepare_ok(self: *Self, header: *const Header) void {
3479
4187
  assert(header.command == .prepare);
3480
4188
  assert(header.cluster == self.cluster);
@@ -3534,7 +4242,7 @@ pub fn Replica(
3534
4242
  .view = self.view,
3535
4243
  .op = header.op,
3536
4244
  .commit = header.commit,
3537
- .offset = header.offset,
4245
+ .timestamp = header.timestamp,
3538
4246
  .operation = header.operation,
3539
4247
  });
3540
4248
  } else {
@@ -3552,7 +4260,7 @@ pub fn Replica(
3552
4260
  // * being able to send what we have will allow the pipeline to commit earlier, and
3553
4261
  // * the leader will drop any prepare_ok for a prepare not in the pipeline.
3554
4262
  // This is safe only because the leader can verify against the prepare checksum.
3555
- if (self.journal.entry_for_op_exact(op)) |header| {
4263
+ if (self.journal.header_with_op(op)) |header| {
3556
4264
  self.send_prepare_ok(header);
3557
4265
  defer self.flush_loopback_queue();
3558
4266
  }
@@ -3576,25 +4284,20 @@ pub fn Replica(
3576
4284
  assert(self.status == .view_change);
3577
4285
  assert(self.start_view_change_quorum);
3578
4286
  assert(!self.do_view_change_quorum);
3579
- const count_start_view_change = self.count_quorum(
3580
- &self.start_view_change_from_other_replicas,
3581
- .start_view_change,
3582
- 0,
3583
- );
4287
+
4288
+ const count_start_view_change = self.start_view_change_from_other_replicas.count();
3584
4289
  assert(count_start_view_change >= self.quorum_view_change - 1);
4290
+ assert(count_start_view_change <= self.replica_count - 1);
3585
4291
 
3586
- const message = self.create_view_change_message(.do_view_change) orelse {
3587
- log.err("{}: send_do_view_change: waiting for message", .{self.replica});
3588
- return;
3589
- };
4292
+ const message = self.create_view_change_message(.do_view_change);
3590
4293
  defer self.message_bus.unref(message);
3591
4294
 
3592
4295
  assert(message.references == 1);
3593
4296
  assert(message.header.command == .do_view_change);
3594
4297
  assert(message.header.view == self.view);
3595
4298
  assert(message.header.op == self.op);
4299
+ assert(message.header.op == self.message_body_as_headers(message)[0].op);
3596
4300
  assert(message.header.commit == self.commit_max);
3597
- // TODO Assert that latest header in message body matches self.op.
3598
4301
 
3599
4302
  self.send_message_to_replica(self.leader_index(self.view), message);
3600
4303
  }
@@ -3618,25 +4321,14 @@ pub fn Replica(
3618
4321
  }
3619
4322
 
3620
4323
  fn send_header_to_client(self: *Self, client: u128, header: Header) void {
3621
- const message = self.create_message_from_header(header) orelse {
3622
- log.err("{}: no header-only message available, dropping message to client {}", .{
3623
- self.replica,
3624
- client,
3625
- });
3626
- return;
3627
- };
4324
+ const message = self.create_message_from_header(header);
3628
4325
  defer self.message_bus.unref(message);
3629
4326
 
3630
4327
  self.message_bus.send_message_to_client(client, message);
3631
4328
  }
3632
4329
 
3633
4330
  fn send_header_to_other_replicas(self: *Self, header: Header) void {
3634
- const message = self.create_message_from_header(header) orelse {
3635
- log.err("{}: no header-only message available, dropping message to replicas", .{
3636
- self.replica,
3637
- });
3638
- return;
3639
- };
4331
+ const message = self.create_message_from_header(header);
3640
4332
  defer self.message_bus.unref(message);
3641
4333
 
3642
4334
  var replica: u8 = 0;
@@ -3648,13 +4340,7 @@ pub fn Replica(
3648
4340
  }
3649
4341
 
3650
4342
  fn send_header_to_replica(self: *Self, replica: u8, header: Header) void {
3651
- const message = self.create_message_from_header(header) orelse {
3652
- log.err("{}: no header-only message available, dropping message to replica {}", .{
3653
- self.replica,
3654
- replica,
3655
- });
3656
- return;
3657
- };
4343
+ const message = self.create_message_from_header(header);
3658
4344
  defer self.message_bus.unref(message);
3659
4345
 
3660
4346
  self.send_message_to_replica(replica, message);
@@ -3686,6 +4372,7 @@ pub fn Replica(
3686
4372
 
3687
4373
  // TODO According to message.header.command, assert on the destination replica.
3688
4374
  switch (message.header.command) {
4375
+ .reserved => unreachable,
3689
4376
  .request => {
3690
4377
  // Do not assert message.header.replica because we forward .request messages.
3691
4378
  assert(self.status == .normal);
@@ -3738,6 +4425,16 @@ pub fn Replica(
3738
4425
  },
3739
4426
  else => unreachable,
3740
4427
  },
4428
+ .recovery => {
4429
+ assert(self.status == .recovering);
4430
+ assert(message.header.replica == self.replica);
4431
+ assert(message.header.context == self.recovery_nonce);
4432
+ },
4433
+ .recovery_response => {
4434
+ assert(self.status == .normal);
4435
+ assert(message.header.view == self.view);
4436
+ assert(message.header.replica == self.replica);
4437
+ },
3741
4438
  .headers => {
3742
4439
  assert(self.status == .normal or self.status == .view_change);
3743
4440
  assert(message.header.view == self.view);
@@ -3764,7 +4461,7 @@ pub fn Replica(
3764
4461
  .nack_prepare => {
3765
4462
  assert(message.header.view == self.view);
3766
4463
  assert(message.header.replica == self.replica);
3767
- assert(replica == self.leader_index(self.view));
4464
+ assert(self.leader_index(self.view) == replica);
3768
4465
  },
3769
4466
  else => {
3770
4467
  log.info("{}: send_message_to_replica: TODO {s}", .{
@@ -3783,8 +4480,8 @@ pub fn Replica(
3783
4480
  }
3784
4481
 
3785
4482
  /// Finds the header with the highest op number in a slice of headers from a replica.
3786
- /// Searches only by op number to find the highest `self.op for the replica.
3787
- fn set_latest_op(headers: []Header, latest: *Header) void {
4483
+ /// Searches only by op number to find the highest `self.op` for the replica.
4484
+ fn set_latest_op(headers: []const Header, latest: *Header) void {
3788
4485
  switch (latest.command) {
3789
4486
  .reserved, .prepare => assert(latest.valid_checksum()),
3790
4487
  else => unreachable,
@@ -3809,17 +4506,27 @@ pub fn Replica(
3809
4506
  k: u64,
3810
4507
  method: []const u8,
3811
4508
  ) void {
3812
- assert(self.status == .view_change);
3813
-
4509
+ assert(self.status == .view_change or self.status == .recovering);
4510
+ assert(self.journal.recovered);
3814
4511
  assert(latest.valid_checksum());
3815
4512
  assert(latest.invalid() == null);
3816
4513
  assert(latest.command == .prepare);
3817
4514
  assert(latest.cluster == self.cluster);
3818
4515
 
3819
- // The view may have started already, so we can have a prepare in the same view:
3820
- assert(latest.view <= self.view);
4516
+ switch (self.status) {
4517
+ .normal => unreachable,
4518
+ .view_change => {
4519
+ // The view may have started already, so we can have a prepare in the same view:
4520
+ assert(latest.view <= self.view);
4521
+ },
4522
+ .recovering => {
4523
+ // The replica's view hasn't been set yet.
4524
+ // It will be set shortly, when we transition to normal status.
4525
+ assert(self.view == 0);
4526
+ },
4527
+ }
3821
4528
 
3822
- log.debug("{}: {s}: view={} op={}..{} commit={}..{} checksum={} offset={}", .{
4529
+ log.debug("{}: {s}: view={} op={}..{} commit={}..{} checksum={}", .{
3823
4530
  self.replica,
3824
4531
  method,
3825
4532
  self.view,
@@ -3828,7 +4535,6 @@ pub fn Replica(
3828
4535
  self.commit_max,
3829
4536
  k,
3830
4537
  latest.checksum,
3831
- latest.offset,
3832
4538
  });
3833
4539
 
3834
4540
  // Uncommitted ops may not survive a view change so we must assert `latest.op` against
@@ -3854,7 +4560,7 @@ pub fn Replica(
3854
4560
  });
3855
4561
  }
3856
4562
  assert(k >= latest.commit);
3857
- assert(k >= self.commit_max - std.math.min(config.pipelining_max, self.commit_max));
4563
+ assert(k >= self.commit_max - std.math.min(config.pipeline_max, self.commit_max));
3858
4564
 
3859
4565
  assert(self.commit_min <= self.commit_max);
3860
4566
  assert(self.op >= self.commit_max or self.op < self.commit_max);
@@ -3870,15 +4576,15 @@ pub fn Replica(
3870
4576
  // Do not set the latest op as dirty if we already have it exactly:
3871
4577
  // Otherwise, this would trigger a repair and delay the view change, or worse, it would
3872
4578
  // prevent us from assisting another replica to recover when we do in fact have the op.
3873
- if (self.journal.entry_for_op_exact_with_checksum(latest.op, latest.checksum)) |_| {
4579
+ if (self.journal.has(latest)) {
3874
4580
  log.debug("{}: {s}: latest op exists exactly", .{ self.replica, method });
3875
4581
  } else {
3876
- self.journal.set_entry_as_dirty(latest);
4582
+ self.journal.set_header_as_dirty(latest);
3877
4583
  }
3878
4584
 
3879
4585
  assert(self.op == latest.op);
3880
4586
  self.journal.remove_entries_from(self.op + 1);
3881
- assert(self.journal.entry_for_op_exact(self.op).?.checksum == latest.checksum);
4587
+ assert(self.journal.header_with_op(self.op).?.checksum == latest.checksum);
3882
4588
  }
3883
4589
 
3884
4590
  fn start_view_as_the_new_leader(self: *Self) void {
@@ -3891,34 +4597,18 @@ pub fn Replica(
3891
4597
 
3892
4598
  assert(self.commit_min == self.commit_max);
3893
4599
  assert(self.repair_pipeline_op() == null);
4600
+ self.verify_pipeline();
3894
4601
  assert(self.commit_max + self.pipeline.count == self.op);
3895
4602
  assert(self.valid_hash_chain_between(self.commit_min, self.op));
3896
4603
 
3897
- var pipeline_op = self.commit_max + 1;
3898
- var pipeline_parent = self.journal.entry_for_op_exact(self.commit_max).?.checksum;
3899
- var iterator = self.pipeline.iterator();
3900
- while (iterator.next_ptr()) |prepare| {
3901
- assert(prepare.message.header.command == .prepare);
3902
- assert(prepare.message.header.op == pipeline_op);
3903
- assert(prepare.message.header.parent == pipeline_parent);
3904
-
3905
- pipeline_parent = prepare.message.header.checksum;
3906
- pipeline_op += 1;
3907
- }
3908
- assert(self.pipeline.count <= config.pipelining_max);
3909
- assert(self.commit_max + self.pipeline.count == pipeline_op - 1);
3910
-
3911
- assert(self.journal.dirty.len == 0);
3912
- assert(self.journal.faulty.len == 0);
4604
+ assert(self.journal.dirty.count == 0);
4605
+ assert(self.journal.faulty.count == 0);
3913
4606
  assert(self.nack_prepare_op == null);
3914
4607
 
3915
- const start_view = self.create_view_change_message(.start_view) orelse {
3916
- log.err("{}: start_view_as_the_new_leader: waiting for message", .{self.replica});
3917
- return;
3918
- };
4608
+ const start_view = self.create_view_change_message(.start_view);
3919
4609
  defer self.message_bus.unref(start_view);
3920
4610
 
3921
- self.transition_to_normal_status(self.view);
4611
+ self.transition_to_normal_from_view_change_status(self.view);
3922
4612
  // Detect if the transition to normal status above accidentally resets the pipeline:
3923
4613
  assert(self.commit_max + self.pipeline.count == self.op);
3924
4614
 
@@ -3937,17 +4627,73 @@ pub fn Replica(
3937
4627
  self.send_message_to_other_replicas(start_view);
3938
4628
  }
3939
4629
 
3940
- fn transition_to_normal_status(self: *Self, new_view: u32) void {
3941
- log.debug("{}: transition_to_normal_status: view={}", .{ self.replica, new_view });
4630
+ fn transition_to_normal_from_recovering_status(self: *Self, new_view: u32) void {
4631
+ assert(self.status == .recovering);
4632
+ assert(self.view == 0);
4633
+ self.view = new_view;
4634
+ self.view_normal = new_view;
4635
+ self.status = .normal;
4636
+
4637
+ if (self.leader()) {
4638
+ log.debug(
4639
+ "{}: transition_to_normal_from_recovering_status: view={} leader",
4640
+ .{
4641
+ self.replica,
4642
+ self.view,
4643
+ },
4644
+ );
4645
+
4646
+ assert(self.journal.is_empty() or self.replica_count == 1);
4647
+ assert(!self.prepare_timeout.ticking);
4648
+ assert(!self.normal_status_timeout.ticking);
4649
+ assert(!self.view_change_status_timeout.ticking);
4650
+ assert(!self.view_change_message_timeout.ticking);
4651
+
4652
+ self.ping_timeout.start();
4653
+ self.commit_timeout.start();
4654
+ self.repair_timeout.start();
4655
+ self.recovery_timeout.stop();
4656
+ } else {
4657
+ log.debug(
4658
+ "{}: transition_to_normal_from_recovering_status: view={} follower",
4659
+ .{
4660
+ self.replica,
4661
+ self.view,
4662
+ },
4663
+ );
4664
+
4665
+ assert(!self.prepare_timeout.ticking);
4666
+ assert(!self.commit_timeout.ticking);
4667
+ assert(!self.view_change_status_timeout.ticking);
4668
+ assert(!self.view_change_message_timeout.ticking);
4669
+
4670
+ self.ping_timeout.start();
4671
+ self.normal_status_timeout.start();
4672
+ self.repair_timeout.start();
4673
+ self.recovery_timeout.stop();
4674
+ }
4675
+ }
4676
+
4677
+ fn transition_to_normal_from_view_change_status(self: *Self, new_view: u32) void {
3942
4678
  // In the VRR paper it's possible to transition from normal to normal for the same view.
3943
4679
  // For example, this could happen after a state transfer triggered by an op jump.
4680
+ assert(self.status == .view_change);
3944
4681
  assert(new_view >= self.view);
3945
4682
  self.view = new_view;
3946
4683
  self.view_normal = new_view;
3947
4684
  self.status = .normal;
3948
4685
 
3949
4686
  if (self.leader()) {
3950
- log.debug("{}: transition_to_normal_status: leader", .{self.replica});
4687
+ log.debug(
4688
+ "{}: transition_to_normal_from_view_change_status: view={} leader",
4689
+ .{
4690
+ self.replica,
4691
+ self.view,
4692
+ },
4693
+ );
4694
+
4695
+ assert(!self.prepare_timeout.ticking);
4696
+ assert(!self.recovery_timeout.ticking);
3951
4697
 
3952
4698
  self.ping_timeout.start();
3953
4699
  self.commit_timeout.start();
@@ -3957,12 +4703,15 @@ pub fn Replica(
3957
4703
  self.repair_timeout.start();
3958
4704
 
3959
4705
  // Do not reset the pipeline as there may be uncommitted ops to drive to completion.
3960
- if (self.pipeline.count > 0) {
3961
- assert(!self.prepare_timeout.ticking);
3962
- self.prepare_timeout.start();
3963
- }
4706
+ if (self.pipeline.count > 0) self.prepare_timeout.start();
3964
4707
  } else {
3965
- log.debug("{}: transition_to_normal_status: follower", .{self.replica});
4708
+ log.debug("{}: transition_to_normal_from_view_change_status: view={} follower", .{
4709
+ self.replica,
4710
+ self.view,
4711
+ });
4712
+
4713
+ assert(!self.prepare_timeout.ticking);
4714
+ assert(!self.recovery_timeout.ticking);
3966
4715
 
3967
4716
  self.ping_timeout.start();
3968
4717
  self.commit_timeout.stop();
@@ -3970,8 +4719,6 @@ pub fn Replica(
3970
4719
  self.view_change_status_timeout.stop();
3971
4720
  self.view_change_message_timeout.stop();
3972
4721
  self.repair_timeout.start();
3973
-
3974
- self.reset_pipeline();
3975
4722
  }
3976
4723
 
3977
4724
  self.reset_quorum_start_view_change();
@@ -3983,17 +4730,18 @@ pub fn Replica(
3983
4730
  assert(self.nack_prepare_op == null);
3984
4731
  }
3985
4732
 
3986
- /// A replica i that notices the need for a view change advances its view, sets its status to
3987
- /// view_change, and sends a ⟨start_view_change v, i⟩ message to all the other replicas,
3988
- /// where v identifies the new view. A replica notices the need for a view change either based
3989
- /// on its own timer, or because it receives a start_view_change or do_view_change message for
3990
- /// a view with a larger number than its own view.
4733
+ /// A replica i that notices the need for a view change advances its view, sets its status
4734
+ /// to view_change, and sends a ⟨start_view_change v, i⟩ message to all the other replicas,
4735
+ /// where v identifies the new view. A replica notices the need for a view change either
4736
+ /// based on its own timer, or because it receives a start_view_change or do_view_change
4737
+ /// message for a view with a larger number than its own view.
3991
4738
  fn transition_to_view_change_status(self: *Self, new_view: u32) void {
3992
4739
  log.debug("{}: transition_to_view_change_status: view={}..{}", .{
3993
4740
  self.replica,
3994
4741
  self.view,
3995
4742
  new_view,
3996
4743
  });
4744
+ assert(self.status == .normal or self.status == .view_change);
3997
4745
  assert(new_view > self.view);
3998
4746
  self.view = new_view;
3999
4747
  self.status = .view_change;
@@ -4004,13 +4752,14 @@ pub fn Replica(
4004
4752
  self.view_change_status_timeout.start();
4005
4753
  self.view_change_message_timeout.start();
4006
4754
  self.repair_timeout.stop();
4755
+ self.prepare_timeout.stop();
4756
+ assert(!self.recovery_timeout.ticking);
4007
4757
 
4008
4758
  // Do not reset quorum counters only on entering a view, assuming that the view will be
4009
4759
  // followed only by a single subsequent view change to the next view, because multiple
4010
4760
  // successive view changes can fail, e.g. after a view change timeout.
4011
- // We must therefore reset our counters here to avoid counting messages from an older view,
4012
- // which would violate the quorum intersection property essential for correctness.
4013
- self.reset_pipeline();
4761
+ // We must therefore reset our counters here to avoid counting messages from an older
4762
+ // view, which would violate the quorum intersection property essential for correctness.
4014
4763
  self.reset_quorum_start_view_change();
4015
4764
  self.reset_quorum_do_view_change();
4016
4765
  self.reset_quorum_nack_prepare();
@@ -4022,19 +4771,6 @@ pub fn Replica(
4022
4771
  self.send_start_view_change();
4023
4772
  }
4024
4773
 
4025
- fn unref_prepare_message_and_quorum_messages(
4026
- self: *Self,
4027
- prepare: *const Prepare,
4028
- ) void {
4029
- self.message_bus.unref(prepare.message);
4030
- for (prepare.ok_from_all_replicas) |received, replica| {
4031
- if (received) |prepare_ok| {
4032
- assert(replica < self.replica_count);
4033
- self.message_bus.unref(prepare_ok);
4034
- }
4035
- }
4036
- }
4037
-
4038
4774
  fn update_client_table_entry(self: *Self, reply: *Message) void {
4039
4775
  assert(reply.header.command == .reply);
4040
4776
  assert(reply.header.operation != .register);
@@ -4098,21 +4834,21 @@ pub fn Replica(
4098
4834
  return true;
4099
4835
  }
4100
4836
 
4101
- /// Returns true if all operations are present, correctly ordered and connected by hash chain,
4102
- /// between `op_min` and `op_max` (both inclusive).
4837
+ /// Returns true if all operations are present, correctly ordered and connected by hash
4838
+ /// chain, between `op_min` and `op_max` (both inclusive).
4103
4839
  fn valid_hash_chain_between(self: *Self, op_min: u64, op_max: u64) bool {
4104
4840
  assert(op_min <= op_max);
4105
4841
 
4106
- // If we use anything less than self.op then we may commit ops for a forked hash chain that
4107
- // have since been reordered by a new leader.
4842
+ // If we use anything less than self.op then we may commit ops for a forked hash chain
4843
+ // that have since been reordered by a new leader.
4108
4844
  assert(op_max == self.op);
4109
- var b = self.journal.entry_for_op_exact(op_max).?;
4845
+ var b = self.journal.header_with_op(op_max).?;
4110
4846
 
4111
4847
  var op = op_max;
4112
4848
  while (op > op_min) {
4113
4849
  op -= 1;
4114
4850
 
4115
- if (self.journal.entry_for_op_exact(op)) |a| {
4851
+ if (self.journal.header_with_op(op)) |a| {
4116
4852
  assert(a.op + 1 == b.op);
4117
4853
  if (a.checksum == b.parent) {
4118
4854
  assert(ascending_viewstamps(a, b));
@@ -4131,6 +4867,33 @@ pub fn Replica(
4131
4867
  return true;
4132
4868
  }
4133
4869
 
4870
+ fn verify_pipeline(self: *Self) void {
4871
+ var op = self.commit_max + 1;
4872
+ var parent = self.journal.header_with_op(self.commit_max).?.checksum;
4873
+
4874
+ var iterator = self.pipeline.iterator();
4875
+ while (iterator.next_ptr()) |prepare| {
4876
+ assert(prepare.message.header.command == .prepare);
4877
+
4878
+ log.debug("{}: verify_pipeline: op={} checksum={x} parent={x}", .{
4879
+ self.replica,
4880
+ prepare.message.header.op,
4881
+ prepare.message.header.checksum,
4882
+ prepare.message.header.parent,
4883
+ });
4884
+
4885
+ assert(self.journal.has(prepare.message.header));
4886
+ assert(prepare.message.header.op == op);
4887
+ assert(prepare.message.header.op <= self.op);
4888
+ assert(prepare.message.header.parent == parent);
4889
+
4890
+ parent = prepare.message.header.checksum;
4891
+ op += 1;
4892
+ }
4893
+ assert(self.pipeline.count <= config.pipeline_max);
4894
+ assert(self.commit_max + self.pipeline.count == op - 1);
4895
+ }
4896
+
4134
4897
  fn view_jump(self: *Self, header: *const Header) void {
4135
4898
  const to: Status = switch (header.command) {
4136
4899
  .prepare, .commit => .normal,
@@ -4226,10 +4989,10 @@ pub fn Replica(
4226
4989
  return;
4227
4990
  }
4228
4991
 
4229
- self.journal.write_prepare(write_prepare_on_write, message, trigger);
4992
+ self.journal.write_prepare(write_prepare_callback, message, trigger);
4230
4993
  }
4231
4994
 
4232
- fn write_prepare_on_write(
4995
+ fn write_prepare_callback(
4233
4996
  self: *Self,
4234
4997
  wrote: ?*Message,
4235
4998
  trigger: Journal.Write.Trigger,
@@ -4245,6 +5008,7 @@ pub fn Replica(
4245
5008
  // If this was a repair, continue immediately to repair the next prepare:
4246
5009
  // This is an optimization to eliminate waiting until the next repair timeout.
4247
5010
  .repair => self.repair(),
5011
+ .pipeline => self.repair(),
4248
5012
  }
4249
5013
  }
4250
5014
  };