tigerbeetle-node 0.4.1 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/README.md +19 -5
  2. package/dist/benchmark.js.map +1 -1
  3. package/dist/index.d.ts +18 -16
  4. package/dist/index.js +35 -13
  5. package/dist/index.js.map +1 -1
  6. package/dist/test.js +13 -1
  7. package/dist/test.js.map +1 -1
  8. package/package.json +12 -12
  9. package/scripts/postinstall.sh +2 -2
  10. package/src/benchmark.ts +2 -2
  11. package/src/index.ts +29 -4
  12. package/src/node.zig +124 -21
  13. package/src/test.ts +18 -4
  14. package/src/tigerbeetle/scripts/install.sh +2 -2
  15. package/src/tigerbeetle/scripts/install_zig.bat +109 -0
  16. package/src/tigerbeetle/scripts/install_zig.sh +22 -3
  17. package/src/tigerbeetle/scripts/lint.zig +8 -2
  18. package/src/tigerbeetle/scripts/vopr.bat +48 -0
  19. package/src/tigerbeetle/scripts/vopr.sh +24 -4
  20. package/src/tigerbeetle/src/benchmark.zig +18 -14
  21. package/src/tigerbeetle/src/cli.zig +8 -6
  22. package/src/tigerbeetle/src/config.zig +10 -18
  23. package/src/tigerbeetle/src/demo.zig +122 -92
  24. package/src/tigerbeetle/src/demo_01_create_accounts.zig +5 -3
  25. package/src/tigerbeetle/src/demo_02_lookup_accounts.zig +2 -3
  26. package/src/tigerbeetle/src/demo_03_create_transfers.zig +5 -3
  27. package/src/tigerbeetle/src/demo_04_create_transfers_two_phase_commit.zig +5 -3
  28. package/src/tigerbeetle/src/demo_05_accept_transfers.zig +5 -3
  29. package/src/tigerbeetle/src/demo_06_reject_transfers.zig +5 -3
  30. package/src/tigerbeetle/src/demo_07_lookup_transfers.zig +7 -0
  31. package/src/tigerbeetle/src/fifo.zig +14 -14
  32. package/src/tigerbeetle/src/io/benchmark.zig +238 -0
  33. package/src/tigerbeetle/src/{io_darwin.zig → io/darwin.zig} +88 -121
  34. package/src/tigerbeetle/src/io/linux.zig +933 -0
  35. package/src/tigerbeetle/src/io/test.zig +621 -0
  36. package/src/tigerbeetle/src/io.zig +7 -1322
  37. package/src/tigerbeetle/src/main.zig +22 -13
  38. package/src/tigerbeetle/src/message_bus.zig +50 -61
  39. package/src/tigerbeetle/src/message_pool.zig +6 -5
  40. package/src/tigerbeetle/src/ring_buffer.zig +135 -68
  41. package/src/tigerbeetle/src/simulator.zig +120 -47
  42. package/src/tigerbeetle/src/state_machine.zig +853 -27
  43. package/src/tigerbeetle/src/storage.zig +51 -48
  44. package/src/tigerbeetle/src/test/cluster.zig +90 -14
  45. package/src/tigerbeetle/src/test/message_bus.zig +7 -10
  46. package/src/tigerbeetle/src/test/network.zig +5 -5
  47. package/src/tigerbeetle/src/test/packet_simulator.zig +188 -32
  48. package/src/tigerbeetle/src/test/state_checker.zig +3 -3
  49. package/src/tigerbeetle/src/test/state_machine.zig +6 -4
  50. package/src/tigerbeetle/src/test/storage.zig +322 -26
  51. package/src/tigerbeetle/src/test/time.zig +2 -2
  52. package/src/tigerbeetle/src/tigerbeetle.zig +6 -129
  53. package/src/tigerbeetle/src/time.zig +6 -5
  54. package/src/tigerbeetle/src/unit_tests.zig +14 -0
  55. package/src/tigerbeetle/src/{vr → vsr}/client.zig +21 -21
  56. package/src/tigerbeetle/src/{vr → vsr}/clock.zig +34 -48
  57. package/src/tigerbeetle/src/{vr → vsr}/journal.zig +259 -61
  58. package/src/tigerbeetle/src/{marzullo.zig → vsr/marzullo.zig} +6 -3
  59. package/src/tigerbeetle/src/{vr → vsr}/replica.zig +711 -349
  60. package/src/tigerbeetle/src/{vr.zig → vsr.zig} +32 -25
  61. package/src/translate.zig +55 -55
  62. package/src/tigerbeetle/src/fixed_array_list.zig +0 -53
  63. package/src/tigerbeetle/src/io_async.zig +0 -600
  64. package/src/tigerbeetle/src/test_client.zig +0 -41
@@ -7,11 +7,11 @@ const config = @import("../config.zig");
7
7
  const Message = @import("../message_pool.zig").MessagePool.Message;
8
8
  const RingBuffer = @import("../ring_buffer.zig").RingBuffer;
9
9
 
10
- const vr = @import("../vr.zig");
11
- const Header = vr.Header;
12
- const Timeout = vr.Timeout;
13
- const Command = vr.Command;
14
- const Version = vr.Version;
10
+ const vsr = @import("../vsr.zig");
11
+ const Header = vsr.Header;
12
+ const Timeout = vsr.Timeout;
13
+ const Command = vsr.Command;
14
+ const Version = vsr.Version;
15
15
 
16
16
  const log = std.log.scoped(.replica);
17
17
 
@@ -21,7 +21,7 @@ pub const Status = enum {
21
21
  recovering,
22
22
  };
23
23
 
24
- const ClientTable = std.AutoHashMap(u128, ClientTableEntry);
24
+ const ClientTable = std.AutoHashMapUnmanaged(u128, ClientTableEntry);
25
25
 
26
26
  /// We found two bugs in the VRR paper relating to the client table:
27
27
  ///
@@ -70,10 +70,8 @@ pub fn Replica(
70
70
  return struct {
71
71
  const Self = @This();
72
72
 
73
- const Journal = vr.Journal(Self, Storage);
74
- const Clock = vr.Clock(Time);
75
-
76
- allocator: *Allocator,
73
+ const Journal = vsr.Journal(Self, Storage);
74
+ const Clock = vsr.Clock(Time);
77
75
 
78
76
  /// The number of the cluster to which this replica belongs:
79
77
  cluster: u32,
@@ -109,13 +107,8 @@ pub fn Replica(
109
107
  /// The current view, initially 0:
110
108
  view: u32,
111
109
 
112
- /// Whether we have experienced a view jump:
113
- /// If this is true then we must request a start_view message from the leader before
114
- /// committing to avoid committing ops that may have been changed through a view change.
115
- /// This is the most crucial aspect of the protocol to get right, especially because it can
116
- /// slip past any protection provided by the hash chain. For example, we may have a fully
117
- /// connected hash chain but with uncommitted ops that never survived into the newer view.
118
- view_jump_barrier: bool = false,
110
+ /// The latest view, in which the replica's status was normal.
111
+ view_normal: u32,
119
112
 
120
113
  /// The current status, either normal, view_change, or recovering:
121
114
  /// TODO Don't default to normal, set the starting status according to the journal's health.
@@ -181,11 +174,11 @@ pub fn Replica(
181
174
 
182
175
  /// The number of ticks without hearing from the leader before starting a view change.
183
176
  /// This transitions from .normal status to .view_change status.
184
- election_timeout: Timeout,
177
+ normal_status_timeout: Timeout,
185
178
 
186
179
  /// The number of ticks before a view change is timed out:
187
180
  /// This transitions from `view_change` status to `view_change` status but for a newer view.
188
- view_change_timeout: Timeout,
181
+ view_change_status_timeout: Timeout,
189
182
 
190
183
  /// The number of ticks before resending a `start_view_change` or `do_view_change` message:
191
184
  view_change_message_timeout: Timeout,
@@ -204,11 +197,11 @@ pub fn Replica(
204
197
  on_change_state: ?fn (replica: *Self) void = null,
205
198
 
206
199
  pub fn init(
207
- allocator: *Allocator,
200
+ allocator: Allocator,
208
201
  cluster: u32,
209
202
  replica_count: u8,
210
203
  replica: u8,
211
- time: Time,
204
+ time: *Time,
212
205
  storage: *Storage,
213
206
  message_bus: *MessageBus,
214
207
  state_machine: *StateMachine,
@@ -236,14 +229,17 @@ pub fn Replica(
236
229
  if (replica_count <= 2) {
237
230
  assert(quorum_replication == replica_count);
238
231
  assert(quorum_view_change == replica_count);
232
+ } else {
233
+ assert(quorum_replication < replica_count);
234
+ assert(quorum_view_change < replica_count);
239
235
  }
240
236
 
241
237
  // Flexible quorums are safe if these two quorums intersect so that this relation holds:
242
238
  assert(quorum_replication + quorum_view_change > replica_count);
243
239
 
244
- var client_table = ClientTable.init(allocator);
245
- errdefer client_table.deinit();
246
- try client_table.ensureCapacity(@intCast(u32, config.clients_max));
240
+ var client_table: ClientTable = .{};
241
+ errdefer client_table.deinit(allocator);
242
+ try client_table.ensureTotalCapacity(allocator, @intCast(u32, config.clients_max));
247
243
  assert(client_table.capacity() >= config.clients_max);
248
244
 
249
245
  var init_prepare = Header{
@@ -267,7 +263,6 @@ pub fn Replica(
267
263
  init_prepare.set_checksum();
268
264
 
269
265
  var self = Self{
270
- .allocator = allocator,
271
266
  .cluster = cluster,
272
267
  .replica_count = replica_count,
273
268
  .replica = replica,
@@ -291,6 +286,7 @@ pub fn Replica(
291
286
  .state_machine = state_machine,
292
287
  .client_table = client_table,
293
288
  .view = init_prepare.view,
289
+ .view_normal = init_prepare.view,
294
290
  .op = init_prepare.op,
295
291
  .commit_min = init_prepare.commit,
296
292
  .commit_max = init_prepare.commit,
@@ -309,13 +305,13 @@ pub fn Replica(
309
305
  .id = replica,
310
306
  .after = 100,
311
307
  },
312
- .election_timeout = Timeout{
313
- .name = "election_timeout",
308
+ .normal_status_timeout = Timeout{
309
+ .name = "normal_status_timeout",
314
310
  .id = replica,
315
311
  .after = 500,
316
312
  },
317
- .view_change_timeout = Timeout{
318
- .name = "view_change_timeout",
313
+ .view_change_status_timeout = Timeout{
314
+ .name = "view_change_status_timeout",
319
315
  .id = replica,
320
316
  .after = 500,
321
317
  },
@@ -358,15 +354,52 @@ pub fn Replica(
358
354
  } else {
359
355
  log.debug("{}: init: follower", .{self.replica});
360
356
  self.ping_timeout.start();
361
- self.election_timeout.start();
357
+ self.normal_status_timeout.start();
362
358
  self.repair_timeout.start();
363
359
  }
364
360
 
365
361
  return self;
366
362
  }
367
363
 
368
- pub fn deinit(self: *Self) void {
369
- self.client_table.deinit();
364
+ /// Free all memory and unref all messages held by the replica
365
+ /// This does not deinitialize the StateMachine, MessageBus, Storage, or Time
366
+ pub fn deinit(self: *Self, allocator: Allocator) void {
367
+ self.journal.deinit(allocator);
368
+ self.clock.deinit(allocator);
369
+
370
+ {
371
+ var it = self.client_table.iterator();
372
+ while (it.next()) |entry| {
373
+ self.message_bus.unref(entry.value_ptr.reply);
374
+ }
375
+ self.client_table.deinit(allocator);
376
+ }
377
+
378
+ {
379
+ var it = self.pipeline.iterator();
380
+ while (it.next()) |prepare| {
381
+ self.message_bus.unref(prepare.message);
382
+ for (prepare.ok_from_all_replicas) |message| {
383
+ if (message) |m| self.message_bus.unref(m);
384
+ }
385
+ }
386
+ }
387
+
388
+ if (self.loopback_queue) |loopback_message| {
389
+ assert(loopback_message.next == null);
390
+ self.message_bus.unref(loopback_message);
391
+ self.loopback_queue = null;
392
+ }
393
+
394
+ for (self.start_view_change_from_other_replicas) |message| {
395
+ if (message) |m| self.message_bus.unref(m);
396
+ }
397
+ for (self.do_view_change_from_all_replicas) |message| {
398
+ if (message) |m| self.message_bus.unref(m);
399
+ }
400
+ for (self.nack_prepare_from_other_replicas) |message| {
401
+ if (message) |m| self.message_bus.unref(m);
402
+ }
370
403
  }
371
404
 
372
405
  /// Time is measured in logical ticks that are incremented on every call to tick().
@@ -380,19 +413,26 @@ pub fn Replica(
380
413
 
381
414
  self.clock.tick();
382
415
 
416
+ if (!self.journal.recovered) {
417
+ self.journal.recover();
418
+ return;
419
+ } else {
420
+ assert(!self.journal.recovering);
421
+ }
422
+
383
423
  self.ping_timeout.tick();
384
424
  self.prepare_timeout.tick();
385
425
  self.commit_timeout.tick();
386
- self.election_timeout.tick();
387
- self.view_change_timeout.tick();
426
+ self.normal_status_timeout.tick();
427
+ self.view_change_status_timeout.tick();
388
428
  self.view_change_message_timeout.tick();
389
429
  self.repair_timeout.tick();
390
430
 
391
431
  if (self.ping_timeout.fired()) self.on_ping_timeout();
392
432
  if (self.prepare_timeout.fired()) self.on_prepare_timeout();
393
433
  if (self.commit_timeout.fired()) self.on_commit_timeout();
394
- if (self.election_timeout.fired()) self.on_election_timeout();
395
- if (self.view_change_timeout.fired()) self.on_view_change_timeout();
434
+ if (self.normal_status_timeout.fired()) self.on_normal_status_timeout();
435
+ if (self.view_change_status_timeout.fired()) self.on_view_change_status_timeout();
396
436
  if (self.view_change_message_timeout.fired()) self.on_view_change_message_timeout();
397
437
  if (self.repair_timeout.fired()) self.on_repair_timeout();
398
438
 
@@ -412,10 +452,13 @@ pub fn Replica(
412
452
  });
413
453
 
414
454
  if (message.header.invalid()) |reason| {
415
- log.alert("{}: on_message: invalid ({s})", .{ self.replica, reason });
455
+ log.err("{}: on_message: invalid ({s})", .{ self.replica, reason });
416
456
  return;
417
457
  }
418
458
 
459
+ // No client or replica should ever send a .reserved message.
460
+ assert(message.header.command != .reserved);
461
+
419
462
  if (message.header.cluster != self.cluster) {
420
463
  log.warn("{}: on_message: wrong cluster (cluster must be {} not {})", .{
421
464
  self.replica,
@@ -425,6 +468,14 @@ pub fn Replica(
425
468
  return;
426
469
  }
427
470
 
471
+ if (!self.journal.recovered) {
472
+ self.journal.recover();
473
+ log.debug("{}: on_message: waiting for journal to recover", .{self.replica});
474
+ return;
475
+ } else {
476
+ assert(!self.journal.recovering);
477
+ }
478
+
428
479
  assert(message.header.replica < self.replica_count);
429
480
  switch (message.header.command) {
430
481
  .ping => self.on_ping(message),
@@ -436,16 +487,26 @@ pub fn Replica(
436
487
  .start_view_change => self.on_start_view_change(message),
437
488
  .do_view_change => self.on_do_view_change(message),
438
489
  .start_view => self.on_start_view(message),
490
+ .recovery => self.on_recovery(message),
491
+ .recovery_response => return, // TODO
439
492
  .request_start_view => self.on_request_start_view(message),
440
493
  .request_prepare => self.on_request_prepare(message),
441
494
  .request_headers => self.on_request_headers(message),
442
495
  .headers => self.on_headers(message),
443
496
  .nack_prepare => self.on_nack_prepare(message),
444
- else => unreachable,
497
+ // A replica should never handle misdirected messages intended for a client:
498
+ .eviction, .reply => {
499
+ log.warn("{}: on_message: ignoring misdirected {s} message", .{
500
+ self.replica,
501
+ @tagName(message.header.command),
502
+ });
503
+ return;
504
+ },
505
+ .reserved => unreachable,
445
506
  }
446
507
 
447
508
  if (self.loopback_queue) |loopback_message| {
448
- log.emerg("{}: on_message: on_{s}() queued a {s} loopback message with no flush", .{
509
+ log.err("{}: on_message: on_{s}() queued a {s} loopback message with no flush", .{
449
510
  self.replica,
450
511
  @tagName(message.header.command),
451
512
  @tagName(loopback_message.header.command),
@@ -473,7 +534,15 @@ pub fn Replica(
473
534
  if (message.header.client > 0) {
474
535
  assert(message.header.replica == 0);
475
536
 
476
- self.send_header_to_client(message.header.client, pong);
537
+ // We must only ever send our view number to a client via a pong message if we are
538
+ // in normal status. Otherwise, we may be partitioned from the cluster with a newer
539
+ // view number, leak this to the client, which would then pass this to the cluster
540
+ // in subsequent client requests, which would then ignore these client requests with
541
+ // a newer view number, locking out the client. The principle here is that we must
542
+ // never send view numbers for views that have not yet started.
543
+ if (self.status == .normal) {
544
+ self.send_header_to_client(message.header.client, pong);
545
+ }
477
546
  } else if (message.header.replica == self.replica) {
478
547
  log.warn("{}: on_ping: ignoring (self)", .{self.replica});
479
548
  } else {
@@ -492,8 +561,7 @@ pub fn Replica(
492
561
  const t1 = @bitCast(i64, message.header.offset);
493
562
  const m2 = self.clock.monotonic();
494
563
 
495
- // TODO Drop the @intCast when the client table branch lands.
496
- self.clock.learn(@intCast(u8, message.header.replica), m0, t1, m2);
564
+ self.clock.learn(message.header.replica, m0, t1, m2);
497
565
  }
498
566
 
499
567
  /// The primary advances op-number, adds the request to the end of the log, and updates the
@@ -513,7 +581,7 @@ pub fn Replica(
513
581
  assert(message.header.view <= self.view); // The client's view may be behind ours.
514
582
 
515
583
  const realtime = self.clock.realtime_synchronized() orelse {
516
- log.alert("{}: on_request: dropping (clock not synchronized)", .{self.replica});
584
+ log.err("{}: on_request: dropping (clock not synchronized)", .{self.replica});
517
585
  return;
518
586
  };
519
587
 
@@ -531,7 +599,7 @@ pub fn Replica(
531
599
  message.header.view = self.view;
532
600
  message.header.op = self.op + 1;
533
601
  message.header.commit = self.commit_max;
534
- message.header.offset = self.journal.next_offset(latest_entry);
602
+ message.header.offset = Journal.next_offset(latest_entry);
535
603
  message.header.replica = self.replica;
536
604
  message.header.command = .prepare;
537
605
 
@@ -597,6 +665,16 @@ pub fn Replica(
597
665
  return;
598
666
  }
599
667
 
668
+ if (message.header.view < self.view) {
669
+ log.debug("{}: on_prepare: ignoring (older view)", .{self.replica});
670
+ return;
671
+ }
672
+
673
+ if (message.header.view > self.view) {
674
+ log.debug("{}: on_prepare: ignoring (newer view)", .{self.replica});
675
+ return;
676
+ }
677
+
600
678
  assert(self.status == .normal);
601
679
  assert(message.header.view == self.view);
602
680
  assert(self.leader() or self.follower());
@@ -604,7 +682,7 @@ pub fn Replica(
604
682
  assert(message.header.op > self.op);
605
683
  assert(message.header.op > self.commit_min);
606
684
 
607
- if (self.follower()) self.election_timeout.reset();
685
+ if (self.follower()) self.normal_status_timeout.reset();
608
686
 
609
687
  if (message.header.op > self.op + 1) {
610
688
  log.debug("{}: on_prepare: newer op", .{self.replica});
@@ -630,12 +708,6 @@ pub fn Replica(
630
708
  self.op = message.header.op;
631
709
  self.journal.set_entry_as_dirty(message.header);
632
710
 
633
- // We have the latest op from the leader and have cleared the view jump barrier:
634
- if (self.view_jump_barrier) {
635
- self.view_jump_barrier = false;
636
- log.debug("{}: on_prepare: cleared view jump barrier", .{self.replica});
637
- }
638
-
639
711
  self.replicate(message);
640
712
  self.append(message);
641
713
 
@@ -681,6 +753,10 @@ pub fn Replica(
681
753
  self.commit_pipeline();
682
754
  }
683
755
 
756
+ /// Known issue:
757
+ /// TODO The leader should stand down if it sees too many retries in on_prepare_timeout().
758
+ /// It's possible for the network to be one-way partitioned so that followers don't see the
759
+ /// leader as down, but neither can the leader hear from the followers.
684
760
  fn on_commit(self: *Self, message: *const Message) void {
685
761
  self.view_jump(message.header);
686
762
 
@@ -694,6 +770,11 @@ pub fn Replica(
694
770
  return;
695
771
  }
696
772
 
773
+ if (message.header.view > self.view) {
774
+ log.debug("{}: on_commit: ignoring (newer view)", .{self.replica});
775
+ return;
776
+ }
777
+
697
778
  if (self.leader()) {
698
779
  log.warn("{}: on_commit: ignoring (leader)", .{self.replica});
699
780
  return;
@@ -711,13 +792,12 @@ pub fn Replica(
711
792
  } else if (self.valid_hash_chain("on_commit")) {
712
793
  @panic("commit checksum verification failed");
713
794
  } else {
714
- // We may have a view jump barrier in place, or we may be repairing after
715
- // resolving the view jump barrier.
795
+ // We may still be repairing after receiving the start_view message.
716
796
  log.debug("{}: on_commit: skipping checksum verification", .{self.replica});
717
797
  }
718
798
  }
719
799
 
720
- self.election_timeout.reset();
800
+ self.normal_status_timeout.reset();
721
801
 
722
802
  self.commit_ops(message.header.commit);
723
803
  }
@@ -768,11 +848,6 @@ pub fn Replica(
768
848
  return;
769
849
  }
770
850
 
771
- if (self.view_jump_barrier) {
772
- log.debug("{}: on_repair: ignoring (view jump barrier)", .{self.replica});
773
- return;
774
- }
775
-
776
851
  if (self.repair_header(message.header)) {
777
852
  assert(self.journal.has_dirty(message.header));
778
853
 
@@ -803,6 +878,20 @@ pub fn Replica(
803
878
  assert(self.status == .view_change);
804
879
  assert(message.header.view == self.view);
805
880
 
881
+ if (self.leader_index(self.view) == self.replica) {
882
+ // If we are the leader of the new view, then wait until we have a message to send a
883
+ // do_view_change message to ourself. The on_do_view_change() handler will panic if
884
+ // we received a start_view_change quorum without a do_view_change to ourself.
885
+ if (self.message_bus.get_message()) |available| {
886
+ self.message_bus.unref(available);
887
+ } else {
888
+ log.err("{}: on_start_view_change: waiting for message for do_view_change", .{
889
+ self.replica,
890
+ });
891
+ return;
892
+ }
893
+ }
894
+
806
895
  // Wait until we have `f` messages (excluding ourself) for quorum:
807
896
  assert(self.replica_count > 1);
808
897
  const threshold = self.quorum_view_change - 1;
@@ -815,7 +904,10 @@ pub fn Replica(
815
904
 
816
905
  assert(count == threshold);
817
906
  assert(self.start_view_change_from_other_replicas[self.replica] == null);
818
- log.debug("{}: on_start_view_change: quorum received", .{self.replica});
907
+ log.debug("{}: on_start_view_change: view={} quorum received", .{
908
+ self.replica,
909
+ self.view,
910
+ });
819
911
 
820
912
  assert(!self.start_view_change_quorum);
821
913
  assert(!self.do_view_change_quorum);
@@ -872,10 +964,14 @@ pub fn Replica(
872
964
 
873
965
  assert(count == threshold);
874
966
  assert(self.do_view_change_from_all_replicas[self.replica] != null);
875
- log.debug("{}: on_do_view_change: quorum received", .{self.replica});
967
+ log.debug("{}: on_do_view_change: view={} quorum received", .{
968
+ self.replica,
969
+ self.view,
970
+ });
876
971
 
877
- var latest = Header.reserved();
972
+ var v: ?u32 = null;
878
973
  var k: ?u64 = null;
974
+ var latest = Header.reserved();
879
975
 
880
976
  for (self.do_view_change_from_all_replicas) |received, replica| {
881
977
  if (received) |m| {
@@ -884,13 +980,40 @@ pub fn Replica(
884
980
  assert(m.header.replica == replica);
885
981
  assert(m.header.view == self.view);
886
982
 
983
+ // The latest normal view experienced by this replica:
984
+ // This may be higher than the view in any of the prepare headers.
985
+ var replica_view_normal = @intCast(u32, m.header.offset);
986
+ assert(replica_view_normal < m.header.view);
987
+
988
+ var replica_latest = Header.reserved();
989
+ set_latest_op(self.message_body_as_headers(m), &replica_latest);
990
+ assert(replica_latest.op == m.header.op);
991
+
992
+ log.debug(
993
+ "{}: on_do_view_change: replica={} v'={} op={} commit={} latest={}",
994
+ .{
995
+ self.replica,
996
+ m.header.replica,
997
+ replica_view_normal,
998
+ m.header.op,
999
+ m.header.commit,
1000
+ replica_latest,
1001
+ },
1002
+ );
1003
+
1004
+ if (v == null or replica_view_normal > v.?) {
1005
+ v = replica_view_normal;
1006
+ latest = replica_latest;
1007
+ } else if (replica_view_normal == v.? and replica_latest.op > latest.op) {
1008
+ v = replica_view_normal;
1009
+ latest = replica_latest;
1010
+ }
1011
+
887
1012
  if (k == null or m.header.commit > k.?) k = m.header.commit;
888
- self.set_latest_header(self.message_body_as_headers(m), &latest);
889
1013
  }
890
1014
  }
891
1015
 
892
1016
  self.set_latest_op_and_k(&latest, k.?, "on_do_view_change");
893
- assert(!self.view_jump_barrier);
894
1017
 
895
1018
  // Now that we have the latest op in place, repair any other headers:
896
1019
  for (self.do_view_change_from_all_replicas) |received| {
@@ -908,6 +1031,10 @@ pub fn Replica(
908
1031
  assert(!self.do_view_change_quorum);
909
1032
  self.do_view_change_quorum = true;
910
1033
 
1034
+ self.discard_uncommitted_headers();
1035
+ assert(self.op >= self.commit_max);
1036
+ assert(self.journal.entry_for_op_exact(self.op) != null);
1037
+
911
1038
  // Start repairs according to the CTRL protocol:
912
1039
  assert(!self.repair_timeout.ticking);
913
1040
  self.repair_timeout.start();
@@ -924,24 +1051,21 @@ pub fn Replica(
924
1051
  fn on_start_view(self: *Self, message: *const Message) void {
925
1052
  if (self.ignore_view_change_message(message)) return;
926
1053
 
927
- assert(self.status == .normal or self.status == .view_change);
1054
+ assert(self.status == .view_change or self.status == .normal);
928
1055
  assert(message.header.view >= self.view);
929
1056
  assert(message.header.replica != self.replica);
930
1057
  assert(message.header.replica == self.leader_index(message.header.view));
931
1058
 
932
1059
  self.view_jump(message.header);
933
1060
 
934
- // In ignore_view_change_message(), we will allow a start_view message through for the
935
- // same view in normal status if a view jump barrier exists that needs to be cleared:
936
- assert(self.status == .view_change or self.view_jump_barrier);
1061
+ assert(self.status == .view_change);
937
1062
  assert(message.header.view == self.view);
938
1063
 
939
1064
  var latest = Header.reserved();
940
- self.set_latest_header(self.message_body_as_headers(message), &latest);
1065
+ set_latest_op(self.message_body_as_headers(message), &latest);
941
1066
  assert(latest.op == message.header.op);
942
1067
 
943
1068
  self.set_latest_op_and_k(&latest, message.header.commit, "on_start_view");
944
- assert(!self.view_jump_barrier);
945
1069
 
946
1070
  // Now that we have the latest op in place, repair any other headers:
947
1071
  for (self.message_body_as_headers(message)) |*h| {
@@ -974,7 +1098,7 @@ pub fn Replica(
974
1098
  assert(self.leader());
975
1099
 
976
1100
  const start_view = self.create_view_change_message(.start_view) orelse {
977
- log.alert("{}: on_request_start_view: dropping start_view, no message available", .{
1101
+ log.err("{}: on_request_start_view: dropping start_view, no message available", .{
978
1102
  self.replica,
979
1103
  });
980
1104
  return;
@@ -990,6 +1114,69 @@ pub fn Replica(
990
1114
  self.send_message_to_replica(message.header.replica, start_view);
991
1115
  }
992
1116
 
1117
+ /// TODO This is a work in progress (out of scope for the bounty)
1118
+ fn on_recovery(self: *Self, message: *const Message) void {
1119
+ if (self.status != .normal) {
1120
+ log.debug("{}: on_recovery: ignoring ({})", .{ self.replica, self.status });
1121
+ return;
1122
+ }
1123
+
1124
+ if (message.header.replica == self.replica) {
1125
+ log.warn("{}: on_recovery: ignoring (self)", .{self.replica});
1126
+ return;
1127
+ }
1128
+
1129
+ const response = self.message_bus.get_message() orelse {
1130
+ log.err("{}: on_recovery: ignoring (waiting for message)", .{self.replica});
1131
+ return;
1132
+ };
1133
+ defer self.message_bus.unref(response);
1134
+
1135
+ response.header.* = .{
1136
+ .command = .recovery_response,
1137
+ .cluster = self.cluster,
1138
+ .context = message.header.context,
1139
+ .replica = self.replica,
1140
+ .view = self.view,
1141
+ .op = self.op,
1142
+ .commit = self.commit_max,
1143
+ };
1144
+
1145
+ const count_max = 8; // The number of prepare headers to include in the body.
1146
+
1147
+ const size_max = @sizeOf(Header) * std.math.min(
1148
+ std.math.max(@divFloor(response.buffer.len, @sizeOf(Header)), 2),
1149
+ 1 + count_max,
1150
+ );
1151
+ assert(size_max > @sizeOf(Header));
1152
+
1153
+ const count = self.journal.copy_latest_headers_between(
1154
+ 0,
1155
+ self.op,
1156
+ std.mem.bytesAsSlice(Header, response.buffer[@sizeOf(Header)..size_max]),
1157
+ );
1158
+
1159
+ // We expect that self.op always exists.
1160
+ assert(count > 0);
1161
+
1162
+ response.header.size = @intCast(u32, @sizeOf(Header) + @sizeOf(Header) * count);
1163
+
1164
+ response.header.set_checksum_body(response.body());
1165
+ response.header.set_checksum();
1166
+
1167
+ assert(self.status == .normal);
1168
+ // The checksum for a recovery message is deterministic, and cannot be used as a nonce:
1169
+ assert(response.header.context != message.header.checksum);
1170
+
1171
+ self.send_message_to_replica(message.header.replica, response);
1172
+ }
1173
+
1174
+ /// TODO This is a work in progress (out of scope for the bounty)
1175
+ fn on_recovery_response(self: *Self, message: *Message) void {
1176
+ _ = self;
1177
+ _ = message;
1178
+ }
1179
+
993
1180
  fn on_request_prepare(self: *Self, message: *const Message) void {
994
1181
  if (self.ignore_repair_message(message)) return;
995
1182
 
@@ -1008,6 +1195,15 @@ pub fn Replica(
1008
1195
  if (!self.journal.dirty.bit(op)) {
1009
1196
  assert(!self.journal.faulty.bit(op));
1010
1197
 
1198
+ log.debug("{}: on_request_prepare: op={} checksum={} reading", .{
1199
+ self.replica,
1200
+ op,
1201
+ checksum,
1202
+ });
1203
+
1204
+ // TODO Do not reissue the read if we are already reading in order to send to
1205
+ // this particular destination replica.
1206
+
1011
1207
  self.journal.read_prepare(
1012
1208
  on_request_prepare_read,
1013
1209
  op,
@@ -1018,6 +1214,12 @@ pub fn Replica(
1018
1214
  // We have guaranteed the prepare and our copy is clean (not safe to nack).
1019
1215
  return;
1020
1216
  } else if (self.journal.faulty.bit(op)) {
1217
+ log.debug("{}: on_request_prepare: op={} checksum={} faulty", .{
1218
+ self.replica,
1219
+ op,
1220
+ checksum,
1221
+ });
1222
+
1021
1223
  // We have gauranteed the prepare but our copy is faulty (not safe to nack).
1022
1224
  return;
1023
1225
  }
@@ -1032,6 +1234,13 @@ pub fn Replica(
1032
1234
  if (self.journal.entry_for_op_exact_with_checksum(op, checksum) != null) {
1033
1235
  assert(self.journal.dirty.bit(op) and !self.journal.faulty.bit(op));
1034
1236
  }
1237
+
1238
+ log.debug("{}: on_request_prepare: op={} checksum={} nacking", .{
1239
+ self.replica,
1240
+ op,
1241
+ checksum,
1242
+ });
1243
+
1035
1244
  self.send_header_to_replica(message.header.replica, .{
1036
1245
  .command = .nack_prepare,
1037
1246
  .context = checksum.?,
@@ -1044,7 +1253,17 @@ pub fn Replica(
1044
1253
  }
1045
1254
 
1046
1255
  fn on_request_prepare_read(self: *Self, prepare: ?*Message, destination_replica: ?u8) void {
1047
- const message = prepare orelse return;
1256
+ const message = prepare orelse {
1257
+ log.debug("{}: on_request_prepare_read: prepare=null", .{self.replica});
1258
+ return;
1259
+ };
1260
+
1261
+ log.debug("{}: on_request_prepare_read: op={} checksum={} sending to replica={}", .{
1262
+ self.replica,
1263
+ message.header.op,
1264
+ message.header.checksum,
1265
+ destination_replica.?,
1266
+ });
1048
1267
 
1049
1268
  assert(destination_replica.? != self.replica);
1050
1269
  self.send_message_to_replica(destination_replica.?, message);
@@ -1058,8 +1277,10 @@ pub fn Replica(
1058
1277
  assert(message.header.replica != self.replica);
1059
1278
 
1060
1279
  const response = self.message_bus.get_message() orelse {
1061
- log.alert("{}: on_request_headers: dropping response, no message available", .{
1280
+ log.err("{}: on_request_headers: ignoring (op={}..{}, no message available)", .{
1062
1281
  self.replica,
1282
+ message.header.commit,
1283
+ message.header.op,
1063
1284
  });
1064
1285
  return;
1065
1286
  };
@@ -1095,7 +1316,11 @@ pub fn Replica(
1095
1316
  );
1096
1317
 
1097
1318
  if (count == 0) {
1098
- log.debug("{}: on_request_headers: no headers found, dropping", .{self.replica});
1319
+ log.debug("{}: on_request_headers: ignoring (op={}..{}, no headers)", .{
1320
+ self.replica,
1321
+ op_min,
1322
+ op_max,
1323
+ });
1099
1324
  return;
1100
1325
  }
1101
1326
 
@@ -1130,17 +1355,59 @@ pub fn Replica(
1130
1355
  return;
1131
1356
  }
1132
1357
 
1358
+ if (message.header.context != checksum) {
1359
+ log.debug("{}: on_nack_prepare: ignoring (repairing another checksum)", .{
1360
+ self.replica,
1361
+ });
1362
+ return;
1363
+ }
1364
+
1133
1365
  // Followers may not send a `nack_prepare` for a different checksum:
1134
- // TODO However our op may change in between sending the request and getting the nack.
1366
+ // However our op may change in between sending the request and getting the nack.
1367
+ assert(message.header.op == op);
1135
1368
  assert(message.header.context == checksum);
1136
1369
 
1137
- // We require a `nack_prepare` from a majority of followers if our op is faulty:
1138
- // Otherwise, we know we do not have the op and need only `f` other nacks.
1370
+ // Here are what our nack quorums look like, if we know our op is faulty:
1371
+ // These are for various replication quorums under Flexible Paxos.
1372
+ // We need to have enough nacks to guarantee that `quorum_replication` was not reached,
1373
+ // because if the replication quorum was reached, then it may have been committed.
1374
+ // We add `1` in each case because our op is faulty and may have been counted.
1375
+ //
1376
+ // replica_count=2 - quorum_replication=2 + 1 = 0 + 1 = 1 nacks required
1377
+ // replica_count=3 - quorum_replication=2 + 1 = 1 + 1 = 2 nacks required
1378
+ // replica_count=4 - quorum_replication=2 + 1 = 2 + 1 = 3 nacks required
1379
+ // replica_count=4 - quorum_replication=3 + 1 = 1 + 1 = 2 nacks required
1380
+ // replica_count=5 - quorum_replication=2 + 1 = 3 + 1 = 4 nacks required
1381
+ // replica_count=5 - quorum_replication=3 + 1 = 2 + 1 = 3 nacks required
1382
+ //
1383
+ // Otherwise, if we know we do not have the op, then we can exclude ourselves.
1139
1384
  assert(self.replica_count > 1);
1385
+
1140
1386
  const threshold = if (self.journal.faulty.bit(op))
1141
- self.quorum_view_change
1387
+ self.replica_count - self.quorum_replication + 1
1142
1388
  else
1143
- self.quorum_view_change - 1;
1389
+ self.replica_count - self.quorum_replication;
1390
+
1391
+ if (threshold == 0) {
1392
+ assert(self.replica_count == 2);
1393
+ assert(!self.journal.faulty.bit(op));
1394
+
1395
+ // This is a special case for a cluster-of-two, handled in `repair_prepare()`.
1396
+ log.debug("{}: on_nack_prepare: ignoring (cluster-of-two, not faulty)", .{
1397
+ self.replica,
1398
+ });
1399
+ return;
1400
+ }
1401
+
1402
+ log.debug("{}: on_nack_prepare: quorum_replication={} threshold={}", .{
1403
+ self.replica,
1404
+ self.quorum_replication,
1405
+ threshold,
1406
+ });
1407
+
1408
+ // We should never expect to receive a nack from ourselves:
1409
+ // Detect if we ever set `threshold` to `quorum_view_change` for a cluster-of-two again.
1410
+ assert(threshold < self.replica_count);
1144
1411
 
1145
1412
  // Wait until we have `threshold` messages for quorum:
1146
1413
  const count = self.add_message_and_receive_quorum_exactly_once(
@@ -1153,32 +1420,8 @@ pub fn Replica(
1153
1420
  assert(self.nack_prepare_from_other_replicas[self.replica] == null);
1154
1421
  log.debug("{}: on_nack_prepare: quorum received", .{self.replica});
1155
1422
 
1156
- assert(self.valid_hash_chain("on_nack_prepare"));
1157
-
1158
- assert(op > self.commit_max);
1159
- assert(op <= self.op);
1160
- assert(self.journal.entry_for_op_exact_with_checksum(op, checksum) != null);
1161
- assert(self.journal.dirty.bit(op));
1162
-
1163
- log.debug("{}: on_nack_prepare: discarding uncommitted ops={}..{}", .{
1164
- self.replica,
1165
- op,
1166
- self.op,
1167
- });
1168
-
1169
- self.journal.remove_entries_from(op);
1170
- self.op = op - 1;
1171
-
1172
- assert(self.journal.entry_for_op(op) == null);
1173
- assert(!self.journal.dirty.bit(op));
1174
- assert(!self.journal.faulty.bit(op));
1175
-
1176
- // We require that `self.op` always exists. Rewinding `self.op` could change that.
1177
- // However, we do this only as the leader within a view change, with all headers intact.
1178
- assert(self.journal.entry_for_op_exact(self.op) != null);
1179
-
1423
+ self.discard_uncommitted_ops_from(op, checksum);
1180
1424
  self.reset_quorum_nack_prepare();
1181
-
1182
1425
  self.repair();
1183
1426
  }
1184
1427
 
@@ -1192,11 +1435,6 @@ pub fn Replica(
1192
1435
  // We expect at least one header in the body, or otherwise no response to our request.
1193
1436
  assert(message.header.size > @sizeOf(Header));
1194
1437
 
1195
- if (self.view_jump_barrier) {
1196
- log.debug("{}: on_headers: ignoring (view jump barrier)", .{self.replica});
1197
- return;
1198
- }
1199
-
1200
1438
  var op_min: ?u64 = null;
1201
1439
  var op_max: ?u64 = null;
1202
1440
  for (self.message_body_as_headers(message)) |*h| {
@@ -1232,7 +1470,7 @@ pub fn Replica(
1232
1470
  assert(self.status == .normal);
1233
1471
  assert(self.leader());
1234
1472
 
1235
- const prepare = self.pipeline.peek_ptr().?;
1473
+ const prepare = self.pipeline.head_ptr().?;
1236
1474
  assert(prepare.message.header.command == .prepare);
1237
1475
 
1238
1476
  if (prepare.ok_quorum_received) {
@@ -1261,10 +1499,24 @@ pub fn Replica(
1261
1499
 
1262
1500
  log.debug("{}: on_prepare_timeout: waiting for journal", .{self.replica});
1263
1501
  assert(prepare.ok_from_all_replicas[self.replica] == null);
1502
+
1503
+ // We may be slow and waiting for the write to complete.
1504
+ //
1505
+ // We may even have maxed out our IO depth and been unable to initiate the write,
1506
+ // which can happen if `config.pipelining_max` exceeds `config.io_depth_write`.
1507
+ // This can lead to deadlock for a cluster of one or two (if we do not retry here),
1508
+ // since there is no other way for the leader to repair the dirty op because no
1509
+ // other replica has it.
1510
+ //
1511
+ // Retry the write through `on_repair()` which will work out which is which.
1512
+ // We do expect that the op would have been run through `on_prepare()` already.
1513
+ assert(prepare.message.header.op <= self.op);
1514
+ self.on_repair(prepare.message);
1515
+
1264
1516
  return;
1265
1517
  }
1266
1518
 
1267
- self.prepare_timeout.backoff(&self.prng);
1519
+ self.prepare_timeout.backoff(self.prng.random());
1268
1520
 
1269
1521
  assert(waiting_len <= self.replica_count);
1270
1522
  for (waiting[0..waiting_len]) |replica| {
@@ -1277,7 +1529,7 @@ pub fn Replica(
1277
1529
  }
1278
1530
 
1279
1531
  // Cycle through the list to reach live replicas and get around partitions:
1280
- assert(self.prepare_timeout.attempts > 0);
1532
+ // We do not assert `prepare_timeout.attempts > 0` since the counter may wrap back to 0.
1281
1533
  const replica = waiting[self.prepare_timeout.attempts % waiting_len];
1282
1534
  assert(replica != self.replica);
1283
1535
 
@@ -1305,13 +1557,13 @@ pub fn Replica(
1305
1557
  });
1306
1558
  }
1307
1559
 
1308
- fn on_election_timeout(self: *Self) void {
1560
+ fn on_normal_status_timeout(self: *Self) void {
1309
1561
  assert(self.status == .normal);
1310
1562
  assert(self.follower());
1311
1563
  self.transition_to_view_change_status(self.view + 1);
1312
1564
  }
1313
1565
 
1314
- fn on_view_change_timeout(self: *Self) void {
1566
+ fn on_view_change_status_timeout(self: *Self) void {
1315
1567
  assert(self.status == .view_change);
1316
1568
  self.transition_to_view_change_status(self.view + 1);
1317
1569
  }
@@ -1433,7 +1685,6 @@ pub fn Replica(
1433
1685
 
1434
1686
  /// Returns whether `b` succeeds `a` by having a newer view or same view and newer op.
1435
1687
  fn ascending_viewstamps(
1436
- self: *Self,
1437
1688
  a: *const Header,
1438
1689
  b: *const Header,
1439
1690
  ) bool {
@@ -1475,6 +1726,8 @@ pub fn Replica(
1475
1726
  }
1476
1727
 
1477
1728
  /// Commit ops up to commit number `commit` (inclusive).
1729
+ /// A function which calls `commit_ops()` to set `commit_max` must first call `view_jump()`.
1730
+ /// Otherwise, we may fork the log.
1478
1731
  fn commit_ops(self: *Self, commit: u64) void {
1479
1732
  // TODO Restrict `view_change` status only to the leader purely as defense-in-depth.
1480
1733
  // Be careful of concurrency when doing this, as successive view changes can happen quickly.
@@ -1504,9 +1757,15 @@ pub fn Replica(
1504
1757
  return;
1505
1758
  }
1506
1759
 
1507
- if (!self.valid_hash_chain("commit_ops")) return;
1508
- assert(!self.view_jump_barrier);
1509
- assert(self.op >= self.commit_max);
1760
+ // We check the hash chain before we read each op, rather than once upfront, because
1761
+ // it's possible for `commit_max` to change while we read asynchronously, after we
1762
+ // validate the hash chain.
1763
+ //
1764
+ // We therefore cannot keep committing until we reach `commit_max`. We need to verify
1765
+ // the hash chain before each read. Once verified (before the read) we can commit in the
1766
+ // callback after the read, but if we see a change we need to stop committing any
1767
+ // further ops, because `commit_max` may have been bumped and may refer to a different
1768
+ // op.
1510
1769
 
1511
1770
  assert(!self.committing);
1512
1771
  self.committing = true;
@@ -1520,6 +1779,12 @@ pub fn Replica(
1520
1779
  assert(self.commit_min <= self.commit_max);
1521
1780
  assert(self.commit_min <= self.op);
1522
1781
 
1782
+ if (!self.valid_hash_chain("commit_ops_read")) {
1783
+ self.committing = false;
1784
+ return;
1785
+ }
1786
+ assert(self.op >= self.commit_max);
1787
+
1523
1788
  // We may receive commit numbers for ops we do not yet have (`commit_max > self.op`):
1524
1789
  // Even a naive state transfer may fail to correct for this.
1525
1790
  if (self.commit_min < self.commit_max and self.commit_min < self.op) {
@@ -1571,7 +1836,7 @@ pub fn Replica(
1571
1836
 
1572
1837
  // TODO We can optimize this to commit into the client table reply if it exists.
1573
1838
  const reply = self.message_bus.get_message() orelse {
1574
- log.alert("{}: commit_ops_commit: waiting for message", .{self.replica});
1839
+ log.err("{}: commit_ops_commit: waiting for message", .{self.replica});
1575
1840
  return;
1576
1841
  };
1577
1842
  defer self.message_bus.unref(reply);
@@ -1594,12 +1859,17 @@ pub fn Replica(
1594
1859
  assert(prepare.header.op == self.commit_min + 1);
1595
1860
  assert(prepare.header.op <= self.op);
1596
1861
 
1597
- if (!self.valid_hash_chain("commit_op")) return;
1598
- assert(!self.view_jump_barrier);
1599
- assert(self.op >= self.commit_max);
1862
+ // If we are a follower committing through `commit_ops()` then a view change may have
1863
+ // happened since we last checked in `commit_ops_read()`. However, this would relate to
1864
+ // subsequent ops, since by now we have already verified the hash chain for this commit.
1600
1865
 
1601
- log.debug("{}: commit_op: executing op={} checksum={} ({s})", .{
1866
+ assert(self.journal.entry_for_op_exact(self.commit_min).?.checksum ==
1867
+ prepare.header.parent);
1868
+
1869
+ log.debug("{}: commit_op: executing view={} {} op={} checksum={} ({s})", .{
1602
1870
  self.replica,
1871
+ self.view,
1872
+ self.leader_index(self.view) == self.replica,
1603
1873
  prepare.header.op,
1604
1874
  prepare.header.checksum,
1605
1875
  @tagName(prepare.header.operation.cast(StateMachine)),
@@ -1657,7 +1927,7 @@ pub fn Replica(
1657
1927
  assert(self.leader());
1658
1928
  assert(self.pipeline.count > 0);
1659
1929
 
1660
- while (self.pipeline.peek_ptr()) |prepare| {
1930
+ while (self.pipeline.head_ptr()) |prepare| {
1661
1931
  assert(self.pipeline.count > 0);
1662
1932
  assert(self.commit_min == self.commit_max);
1663
1933
  assert(self.commit_max + self.pipeline.count == self.op);
@@ -1679,7 +1949,7 @@ pub fn Replica(
1679
1949
  // TODO We can optimize this to commit into the client table reply if it exists.
1680
1950
  const reply = self.message_bus.get_message() orelse {
1681
1951
  // Eventually handled by on_prepare_timeout().
1682
- log.alert("{}: commit_pipeline: waiting for message", .{self.replica});
1952
+ log.err("{}: commit_pipeline: waiting for message", .{self.replica});
1683
1953
  return;
1684
1954
  };
1685
1955
  defer self.message_bus.unref(reply);
@@ -1760,13 +2030,14 @@ pub fn Replica(
1760
2030
  assert(request == 0);
1761
2031
 
1762
2032
  // For correctness, it's critical that all replicas evict deterministically:
1763
- // We cannot depend on `HashMap.capacity()` since `HashMap.ensureCapacity()` may change
1764
- // across different versions of the Zig std lib. We therefore rely on `config.clients_max`,
1765
- // which must be the same across all replicas, and must not change after initing a cluster.
1766
- // We also do not depend on `HashMap.valueIterator()` being deterministic here. However, we
1767
- // do require that all entries have different commit numbers and are at least iterated.
2033
+ // We cannot depend on `HashMap.capacity()` since `HashMap.ensureTotalCapacity()` may
2034
+ // change across versions of the Zig std lib. We therefore rely on `config.clients_max`,
2035
+ // which must be the same across all replicas, and must not change after initializing a
2036
+ // cluster.
2037
+ // We also do not depend on `HashMap.valueIterator()` being deterministic here. However,
2038
+ // we do require that all entries have different commit numbers and are iterated.
1768
2039
  // This ensures that we will always pick the entry with the oldest commit number.
1769
- // We also double-check that a client has only one entry in the hash map (or it's buggy).
2040
+ // We also check that a client has only one entry in the hash map (or it's buggy).
1770
2041
  const clients = self.client_table.count();
1771
2042
  assert(clients <= config.clients_max);
1772
2043
  if (clients == config.clients_max) {
@@ -1791,7 +2062,7 @@ pub fn Replica(
1791
2062
  }
1792
2063
  }
1793
2064
  assert(iterated == clients);
1794
- log.alert("{}: create_client_table_entry: clients={}/{} evicting client={}", .{
2065
+ log.err("{}: create_client_table_entry: clients={}/{} evicting client={}", .{
1795
2066
  self.replica,
1796
2067
  clients,
1797
2068
  config.clients_max,
@@ -1833,11 +2104,23 @@ pub fn Replica(
1833
2104
  .cluster = self.cluster,
1834
2105
  .replica = self.replica,
1835
2106
  .view = self.view,
2107
+ // The latest normal view (as specified in the 2012 paper) is different to the view
2108
+ // number contained in the prepare headers we include in the body. The former shows
2109
+ // how recent a view change the replica participated in, which may be much higher.
2110
+ // We use the `offset` field to send this in addition to the current view number:
2111
+ .offset = if (command == .do_view_change) self.view_normal else 0,
1836
2112
  .op = self.op,
1837
2113
  .commit = self.commit_max,
1838
2114
  };
1839
2115
 
1840
- const count_max = 8; // The number of prepare headers to include in the body.
2116
+ // CRITICAL: The number of prepare headers to include in the body:
2117
+ // We must provide enough headers to cover all uncommitted headers so that the new
2118
+ // leader (if we are in a view change) can decide whether to discard uncommitted headers
2119
+ // that cannot be repaired because they are gaps, and this must be relative to the
2120
+ // cluster as a whole (not relative to the difference between our op and commit number)
2121
+ // as otherwise we would break correctness.
2122
+ const count_max = config.pipelining_max;
2123
+ assert(count_max > 0);
1841
2124
 
1842
2125
  const size_max = @sizeOf(Header) * std.math.min(
1843
2126
  std.math.max(@divFloor(message.buffer.len, @sizeOf(Header)), 2),
@@ -1865,7 +2148,7 @@ pub fn Replica(
1865
2148
  /// The caller owns the returned message, if any, which has exactly 1 reference.
1866
2149
  fn create_message_from_header(self: *Self, header: Header) ?*Message {
1867
2150
  assert(header.replica == self.replica);
1868
- assert(header.view == self.view);
2151
+ assert(header.view == self.view or header.command == .request_start_view);
1869
2152
  assert(header.size == @sizeOf(Header));
1870
2153
 
1871
2154
  const message = self.message_bus.pool.get_header_only_message() orelse return null;
@@ -1878,6 +2161,112 @@ pub fn Replica(
1878
2161
  return message.ref();
1879
2162
  }
1880
2163
 
2164
+ /// Discards uncommitted headers during a view change before the new leader starts the view.
2165
+ /// This is required to maximize availability in the presence of storage faults.
2166
+ /// Refer to the CTRL protocol from Protocol-Aware Recovery for Consensus-Based Storage.
2167
+ ///
2168
+ /// It's possible for the new leader to have done an op jump in a previous view, and so
2169
+ /// introduced a header gap for an op, which was then discarded by another leader during a
2170
+ /// newer view change, before surviving into this view as a gap because our latest op was
2171
+ /// set as the latest op for the quorum.
2172
+ ///
2173
+ /// In this case, it may be impossible for the new leader to repair the missing header since
2174
+ /// the rest of the cluster may have already discarded it. We therefore iterate over our
2175
+ /// uncommitted header gaps and compare them with the quorum of do_view_change messages
2176
+ /// received from other replicas, before starting the new view, to discard any that may be
2177
+ /// impossible to repair.
2178
+ fn discard_uncommitted_headers(self: *Self) void {
2179
+ assert(self.status == .view_change);
2180
+ assert(self.leader_index(self.view) == self.replica);
2181
+ assert(self.do_view_change_quorum);
2182
+ assert(!self.repair_timeout.ticking);
2183
+ assert(self.op >= self.commit_max);
2184
+ assert(self.replica_count > 1);
2185
+
2186
+ const threshold = self.replica_count - self.quorum_replication;
2187
+ if (threshold == 0) {
2188
+ assert(self.replica_count == 2);
2189
+ return;
2190
+ }
2191
+
2192
+ var op = self.op;
2193
+ while (op > self.commit_max) : (op -= 1) {
2194
+ if (self.journal.entry_for_op_exact(op) != null) continue;
2195
+
2196
+ log.debug("{}: discard_uncommitted_headers: op={} gap", .{ self.replica, op });
2197
+
2198
+ var nacks: usize = 0;
2199
+ for (self.do_view_change_from_all_replicas) |received, replica| {
2200
+ if (received) |m| {
2201
+ assert(m.header.command == .do_view_change);
2202
+ assert(m.header.cluster == self.cluster);
2203
+ assert(m.header.replica == replica);
2204
+ assert(m.header.view == self.view);
2205
+
2206
+ if (replica != self.replica) {
2207
+ if (m.header.op < op) nacks += 1;
2208
+
2209
+ log.debug("{}: discard_uncommitted_headers: replica={} op={}", .{
2210
+ self.replica,
2211
+ m.header.replica,
2212
+ m.header.op,
2213
+ });
2214
+ }
2215
+ }
2216
+ }
2217
+
2218
+ log.debug("{}: discard_uncommitted_headers: op={} nacks={} threshold={}", .{
2219
+ self.replica,
2220
+ op,
2221
+ nacks,
2222
+ threshold,
2223
+ });
2224
+
2225
+ if (nacks >= threshold) {
2226
+ self.journal.remove_entries_from(op);
2227
+ self.op = op - 1;
2228
+
2229
+ assert(self.journal.entry_for_op(op) == null);
2230
+ assert(!self.journal.dirty.bit(op));
2231
+ assert(!self.journal.faulty.bit(op));
2232
+ }
2233
+ }
2234
+ }
2235
+
2236
+ /// Discards uncommitted ops during a view change from after and including `op`.
2237
+ /// This is required to maximize availability in the presence of storage faults.
2238
+ /// Refer to the CTRL protocol from Protocol-Aware Recovery for Consensus-Based Storage.
2239
+ fn discard_uncommitted_ops_from(self: *Self, op: u64, checksum: u128) void {
2240
+ assert(self.status == .view_change);
2241
+ assert(self.leader_index(self.view) == self.replica);
2242
+ assert(self.repairs_allowed());
2243
+
2244
+ assert(self.valid_hash_chain("discard_uncommitted_ops_from"));
2245
+
2246
+ assert(op > self.commit_max);
2247
+ assert(op <= self.op);
2248
+ assert(self.journal.entry_for_op_exact_with_checksum(op, checksum) != null);
2249
+ assert(self.journal.dirty.bit(op));
2250
+
2251
+ log.debug("{}: discard_uncommitted_ops_from: ops={}..{} view={}", .{
2252
+ self.replica,
2253
+ op,
2254
+ self.op,
2255
+ self.view,
2256
+ });
2257
+
2258
+ self.journal.remove_entries_from(op);
2259
+ self.op = op - 1;
2260
+
2261
+ assert(self.journal.entry_for_op(op) == null);
2262
+ assert(!self.journal.dirty.bit(op));
2263
+ assert(!self.journal.faulty.bit(op));
2264
+
2265
+ // We require that `self.op` always exists. Rewinding `self.op` could change that.
2266
+ // However, we do this only as the leader within a view change, with all headers intact.
2267
+ assert(self.journal.entry_for_op_exact(self.op) != null);
2268
+ }
2269
+
1881
2270
  /// Returns whether the replica is a follower for the current view.
1882
2271
  /// This may be used only when the replica status is normal.
1883
2272
  fn follower(self: *Self) bool {
@@ -1898,6 +2287,7 @@ pub fn Replica(
1898
2287
  if (self.loopback_queue) |message| {
1899
2288
  defer self.message_bus.unref(message);
1900
2289
 
2290
+ assert(message.next == null);
1901
2291
  self.loopback_queue = null;
1902
2292
  assert(message.header.replica == self.replica);
1903
2293
  self.on_message(message);
@@ -1954,8 +2344,6 @@ pub fn Replica(
1954
2344
  return true;
1955
2345
  }
1956
2346
 
1957
- // We should never view jump unless we know what our status should be after the jump:
1958
- // Otherwise we may be normal before the leader, or in a view change that has completed.
1959
2347
  if (message.header.view > self.view) {
1960
2348
  log.debug("{}: on_{s}: ignoring (newer view)", .{ self.replica, command });
1961
2349
  return true;
@@ -2068,11 +2456,11 @@ pub fn Replica(
2068
2456
  // Fall through below to check if we should resend the .register session reply.
2069
2457
  } else if (entry.session > message.header.context) {
2070
2458
  // The client must not reuse the ephemeral client ID when registering a new session.
2071
- log.alert("{}: on_request: ignoring older session (client bug)", .{self.replica});
2459
+ log.err("{}: on_request: ignoring older session (client bug)", .{self.replica});
2072
2460
  return true;
2073
2461
  } else if (entry.session < message.header.context) {
2074
2462
  // This cannot be because of a partition since we check the client's view number.
2075
- log.alert("{}: on_request: ignoring newer session (client bug)", .{self.replica});
2463
+ log.err("{}: on_request: ignoring newer session (client bug)", .{self.replica});
2076
2464
  return true;
2077
2465
  }
2078
2466
 
@@ -2087,7 +2475,7 @@ pub fn Replica(
2087
2475
  self.message_bus.send_message_to_client(message.header.client, entry.reply);
2088
2476
  return true;
2089
2477
  } else {
2090
- log.alert("{}: on_request: request collision (client bug)", .{self.replica});
2478
+ log.err("{}: on_request: request collision (client bug)", .{self.replica});
2091
2479
  return true;
2092
2480
  }
2093
2481
  } else if (entry.reply.header.request + 1 == message.header.request) {
@@ -2097,16 +2485,24 @@ pub fn Replica(
2097
2485
  return false;
2098
2486
  } else {
2099
2487
  // The client may have only one request inflight at a time.
2100
- log.alert("{}: on_request: ignoring new request (client bug)", .{self.replica});
2488
+ log.err("{}: on_request: ignoring new request (client bug)", .{self.replica});
2101
2489
  return true;
2102
2490
  }
2103
2491
  } else {
2104
- log.alert("{}: on_request: ignoring newer request (client bug)", .{self.replica});
2492
+ log.err("{}: on_request: ignoring newer request (client bug)", .{self.replica});
2105
2493
  return true;
2106
2494
  }
2107
2495
  } else if (message.header.operation == .register) {
2108
2496
  log.debug("{}: on_request: new session", .{self.replica});
2109
2497
  return false;
2498
+ } else if (self.pipeline_prepare_for_client(message.header.client)) |_| {
2499
+ // The client registered with the previous leader, which committed and replied back
2500
+ // to the client before the view change, after which the register operation was
2501
+ // reloaded into the pipeline to be driven to completion by the new leader, which
2502
+ // now receives a request from the client that appears to have no session.
2503
+ // However, the session is about to be registered, so we must wait for it to commit.
2504
+ log.debug("{}: on_request: waiting for session to commit", .{self.replica});
2505
+ return true;
2110
2506
  } else {
2111
2507
  // We must have all commits to know whether a session has been evicted. For example,
2112
2508
  // there is the risk of sending an eviction message (even as the leader) if we are
@@ -2180,7 +2576,7 @@ pub fn Replica(
2180
2576
  log.debug("{}: on_request: ignoring (already preparing)", .{self.replica});
2181
2577
  return true;
2182
2578
  } else {
2183
- log.alert("{}: on_request: ignoring (client forked)", .{self.replica});
2579
+ log.err("{}: on_request: ignoring (client forked)", .{self.replica});
2184
2580
  return true;
2185
2581
  }
2186
2582
  }
@@ -2216,12 +2612,8 @@ pub fn Replica(
2216
2612
  }
2217
2613
 
2218
2614
  if (message.header.view == self.view and self.status == .normal) {
2219
- if (message.header.command == .start_view and self.view_jump_barrier) {
2220
- // Fall through, we requested this start_view to clear our view jump barrier.
2221
- } else {
2222
- log.debug("{}: on_{s}: ignoring (view started)", .{ self.replica, command });
2223
- return true;
2224
- }
2615
+ log.debug("{}: on_{s}: ignoring (view started)", .{ self.replica, command });
2616
+ return true;
2225
2617
  }
2226
2618
 
2227
2619
  // These may be caused by faults in the network topology.
@@ -2293,7 +2685,7 @@ pub fn Replica(
2293
2685
  assert(self.op + 1 == header.op);
2294
2686
  }
2295
2687
 
2296
- fn message_body_as_headers(self: *Self, message: *const Message) []Header {
2688
+ fn message_body_as_headers(_: *Self, message: *const Message) []Header {
2297
2689
  // TODO Assert message commands that we expect this to be called for.
2298
2690
  assert(message.header.size > @sizeOf(Header)); // Body must contain at least one header.
2299
2691
  return std.mem.bytesAsSlice(Header, message.buffer[@sizeOf(Header)..message.header.size]);
@@ -2312,8 +2704,8 @@ pub fn Replica(
2312
2704
  if (a.view == b.view and a.op + 1 == b.op and a.checksum != b.parent) {
2313
2705
  assert(a.valid_checksum());
2314
2706
  assert(b.valid_checksum());
2315
- log.emerg("{}: panic_if_hash_chain_would_break: a: {}", .{ self.replica, a });
2316
- log.emerg("{}: panic_if_hash_chain_would_break: b: {}", .{ self.replica, b });
2707
+ log.err("{}: panic_if_hash_chain_would_break: a: {}", .{ self.replica, a });
2708
+ log.err("{}: panic_if_hash_chain_would_break: b: {}", .{ self.replica, b });
2317
2709
  @panic("hash chain would break");
2318
2710
  }
2319
2711
  }
@@ -2332,6 +2724,8 @@ pub fn Replica(
2332
2724
  assert(prepare.message.header.op == op);
2333
2725
  assert(prepare.message.header.parent == parent);
2334
2726
 
2727
+ // A client may have multiple requests in the pipeline if these were committed by
2728
+ // the previous leader and were reloaded into the pipeline after a view change.
2335
2729
  if (prepare.message.header.client == client) return prepare;
2336
2730
 
2337
2731
  parent = prepare.message.header.checksum;
@@ -2399,26 +2793,10 @@ pub fn Replica(
2399
2793
  assert(self.commit_min <= self.op);
2400
2794
  assert(self.commit_min <= self.commit_max);
2401
2795
 
2402
- // TODO Handle case where we are requesting reordered headers that no longer exist.
2403
-
2404
2796
  // We expect these always to exist:
2405
2797
  assert(self.journal.entry_for_op_exact(self.commit_min) != null);
2406
2798
  assert(self.journal.entry_for_op_exact(self.op) != null);
2407
2799
 
2408
- // Resolve any view jump by requesting the leader's latest op:
2409
- if (self.view_jump_barrier) {
2410
- assert(self.status == .normal);
2411
- assert(self.follower());
2412
- log.debug("{}: repair: resolving view jump barrier", .{self.replica});
2413
- self.send_header_to_replica(self.leader_index(self.view), .{
2414
- .command = .request_start_view,
2415
- .cluster = self.cluster,
2416
- .replica = self.replica,
2417
- .view = self.view,
2418
- });
2419
- return;
2420
- }
2421
-
2422
2800
  // Request outstanding committed prepares to advance our op number:
2423
2801
  // This handles the case of an idle cluster, where a follower will not otherwise advance.
2424
2802
  // This is not required for correctness, but for durability.
@@ -2450,14 +2828,15 @@ pub fn Replica(
2450
2828
 
2451
2829
  // Request any missing or disconnected headers:
2452
2830
  // TODO Snapshots: Ensure that self.commit_min op always exists in the journal.
2453
- assert(!self.view_jump_barrier);
2454
2831
  var broken = self.journal.find_latest_headers_break_between(self.commit_min, self.op);
2455
2832
  if (broken) |range| {
2456
- log.debug("{}: repair: latest break: op_min={} op_max={} (commit_min={} op={})", .{
2833
+ log.debug("{}: repair: break: view={} op_min={} op_max={} (commit={}..{} op={})", .{
2457
2834
  self.replica,
2835
+ self.view,
2458
2836
  range.op_min,
2459
2837
  range.op_max,
2460
2838
  self.commit_min,
2839
+ self.commit_max,
2461
2840
  self.op,
2462
2841
  });
2463
2842
  assert(range.op_min > self.commit_min);
@@ -2480,7 +2859,6 @@ pub fn Replica(
2480
2859
  }
2481
2860
 
2482
2861
  // Assert that all headers are now present and connected with a perfect hash chain:
2483
- assert(!self.view_jump_barrier);
2484
2862
  assert(self.op >= self.commit_max);
2485
2863
  assert(self.valid_hash_chain_between(self.commit_min, self.op));
2486
2864
 
@@ -2514,15 +2892,13 @@ pub fn Replica(
2514
2892
  /// * The latest op makes sense of everything else and must not be replaced with a different
2515
2893
  /// op or advanced except by the leader in the current view.
2516
2894
  ///
2517
- /// * Do not jump to a view in normal status without imposing a view jump barrier.
2518
- ///
2519
- /// * Do not commit before resolving the view jump barrier with the leader.
2895
+ /// * Do not jump to a view in normal status without receiving a start_view message.
2520
2896
  ///
2521
2897
  /// * Do not commit until the hash chain between `self.commit_min` and `self.op` is fully
2522
2898
  /// connected, to ensure that all the ops in this range are correct.
2523
2899
  ///
2524
2900
  /// * Ensure that `self.commit_max` is never advanced for a newer view without first
2525
- /// imposing a view jump barrier, otherwise `self.commit_max` may refer to different ops.
2901
+ /// receiving a start_view message, otherwise `self.commit_max` may refer to different ops.
2526
2902
  ///
2527
2903
  /// * Ensure that `self.op` is never advanced by a repair since repairs may occur in a view
2528
2904
  /// change where the view has not yet started.
@@ -2534,16 +2910,13 @@ pub fn Replica(
2534
2910
  /// http://web.stanford.edu/~ouster/cgi-bin/papers/OngaroPhD.pdf.
2535
2911
  ///
2536
2912
  fn repair_header(self: *Self, header: *const Header) bool {
2537
- // Do not try to do any repairs while we cannot trust `self.op`:
2538
- assert(!self.view_jump_barrier);
2539
-
2540
2913
  assert(header.valid_checksum());
2541
2914
  assert(header.invalid() == null);
2542
2915
  assert(header.command == .prepare);
2543
2916
 
2544
2917
  switch (self.status) {
2545
2918
  .normal => assert(header.view <= self.view),
2546
- .view_change => assert(header.view < self.view),
2919
+ .view_change => assert(header.view <= self.view),
2547
2920
  else => unreachable,
2548
2921
  }
2549
2922
 
@@ -2557,7 +2930,10 @@ pub fn Replica(
2557
2930
  // that breaks the chain with self.op. In this case, we must skip the repair to
2558
2931
  // avoid overwriting any overlapping op.
2559
2932
  } else {
2560
- log.debug("{}: repair_header: false (changes self.op)", .{self.replica});
2933
+ log.debug("{}: repair_header: false (changes self.op={})", .{
2934
+ self.replica,
2935
+ self.op,
2936
+ });
2561
2937
  return false;
2562
2938
  }
2563
2939
  }
@@ -2667,9 +3043,6 @@ pub fn Replica(
2667
3043
  /// The mistake in this example was not that we ignored the break to the left, which we must
2668
3044
  /// do to repair reordered ops, but that we did not check for connection to the right.
2669
3045
  fn repair_header_would_connect_hash_chain(self: *Self, header: *const Header) bool {
2670
- // We must be able to trust `self.op` if this function is to be reliable.
2671
- assert(!self.view_jump_barrier);
2672
-
2673
3046
  var entry = header;
2674
3047
 
2675
3048
  while (entry.op < self.op) {
@@ -2701,7 +3074,7 @@ pub fn Replica(
2701
3074
  while (op > 0) {
2702
3075
  op -= 1;
2703
3076
  if (self.journal.entry_for_op(op)) |neighbor| {
2704
- if (self.journal.next_offset(neighbor) > header.offset) return true;
3077
+ if (Journal.next_offset(neighbor) > header.offset) return true;
2705
3078
  break;
2706
3079
  }
2707
3080
  }
@@ -2711,7 +3084,7 @@ pub fn Replica(
2711
3084
  var op: u64 = header.op + 1;
2712
3085
  while (op <= self.op) : (op += 1) {
2713
3086
  if (self.journal.entry_for_op(op)) |neighbor| {
2714
- if (self.journal.next_offset(header) > neighbor.offset) return true;
3087
+ if (Journal.next_offset(header) > neighbor.offset) return true;
2715
3088
  break;
2716
3089
  }
2717
3090
  }
@@ -2845,22 +3218,12 @@ pub fn Replica(
2845
3218
  assert(self.repairs_allowed());
2846
3219
  assert(self.journal.dirty.len > 0);
2847
3220
 
2848
- if (self.journal.writes.available() == 0) {
2849
- log.debug("{}: repair_prepares: waiting for available IOP", .{self.replica});
2850
- return;
2851
- }
2852
-
2853
- // We may be appending to or repairing the journal concurrently.
2854
- // We do not want to re-request any of these prepares unnecessarily.
2855
- // TODO Add journal.writing bits to clear this up (and needed anyway - why?).
2856
- if (self.journal.writes.executing() > 0) {
2857
- log.debug("{}: repair_prepares: waiting for dirty bits to settle", .{self.replica});
2858
- return;
2859
- }
2860
-
2861
3221
  // Request enough prepares to utilize our max IO depth:
2862
3222
  var budget = self.journal.writes.available();
2863
- assert(budget > 0);
3223
+ if (budget == 0) {
3224
+ log.debug("{}: repair_prepares: waiting for IOP", .{self.replica});
3225
+ return;
3226
+ }
2864
3227
 
2865
3228
  var op = self.op + 1;
2866
3229
  while (op > 0) {
@@ -2871,20 +3234,21 @@ pub fn Replica(
2871
3234
  // then we will `request_prepare` from the cluster, set `nack_prepare_op`,
2872
3235
  // and stop repairing any further prepares:
2873
3236
  // This will also rebroadcast any `request_prepare` every `repair_timeout` tick.
2874
- self.repair_prepare(op);
2875
- if (self.nack_prepare_op) |nack_prepare_op| {
2876
- assert(nack_prepare_op == op);
2877
- assert(self.status == .view_change);
2878
- assert(self.leader_index(self.view) == self.replica);
2879
- assert(op > self.commit_max);
2880
- return;
2881
- }
3237
+ if (self.repair_prepare(op)) {
3238
+ if (self.nack_prepare_op) |nack_prepare_op| {
3239
+ assert(nack_prepare_op == op);
3240
+ assert(self.status == .view_change);
3241
+ assert(self.leader_index(self.view) == self.replica);
3242
+ assert(op > self.commit_max);
3243
+ return;
3244
+ }
2882
3245
 
2883
- // Otherwise, we continue to request prepares until our budget is used up:
2884
- budget -= 1;
2885
- if (budget == 0) {
2886
- log.debug("{}: repair_prepares: request budget used up", .{self.replica});
2887
- return;
3246
+ // Otherwise, we continue to request prepares until our budget is used:
3247
+ budget -= 1;
3248
+ if (budget == 0) {
3249
+ log.debug("{}: repair_prepares: request budget used", .{self.replica});
3250
+ return;
3251
+ }
2888
3252
  }
2889
3253
  } else {
2890
3254
  assert(!self.journal.faulty.bit(op));
@@ -2908,16 +3272,29 @@ pub fn Replica(
2908
3272
  ///
2909
3273
  /// This is effectively "many-to-one" repair, where a single replica recovers using the
2910
3274
  /// resources of many replicas, for faster recovery.
2911
- fn repair_prepare(self: *Self, op: u64) void {
3275
+ fn repair_prepare(self: *Self, op: u64) bool {
2912
3276
  assert(self.status == .normal or self.status == .view_change);
2913
3277
  assert(self.repairs_allowed());
2914
3278
  assert(self.journal.dirty.bit(op));
2915
3279
 
3280
+ const checksum = self.journal.entry_for_op_exact(op).?.checksum;
3281
+
3282
+ // We may be appending to or repairing the journal concurrently.
3283
+ // We do not want to re-request any of these prepares unnecessarily.
3284
+ if (self.journal.writing(op, checksum)) {
3285
+ log.debug("{}: repair_prepare: already writing op={} checksum={}", .{
3286
+ self.replica,
3287
+ op,
3288
+ checksum,
3289
+ });
3290
+ return false;
3291
+ }
3292
+
2916
3293
  const request_prepare = Header{
2917
3294
  .command = .request_prepare,
2918
3295
  // If we request a prepare from a follower, as below, it is critical to pass a checksum:
2919
3296
  // Otherwise we could receive different prepares for the same op number.
2920
- .context = self.journal.entry_for_op_exact(op).?.checksum,
3297
+ .context = checksum,
2921
3298
  .cluster = self.cluster,
2922
3299
  .replica = self.replica,
2923
3300
  .view = self.view,
@@ -2928,6 +3305,29 @@ pub fn Replica(
2928
3305
  // Only the leader is allowed to do repairs in a view change:
2929
3306
  assert(self.leader_index(self.view) == self.replica);
2930
3307
 
3308
+ const reason = if (self.journal.faulty.bit(op)) "faulty" else "dirty";
3309
+ log.debug(
3310
+ "{}: repair_prepare: op={} checksum={} (uncommitted, {s}, view_change)",
3311
+ .{
3312
+ self.replica,
3313
+ op,
3314
+ checksum,
3315
+ reason,
3316
+ },
3317
+ );
3318
+
3319
+ if (self.replica_count == 2 and !self.journal.faulty.bit(op)) {
3320
+ // This is required to avoid a liveness issue for a cluster-of-two where a new
3321
+ // leader learns of an op during a view change but where the op is faulty on
3322
+ // the old leader. We must immediately roll back the op since it could not have
3323
+ // been committed by the old leader if we know we do not have it, and because
3324
+ // the old leader cannot send a nack_prepare for its faulty copy.
3325
+ // For this to be correct, the recovery protocol must set all headers as faulty,
3326
+ // not only as dirty.
3327
+ self.discard_uncommitted_ops_from(op, checksum);
3328
+ return false;
3329
+ }
3330
+
2931
3331
  // Initialize the `nack_prepare` quorum counter for this uncommitted op:
2932
3332
  // It is also possible that we may start repairing a lower uncommitted op, having
2933
3333
  // initialized `nack_prepare_op` before we learn of a higher uncommitted dirty op,
@@ -2948,25 +3348,33 @@ pub fn Replica(
2948
3348
  .nack_prepare,
2949
3349
  );
2950
3350
  }
2951
- log.debug("{}: repair_prepare: requesting op={} (priority, from all replicas)", .{
2952
- self.replica,
2953
- op,
2954
- });
3351
+
2955
3352
  assert(self.nack_prepare_op.? == op);
2956
- assert(request_prepare.context != 0);
3353
+ assert(request_prepare.context == checksum);
2957
3354
  self.send_header_to_other_replicas(request_prepare);
2958
3355
  } else {
2959
- log.debug("{}: repair_prepare: requesting op={}", .{ self.replica, op });
3356
+ const nature = if (op > self.commit_max) "uncommitted" else "committed";
3357
+ const reason = if (self.journal.faulty.bit(op)) "faulty" else "dirty";
3358
+ log.debug("{}: repair_prepare: op={} checksum={} ({s}, {s})", .{
3359
+ self.replica,
3360
+ op,
3361
+ checksum,
3362
+ nature,
3363
+ reason,
3364
+ });
3365
+
2960
3366
  // We expect that `repair_prepare()` is called in reverse chronological order:
2961
3367
  // Any uncommitted ops should have already been dealt with.
2962
3368
  // We never roll back committed ops, and thus never regard `nack_prepare` responses.
2963
3369
  // Alternatively, we may not be the leader, in which case we do distinguish anyway.
2964
3370
  assert(self.nack_prepare_op == null);
2965
- assert(request_prepare.context != 0);
3371
+ assert(request_prepare.context == checksum);
2966
3372
  if (self.choose_any_other_replica()) |replica| {
2967
3373
  self.send_header_to_replica(replica, request_prepare);
2968
3374
  }
2969
3375
  }
3376
+
3377
+ return true;
2970
3378
  }
2971
3379
 
2972
3380
  fn repairs_allowed(self: *Self) bool {
@@ -2987,7 +3395,7 @@ pub fn Replica(
2987
3395
  /// Replicates to the next replica in the configuration (until we get back to the leader):
2988
3396
  /// Replication starts and ends with the leader, we never forward back to the leader.
2989
3397
  /// Does not flood the network with prepares that have already committed.
2990
- /// TODO Use recent heartbeat data for next replica to leapfrog if faulty.
3398
+ /// TODO Use recent heartbeat data for next replica to leapfrog if faulty (optimization).
2991
3399
  fn replicate(self: *Self, message: *Message) void {
2992
3400
  assert(self.status == .normal);
2993
3401
  assert(message.header.command == .prepare);
@@ -3176,7 +3584,7 @@ pub fn Replica(
3176
3584
  assert(count_start_view_change >= self.quorum_view_change - 1);
3177
3585
 
3178
3586
  const message = self.create_view_change_message(.do_view_change) orelse {
3179
- log.alert("{}: send_do_view_change: waiting for message", .{self.replica});
3587
+ log.err("{}: send_do_view_change: waiting for message", .{self.replica});
3180
3588
  return;
3181
3589
  };
3182
3590
  defer self.message_bus.unref(message);
@@ -3195,7 +3603,7 @@ pub fn Replica(
3195
3603
  assert(self.status == .normal);
3196
3604
  assert(self.leader());
3197
3605
 
3198
- log.alert("{}: too many sessions, sending eviction message to client={}", .{
3606
+ log.err("{}: too many sessions, sending eviction message to client={}", .{
3199
3607
  self.replica,
3200
3608
  client,
3201
3609
  });
@@ -3211,7 +3619,7 @@ pub fn Replica(
3211
3619
 
3212
3620
  fn send_header_to_client(self: *Self, client: u128, header: Header) void {
3213
3621
  const message = self.create_message_from_header(header) orelse {
3214
- log.alert("{}: no header-only message available, dropping message to client {}", .{
3622
+ log.err("{}: no header-only message available, dropping message to client {}", .{
3215
3623
  self.replica,
3216
3624
  client,
3217
3625
  });
@@ -3224,7 +3632,7 @@ pub fn Replica(
3224
3632
 
3225
3633
  fn send_header_to_other_replicas(self: *Self, header: Header) void {
3226
3634
  const message = self.create_message_from_header(header) orelse {
3227
- log.alert("{}: no header-only message available, dropping message to replicas", .{
3635
+ log.err("{}: no header-only message available, dropping message to replicas", .{
3228
3636
  self.replica,
3229
3637
  });
3230
3638
  return;
@@ -3241,7 +3649,7 @@ pub fn Replica(
3241
3649
 
3242
3650
  fn send_header_to_replica(self: *Self, replica: u8, header: Header) void {
3243
3651
  const message = self.create_message_from_header(header) orelse {
3244
- log.alert("{}: no header-only message available, dropping message to replica {}", .{
3652
+ log.err("{}: no header-only message available, dropping message to replica {}", .{
3245
3653
  self.replica,
3246
3654
  replica,
3247
3655
  });
@@ -3270,7 +3678,7 @@ pub fn Replica(
3270
3678
  });
3271
3679
 
3272
3680
  if (message.header.invalid()) |reason| {
3273
- log.emerg("{}: send_message_to_replica: invalid ({s})", .{ self.replica, reason });
3681
+ log.err("{}: send_message_to_replica: invalid ({s})", .{ self.replica, reason });
3274
3682
  @panic("send_message_to_replica: invalid message");
3275
3683
  }
3276
3684
 
@@ -3311,6 +3719,7 @@ pub fn Replica(
3311
3719
  assert(!self.do_view_change_quorum);
3312
3720
  assert(message.header.view == self.view);
3313
3721
  assert(message.header.replica == self.replica);
3722
+ assert(message.header.op == self.op);
3314
3723
  assert(replica == self.leader_index(self.view));
3315
3724
  },
3316
3725
  .start_view => switch (self.status) {
@@ -3358,7 +3767,7 @@ pub fn Replica(
3358
3767
  assert(replica == self.leader_index(self.view));
3359
3768
  },
3360
3769
  else => {
3361
- log.notice("{}: send_message_to_replica: TODO {s}", .{
3770
+ log.info("{}: send_message_to_replica: TODO {s}", .{
3362
3771
  self.replica,
3363
3772
  @tagName(message.header.command),
3364
3773
  });
@@ -3373,7 +3782,9 @@ pub fn Replica(
3373
3782
  }
3374
3783
  }
3375
3784
 
3376
- fn set_latest_header(self: *Self, headers: []Header, latest: *Header) void {
3785
+ /// Finds the header with the highest op number in a slice of headers from a replica.
3786
+ /// Searches only by op number to find the highest `self.op for the replica.
3787
+ fn set_latest_op(headers: []Header, latest: *Header) void {
3377
3788
  switch (latest.command) {
3378
3789
  .reserved, .prepare => assert(latest.valid_checksum()),
3379
3790
  else => unreachable,
@@ -3384,11 +3795,9 @@ pub fn Replica(
3384
3795
  assert(header.invalid() == null);
3385
3796
  assert(header.command == .prepare);
3386
3797
 
3387
- if (latest.command == .reserved) {
3388
- latest.* = header;
3389
- } else if (header.view > latest.view) {
3390
- latest.* = header;
3391
- } else if (header.view == latest.view and header.op > latest.op) {
3798
+ if (latest.command == .reserved or header.op > latest.op) {
3799
+ // We are simply trying to find the latest `self.op` in the replica's log.
3800
+ // We therefore do not compare views here.
3392
3801
  latest.* = header;
3393
3802
  }
3394
3803
  }
@@ -3400,18 +3809,15 @@ pub fn Replica(
3400
3809
  k: u64,
3401
3810
  method: []const u8,
3402
3811
  ) void {
3403
- assert(self.status == .view_change or self.status == .normal);
3812
+ assert(self.status == .view_change);
3404
3813
 
3405
3814
  assert(latest.valid_checksum());
3406
3815
  assert(latest.invalid() == null);
3407
3816
  assert(latest.command == .prepare);
3408
3817
  assert(latest.cluster == self.cluster);
3409
- if (self.status == .view_change) {
3410
- assert(latest.view < self.view); // Latest normal view before this view change.
3411
- } else {
3412
- assert(latest.view <= self.view);
3413
- assert(self.view_jump_barrier);
3414
- }
3818
+
3819
+ // The view may have started already, so we can have a prepare in the same view:
3820
+ assert(latest.view <= self.view);
3415
3821
 
3416
3822
  log.debug("{}: {s}: view={} op={}..{} commit={}..{} checksum={} offset={}", .{
3417
3823
  self.replica,
@@ -3462,7 +3868,8 @@ pub fn Replica(
3462
3868
  assert(self.op >= self.commit_max);
3463
3869
 
3464
3870
  // Do not set the latest op as dirty if we already have it exactly:
3465
- // Otherwise, this would trigger a repair and delay the view change.
3871
+ // Otherwise, this would trigger a repair and delay the view change, or worse, it would
3872
+ // prevent us from assisting another replica to recover when we do in fact have the op.
3466
3873
  if (self.journal.entry_for_op_exact_with_checksum(latest.op, latest.checksum)) |_| {
3467
3874
  log.debug("{}: {s}: latest op exists exactly", .{ self.replica, method });
3468
3875
  } else {
@@ -3472,11 +3879,6 @@ pub fn Replica(
3472
3879
  assert(self.op == latest.op);
3473
3880
  self.journal.remove_entries_from(self.op + 1);
3474
3881
  assert(self.journal.entry_for_op_exact(self.op).?.checksum == latest.checksum);
3475
-
3476
- if (self.view_jump_barrier) {
3477
- self.view_jump_barrier = false;
3478
- log.debug("{}: {s}: resolved view jump barrier", .{ self.replica, method });
3479
- }
3480
3882
  }
3481
3883
 
3482
3884
  fn start_view_as_the_new_leader(self: *Self) void {
@@ -3484,9 +3886,6 @@ pub fn Replica(
3484
3886
  assert(self.leader_index(self.view) == self.replica);
3485
3887
  assert(self.do_view_change_quorum);
3486
3888
 
3487
- // TODO Do one last count of our do_view_change quorum messages.
3488
-
3489
- assert(!self.view_jump_barrier);
3490
3889
  assert(!self.committing);
3491
3890
  assert(!self.repairing_pipeline);
3492
3891
 
@@ -3514,7 +3913,7 @@ pub fn Replica(
3514
3913
  assert(self.nack_prepare_op == null);
3515
3914
 
3516
3915
  const start_view = self.create_view_change_message(.start_view) orelse {
3517
- log.alert("{}: start_view_as_the_new_leader: waiting for message", .{self.replica});
3916
+ log.err("{}: start_view_as_the_new_leader: waiting for message", .{self.replica});
3518
3917
  return;
3519
3918
  };
3520
3919
  defer self.message_bus.unref(start_view);
@@ -3544,6 +3943,7 @@ pub fn Replica(
3544
3943
  // For example, this could happen after a state transfer triggered by an op jump.
3545
3944
  assert(new_view >= self.view);
3546
3945
  self.view = new_view;
3946
+ self.view_normal = new_view;
3547
3947
  self.status = .normal;
3548
3948
 
3549
3949
  if (self.leader()) {
@@ -3551,8 +3951,8 @@ pub fn Replica(
3551
3951
 
3552
3952
  self.ping_timeout.start();
3553
3953
  self.commit_timeout.start();
3554
- self.election_timeout.stop();
3555
- self.view_change_timeout.stop();
3954
+ self.normal_status_timeout.stop();
3955
+ self.view_change_status_timeout.stop();
3556
3956
  self.view_change_message_timeout.stop();
3557
3957
  self.repair_timeout.start();
3558
3958
 
@@ -3566,8 +3966,8 @@ pub fn Replica(
3566
3966
 
3567
3967
  self.ping_timeout.start();
3568
3968
  self.commit_timeout.stop();
3569
- self.election_timeout.start();
3570
- self.view_change_timeout.stop();
3969
+ self.normal_status_timeout.start();
3970
+ self.view_change_status_timeout.stop();
3571
3971
  self.view_change_message_timeout.stop();
3572
3972
  self.repair_timeout.start();
3573
3973
 
@@ -3589,15 +3989,19 @@ pub fn Replica(
3589
3989
  /// on its own timer, or because it receives a start_view_change or do_view_change message for
3590
3990
  /// a view with a larger number than its own view.
3591
3991
  fn transition_to_view_change_status(self: *Self, new_view: u32) void {
3592
- log.debug("{}: transition_to_view_change_status: view={}", .{ self.replica, new_view });
3992
+ log.debug("{}: transition_to_view_change_status: view={}..{}", .{
3993
+ self.replica,
3994
+ self.view,
3995
+ new_view,
3996
+ });
3593
3997
  assert(new_view > self.view);
3594
3998
  self.view = new_view;
3595
3999
  self.status = .view_change;
3596
4000
 
3597
4001
  self.ping_timeout.stop();
3598
4002
  self.commit_timeout.stop();
3599
- self.election_timeout.stop();
3600
- self.view_change_timeout.start();
4003
+ self.normal_status_timeout.stop();
4004
+ self.view_change_status_timeout.start();
3601
4005
  self.view_change_message_timeout.start();
3602
4006
  self.repair_timeout.stop();
3603
4007
 
@@ -3673,16 +4077,6 @@ pub fn Replica(
3673
4077
  /// Returns true if the hash chain is valid and up to date for the current view.
3674
4078
  /// This is a stronger guarantee than `valid_hash_chain_between()` below.
3675
4079
  fn valid_hash_chain(self: *Self, method: []const u8) bool {
3676
- // If we know we have uncommitted ops that may have been reordered through a view change
3677
- // then wait until the latest of these has been resolved with the leader:
3678
- if (self.view_jump_barrier) {
3679
- log.debug("{}: {s}: waiting to resolve view jump barrier", .{
3680
- self.replica,
3681
- method,
3682
- });
3683
- return false;
3684
- }
3685
-
3686
4080
  // If we know we could validate the hash chain even further, then wait until we can:
3687
4081
  // This is partial defense-in-depth in case `self.op` is ever advanced by a reordered op.
3688
4082
  if (self.op < self.commit_max) {
@@ -3721,7 +4115,7 @@ pub fn Replica(
3721
4115
  if (self.journal.entry_for_op_exact(op)) |a| {
3722
4116
  assert(a.op + 1 == b.op);
3723
4117
  if (a.checksum == b.parent) {
3724
- assert(self.ascending_viewstamps(a, b));
4118
+ assert(ascending_viewstamps(a, b));
3725
4119
  b = a;
3726
4120
  } else {
3727
4121
  log.debug("{}: valid_hash_chain_between: break: A: {}", .{ self.replica, a });
@@ -3738,7 +4132,7 @@ pub fn Replica(
3738
4132
  }
3739
4133
 
3740
4134
  fn view_jump(self: *Self, header: *const Header) void {
3741
- const into: Status = switch (header.command) {
4135
+ const to: Status = switch (header.command) {
3742
4136
  .prepare, .commit => .normal,
3743
4137
  .start_view_change, .do_view_change, .start_view => .view_change,
3744
4138
  else => unreachable,
@@ -3750,19 +4144,19 @@ pub fn Replica(
3750
4144
 
3751
4145
  // Compare status transitions and decide whether to view jump or ignore:
3752
4146
  switch (self.status) {
3753
- .normal => switch (into) {
4147
+ .normal => switch (to) {
3754
4148
  // If the transition is to `.normal`, then ignore if for the same view:
3755
4149
  .normal => if (header.view == self.view) return,
3756
4150
  // If the transition is to `.view_change`, then ignore if the view has started:
3757
4151
  .view_change => if (header.view == self.view) return,
3758
4152
  else => unreachable,
3759
4153
  },
3760
- .view_change => switch (into) {
4154
+ .view_change => switch (to) {
3761
4155
  // This is an interesting special case:
3762
4156
  // If the transition is to `.normal` in the same view, then we missed the
3763
4157
  // `start_view` message and we must also consider this a view jump:
3764
- // If we don't view jump here, then our `view_change_timeout` will fire and we
3765
- // will disrupt the cluster by starting another view change for a newer view.
4158
+ // If we don't handle this below then our `view_change_status_timeout` will fire
4159
+ // and we will disrupt the cluster with another view change for a newer view.
3766
4160
  .normal => {},
3767
4161
  // If the transition is to `.view_change`, then ignore if for the same view:
3768
4162
  .view_change => if (header.view == self.view) return,
@@ -3771,84 +4165,39 @@ pub fn Replica(
3771
4165
  else => unreachable,
3772
4166
  }
3773
4167
 
3774
- if (into == .normal) {
3775
- if (header.view == self.view) {
3776
- assert(self.status == .view_change);
3777
-
3778
- log.debug("{}: view_jump: exiting view change", .{self.replica});
3779
- } else {
3780
- assert(header.view > self.view);
3781
- assert(self.status == .normal or self.status == .view_change);
3782
-
3783
- log.debug("{}: view_jump: jumping into newer view", .{self.replica});
3784
- }
3785
- } else if (into == .view_change) {
3786
- assert(header.view > self.view);
3787
- assert(self.status == .normal or self.status == .view_change);
3788
-
3789
- if (header.view == self.view + 1) {
3790
- log.debug("{}: view_jump: jumping into view change", .{self.replica});
3791
- } else {
3792
- log.debug("{}: view_jump: jumping into next view change", .{self.replica});
3793
- }
4168
+ switch (to) {
4169
+ .normal => {
4170
+ if (header.view == self.view) {
4171
+ assert(self.status == .view_change);
3794
4172
 
3795
- // This view change will terminate by setting the latest op in on_do_view_change()
3796
- // or in on_start_view(), or in a newer view change that will do the same, or in a
3797
- // newer view in normal status that MAY impose a view jump barrier but only if by
3798
- // then our op number still exceeds the cluster commit number (if it does now).
3799
- //
3800
- // However, that does not mean that we may clear any view jump barrier here, because
3801
- // it may not be reimposed if we double-jump into a normal view with our op number
3802
- // at that point older than the cluster commit number.
3803
- //
3804
- // Furthermore, even if we are transitioning from normal status into the very next
3805
- // view through a view change, we must still impose a view jump barrier, because we
3806
- // may never receive the start_view message, and because again, if we wait until the
3807
- // next view jump then our op number may no longer exceed the cluster commit number.
3808
- } else {
3809
- unreachable;
3810
- }
4173
+ log.debug("{}: view_jump: waiting to exit view change", .{self.replica});
4174
+ } else {
4175
+ assert(header.view > self.view);
4176
+ assert(self.status == .view_change or self.status == .normal);
3811
4177
 
3812
- if (self.op > self.commit_max) {
3813
- // We have uncommitted ops, and these may be removed or replaced by the new leader
3814
- // through any view change(s) in which we did/do not receive the start_view message.
3815
- //
3816
- // A commit number from the new leader may now refer to a different op than what we
3817
- // have in our log, even if our hash chain is fully intact.
3818
- //
3819
- // CRITICAL: If we were to commit despite this ambiguity, we would fork the log.
3820
- //
3821
- // In Section 5.2, the VRR paper deals with this scenario by simply removing
3822
- // the uncommitted ops and doing a state transfer.
3823
- //
3824
- // However, while strictly safe, this impairs safety in terms of durability, and
3825
- // adds unnecessary repair overhead if it turns out that the ops were in fact
3826
- // committed.
3827
- //
3828
- // We rather impose a view jump barrier to keep from committing, for as long as
3829
- // there is ambiguity around what specific op a commit number represents.
3830
- // This preserves and maximizes durability and minimizes repair traffic.
3831
- //
3832
- // This view jump barrier is cleared or resolved, respectively, as soon as:
3833
- // 1. we receive a new prepare from the leader that advances our latest op, or
3834
- // 2. we request and receive a `start_view` message from the leader of the view.
3835
- //
3836
- // This is safe because advancing our latest op in the current view or receiving
3837
- // it from the leader ensures that we have the latest hash chain head, from which we
3838
- // can work backwards to disambiguate any ops.
3839
- log.debug("{}: view_jump: imposing view jump barrier", .{self.replica});
3840
- self.view_jump_barrier = true;
3841
- } else {
3842
- assert(self.op <= self.commit_max);
4178
+ log.debug("{}: view_jump: waiting to jump to newer view", .{self.replica});
4179
+ }
3843
4180
 
3844
- // Crucially, we may still need to resolve any prior view jump barrier:
3845
- // For example, if we jump to view 3 and then view 7 both in normal status.
3846
- assert(self.view_jump_barrier == true or self.view_jump_barrier == false);
3847
- }
4181
+ // TODO Debounce and decouple this from `on_message()` by moving into `tick()`:
4182
+ log.debug("{}: view_jump: requesting start_view message", .{self.replica});
4183
+ self.send_header_to_replica(self.leader_index(header.view), .{
4184
+ .command = .request_start_view,
4185
+ .cluster = self.cluster,
4186
+ .replica = self.replica,
4187
+ .view = header.view,
4188
+ });
4189
+ },
4190
+ .view_change => {
4191
+ assert(header.view > self.view);
4192
+ assert(self.status == .view_change or self.status == .normal);
3848
4193
 
3849
- switch (into) {
3850
- .normal => self.transition_to_normal_status(header.view),
3851
- .view_change => self.transition_to_view_change_status(header.view),
4194
+ if (header.view == self.view + 1) {
4195
+ log.debug("{}: view_jump: jumping to view change", .{self.replica});
4196
+ } else {
4197
+ log.debug("{}: view_jump: jumping to next view change", .{self.replica});
4198
+ }
4199
+ self.transition_to_view_change_status(header.view);
4200
+ },
3852
4201
  else => unreachable,
3853
4202
  }
3854
4203
  }
@@ -3860,7 +4209,20 @@ pub fn Replica(
3860
4209
  assert(message.header.op <= self.op);
3861
4210
 
3862
4211
  if (!self.journal.has(message.header)) {
3863
- log.debug("{}: write_prepare: ignoring (header changed)", .{self.replica});
4212
+ log.debug("{}: write_prepare: ignoring op={} checksum={} (header changed)", .{
4213
+ self.replica,
4214
+ message.header.op,
4215
+ message.header.checksum,
4216
+ });
4217
+ return;
4218
+ }
4219
+
4220
+ if (self.journal.writing(message.header.op, message.header.checksum)) {
4221
+ log.debug("{}: write_prepare: ignoring op={} checksum={} (already writing)", .{
4222
+ self.replica,
4223
+ message.header.op,
4224
+ message.header.checksum,
4225
+ });
3864
4226
  return;
3865
4227
  }
3866
4228