tigerbeetle-node 0.11.11 → 0.11.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/README.md +212 -196
  2. package/dist/.client.node.sha256 +1 -1
  3. package/package.json +6 -8
  4. package/scripts/build_lib.sh +22 -2
  5. package/src/node.zig +1 -0
  6. package/src/tigerbeetle/scripts/benchmark.bat +9 -5
  7. package/src/tigerbeetle/scripts/benchmark.sh +1 -4
  8. package/src/tigerbeetle/scripts/fail_on_diff.sh +9 -0
  9. package/src/tigerbeetle/scripts/fuzz_loop_hash_log.sh +12 -0
  10. package/src/tigerbeetle/scripts/scripts/benchmark.bat +55 -0
  11. package/src/tigerbeetle/scripts/scripts/benchmark.sh +66 -0
  12. package/src/tigerbeetle/scripts/scripts/confirm_image.sh +44 -0
  13. package/src/tigerbeetle/scripts/scripts/fail_on_diff.sh +9 -0
  14. package/src/tigerbeetle/scripts/scripts/fuzz_loop.sh +15 -0
  15. package/src/tigerbeetle/scripts/scripts/fuzz_loop_hash_log.sh +12 -0
  16. package/src/tigerbeetle/scripts/scripts/fuzz_unique_errors.sh +7 -0
  17. package/src/tigerbeetle/scripts/scripts/install.bat +7 -0
  18. package/src/tigerbeetle/scripts/scripts/install.sh +21 -0
  19. package/src/tigerbeetle/scripts/scripts/install_zig.bat +113 -0
  20. package/src/tigerbeetle/scripts/scripts/install_zig.sh +90 -0
  21. package/src/tigerbeetle/scripts/scripts/lint.zig +199 -0
  22. package/src/tigerbeetle/scripts/scripts/pre-commit.sh +9 -0
  23. package/src/tigerbeetle/scripts/scripts/shellcheck.sh +5 -0
  24. package/src/tigerbeetle/scripts/scripts/tests_on_alpine.sh +10 -0
  25. package/src/tigerbeetle/scripts/scripts/tests_on_ubuntu.sh +14 -0
  26. package/src/tigerbeetle/scripts/scripts/upgrade_ubuntu_kernel.sh +48 -0
  27. package/src/tigerbeetle/scripts/scripts/validate_docs.sh +23 -0
  28. package/src/tigerbeetle/scripts/scripts/vr_state_enumerate +46 -0
  29. package/src/tigerbeetle/src/benchmark.zig +253 -231
  30. package/src/tigerbeetle/src/config.zig +2 -3
  31. package/src/tigerbeetle/src/constants.zig +2 -10
  32. package/src/tigerbeetle/src/io/linux.zig +15 -6
  33. package/src/tigerbeetle/src/lsm/forest.zig +1 -0
  34. package/src/tigerbeetle/src/lsm/forest_fuzz.zig +63 -14
  35. package/src/tigerbeetle/src/lsm/groove.zig +134 -70
  36. package/src/tigerbeetle/src/lsm/level_iterator.zig +2 -2
  37. package/src/tigerbeetle/src/lsm/manifest_level.zig +1 -0
  38. package/src/tigerbeetle/src/lsm/posted_groove.zig +7 -4
  39. package/src/tigerbeetle/src/lsm/segmented_array.zig +1 -0
  40. package/src/tigerbeetle/src/lsm/table.zig +29 -51
  41. package/src/tigerbeetle/src/lsm/table_immutable.zig +6 -17
  42. package/src/tigerbeetle/src/lsm/table_iterator.zig +2 -2
  43. package/src/tigerbeetle/src/lsm/table_mutable.zig +9 -26
  44. package/src/tigerbeetle/src/lsm/test.zig +1 -0
  45. package/src/tigerbeetle/src/lsm/tree.zig +2 -26
  46. package/src/tigerbeetle/src/lsm/tree_fuzz.zig +7 -2
  47. package/src/tigerbeetle/src/message_bus.zig +2 -1
  48. package/src/tigerbeetle/src/simulator.zig +14 -3
  49. package/src/tigerbeetle/src/state_machine/auditor.zig +1 -0
  50. package/src/tigerbeetle/src/state_machine.zig +402 -184
  51. package/src/tigerbeetle/src/stdx.zig +32 -0
  52. package/src/tigerbeetle/src/testing/cluster/network.zig +6 -7
  53. package/src/tigerbeetle/src/testing/cluster.zig +6 -5
  54. package/src/tigerbeetle/src/testing/packet_simulator.zig +19 -10
  55. package/src/tigerbeetle/src/testing/state_machine.zig +1 -0
  56. package/src/tigerbeetle/src/unit_tests.zig +20 -22
  57. package/src/tigerbeetle/src/vsr/README.md +209 -0
  58. package/src/tigerbeetle/src/vsr/client.zig +4 -4
  59. package/src/tigerbeetle/src/vsr/clock.zig +2 -0
  60. package/src/tigerbeetle/src/vsr/journal.zig +2 -0
  61. package/src/tigerbeetle/src/vsr/replica.zig +646 -578
  62. package/src/tigerbeetle/src/vsr/superblock.zig +14 -17
  63. package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +5 -5
  64. package/src/tigerbeetle/src/vsr.zig +370 -37
@@ -96,6 +96,9 @@ pub const Command = enum(u8) {
96
96
  ping,
97
97
  pong,
98
98
 
99
+ ping_client,
100
+ pong_client,
101
+
99
102
  request,
100
103
  prepare,
101
104
  prepare_ok,
@@ -159,6 +162,8 @@ pub const Operation = enum(u8) {
159
162
  /// We reuse the same header for both so that prepare messages from the primary can simply be
160
163
  /// journalled as is by the backups without requiring any further modification.
161
164
  pub const Header = extern struct {
165
+ const checksum_body_empty = checksum(&.{});
166
+
162
167
  comptime {
163
168
  assert(@sizeOf(Header) == 128);
164
169
  // Assert that there is no implicit padding in the struct.
@@ -250,6 +255,7 @@ pub const Header = extern struct {
250
255
  /// * A `pong` sets this to the sender's wall clock value.
251
256
  /// * A `request_prepare` sets this to `1` when `context` is set to a checksum, and `0`
252
257
  /// otherwise.
258
+ /// * A `commit` message sets this to the replica's monotonic timestamp.
253
259
  timestamp: u64 = 0,
254
260
 
255
261
  /// The size of the Header structure (always), plus any associated body.
@@ -312,6 +318,8 @@ pub const Header = extern struct {
312
318
  .reserved => self.invalid_reserved(),
313
319
  .ping => self.invalid_ping(),
314
320
  .pong => self.invalid_pong(),
321
+ .ping_client => self.invalid_ping_client(),
322
+ .pong_client => self.invalid_pong_client(),
315
323
  .request => self.invalid_request(),
316
324
  .prepare => self.invalid_prepare(),
317
325
  .prepare_ok => self.invalid_prepare_ok(),
@@ -348,10 +356,14 @@ pub const Header = extern struct {
348
356
  fn invalid_ping(self: *const Header) ?[]const u8 {
349
357
  assert(self.command == .ping);
350
358
  if (self.parent != 0) return "parent != 0";
359
+ if (self.client != 0) return "client != 0";
351
360
  if (self.context != 0) return "context != 0";
352
361
  if (self.request != 0) return "request != 0";
362
+ if (self.view != 0) return "view != 0";
353
363
  if (self.commit != 0) return "commit != 0";
354
364
  if (self.timestamp != 0) return "timestamp != 0";
365
+ if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
366
+ if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
355
367
  if (self.operation != .reserved) return "operation != .reserved";
356
368
  return null;
357
369
  }
@@ -362,7 +374,43 @@ pub const Header = extern struct {
362
374
  if (self.client != 0) return "client != 0";
363
375
  if (self.context != 0) return "context != 0";
364
376
  if (self.request != 0) return "request != 0";
377
+ if (self.view != 0) return "view != 0";
378
+ if (self.commit != 0) return "commit != 0";
379
+ if (self.timestamp == 0) return "timestamp == 0";
380
+ if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
381
+ if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
382
+ if (self.operation != .reserved) return "operation != .reserved";
383
+ return null;
384
+ }
385
+
386
+ fn invalid_ping_client(self: *const Header) ?[]const u8 {
387
+ assert(self.command == .ping_client);
388
+ if (self.parent != 0) return "parent != 0";
389
+ if (self.client == 0) return "client == 0";
390
+ if (self.context != 0) return "context != 0";
391
+ if (self.request != 0) return "request != 0";
392
+ if (self.view != 0) return "view != 0";
393
+ if (self.op != 0) return "op != 0";
394
+ if (self.commit != 0) return "commit != 0";
395
+ if (self.timestamp != 0) return "timestamp != 0";
396
+ if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
397
+ if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
398
+ if (self.replica != 0) return "replica != 0";
399
+ if (self.operation != .reserved) return "operation != .reserved";
400
+ return null;
401
+ }
402
+
403
+ fn invalid_pong_client(self: *const Header) ?[]const u8 {
404
+ assert(self.command == .pong_client);
405
+ if (self.parent != 0) return "parent != 0";
406
+ if (self.client != 0) return "client != 0";
407
+ if (self.context != 0) return "context != 0";
408
+ if (self.request != 0) return "request != 0";
409
+ if (self.op != 0) return "op != 0";
365
410
  if (self.commit != 0) return "commit != 0";
411
+ if (self.timestamp != 0) return "timestamp != 0";
412
+ if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
413
+ if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
366
414
  if (self.operation != .reserved) return "operation != .reserved";
367
415
  return null;
368
416
  }
@@ -379,10 +427,11 @@ pub const Header = extern struct {
379
427
  .root => return "operation == .root",
380
428
  .register => {
381
429
  // The first request a client makes must be to register with the cluster:
382
- if (self.parent != 0) return "parent != 0";
383
- if (self.context != 0) return "context != 0";
384
- if (self.request != 0) return "request != 0";
430
+ if (self.parent != 0) return "register: parent != 0";
431
+ if (self.context != 0) return "register: context != 0";
432
+ if (self.request != 0) return "register: request != 0";
385
433
  // The .register operation carries no payload:
434
+ if (self.checksum_body != checksum_body_empty) return "register: checksum_body != expected";
386
435
  if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
387
436
  },
388
437
  else => {
@@ -408,6 +457,7 @@ pub const Header = extern struct {
408
457
  if (self.op != 0) return "root: op != 0";
409
458
  if (self.commit != 0) return "root: commit != 0";
410
459
  if (self.timestamp != 0) return "root: timestamp != 0";
460
+ if (self.checksum_body != checksum_body_empty) return "root: checksum_body != expected";
411
461
  if (self.size != @sizeOf(Header)) return "root: size != @sizeOf(Header)";
412
462
  if (self.replica != 0) return "root: replica != 0";
413
463
  },
@@ -430,6 +480,7 @@ pub const Header = extern struct {
430
480
 
431
481
  fn invalid_prepare_ok(self: *const Header) ?[]const u8 {
432
482
  assert(self.command == .prepare_ok);
483
+ if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
433
484
  if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
434
485
  switch (self.operation) {
435
486
  .reserved => return "operation == .reserved",
@@ -448,6 +499,7 @@ pub const Header = extern struct {
448
499
  if (self.client == 0) return "client == 0";
449
500
  if (self.op == 0) return "op == 0";
450
501
  if (self.op <= self.commit) return "op <= commit";
502
+ if (self.timestamp == 0) return "timestamp == 0";
451
503
  if (self.operation == .register) {
452
504
  if (self.request != 0) return "request != 0";
453
505
  } else {
@@ -483,7 +535,9 @@ pub const Header = extern struct {
483
535
  if (self.client != 0) return "client != 0";
484
536
  if (self.request != 0) return "request != 0";
485
537
  if (self.op != 0) return "op != 0";
486
- if (self.timestamp != 0) return "timestamp != 0";
538
+ if (self.timestamp == 0) return "timestamp == 0";
539
+ if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
540
+ if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
487
541
  if (self.operation != .reserved) return "operation != .reserved";
488
542
  return null;
489
543
  }
@@ -497,6 +551,8 @@ pub const Header = extern struct {
497
551
  if (self.op != 0) return "op != 0";
498
552
  if (self.commit != 0) return "commit != 0";
499
553
  if (self.timestamp != 0) return "timestamp != 0";
554
+ if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
555
+ if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
500
556
  if (self.operation != .reserved) return "operation != .reserved";
501
557
  return null;
502
558
  }
@@ -531,6 +587,8 @@ pub const Header = extern struct {
531
587
  if (self.op != 0) return "op != 0";
532
588
  if (self.commit != 0) return "commit != 0";
533
589
  if (self.timestamp != 0) return "timestamp != 0";
590
+ if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
591
+ if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
534
592
  if (self.operation != .reserved) return "operation != .reserved";
535
593
  return null;
536
594
  }
@@ -541,8 +599,10 @@ pub const Header = extern struct {
541
599
  if (self.client != 0) return "client != 0";
542
600
  if (self.context != 0) return "context != 0";
543
601
  if (self.request != 0) return "request != 0";
544
- if (self.timestamp != 0) return "timestamp != 0";
545
602
  if (self.commit > self.op) return "op_min > op_max";
603
+ if (self.timestamp != 0) return "timestamp != 0";
604
+ if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
605
+ if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
546
606
  if (self.operation != .reserved) return "operation != .reserved";
547
607
  return null;
548
608
  }
@@ -553,6 +613,8 @@ pub const Header = extern struct {
553
613
  if (self.client != 0) return "client != 0";
554
614
  if (self.request != 0) return "request != 0";
555
615
  if (self.commit != 0) return "commit != 0";
616
+ if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
617
+ if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
556
618
  switch (self.timestamp) {
557
619
  0 => if (self.context != 0) return "context != 0",
558
620
  1 => {}, // context is a checksum, which may be 0.
@@ -581,6 +643,8 @@ pub const Header = extern struct {
581
643
  if (self.request != 0) return "request != 0";
582
644
  if (self.commit != 0) return "commit != 0";
583
645
  if (self.timestamp != 0) return "timestamp != 0";
646
+ if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
647
+ if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
584
648
  if (self.operation != .reserved) return "operation != .reserved";
585
649
  return null;
586
650
  }
@@ -593,6 +657,8 @@ pub const Header = extern struct {
593
657
  if (self.op != 0) return "op != 0";
594
658
  if (self.commit != 0) return "commit != 0";
595
659
  if (self.timestamp != 0) return "timestamp != 0";
660
+ if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
661
+ if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
596
662
  if (self.operation != .reserved) return "operation != .reserved";
597
663
  return null;
598
664
  }
@@ -611,15 +677,7 @@ pub const Header = extern struct {
611
677
  },
612
678
  .prepare => return .unknown,
613
679
  // These messages identify the peer as either a replica or a client:
614
- // TODO Assert that pong responses from a replica do not echo the pinging client's ID.
615
- .ping, .pong => {
616
- if (self.client > 0) {
617
- assert(self.replica == 0);
618
- return .client;
619
- } else {
620
- return .replica;
621
- }
622
- },
680
+ .ping_client => return .client,
623
681
  // All other messages identify the peer as a replica:
624
682
  else => return .replica,
625
683
  }
@@ -679,7 +737,7 @@ pub const Timeout = struct {
679
737
 
680
738
  /// It's important to check that when fired() is acted on that the timeout is stopped/started,
681
739
  /// otherwise further ticks around the event loop may trigger a thundering herd of messages.
682
- pub fn fired(self: *Timeout) bool {
740
+ pub fn fired(self: *const Timeout) bool {
683
741
  if (self.ticking and self.ticks >= self.after) {
684
742
  log.debug("{}: {s} fired", .{ self.id, self.name });
685
743
  if (self.ticks > self.after) {
@@ -967,6 +1025,8 @@ pub fn sector_ceil(offset: u64) u64 {
967
1025
  }
968
1026
 
969
1027
  pub fn checksum(source: []const u8) u128 {
1028
+ @setEvalBranchQuota(4000);
1029
+
970
1030
  var target: [32]u8 = undefined;
971
1031
  std.crypto.hash.Blake3.hash(source, target[0..], .{});
972
1032
  return @bitCast(u128, target[0..@sizeOf(u128)].*);
@@ -976,22 +1036,30 @@ pub fn quorums(replica_count: u8) struct {
976
1036
  replication: u8,
977
1037
  view_change: u8,
978
1038
  } {
979
- const majority = @divFloor(replica_count, 2) + 1;
980
- assert(majority <= replica_count);
1039
+ assert(replica_count > 0);
981
1040
 
982
1041
  assert(constants.quorum_replication_max >= 2);
983
- const quorum_replication = std.math.min(constants.quorum_replication_max, majority);
1042
+ // For replica_count=2, set quorum_replication=2 even though =1 would intersect.
1043
+ // This improves durability of small clusters.
1044
+ const quorum_replication = if (replica_count == 2) 2 else std.math.min(
1045
+ constants.quorum_replication_max,
1046
+ stdx.div_ceil(replica_count, 2),
1047
+ );
1048
+ assert(quorum_replication <= replica_count);
984
1049
  assert(quorum_replication >= 2 or quorum_replication == replica_count);
985
1050
 
986
- const quorum_view_change = std.math.max(
987
- replica_count - quorum_replication + 1,
988
- majority,
989
- );
1051
+ // For replica_count=2, set quorum_view_change=2 even though =1 would intersect.
1052
+ // This avoids special cases for a single-replica view-change in Replica.
1053
+ const quorum_view_change =
1054
+ if (replica_count == 2) 2 else replica_count - quorum_replication + 1;
990
1055
  // The view change quorum may be more expensive to make the replication quorum cheaper.
991
1056
  // The insight is that the replication phase is by far more common than the view change.
992
1057
  // This trade-off allows us to optimize for the common case.
993
1058
  // See the comments in `constants.zig` for further explanation.
994
- assert(quorum_view_change >= majority);
1059
+ assert(quorum_view_change <= replica_count);
1060
+ assert(quorum_view_change >= 2 or quorum_view_change == replica_count);
1061
+ assert(quorum_view_change >= @divFloor(replica_count, 2) + 1);
1062
+ assert(quorum_view_change + quorum_replication > replica_count);
995
1063
 
996
1064
  return .{
997
1065
  .replication = quorum_replication,
@@ -999,26 +1067,43 @@ pub fn quorums(replica_count: u8) struct {
999
1067
  };
1000
1068
  }
1001
1069
 
1002
- /// The SuperBlock's persisted VSR headers.
1003
- /// One of the following:
1004
- ///
1005
- /// - SV headers (consecutive chain)
1006
- /// - DVC headers (disjoint chain)
1007
- pub const ViewChangeHeaders = struct {
1070
+ test "quorums" {
1071
+ if (constants.quorum_replication_max != 3) return error.SkipZigTest;
1072
+
1073
+ const expect_replication = [_]u8{ 1, 2, 2, 2, 3, 3, 3, 3 };
1074
+ const expect_view_change = [_]u8{ 1, 2, 2, 3, 3, 4, 5, 6 };
1075
+
1076
+ for (expect_replication[0..]) |_, i| {
1077
+ const actual = quorums(@intCast(u8, i) + 1);
1078
+ try std.testing.expectEqual(actual.replication, expect_replication[i]);
1079
+ try std.testing.expectEqual(actual.view_change, expect_view_change[i]);
1080
+ }
1081
+ }
1082
+
1083
+ pub const Headers = struct {
1084
+ pub const Array = std.BoundedArray(Header, constants.view_change_headers_max);
1085
+ /// The SuperBlock's persisted VSR headers.
1086
+ /// One of the following:
1087
+ ///
1088
+ /// - SV headers (consecutive chain)
1089
+ /// - DVC headers (disjoint chain)
1090
+ pub const ViewChangeSlice = ViewChangeHeadersSlice;
1091
+ pub const ViewChangeArray = ViewChangeHeadersArray;
1092
+ };
1093
+
1094
+ const ViewChangeHeadersSlice = struct {
1008
1095
  /// Headers are ordered from high-to-low op.
1009
1096
  slice: []const Header,
1010
1097
 
1011
- pub const BoundedArray = std.BoundedArray(Header, constants.pipeline_prepare_queue_max);
1012
-
1013
- pub fn init(slice: []const Header) ViewChangeHeaders {
1014
- ViewChangeHeaders.verify(slice);
1098
+ pub fn init(slice: []const Header) ViewChangeHeadersSlice {
1099
+ ViewChangeHeadersSlice.verify(slice);
1015
1100
 
1016
1101
  return .{ .slice = slice };
1017
1102
  }
1018
1103
 
1019
1104
  pub fn verify(slice: []const Header) void {
1020
1105
  assert(slice.len > 0);
1021
- assert(slice.len <= constants.pipeline_prepare_queue_max);
1106
+ assert(slice.len <= constants.view_change_headers_max);
1022
1107
 
1023
1108
  var child: ?*const Header = null;
1024
1109
  for (slice) |*header| {
@@ -1053,7 +1138,7 @@ pub const ViewChangeHeaders = struct {
1053
1138
  /// - When these are SV headers for a log_view=V, we can continue to add to them (by preparing
1054
1139
  /// more ops), but those ops will laways be part of the log_view. If they were prepared during
1055
1140
  /// a view prior to the log_view, they would already be part of the headers.
1056
- pub fn view_for_op(headers: ViewChangeHeaders, op: u64, log_view: u32) ViewRange {
1141
+ pub fn view_for_op(headers: ViewChangeHeadersSlice, op: u64, log_view: u32) ViewRange {
1057
1142
  const header_newest = &headers.slice[0];
1058
1143
  const header_oldest = &headers.slice[headers.slice.len - 1];
1059
1144
 
@@ -1074,13 +1159,13 @@ pub const ViewChangeHeaders = struct {
1074
1159
  }
1075
1160
  };
1076
1161
 
1077
- test "ViewChangeHeaders.view_for_op" {
1162
+ test "Headers.ViewChangeSlice.view_for_op" {
1078
1163
  var headers_array = [_]Header{
1079
1164
  std.mem.zeroInit(Header, .{ .op = 9, .view = 10 }),
1080
1165
  std.mem.zeroInit(Header, .{ .op = 6, .view = 7 }),
1081
1166
  };
1082
1167
 
1083
- const headers = ViewChangeHeaders{ .slice = &headers_array };
1168
+ const headers = Headers.ViewChangeSlice{ .slice = &headers_array };
1084
1169
  try std.testing.expect(std.meta.eql(headers.view_for_op(11, 12), .{ .min = 12, .max = 12 }));
1085
1170
  try std.testing.expect(std.meta.eql(headers.view_for_op(10, 12), .{ .min = 12, .max = 12 }));
1086
1171
  try std.testing.expect(std.meta.eql(headers.view_for_op(9, 12), .{ .min = 10, .max = 10 }));
@@ -1090,3 +1175,251 @@ test "ViewChangeHeaders.view_for_op" {
1090
1175
  try std.testing.expect(std.meta.eql(headers.view_for_op(5, 12), .{ .min = 0, .max = 7 }));
1091
1176
  try std.testing.expect(std.meta.eql(headers.view_for_op(0, 12), .{ .min = 0, .max = 7 }));
1092
1177
  }
1178
+
1179
+ /// The headers of a SV or DVC message.
1180
+ const ViewChangeHeadersArray = struct {
1181
+ array: Headers.Array,
1182
+
1183
+ pub fn root(cluster: u32) ViewChangeHeadersArray {
1184
+ var array = Headers.Array{ .buffer = undefined };
1185
+ array.appendAssumeCapacity(Header.root_prepare(cluster));
1186
+ return ViewChangeHeadersArray.init(array);
1187
+ }
1188
+
1189
+ fn init(array: Headers.Array) ViewChangeHeadersArray {
1190
+ Headers.ViewChangeSlice.verify(array.constSlice());
1191
+ return .{ .array = array };
1192
+ }
1193
+
1194
+ /// This function generates either DVC headers or SV headers:
1195
+ /// - When `current.log_view < current.view`, generate headers for a SV message.
1196
+ /// - When `current.log_view = current.view`, generate headers for a DVC message.
1197
+ ///
1198
+ /// Additionally, the current log_view/view/primary state informs the sort of "faults"
1199
+ /// (gaps/breaks/etc) that we expect to find in the journal headers (`current.headers`).
1200
+ /// For example, backups generating a DVC can safely skip over gaps (if the gap is after the DVC
1201
+ /// anchor).
1202
+ ///
1203
+ /// Primaries and backups both generate DVCs and SVs.
1204
+ /// - However, SVs are broadcast only by the primary.
1205
+ /// - Backups generate a SV for persisting to the superblock.
1206
+ /// (For convenience/symmetry, not correctness).
1207
+ ///
1208
+ /// DVCs and SVs have different invariants they must abide.
1209
+ /// - Read DVCQuorum's comments to understand DVC invariants.
1210
+ /// - SV headers are much simpler: no gaps or breaks, and all uncommitted ops must be included.
1211
+ pub fn build(
1212
+ results: *ViewChangeHeadersArray,
1213
+ options: struct {
1214
+ op_checkpoint: u64,
1215
+ /// The last view_change_headers_max headers of the journal, starting with the head op
1216
+ /// then descending, skipping over all gaps.
1217
+ current: struct {
1218
+ headers: *const Headers.Array,
1219
+ view: u32,
1220
+ log_view: u32,
1221
+ log_view_primary: bool,
1222
+ },
1223
+ // The vsr_headers from the working superblock.
1224
+ // The durable headers are useful (complimenting `current.headers`) because:
1225
+ // - They simplify generation of DVCs in the case where we are recovering from a crash,
1226
+ // when we were generating the same DVC prior to the crash.
1227
+ // - They enable additional verification of header gaps/breaks based on the
1228
+ // gap's/break's position relative to the durable headers.
1229
+ durable: struct {
1230
+ headers: Headers.ViewChangeSlice,
1231
+ view: u32,
1232
+ log_view: u32,
1233
+ log_view_primary: bool,
1234
+ },
1235
+ },
1236
+ ) void {
1237
+ defer Headers.ViewChangeSlice.verify(results.array.constSlice());
1238
+
1239
+ const headers = &results.array;
1240
+ const current = options.current;
1241
+ const durable = options.durable;
1242
+
1243
+ assert(headers.len == 0);
1244
+ assert(durable.headers.slice.len > 0);
1245
+ assert(current.headers.len > 0);
1246
+ for (current.headers.constSlice()[1..]) |*header, i| {
1247
+ assert(current.headers.get(i).op > header.op);
1248
+ }
1249
+
1250
+ assert(current.view >= durable.view);
1251
+ assert(current.log_view >= durable.log_view);
1252
+ assert(current.view >= current.log_view);
1253
+ assert(durable.view >= durable.log_view);
1254
+
1255
+ const op_head_current = current.headers.get(0).op;
1256
+ const op_head_durable = durable.headers.slice[0].op;
1257
+
1258
+ // The rules for generating DVCs and SVs differ. We use the current view numbers to
1259
+ // determine which is being generated:
1260
+ // - When `log_view < view`, generate a DVC.
1261
+ // - When `log_view = view`, generate a SV.
1262
+ const command_current: enum { start_view, do_view_change } =
1263
+ if (current.log_view == current.view) .start_view else .do_view_change;
1264
+ // Likewise, the durable view numbers identify whether the durable headers were from a past
1265
+ // DVC or SV. The durable headers are only useful if they are from the same view as our
1266
+ // current headers, though.
1267
+ const command_durable: enum { start_view, do_view_change, outdated } = command: {
1268
+ if (durable.log_view == current.log_view) {
1269
+ if (durable.log_view == durable.view) {
1270
+ break :command .start_view;
1271
+ } else {
1272
+ break :command .do_view_change;
1273
+ }
1274
+ } else {
1275
+ break :command .outdated;
1276
+ }
1277
+ };
1278
+
1279
+ if (command_durable == .do_view_change and command_current == .do_view_change) {
1280
+ assert(op_head_durable == op_head_current);
1281
+ // Ensure that if we started a DVC before a crash, that we will resume sending the exact
1282
+ // same DVC after recovery. (An alternative implementation would be to load the
1283
+ // superblock's DVC headers (including gaps) into the journal during Replica.open(), but
1284
+ // that is more complicated to implement correctly).
1285
+ for (durable.headers.slice) |*header| headers.appendAssumeCapacity(header.*);
1286
+ return;
1287
+ }
1288
+
1289
+ // What is the relationship between two prepares?
1290
+ const Chain = enum {
1291
+ // The ops are sequential, and the hash-chain is valid.
1292
+ chain_sequence,
1293
+ // The ops are sequential, and the hash-chain is invalid.
1294
+ chain_break,
1295
+ // The ops are non-sequential, and belong to the same view.
1296
+ // This gap never hides a break.
1297
+ chain_view,
1298
+ // The ops are non-sequential, and belong to the different views.
1299
+ // Depending on the replica state, this gap may hide a break.
1300
+ chain_gap,
1301
+ };
1302
+
1303
+ // The DVC anchor: Within the log suffix following the anchor, we have additional
1304
+ // guarantees about the state of the log headers which allow us to tolerate certain
1305
+ // gaps (by locally guaranteeing that the gap does not hide a break).
1306
+ const op_dvc_anchor = std.math.max(
1307
+ options.op_checkpoint,
1308
+ // +1: We may have a full pipeline, but not yet have performed any repair.
1309
+ // In such a case, we want to send those pipeline_prepare_queue_max headers in
1310
+ // the DVC, but not the preceding op (which may belong to a different chain).
1311
+ // This satisfies the DVC invariant because the first op in the pipeline is
1312
+ // "connected" to the canonical chain (via its "parent" checksum).
1313
+ 1 + op_head_current -| constants.pipeline_prepare_queue_max,
1314
+ );
1315
+
1316
+ // Within the "suffix" we can make additional assumptions about gaps/etc.
1317
+ // After the suffix, we just add as many extra (valid) headers as we can fit.
1318
+ var suffix_done = false;
1319
+
1320
+ for (current.headers.constSlice()) |*header, i| {
1321
+ const op = header.op;
1322
+ const chain = chain: {
1323
+ // Always include the head message.
1324
+ if (i == 0) break :chain Chain.chain_sequence;
1325
+
1326
+ const child = headers.get(i - 1);
1327
+ if (child.op == header.op + 1) {
1328
+ break :chain if (child.parent == header.checksum) Chain.chain_sequence else Chain.chain_break;
1329
+ } else {
1330
+ break :chain if (child.view == header.view) Chain.chain_view else Chain.chain_gap;
1331
+ }
1332
+ };
1333
+
1334
+ if (command_current == .start_view) {
1335
+ // Primary: Collect headers for a start_view message.
1336
+ // Backup: these headers are stored in the superblock's vsr_headers.
1337
+ switch (chain) {
1338
+ .chain_sequence => {},
1339
+ // Gaps are due to either:
1340
+ // - entries before checkpoint, which are not repaired, or
1341
+ // - backup missed prepares and has not repaired headers. (Immediately after
1342
+ // receiving a start_view this is not a concern, but the view_durable_update()
1343
+ // may be delayed if another is in progress).
1344
+ .chain_view, .chain_gap => {
1345
+ assert(op <= options.op_checkpoint or !current.log_view_primary);
1346
+ break;
1347
+ },
1348
+ // Breaks are due to:
1349
+ // - entries before checkpoint, which are not repaired
1350
+ .chain_break => {
1351
+ assert(op <= options.op_checkpoint);
1352
+ break;
1353
+ },
1354
+ }
1355
+ } else if (suffix_done) {
1356
+ // Add extra headers to the DVC. These are not required for correctness or
1357
+ // availability, but including extra (correct) headers minimizes header repair at
1358
+ // the new primary.
1359
+ switch (chain) {
1360
+ .chain_sequence => {},
1361
+ .chain_view => {},
1362
+ // Outside of the log suffix, repair may not have been finished, so gaps and
1363
+ // breaks are possible. Non-same-view gaps may hide breaks.
1364
+ .chain_gap => break,
1365
+ .chain_break => break,
1366
+ }
1367
+ } else if (current.log_view_primary and command_durable == .start_view) {
1368
+ switch (chain) {
1369
+ .chain_sequence => {},
1370
+ // Gaps to the right of the (durable) SV originate from:
1371
+ // 1. The primary (durable SV: 1,2,3) prepares several ops (4,5,6).
1372
+ // 2. However, the WAL writes are reordered such that some later ops (5,6)
1373
+ // finish before an earlier op (4).
1374
+ // 3. Crash, recover. Start sending a DVC for the next view. Either:
1375
+ // - There is a gap in the WAL at op=4, but this is to the right of the
1376
+ // durable SV, so it may be safely skipped.
1377
+ // - Same as above, except op=4 was a torn write (or bit rot).
1378
+ .chain_view, .chain_gap => assert(op + 1 > op_head_durable),
1379
+ // Breaks are impossible to the right of the durable SV — journal recovery uses
1380
+ // the durable SV to prune bad headers by their view numbers.
1381
+ .chain_break => unreachable,
1382
+ }
1383
+ suffix_done = op <= op_head_durable;
1384
+ } else if (current.log_view_primary and command_durable != .start_view) {
1385
+ switch (chain) {
1386
+ .chain_sequence => {},
1387
+ .chain_view => {},
1388
+ // The retiring primary may have gap-breaks or breaks in its suffix iff:
1389
+ // - it didn't finish repairs before the second view-change, and
1390
+ // - some uncommitted ops were truncated during the first view-change.
1391
+ // (Truncation "moves" the suffix backwards).
1392
+ .chain_gap => break,
1393
+ .chain_break => break,
1394
+ }
1395
+ suffix_done = op <= op_dvc_anchor;
1396
+ } else if (!current.log_view_primary and command_durable == .start_view) {
1397
+ switch (chain) {
1398
+ .chain_sequence => {},
1399
+ // Backups load a full suffix of headers from the view's SV message. If there
1400
+ // is now a gap in it the bcakup's suffix, this must be due to missed prepares.
1401
+ .chain_view, .chain_gap => assert(op + 1 > op_head_durable),
1402
+ // Breaks are impossible to the right of the durable SV — journal recovery uses
1403
+ // the durable SV to prune bad headers by their view numbers.
1404
+ .chain_break => unreachable,
1405
+ }
1406
+ suffix_done = op <= op_head_durable;
1407
+ } else if (!current.log_view_primary and command_durable != .start_view) {
1408
+ switch (chain) {
1409
+ .chain_sequence => {},
1410
+ .chain_view => {},
1411
+ // Backups load a full suffix of headers from the view's SV message.
1412
+ // That SV isn't durable, but it is part of the journal, so any gaps to its
1413
+ // right must be due to missed prepares.
1414
+ .chain_gap => {},
1415
+ // Breaks are impossible to the right of the ephemeral SV, since the log was
1416
+ // truncated when the SV was installed.
1417
+ .chain_break => unreachable,
1418
+ }
1419
+ suffix_done = op <= op_dvc_anchor;
1420
+ } else unreachable;
1421
+
1422
+ headers.appendAssumeCapacity(header.*);
1423
+ }
1424
+ }
1425
+ };