tigerbeetle-node 0.11.12 → 0.11.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +212 -196
- package/dist/.client.node.sha256 +1 -1
- package/package.json +3 -2
- package/src/node.zig +1 -0
- package/src/tigerbeetle/scripts/benchmark.bat +9 -2
- package/src/tigerbeetle/scripts/benchmark.sh +1 -1
- package/src/tigerbeetle/scripts/fail_on_diff.sh +9 -0
- package/src/tigerbeetle/scripts/fuzz_loop_hash_log.sh +12 -0
- package/src/tigerbeetle/scripts/scripts/benchmark.bat +9 -2
- package/src/tigerbeetle/scripts/scripts/benchmark.sh +1 -1
- package/src/tigerbeetle/scripts/scripts/fail_on_diff.sh +9 -0
- package/src/tigerbeetle/scripts/scripts/fuzz_loop_hash_log.sh +12 -0
- package/src/tigerbeetle/src/benchmark.zig +253 -231
- package/src/tigerbeetle/src/config.zig +2 -3
- package/src/tigerbeetle/src/constants.zig +2 -10
- package/src/tigerbeetle/src/io/linux.zig +15 -6
- package/src/tigerbeetle/src/lsm/forest.zig +1 -0
- package/src/tigerbeetle/src/lsm/forest_fuzz.zig +63 -14
- package/src/tigerbeetle/src/lsm/groove.zig +134 -70
- package/src/tigerbeetle/src/lsm/level_iterator.zig +2 -2
- package/src/tigerbeetle/src/lsm/manifest_level.zig +1 -0
- package/src/tigerbeetle/src/lsm/posted_groove.zig +7 -4
- package/src/tigerbeetle/src/lsm/segmented_array.zig +1 -0
- package/src/tigerbeetle/src/lsm/table.zig +29 -51
- package/src/tigerbeetle/src/lsm/table_immutable.zig +6 -17
- package/src/tigerbeetle/src/lsm/table_iterator.zig +2 -2
- package/src/tigerbeetle/src/lsm/table_mutable.zig +9 -26
- package/src/tigerbeetle/src/lsm/test.zig +1 -0
- package/src/tigerbeetle/src/lsm/tree.zig +2 -26
- package/src/tigerbeetle/src/lsm/tree_fuzz.zig +7 -2
- package/src/tigerbeetle/src/message_bus.zig +1 -0
- package/src/tigerbeetle/src/simulator.zig +14 -3
- package/src/tigerbeetle/src/state_machine/auditor.zig +1 -0
- package/src/tigerbeetle/src/state_machine.zig +402 -184
- package/src/tigerbeetle/src/stdx.zig +9 -0
- package/src/tigerbeetle/src/testing/cluster.zig +1 -0
- package/src/tigerbeetle/src/testing/packet_simulator.zig +19 -9
- package/src/tigerbeetle/src/testing/state_machine.zig +1 -0
- package/src/tigerbeetle/src/unit_tests.zig +20 -22
- package/src/tigerbeetle/src/vsr/README.md +1 -1
- package/src/tigerbeetle/src/vsr/client.zig +4 -4
- package/src/tigerbeetle/src/vsr/clock.zig +2 -0
- package/src/tigerbeetle/src/vsr/journal.zig +2 -0
- package/src/tigerbeetle/src/vsr/replica.zig +481 -246
- package/src/tigerbeetle/src/vsr.zig +104 -31
|
@@ -208,8 +208,21 @@ pub fn ReplicaType(
|
|
|
208
208
|
/// we require and assert in our protocol implementation.
|
|
209
209
|
loopback_queue: ?*Message = null,
|
|
210
210
|
|
|
211
|
-
///
|
|
212
|
-
|
|
211
|
+
/// The last timestamp received on a commit heartbeat.
|
|
212
|
+
/// The timestamp originates from the primary's monotonic clock. It is used to discard
|
|
213
|
+
/// delayed or duplicate heartbeat messages.
|
|
214
|
+
/// (status=normal backup)
|
|
215
|
+
heartbeat_timestamp: u64 = 0,
|
|
216
|
+
|
|
217
|
+
/// While set, don't send commit heartbeats.
|
|
218
|
+
/// Used when the primary believes that it is partitioned and needs to step down.
|
|
219
|
+
/// In particular, guards against a deadlock in the case where small messages (e.g.
|
|
220
|
+
/// heartbeats, pings/pongs) succeed, but large messages (e.g. prepares) fail.
|
|
221
|
+
/// (status=normal primary, pipeline has prepare with !ok_quorum_received)
|
|
222
|
+
primary_abdicating: bool = false,
|
|
223
|
+
|
|
224
|
+
/// Unique start_view_change messages for the same view from ALL replicas (including ourself).
|
|
225
|
+
start_view_change_from_all_replicas: QuorumCounter = quorum_counter_null,
|
|
213
226
|
|
|
214
227
|
/// Unique do_view_change messages for the same view from ALL replicas (including ourself).
|
|
215
228
|
do_view_change_from_all_replicas: QuorumMessages = quorum_messages_null,
|
|
@@ -217,9 +230,6 @@ pub fn ReplicaType(
|
|
|
217
230
|
/// Unique nack_prepare messages for the same view from OTHER replicas (excluding ourself).
|
|
218
231
|
nack_prepare_from_other_replicas: QuorumCounter = quorum_counter_null,
|
|
219
232
|
|
|
220
|
-
/// Whether a replica has received a quorum of start_view_change messages for the view change:
|
|
221
|
-
start_view_change_quorum: bool = false,
|
|
222
|
-
|
|
223
233
|
/// Whether the primary has received a quorum of do_view_change messages for the view change:
|
|
224
234
|
/// Determines whether the primary may effect repairs according to the CTRL protocol.
|
|
225
235
|
do_view_change_quorum: bool = false,
|
|
@@ -230,28 +240,54 @@ pub fn ReplicaType(
|
|
|
230
240
|
/// The number of ticks before a primary or backup broadcasts a ping to other replicas.
|
|
231
241
|
/// TODO Explain why we need this (MessageBus handshaking, leapfrogging faulty replicas,
|
|
232
242
|
/// deciding whether starting a view change would be detrimental under some network partitions).
|
|
243
|
+
/// (status=normal replicas)
|
|
233
244
|
ping_timeout: Timeout,
|
|
234
245
|
|
|
235
246
|
/// The number of ticks without enough prepare_ok's before the primary resends a prepare.
|
|
247
|
+
/// (status=normal primary, pipeline has prepare with !ok_quorum_received)
|
|
236
248
|
prepare_timeout: Timeout,
|
|
237
249
|
|
|
250
|
+
/// The number of ticks waiting for a prepare_ok.
|
|
251
|
+
/// When triggered, set primary_abdicating=true, which pauses outgoing commit heartbeats.
|
|
252
|
+
/// (status=normal primary, pipeline has prepare with !ok_quorum_received)
|
|
253
|
+
primary_abdicate_timeout: Timeout,
|
|
254
|
+
|
|
238
255
|
/// The number of ticks before the primary sends a commit heartbeat:
|
|
239
256
|
/// The primary always sends a commit heartbeat irrespective of when it last sent a prepare.
|
|
240
257
|
/// This improves liveness when prepare messages cannot be replicated fully due to partitions.
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
///
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
///
|
|
248
|
-
|
|
258
|
+
/// (status=normal primary)
|
|
259
|
+
commit_message_timeout: Timeout,
|
|
260
|
+
|
|
261
|
+
/// The number of ticks without a heartbeat.
|
|
262
|
+
/// Reset any time the backup receives a heartbeat from the primary.
|
|
263
|
+
/// Triggers SVC messages. If an SVC quorum is achieved, we will kick off a view-change.
|
|
264
|
+
/// (status=normal backup)
|
|
265
|
+
normal_heartbeat_timeout: Timeout,
|
|
266
|
+
|
|
267
|
+
/// The number of ticks before resetting the SVC quorum.
|
|
268
|
+
/// (status=normal|view-change, SVC quorum contains message from ANY OTHER replica)
|
|
269
|
+
start_view_change_window_timeout: Timeout,
|
|
270
|
+
|
|
271
|
+
/// The number of ticks before resending a `start_view_change` message.
|
|
272
|
+
/// (status=normal|view-change)
|
|
273
|
+
start_view_change_message_timeout: Timeout,
|
|
274
|
+
|
|
275
|
+
/// The number of ticks before a view change is timed out.
|
|
276
|
+
/// When triggered, begin sending SVC messages (to attempt to increment the view and try a
|
|
277
|
+
/// different primary) — but keep trying DVCs as well.
|
|
278
|
+
/// (status=view-change)
|
|
249
279
|
view_change_status_timeout: Timeout,
|
|
250
280
|
|
|
251
|
-
/// The number of ticks before resending a `
|
|
252
|
-
|
|
281
|
+
/// The number of ticks before resending a `do_view_change` message:
|
|
282
|
+
/// (status=view-change)
|
|
283
|
+
do_view_change_message_timeout: Timeout,
|
|
284
|
+
|
|
285
|
+
/// The number of ticks before resending a `request_start_view` message.
|
|
286
|
+
/// (status=view-change backup)
|
|
287
|
+
request_start_view_message_timeout: Timeout,
|
|
253
288
|
|
|
254
289
|
/// The number of ticks before repairing missing/disconnected headers and/or dirty entries:
|
|
290
|
+
/// (status=normal or (status=view-change and primary))
|
|
255
291
|
repair_timeout: Timeout,
|
|
256
292
|
|
|
257
293
|
/// Used to provide deterministic entropy to `choose_any_other_replica()`.
|
|
@@ -479,8 +515,6 @@ pub fn ReplicaType(
|
|
|
479
515
|
time: Time,
|
|
480
516
|
storage: *Storage,
|
|
481
517
|
message_pool: *MessagePool,
|
|
482
|
-
// TODO With https://github.com/coilhq/tigerbeetle/issues/71,
|
|
483
|
-
// the separate message_bus_options won't be necessary.
|
|
484
518
|
message_bus_options: MessageBus.Options,
|
|
485
519
|
state_machine_options: StateMachine.Options,
|
|
486
520
|
};
|
|
@@ -578,26 +612,46 @@ pub fn ReplicaType(
|
|
|
578
612
|
.id = replica_index,
|
|
579
613
|
.after = 50,
|
|
580
614
|
},
|
|
581
|
-
.
|
|
582
|
-
.name = "
|
|
615
|
+
.primary_abdicate_timeout = Timeout{
|
|
616
|
+
.name = "primary_abdicate_timeout",
|
|
583
617
|
.id = replica_index,
|
|
584
|
-
.after =
|
|
618
|
+
.after = 1000,
|
|
619
|
+
},
|
|
620
|
+
.commit_message_timeout = Timeout{
|
|
621
|
+
.name = "commit_message_timeout",
|
|
622
|
+
.id = replica_index,
|
|
623
|
+
.after = 50,
|
|
624
|
+
},
|
|
625
|
+
.normal_heartbeat_timeout = Timeout{
|
|
626
|
+
.name = "normal_heartbeat_timeout",
|
|
627
|
+
.id = replica_index,
|
|
628
|
+
.after = 500,
|
|
585
629
|
},
|
|
586
|
-
.
|
|
587
|
-
.name = "
|
|
630
|
+
.start_view_change_window_timeout = Timeout{
|
|
631
|
+
.name = "start_view_change_window_timeout",
|
|
588
632
|
.id = replica_index,
|
|
589
633
|
.after = 500,
|
|
590
634
|
},
|
|
635
|
+
.start_view_change_message_timeout = Timeout{
|
|
636
|
+
.name = "start_view_change_message_timeout",
|
|
637
|
+
.id = replica_index,
|
|
638
|
+
.after = 50,
|
|
639
|
+
},
|
|
591
640
|
.view_change_status_timeout = Timeout{
|
|
592
641
|
.name = "view_change_status_timeout",
|
|
593
642
|
.id = replica_index,
|
|
594
643
|
.after = 500,
|
|
595
644
|
},
|
|
596
|
-
.
|
|
597
|
-
.name = "
|
|
645
|
+
.do_view_change_message_timeout = Timeout{
|
|
646
|
+
.name = "do_view_change_message_timeout",
|
|
598
647
|
.id = replica_index,
|
|
599
648
|
.after = 50,
|
|
600
649
|
},
|
|
650
|
+
.request_start_view_message_timeout = Timeout{
|
|
651
|
+
.name = "request_start_view_message_timeout",
|
|
652
|
+
.id = replica_index,
|
|
653
|
+
.after = 100,
|
|
654
|
+
},
|
|
601
655
|
.repair_timeout = Timeout{
|
|
602
656
|
.name = "repair_timeout",
|
|
603
657
|
.id = replica_index,
|
|
@@ -686,18 +740,26 @@ pub fn ReplicaType(
|
|
|
686
740
|
|
|
687
741
|
self.ping_timeout.tick();
|
|
688
742
|
self.prepare_timeout.tick();
|
|
689
|
-
self.
|
|
690
|
-
self.
|
|
743
|
+
self.primary_abdicate_timeout.tick();
|
|
744
|
+
self.commit_message_timeout.tick();
|
|
745
|
+
self.normal_heartbeat_timeout.tick();
|
|
746
|
+
self.start_view_change_window_timeout.tick();
|
|
747
|
+
self.start_view_change_message_timeout.tick();
|
|
691
748
|
self.view_change_status_timeout.tick();
|
|
692
|
-
self.
|
|
749
|
+
self.do_view_change_message_timeout.tick();
|
|
750
|
+
self.request_start_view_message_timeout.tick();
|
|
693
751
|
self.repair_timeout.tick();
|
|
694
752
|
|
|
695
753
|
if (self.ping_timeout.fired()) self.on_ping_timeout();
|
|
696
754
|
if (self.prepare_timeout.fired()) self.on_prepare_timeout();
|
|
697
|
-
if (self.
|
|
698
|
-
if (self.
|
|
755
|
+
if (self.primary_abdicate_timeout.fired()) self.on_primary_abdicate_timeout();
|
|
756
|
+
if (self.commit_message_timeout.fired()) self.on_commit_message_timeout();
|
|
757
|
+
if (self.normal_heartbeat_timeout.fired()) self.on_normal_heartbeat_timeout();
|
|
758
|
+
if (self.start_view_change_window_timeout.fired()) self.on_start_view_change_window_timeout();
|
|
759
|
+
if (self.start_view_change_message_timeout.fired()) self.on_start_view_change_message_timeout();
|
|
699
760
|
if (self.view_change_status_timeout.fired()) self.on_view_change_status_timeout();
|
|
700
|
-
if (self.
|
|
761
|
+
if (self.do_view_change_message_timeout.fired()) self.on_do_view_change_message_timeout();
|
|
762
|
+
if (self.request_start_view_message_timeout.fired()) self.on_request_start_view_message_timeout();
|
|
701
763
|
if (self.repair_timeout.fired()) self.on_repair_timeout();
|
|
702
764
|
|
|
703
765
|
// None of the on_timeout() functions above should send a message to this replica.
|
|
@@ -743,6 +805,7 @@ pub fn ReplicaType(
|
|
|
743
805
|
switch (message.header.command) {
|
|
744
806
|
.ping => self.on_ping(message),
|
|
745
807
|
.pong => self.on_pong(message),
|
|
808
|
+
.ping_client => self.on_ping_client(message),
|
|
746
809
|
.request => self.on_request(message),
|
|
747
810
|
.prepare => self.on_prepare(message),
|
|
748
811
|
.prepare_ok => self.on_prepare_ok(message),
|
|
@@ -757,7 +820,7 @@ pub fn ReplicaType(
|
|
|
757
820
|
.headers => self.on_headers(message),
|
|
758
821
|
.nack_prepare => self.on_nack_prepare(message),
|
|
759
822
|
// A replica should never handle misdirected messages intended for a client:
|
|
760
|
-
.eviction, .reply => {
|
|
823
|
+
.pong_client, .eviction, .reply => {
|
|
761
824
|
log.warn("{}: on_message: ignoring misdirected {s} message", .{
|
|
762
825
|
self.replica,
|
|
763
826
|
@tagName(message.header.command),
|
|
@@ -783,58 +846,34 @@ pub fn ReplicaType(
|
|
|
783
846
|
tracer.flush();
|
|
784
847
|
}
|
|
785
848
|
|
|
786
|
-
|
|
787
|
-
// - By clients, to learn about the current view.
|
|
788
|
-
// - By replicas, to synchronise cluster time and to probe for network connectivity.
|
|
789
|
-
//
|
|
790
|
-
// In the second case we avoid setting the view to make sure pings can still be sent
|
|
791
|
-
// during view changes.
|
|
849
|
+
/// Pings are used by replicas to synchronise cluster time and to probe for network connectivity.
|
|
792
850
|
fn on_ping(self: *Self, message: *const Message) void {
|
|
851
|
+
assert(message.header.command == .ping);
|
|
793
852
|
if (self.status != .normal and self.status != .view_change) return;
|
|
794
853
|
|
|
795
854
|
assert(self.status == .normal or self.status == .view_change);
|
|
796
855
|
|
|
856
|
+
if (message.header.replica == self.replica) {
|
|
857
|
+
log.warn("{}: on_ping: ignoring (self)", .{self.replica});
|
|
858
|
+
return;
|
|
859
|
+
}
|
|
860
|
+
|
|
797
861
|
// TODO Drop pings that were not addressed to us.
|
|
798
862
|
|
|
799
|
-
|
|
863
|
+
self.send_header_to_replica(message.header.replica, .{
|
|
800
864
|
.command = .pong,
|
|
801
865
|
.cluster = self.cluster,
|
|
802
866
|
.replica = self.replica,
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
// We must only ever send our view number to a client via a pong message if we are
|
|
809
|
-
// in normal status. Otherwise, we may be partitioned from the cluster with a newer
|
|
810
|
-
// view number, leak this to the client, which would then pass this to the cluster
|
|
811
|
-
// in subsequent client requests, which would then ignore these client requests with
|
|
812
|
-
// a newer view number, locking out the client. The principle here is that we must
|
|
813
|
-
// never send view numbers for views that have not yet started.
|
|
814
|
-
if (self.status == .normal) {
|
|
815
|
-
pong.view = self.view;
|
|
816
|
-
self.send_header_to_client(message.header.client, pong);
|
|
817
|
-
}
|
|
818
|
-
} else {
|
|
819
|
-
assert(message.header.view == 0);
|
|
820
|
-
|
|
821
|
-
if (message.header.replica == self.replica) {
|
|
822
|
-
log.warn("{}: on_ping: ignoring (self)", .{self.replica});
|
|
823
|
-
} else {
|
|
824
|
-
// Copy the ping's monotonic timestamp to our pong and add our wall clock sample:
|
|
825
|
-
pong.op = message.header.op;
|
|
826
|
-
pong.timestamp = @bitCast(u64, self.clock.realtime());
|
|
827
|
-
self.send_header_to_replica(message.header.replica, pong);
|
|
828
|
-
}
|
|
829
|
-
}
|
|
867
|
+
// Copy the ping's monotonic timestamp to our pong and add our wall clock sample:
|
|
868
|
+
.op = message.header.op,
|
|
869
|
+
.timestamp = @bitCast(u64, self.clock.realtime()),
|
|
870
|
+
});
|
|
830
871
|
}
|
|
831
872
|
|
|
832
873
|
fn on_pong(self: *Self, message: *const Message) void {
|
|
833
|
-
|
|
874
|
+
assert(message.header.command == .pong);
|
|
834
875
|
if (message.header.replica == self.replica) return;
|
|
835
876
|
|
|
836
|
-
assert(message.header.view == 0);
|
|
837
|
-
|
|
838
877
|
const m0 = message.header.op;
|
|
839
878
|
const t1 = @bitCast(i64, message.header.timestamp);
|
|
840
879
|
const m2 = self.clock.monotonic();
|
|
@@ -842,6 +881,27 @@ pub fn ReplicaType(
|
|
|
842
881
|
self.clock.learn(message.header.replica, m0, t1, m2);
|
|
843
882
|
}
|
|
844
883
|
|
|
884
|
+
/// Pings are used by clients to learn about the current view.
|
|
885
|
+
fn on_ping_client(self: *Self, message: *const Message) void {
|
|
886
|
+
assert(message.header.command == .ping_client);
|
|
887
|
+
assert(message.header.client != 0);
|
|
888
|
+
|
|
889
|
+
// We must only ever send our view number to a client via a pong message if we are
|
|
890
|
+
// in normal status. Otherwise, we may be partitioned from the cluster with a newer
|
|
891
|
+
// view number, leak this to the client, which would then pass this to the cluster
|
|
892
|
+
// in subsequent client requests, which would then ignore these client requests with
|
|
893
|
+
// a newer view number, locking out the client. The principle here is that we must
|
|
894
|
+
// never send view numbers for views that have not yet started.
|
|
895
|
+
if (self.status != .normal) return;
|
|
896
|
+
|
|
897
|
+
self.send_header_to_client(message.header.client, .{
|
|
898
|
+
.command = .pong_client,
|
|
899
|
+
.cluster = self.cluster,
|
|
900
|
+
.replica = self.replica,
|
|
901
|
+
.view = self.view,
|
|
902
|
+
});
|
|
903
|
+
}
|
|
904
|
+
|
|
845
905
|
/// When there is free space in the pipeline's prepare queue:
|
|
846
906
|
/// The primary advances op-number, adds the request to the end of the log, and updates the
|
|
847
907
|
/// information for this client in the client-table to contain the new request number, s.
|
|
@@ -905,6 +965,7 @@ pub fn ReplicaType(
|
|
|
905
965
|
/// If the next replica is down or partitioned, then the primary's prepare timeout will fire,
|
|
906
966
|
/// and the primary will resend but to another replica, until it receives enough prepare_ok's.
|
|
907
967
|
fn on_prepare(self: *Self, message: *Message) void {
|
|
968
|
+
assert(message.header.command == .prepare);
|
|
908
969
|
self.view_jump(message.header);
|
|
909
970
|
|
|
910
971
|
if (self.is_repair(message)) {
|
|
@@ -949,8 +1010,6 @@ pub fn ReplicaType(
|
|
|
949
1010
|
assert(message.header.op > self.commit_min);
|
|
950
1011
|
assert(message.header.op <= self.op_checkpoint_trigger());
|
|
951
1012
|
|
|
952
|
-
if (self.backup()) self.normal_status_timeout.reset();
|
|
953
|
-
|
|
954
1013
|
if (message.header.op > self.op + 1) {
|
|
955
1014
|
log.debug("{}: on_prepare: newer op", .{self.replica});
|
|
956
1015
|
self.jump_to_newer_op_in_normal_status(message.header);
|
|
@@ -988,6 +1047,7 @@ pub fn ReplicaType(
|
|
|
988
1047
|
}
|
|
989
1048
|
|
|
990
1049
|
fn on_prepare_ok(self: *Self, message: *Message) void {
|
|
1050
|
+
assert(message.header.command == .prepare_ok);
|
|
991
1051
|
if (self.ignore_prepare_ok(message)) return;
|
|
992
1052
|
|
|
993
1053
|
assert(self.status == .normal);
|
|
@@ -1020,12 +1080,20 @@ pub fn ReplicaType(
|
|
|
1020
1080
|
else
|
|
1021
1081
|
self.quorum_replication;
|
|
1022
1082
|
|
|
1083
|
+
if (!prepare.ok_from_all_replicas.isSet(message.header.replica)) {
|
|
1084
|
+
self.primary_abdicating = false;
|
|
1085
|
+
if (!prepare.ok_quorum_received) {
|
|
1086
|
+
self.primary_abdicate_timeout.reset();
|
|
1087
|
+
}
|
|
1088
|
+
}
|
|
1089
|
+
|
|
1023
1090
|
const count = self.count_message_and_receive_quorum_exactly_once(
|
|
1024
1091
|
&prepare.ok_from_all_replicas,
|
|
1025
1092
|
message,
|
|
1026
1093
|
threshold,
|
|
1027
1094
|
) orelse return;
|
|
1028
1095
|
|
|
1096
|
+
const prepare_pending = self.primary_pipeline_pending().?;
|
|
1029
1097
|
assert(count == threshold);
|
|
1030
1098
|
assert(!prepare.ok_quorum_received);
|
|
1031
1099
|
prepare.ok_quorum_received = true;
|
|
@@ -1035,6 +1103,16 @@ pub fn ReplicaType(
|
|
|
1035
1103
|
prepare.message.header.checksum,
|
|
1036
1104
|
});
|
|
1037
1105
|
|
|
1106
|
+
assert(self.prepare_timeout.ticking);
|
|
1107
|
+
assert(self.primary_abdicate_timeout.ticking);
|
|
1108
|
+
assert(!self.primary_abdicating);
|
|
1109
|
+
if (self.primary_pipeline_pending()) |_| {
|
|
1110
|
+
if (prepare_pending == prepare) self.prepare_timeout.reset();
|
|
1111
|
+
} else {
|
|
1112
|
+
self.prepare_timeout.stop();
|
|
1113
|
+
self.primary_abdicate_timeout.stop();
|
|
1114
|
+
}
|
|
1115
|
+
|
|
1038
1116
|
self.commit_pipeline();
|
|
1039
1117
|
}
|
|
1040
1118
|
|
|
@@ -1043,6 +1121,8 @@ pub fn ReplicaType(
|
|
|
1043
1121
|
/// It's possible for the network to be one-way partitioned so that backups don't see the
|
|
1044
1122
|
/// primary as down, but neither can the primary hear from the backups.
|
|
1045
1123
|
fn on_commit(self: *Self, message: *const Message) void {
|
|
1124
|
+
assert(message.header.command == .commit);
|
|
1125
|
+
|
|
1046
1126
|
self.view_jump(message.header);
|
|
1047
1127
|
|
|
1048
1128
|
if (self.status != .normal) {
|
|
@@ -1070,6 +1150,13 @@ pub fn ReplicaType(
|
|
|
1070
1150
|
assert(message.header.view == self.view);
|
|
1071
1151
|
assert(message.header.replica == self.primary_index(message.header.view));
|
|
1072
1152
|
|
|
1153
|
+
// Old/duplicate heartbeats don't count.
|
|
1154
|
+
if (self.heartbeat_timestamp < message.header.timestamp) {
|
|
1155
|
+
self.heartbeat_timestamp = message.header.timestamp;
|
|
1156
|
+
self.normal_heartbeat_timeout.reset();
|
|
1157
|
+
self.start_view_change_from_all_replicas.unset(self.replica);
|
|
1158
|
+
}
|
|
1159
|
+
|
|
1073
1160
|
// We may not always have the latest commit entry but if we do our checksum must match:
|
|
1074
1161
|
if (self.journal.header_with_op(message.header.commit)) |commit_entry| {
|
|
1075
1162
|
if (commit_entry.checksum == message.header.context) {
|
|
@@ -1082,7 +1169,6 @@ pub fn ReplicaType(
|
|
|
1082
1169
|
}
|
|
1083
1170
|
}
|
|
1084
1171
|
|
|
1085
|
-
self.normal_status_timeout.reset();
|
|
1086
1172
|
self.commit_journal(message.header.commit);
|
|
1087
1173
|
}
|
|
1088
1174
|
|
|
@@ -1154,47 +1240,53 @@ pub fn ReplicaType(
|
|
|
1154
1240
|
}
|
|
1155
1241
|
|
|
1156
1242
|
fn on_start_view_change(self: *Self, message: *Message) void {
|
|
1157
|
-
|
|
1243
|
+
assert(message.header.command == .start_view_change);
|
|
1244
|
+
if (self.ignore_start_view_change_message(message)) return;
|
|
1158
1245
|
|
|
1246
|
+
assert(self.replica_count > 1);
|
|
1159
1247
|
assert(self.status == .normal or self.status == .view_change);
|
|
1160
1248
|
assert(message.header.view >= self.view);
|
|
1161
|
-
assert(message.header.replica != self.replica);
|
|
1162
1249
|
|
|
1163
1250
|
self.view_jump(message.header);
|
|
1164
1251
|
|
|
1165
|
-
assert(self.status == .view_change);
|
|
1166
1252
|
assert(message.header.view == self.view);
|
|
1167
1253
|
|
|
1168
|
-
// Wait until we have `f` messages (
|
|
1169
|
-
|
|
1170
|
-
|
|
1254
|
+
// Wait until we have `f + 1` messages (possibly including ourself) for quorum.
|
|
1255
|
+
// This ensures that we do not start a view-change while normal request processing
|
|
1256
|
+
// is possible.
|
|
1257
|
+
const threshold = self.quorum_view_change;
|
|
1171
1258
|
|
|
1172
|
-
|
|
1173
|
-
&self.start_view_change_from_other_replicas,
|
|
1174
|
-
message,
|
|
1175
|
-
threshold,
|
|
1176
|
-
) orelse return;
|
|
1259
|
+
self.start_view_change_from_all_replicas.set(message.header.replica);
|
|
1177
1260
|
|
|
1178
|
-
|
|
1179
|
-
|
|
1261
|
+
if (self.replica != message.header.replica and
|
|
1262
|
+
!self.start_view_change_window_timeout.ticking)
|
|
1263
|
+
{
|
|
1264
|
+
self.start_view_change_window_timeout.start();
|
|
1265
|
+
}
|
|
1266
|
+
|
|
1267
|
+
const count = self.start_view_change_from_all_replicas.count();
|
|
1268
|
+
assert(count <= threshold);
|
|
1269
|
+
|
|
1270
|
+
if (count < threshold) {
|
|
1271
|
+
log.debug("{}: on_start_view_change: view={} waiting for quorum ({}/{})", .{
|
|
1272
|
+
self.replica,
|
|
1273
|
+
self.view,
|
|
1274
|
+
count,
|
|
1275
|
+
threshold,
|
|
1276
|
+
});
|
|
1277
|
+
return;
|
|
1278
|
+
}
|
|
1180
1279
|
log.debug("{}: on_start_view_change: view={} quorum received", .{
|
|
1181
1280
|
self.replica,
|
|
1182
1281
|
self.view,
|
|
1183
1282
|
});
|
|
1184
1283
|
|
|
1185
|
-
|
|
1186
|
-
assert(
|
|
1187
|
-
self.start_view_change_quorum = true;
|
|
1188
|
-
|
|
1189
|
-
// When replica i receives start_view_change messages for its view from f other replicas,
|
|
1190
|
-
// it sends a ⟨do_view_change v, l, v’, n, k, i⟩ message to the node that will be the
|
|
1191
|
-
// primary in the new view. Here v is its view, l is its log, v′ is the view number of the
|
|
1192
|
-
// latest view in which its status was normal, n is the op number, and k is the commit
|
|
1193
|
-
// number.
|
|
1194
|
-
self.send_do_view_change();
|
|
1195
|
-
defer self.flush_loopback_queue();
|
|
1284
|
+
self.transition_to_view_change_status(self.view + 1);
|
|
1285
|
+
assert(self.start_view_change_from_all_replicas.count() == 0);
|
|
1196
1286
|
}
|
|
1197
1287
|
|
|
1288
|
+
/// DVC serves two purposes:
|
|
1289
|
+
///
|
|
1198
1290
|
/// When the new primary receives f + 1 do_view_change messages from different replicas
|
|
1199
1291
|
/// (including itself), it sets its view number to that in the messages and selects as the
|
|
1200
1292
|
/// new log the one contained in the message with the largest v′; if several messages have
|
|
@@ -1204,22 +1296,25 @@ pub fn ReplicaType(
|
|
|
1204
1296
|
/// informs the other replicas of the completion of the view change by sending
|
|
1205
1297
|
/// ⟨start_view v, l, n, k⟩ messages to the other replicas, where l is the new log, n is the
|
|
1206
1298
|
/// op number, and k is the commit number.
|
|
1299
|
+
///
|
|
1300
|
+
/// When a new backup receives a do_view_change message for a new view, it transitions to
|
|
1301
|
+
/// that new view in view-change status and begins to broadcast its own DVC.
|
|
1207
1302
|
fn on_do_view_change(self: *Self, message: *Message) void {
|
|
1303
|
+
assert(message.header.command == .do_view_change);
|
|
1208
1304
|
if (self.ignore_view_change_message(message)) return;
|
|
1209
1305
|
|
|
1210
1306
|
assert(self.status == .normal or self.status == .view_change);
|
|
1211
1307
|
assert(message.header.view >= self.view);
|
|
1212
|
-
assert(self.primary_index(message.header.view) == self.replica);
|
|
1213
1308
|
|
|
1214
1309
|
self.view_jump(message.header);
|
|
1215
1310
|
|
|
1216
1311
|
assert(self.status == .view_change);
|
|
1217
1312
|
assert(message.header.view == self.view);
|
|
1218
1313
|
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
log.debug("{}: on_do_view_change:
|
|
1314
|
+
if (self.primary_index(message.header.view) != self.replica) {
|
|
1315
|
+
for (self.do_view_change_from_all_replicas) |dvc| assert(dvc == null);
|
|
1316
|
+
|
|
1317
|
+
log.debug("{}: on_do_view_change: view={} backup awaiting start_view", .{
|
|
1223
1318
|
self.replica,
|
|
1224
1319
|
self.view,
|
|
1225
1320
|
});
|
|
@@ -1244,7 +1339,6 @@ pub fn ReplicaType(
|
|
|
1244
1339
|
self.view,
|
|
1245
1340
|
});
|
|
1246
1341
|
|
|
1247
|
-
assert(self.start_view_change_quorum);
|
|
1248
1342
|
assert(!self.do_view_change_quorum);
|
|
1249
1343
|
self.do_view_change_quorum = true;
|
|
1250
1344
|
|
|
@@ -1274,6 +1368,7 @@ pub fn ReplicaType(
|
|
|
1274
1368
|
/// they execute all operations known to be committed that they haven’t executed previously,
|
|
1275
1369
|
/// advance their commit number, and update the information in their client table.
|
|
1276
1370
|
fn on_start_view(self: *Self, message: *const Message) void {
|
|
1371
|
+
assert(message.header.command == .start_view);
|
|
1277
1372
|
if (self.ignore_view_change_message(message)) return;
|
|
1278
1373
|
|
|
1279
1374
|
if (message.header.op > self.op_checkpoint_trigger()) {
|
|
@@ -1320,6 +1415,7 @@ pub fn ReplicaType(
|
|
|
1320
1415
|
}
|
|
1321
1416
|
|
|
1322
1417
|
fn on_request_start_view(self: *Self, message: *const Message) void {
|
|
1418
|
+
assert(message.header.command == .request_start_view);
|
|
1323
1419
|
if (self.ignore_repair_message(message)) return;
|
|
1324
1420
|
|
|
1325
1421
|
assert(self.status == .normal);
|
|
@@ -1350,6 +1446,7 @@ pub fn ReplicaType(
|
|
|
1350
1446
|
/// prepare. If a guaranteed prepare is found to by faulty, the replica must repair it
|
|
1351
1447
|
/// to restore durability.
|
|
1352
1448
|
fn on_request_prepare(self: *Self, message: *const Message) void {
|
|
1449
|
+
assert(message.header.command == .request_prepare);
|
|
1353
1450
|
if (self.ignore_repair_message(message)) return;
|
|
1354
1451
|
|
|
1355
1452
|
assert(self.replica_count > 1);
|
|
@@ -1483,6 +1580,7 @@ pub fn ReplicaType(
|
|
|
1483
1580
|
}
|
|
1484
1581
|
|
|
1485
1582
|
fn on_request_headers(self: *Self, message: *const Message) void {
|
|
1583
|
+
assert(message.header.command == .request_headers);
|
|
1486
1584
|
if (self.ignore_repair_message(message)) return;
|
|
1487
1585
|
|
|
1488
1586
|
assert(self.status == .normal or self.status == .view_change);
|
|
@@ -1525,6 +1623,7 @@ pub fn ReplicaType(
|
|
|
1525
1623
|
}
|
|
1526
1624
|
|
|
1527
1625
|
fn on_nack_prepare(self: *Self, message: *Message) void {
|
|
1626
|
+
assert(message.header.command == .nack_prepare);
|
|
1528
1627
|
if (self.ignore_repair_message(message)) return;
|
|
1529
1628
|
|
|
1530
1629
|
assert(self.status == .view_change);
|
|
@@ -1622,6 +1721,7 @@ pub fn ReplicaType(
|
|
|
1622
1721
|
}
|
|
1623
1722
|
|
|
1624
1723
|
fn on_headers(self: *Self, message: *const Message) void {
|
|
1724
|
+
assert(message.header.command == .headers);
|
|
1625
1725
|
if (self.ignore_repair_message(message)) return;
|
|
1626
1726
|
|
|
1627
1727
|
assert(self.status == .normal or self.status == .view_change);
|
|
@@ -1665,12 +1765,12 @@ pub fn ReplicaType(
|
|
|
1665
1765
|
assert(self.status == .normal);
|
|
1666
1766
|
assert(self.primary());
|
|
1667
1767
|
|
|
1668
|
-
const prepare = self.
|
|
1669
|
-
assert(prepare.message.header.command == .prepare);
|
|
1670
|
-
assert(prepare.message.header.op == self.commit_min + 1);
|
|
1768
|
+
const prepare = self.primary_pipeline_pending().?;
|
|
1671
1769
|
|
|
1672
|
-
if (
|
|
1673
|
-
|
|
1770
|
+
if (self.replica_count == 1) {
|
|
1771
|
+
// Replica=1 doesn't write prepares concurrently to avoid gaps in its WAL.
|
|
1772
|
+
assert(self.journal.writes.executing() <= 1);
|
|
1773
|
+
assert(self.journal.writes.executing() == 1 or self.committing);
|
|
1674
1774
|
|
|
1675
1775
|
self.prepare_timeout.reset();
|
|
1676
1776
|
return;
|
|
@@ -1701,10 +1801,15 @@ pub fn ReplicaType(
|
|
|
1701
1801
|
}
|
|
1702
1802
|
|
|
1703
1803
|
if (waiting_len == 0) {
|
|
1704
|
-
|
|
1804
|
+
// TODO: This assert will be valid when the state-transfer is implemented and the
|
|
1805
|
+
// threshold=replica_count hack is removed from on_prepare_ok.
|
|
1806
|
+
// assert(self.quorum_replication == self.replica_count);
|
|
1807
|
+
assert(!prepare.ok_from_all_replicas.isSet(self.replica));
|
|
1808
|
+
assert(prepare.ok_from_all_replicas.count() == self.replica_count - 1);
|
|
1809
|
+
assert(prepare.message.header.op <= self.op);
|
|
1705
1810
|
|
|
1811
|
+
self.prepare_timeout.reset();
|
|
1706
1812
|
log.debug("{}: on_prepare_timeout: waiting for journal", .{self.replica});
|
|
1707
|
-
assert(!prepare.ok_from_all_replicas.isSet(self.replica));
|
|
1708
1813
|
|
|
1709
1814
|
// We may be slow and waiting for the write to complete.
|
|
1710
1815
|
//
|
|
@@ -1716,9 +1821,7 @@ pub fn ReplicaType(
|
|
|
1716
1821
|
//
|
|
1717
1822
|
// Retry the write through `on_repair()` which will work out which is which.
|
|
1718
1823
|
// We do expect that the op would have been run through `on_prepare()` already.
|
|
1719
|
-
assert(prepare.message.header.op <= self.op);
|
|
1720
1824
|
self.on_repair(prepare.message);
|
|
1721
|
-
|
|
1722
1825
|
return;
|
|
1723
1826
|
}
|
|
1724
1827
|
|
|
@@ -1746,13 +1849,39 @@ pub fn ReplicaType(
|
|
|
1746
1849
|
self.send_message_to_replica(replica, prepare.message);
|
|
1747
1850
|
}
|
|
1748
1851
|
|
|
1749
|
-
fn
|
|
1750
|
-
self.
|
|
1852
|
+
fn on_primary_abdicate_timeout(self: *Self) void {
|
|
1853
|
+
assert(self.status == .normal);
|
|
1854
|
+
assert(self.primary());
|
|
1855
|
+
assert(self.primary_pipeline_pending() != null);
|
|
1856
|
+
self.primary_abdicate_timeout.reset();
|
|
1857
|
+
if (self.replica_count == 1) return;
|
|
1858
|
+
|
|
1859
|
+
log.debug("{}: on_primary_abdicate_timeout: abdicating (view={})", .{
|
|
1860
|
+
self.replica,
|
|
1861
|
+
self.view,
|
|
1862
|
+
});
|
|
1863
|
+
self.primary_abdicating = true;
|
|
1864
|
+
}
|
|
1865
|
+
|
|
1866
|
+
fn on_commit_message_timeout(self: *Self) void {
|
|
1867
|
+
self.commit_message_timeout.reset();
|
|
1751
1868
|
|
|
1752
1869
|
assert(self.status == .normal);
|
|
1753
1870
|
assert(self.primary());
|
|
1754
1871
|
assert(self.commit_min == self.commit_max);
|
|
1755
1872
|
|
|
1873
|
+
if (self.primary_abdicating) {
|
|
1874
|
+
assert(self.primary_abdicate_timeout.ticking);
|
|
1875
|
+
assert(self.pipeline.queue.prepare_queue.count > 0);
|
|
1876
|
+
assert(self.primary_pipeline_pending() != null);
|
|
1877
|
+
|
|
1878
|
+
log.debug("{}: on_commit_message_timeout: primary abdicating (view={})", .{
|
|
1879
|
+
self.replica,
|
|
1880
|
+
self.view,
|
|
1881
|
+
});
|
|
1882
|
+
return;
|
|
1883
|
+
}
|
|
1884
|
+
|
|
1756
1885
|
const latest_committed_entry = checksum: {
|
|
1757
1886
|
if (self.commit_max == self.superblock.working.vsr_state.commit_min) {
|
|
1758
1887
|
break :checksum self.superblock.working.vsr_state.commit_min_checksum;
|
|
@@ -1768,37 +1897,86 @@ pub fn ReplicaType(
|
|
|
1768
1897
|
.replica = self.replica,
|
|
1769
1898
|
.view = self.view,
|
|
1770
1899
|
.commit = self.commit_max,
|
|
1900
|
+
.timestamp = self.clock.monotonic(),
|
|
1771
1901
|
});
|
|
1772
1902
|
}
|
|
1773
1903
|
|
|
1774
|
-
fn
|
|
1904
|
+
fn on_normal_heartbeat_timeout(self: *Self) void {
|
|
1775
1905
|
assert(self.status == .normal);
|
|
1776
1906
|
assert(self.backup());
|
|
1777
|
-
self.
|
|
1907
|
+
self.normal_heartbeat_timeout.reset();
|
|
1908
|
+
|
|
1909
|
+
if (self.replica_count == 1) return;
|
|
1910
|
+
|
|
1911
|
+
log.debug("{}: on_normal_heartbeat_timeout: heartbeat lost (view={})", .{
|
|
1912
|
+
self.replica,
|
|
1913
|
+
self.view,
|
|
1914
|
+
});
|
|
1915
|
+
self.send_start_view_change();
|
|
1916
|
+
}
|
|
1917
|
+
|
|
1918
|
+
fn on_start_view_change_window_timeout(self: *Self) void {
|
|
1919
|
+
assert(self.status == .normal or self.status == .view_change);
|
|
1920
|
+
assert(self.start_view_change_from_all_replicas.count() > 0);
|
|
1921
|
+
assert(self.replica_count > 1);
|
|
1922
|
+
self.start_view_change_window_timeout.stop();
|
|
1923
|
+
|
|
1924
|
+
// Don't reset our own SVC; it will be reset if/when we receive a heartbeat.
|
|
1925
|
+
const svc = self.start_view_change_from_all_replicas.isSet(self.replica);
|
|
1926
|
+
self.reset_quorum_start_view_change();
|
|
1927
|
+
if (svc) self.start_view_change_from_all_replicas.set(self.replica);
|
|
1928
|
+
}
|
|
1929
|
+
|
|
1930
|
+
fn on_start_view_change_message_timeout(self: *Self) void {
|
|
1931
|
+
assert(self.status == .normal or self.status == .view_change);
|
|
1932
|
+
self.start_view_change_message_timeout.reset();
|
|
1933
|
+
if (self.replica_count == 1) return;
|
|
1934
|
+
|
|
1935
|
+
if (self.start_view_change_from_all_replicas.isSet(self.replica)) {
|
|
1936
|
+
self.send_start_view_change();
|
|
1937
|
+
}
|
|
1778
1938
|
}
|
|
1779
1939
|
|
|
1780
1940
|
fn on_view_change_status_timeout(self: *Self) void {
|
|
1781
1941
|
assert(self.status == .view_change);
|
|
1782
|
-
|
|
1942
|
+
assert(self.replica_count > 1);
|
|
1943
|
+
self.view_change_status_timeout.reset();
|
|
1944
|
+
|
|
1945
|
+
self.send_start_view_change();
|
|
1783
1946
|
}
|
|
1784
1947
|
|
|
1785
|
-
fn
|
|
1786
|
-
self.view_change_message_timeout.reset();
|
|
1948
|
+
fn on_do_view_change_message_timeout(self: *Self) void {
|
|
1787
1949
|
assert(self.status == .view_change);
|
|
1950
|
+
assert(self.replica_count > 1);
|
|
1951
|
+
self.do_view_change_message_timeout.reset();
|
|
1788
1952
|
|
|
1789
|
-
|
|
1790
|
-
|
|
1791
|
-
|
|
1792
|
-
|
|
1793
|
-
|
|
1794
|
-
|
|
1795
|
-
|
|
1796
|
-
// The primary need not retry to send a `do_view_change` message to itself:
|
|
1797
|
-
// We assume the MessageBus will not drop messages sent by a replica to itself.
|
|
1798
|
-
if (self.primary_index(self.view) != self.replica) self.send_do_view_change();
|
|
1953
|
+
if (self.primary_index(self.view) == self.replica and self.do_view_change_quorum) {
|
|
1954
|
+
// A primary in status=view_change with a complete DVC quorum must be repairing —
|
|
1955
|
+
// it does not need to signal other replicas.
|
|
1956
|
+
assert(self.view == self.log_view);
|
|
1957
|
+
} else {
|
|
1958
|
+
assert(self.view > self.log_view);
|
|
1959
|
+
self.send_do_view_change();
|
|
1799
1960
|
}
|
|
1800
1961
|
}
|
|
1801
1962
|
|
|
1963
|
+
fn on_request_start_view_message_timeout(self: *Self) void {
|
|
1964
|
+
assert(self.status == .view_change);
|
|
1965
|
+
assert(self.primary_index(self.view) != self.replica);
|
|
1966
|
+
self.request_start_view_message_timeout.reset();
|
|
1967
|
+
|
|
1968
|
+
log.debug("{}: on_request_start_view_message_timeout: view={}", .{
|
|
1969
|
+
self.replica,
|
|
1970
|
+
self.view,
|
|
1971
|
+
});
|
|
1972
|
+
self.send_header_to_replica(self.primary_index(self.view), .{
|
|
1973
|
+
.command = .request_start_view,
|
|
1974
|
+
.cluster = self.cluster,
|
|
1975
|
+
.replica = self.replica,
|
|
1976
|
+
.view = self.view,
|
|
1977
|
+
});
|
|
1978
|
+
}
|
|
1979
|
+
|
|
1802
1980
|
fn on_repair_timeout(self: *Self) void {
|
|
1803
1981
|
assert(self.status == .normal or self.status == .view_change);
|
|
1804
1982
|
self.repair();
|
|
@@ -1926,13 +2104,6 @@ pub fn ReplicaType(
|
|
|
1926
2104
|
assert(self.status == .normal);
|
|
1927
2105
|
assert(self.primary());
|
|
1928
2106
|
},
|
|
1929
|
-
.start_view_change => {
|
|
1930
|
-
assert(self.replica_count > 1);
|
|
1931
|
-
if (self.replica_count == 2) assert(threshold == 1);
|
|
1932
|
-
|
|
1933
|
-
assert(self.status == .view_change);
|
|
1934
|
-
assert(self.replica != message.header.replica);
|
|
1935
|
-
},
|
|
1936
2107
|
.nack_prepare => {
|
|
1937
2108
|
assert(self.replica_count > 1);
|
|
1938
2109
|
if (self.replica_count == 2) assert(threshold >= 1);
|
|
@@ -2270,7 +2441,6 @@ pub fn ReplicaType(
|
|
|
2270
2441
|
assert(prepare.message.header.checksum == self.commit_prepare.?.header.checksum);
|
|
2271
2442
|
assert(prepare.message.header.op == self.commit_min);
|
|
2272
2443
|
assert(prepare.message.header.op == self.commit_max);
|
|
2273
|
-
assert(self.prepare_timeout.ticking);
|
|
2274
2444
|
|
|
2275
2445
|
self.message_bus.unref(prepare.message);
|
|
2276
2446
|
|
|
@@ -2288,10 +2458,6 @@ pub fn ReplicaType(
|
|
|
2288
2458
|
});
|
|
2289
2459
|
self.write_prepare(next.message, .append);
|
|
2290
2460
|
}
|
|
2291
|
-
} else {
|
|
2292
|
-
// When the pipeline is empty, stop the prepare timeout.
|
|
2293
|
-
// The timeout will be restarted when another entry arrives for the pipeline.
|
|
2294
|
-
self.prepare_timeout.stop();
|
|
2295
2461
|
}
|
|
2296
2462
|
}
|
|
2297
2463
|
|
|
@@ -2815,7 +2981,7 @@ pub fn ReplicaType(
|
|
|
2815
2981
|
assert(
|
|
2816
2982
|
header.view == self.view or
|
|
2817
2983
|
header.command == .request_start_view or
|
|
2818
|
-
header.command == .
|
|
2984
|
+
header.command == .ping or header.command == .pong,
|
|
2819
2985
|
);
|
|
2820
2986
|
assert(header.size == @sizeOf(Header));
|
|
2821
2987
|
|
|
@@ -2864,25 +3030,20 @@ pub fn ReplicaType(
|
|
|
2864
3030
|
assert(self.journal.header_with_op(self.op) != null);
|
|
2865
3031
|
}
|
|
2866
3032
|
|
|
2867
|
-
/// Returns whether the replica is a backup for the current view.
|
|
2868
|
-
/// This may be used only when the replica status is normal.
|
|
2869
|
-
fn backup(self: *Self) bool {
|
|
2870
|
-
return !self.primary();
|
|
2871
|
-
}
|
|
2872
|
-
|
|
2873
3033
|
fn flush_loopback_queue(self: *Self) void {
|
|
2874
|
-
// There are
|
|
2875
|
-
// However, of these
|
|
3034
|
+
// There are five cases where a replica will send a message to itself:
|
|
3035
|
+
// However, of these five cases, all but one call send_message_to_replica().
|
|
2876
3036
|
//
|
|
2877
3037
|
// 1. In on_request(), the primary sends a synchronous prepare to itself, but this is
|
|
2878
3038
|
// done by calling on_prepare() directly, and subsequent prepare timeout retries will
|
|
2879
3039
|
// never resend to self.
|
|
2880
3040
|
// 2. In on_prepare(), after writing to storage, the primary sends a (typically)
|
|
2881
3041
|
// asynchronous prepare_ok to itself.
|
|
2882
|
-
// 3. In
|
|
2883
|
-
//
|
|
3042
|
+
// 3. In transition_to_view_change_status(), the new primary sends a synchronous DVC to
|
|
3043
|
+
// itself.
|
|
2884
3044
|
// 4. In primary_start_view_as_the_new_primary(), the new primary sends itself a
|
|
2885
3045
|
// prepare_ok message for each uncommitted message.
|
|
3046
|
+
// 5. In send_start_view_change(), a replica sends itself a start_view_change message.
|
|
2886
3047
|
if (self.loopback_queue) |message| {
|
|
2887
3048
|
defer self.message_bus.unref(message);
|
|
2888
3049
|
|
|
@@ -2898,6 +3059,8 @@ pub fn ReplicaType(
|
|
|
2898
3059
|
}
|
|
2899
3060
|
|
|
2900
3061
|
fn ignore_prepare_ok(self: *Self, message: *const Message) bool {
|
|
3062
|
+
assert(message.header.command == .prepare_ok);
|
|
3063
|
+
|
|
2901
3064
|
if (self.primary_index(message.header.view) == self.replica) {
|
|
2902
3065
|
assert(message.header.view <= self.view);
|
|
2903
3066
|
}
|
|
@@ -3218,9 +3381,31 @@ pub fn ReplicaType(
|
|
|
3218
3381
|
return false;
|
|
3219
3382
|
}
|
|
3220
3383
|
|
|
3221
|
-
fn
|
|
3222
|
-
assert(message.header.command == .start_view_change
|
|
3223
|
-
|
|
3384
|
+
fn ignore_start_view_change_message(self: *const Self, message: *const Message) bool {
|
|
3385
|
+
assert(message.header.command == .start_view_change);
|
|
3386
|
+
|
|
3387
|
+
switch (self.status) {
|
|
3388
|
+
.normal, .view_change => {},
|
|
3389
|
+
.recovering => unreachable, // Single node clusters don't have view changes.
|
|
3390
|
+
.recovering_head => {
|
|
3391
|
+
log.debug("{}: on_start_view_change: ignoring (status={})", .{
|
|
3392
|
+
self.replica,
|
|
3393
|
+
self.status,
|
|
3394
|
+
});
|
|
3395
|
+
return true;
|
|
3396
|
+
},
|
|
3397
|
+
}
|
|
3398
|
+
|
|
3399
|
+
if (message.header.view < self.view) {
|
|
3400
|
+
log.debug("{}: on_start_view_change: ignoring (older view)", .{self.replica});
|
|
3401
|
+
return true;
|
|
3402
|
+
}
|
|
3403
|
+
|
|
3404
|
+
return false;
|
|
3405
|
+
}
|
|
3406
|
+
|
|
3407
|
+
fn ignore_view_change_message(self: *const Self, message: *const Message) bool {
|
|
3408
|
+
assert(message.header.command == .do_view_change or
|
|
3224
3409
|
message.header.command == .start_view);
|
|
3225
3410
|
assert(message.header.view > 0); // The initial view is already zero.
|
|
3226
3411
|
assert(self.status != .recovering); // Single node clusters don't have view changes.
|
|
@@ -3228,6 +3413,7 @@ pub fn ReplicaType(
|
|
|
3228
3413
|
const command: []const u8 = @tagName(message.header.command);
|
|
3229
3414
|
|
|
3230
3415
|
if (self.status == .recovering_head and message.header.command != .start_view) {
|
|
3416
|
+
log.debug("{}: on_{s}: ignoring (recovering_head)", .{ self.replica, command });
|
|
3231
3417
|
return true;
|
|
3232
3418
|
}
|
|
3233
3419
|
|
|
@@ -3243,18 +3429,13 @@ pub fn ReplicaType(
|
|
|
3243
3429
|
|
|
3244
3430
|
// These may be caused by faults in the network topology.
|
|
3245
3431
|
switch (message.header.command) {
|
|
3246
|
-
.
|
|
3432
|
+
.start_view => {
|
|
3247
3433
|
if (message.header.replica == self.replica) {
|
|
3248
3434
|
log.warn("{}: on_{s}: ignoring (self)", .{ self.replica, command });
|
|
3249
3435
|
return true;
|
|
3250
3436
|
}
|
|
3251
3437
|
},
|
|
3252
|
-
.do_view_change => {
|
|
3253
|
-
if (self.primary_index(message.header.view) != self.replica) {
|
|
3254
|
-
log.warn("{}: on_{s}: ignoring (backup)", .{ self.replica, command });
|
|
3255
|
-
return true;
|
|
3256
|
-
}
|
|
3257
|
-
},
|
|
3438
|
+
.do_view_change => {},
|
|
3258
3439
|
else => unreachable,
|
|
3259
3440
|
}
|
|
3260
3441
|
|
|
@@ -3275,6 +3456,11 @@ pub fn ReplicaType(
|
|
|
3275
3456
|
return false;
|
|
3276
3457
|
}
|
|
3277
3458
|
|
|
3459
|
+
/// Returns the index into the configuration of the primary for a given view.
|
|
3460
|
+
fn primary_index(self: *const Self, view: u32) u8 {
|
|
3461
|
+
return @intCast(u8, @mod(view, self.replica_count));
|
|
3462
|
+
}
|
|
3463
|
+
|
|
3278
3464
|
/// Returns whether the replica is the primary for the current view.
|
|
3279
3465
|
/// This may be used only when the replica status is normal.
|
|
3280
3466
|
fn primary(self: *const Self) bool {
|
|
@@ -3282,9 +3468,10 @@ pub fn ReplicaType(
|
|
|
3282
3468
|
return self.primary_index(self.view) == self.replica;
|
|
3283
3469
|
}
|
|
3284
3470
|
|
|
3285
|
-
/// Returns
|
|
3286
|
-
|
|
3287
|
-
|
|
3471
|
+
/// Returns whether the replica is a backup for the current view.
|
|
3472
|
+
/// This may be used only when the replica status is normal.
|
|
3473
|
+
fn backup(self: *const Self) bool {
|
|
3474
|
+
return !self.primary();
|
|
3288
3475
|
}
|
|
3289
3476
|
|
|
3290
3477
|
/// Advances `op` to where we need to be before `header` can be processed as a prepare.
|
|
@@ -3512,14 +3699,17 @@ pub fn ReplicaType(
|
|
|
3512
3699
|
|
|
3513
3700
|
log.debug("{}: primary_pipeline_next: prepare {}", .{ self.replica, message.header.checksum });
|
|
3514
3701
|
|
|
3515
|
-
if (self.
|
|
3702
|
+
if (self.primary_pipeline_pending()) |_| {
|
|
3516
3703
|
// Do not restart the prepare timeout as it is already ticking for another prepare.
|
|
3517
|
-
|
|
3704
|
+
const previous = self.pipeline.queue.prepare_queue.tail_ptr().?;
|
|
3518
3705
|
assert(previous.message.header.checksum == message.header.parent);
|
|
3706
|
+
assert(self.prepare_timeout.ticking);
|
|
3707
|
+
assert(self.primary_abdicate_timeout.ticking);
|
|
3519
3708
|
} else {
|
|
3520
|
-
// We are about to add the first prepare to the pipeline, so start the timeout.
|
|
3521
3709
|
assert(!self.prepare_timeout.ticking);
|
|
3710
|
+
assert(!self.primary_abdicate_timeout.ticking);
|
|
3522
3711
|
self.prepare_timeout.start();
|
|
3712
|
+
self.primary_abdicate_timeout.start();
|
|
3523
3713
|
}
|
|
3524
3714
|
self.pipeline.queue.push_prepare(message);
|
|
3525
3715
|
self.on_prepare(message);
|
|
@@ -3529,6 +3719,24 @@ pub fn ReplicaType(
|
|
|
3529
3719
|
assert(self.op == message.header.op);
|
|
3530
3720
|
}
|
|
3531
3721
|
|
|
3722
|
+
/// Returns the next prepare in the pipeline waiting for a quorum.
|
|
3723
|
+
/// Returns null when the pipeline is empty.
|
|
3724
|
+
/// Returns null when the pipeline is nonempty but all prepares have a quorum.
|
|
3725
|
+
fn primary_pipeline_pending(self: *const Self) ?*const Prepare {
|
|
3726
|
+
assert(self.status == .normal);
|
|
3727
|
+
assert(self.primary());
|
|
3728
|
+
|
|
3729
|
+
var prepares = self.pipeline.queue.prepare_queue.iterator();
|
|
3730
|
+
while (prepares.next_ptr()) |prepare| {
|
|
3731
|
+
assert(prepare.message.header.command == .prepare);
|
|
3732
|
+
if (!prepare.ok_quorum_received) {
|
|
3733
|
+
return prepare;
|
|
3734
|
+
}
|
|
3735
|
+
} else {
|
|
3736
|
+
return null;
|
|
3737
|
+
}
|
|
3738
|
+
}
|
|
3739
|
+
|
|
3532
3740
|
fn pipeline_prepare_by_op_and_checksum(self: *Self, op: u64, checksum: ?u128) ?*Message {
|
|
3533
3741
|
assert(self.status == .normal or self.status == .view_change);
|
|
3534
3742
|
assert(self.replica == self.primary_index(self.view) or checksum != null);
|
|
@@ -4349,7 +4557,7 @@ pub fn ReplicaType(
|
|
|
4349
4557
|
return;
|
|
4350
4558
|
}
|
|
4351
4559
|
|
|
4352
|
-
const next = @mod(self.replica + 1,
|
|
4560
|
+
const next = @mod(self.replica + 1, self.replica_count);
|
|
4353
4561
|
if (next == self.primary_index(message.header.view)) {
|
|
4354
4562
|
log.debug("{}: replicate: not replicating (completed)", .{self.replica});
|
|
4355
4563
|
return;
|
|
@@ -4412,13 +4620,13 @@ pub fn ReplicaType(
|
|
|
4412
4620
|
}
|
|
4413
4621
|
|
|
4414
4622
|
fn reset_quorum_nack_prepare(self: *Self) void {
|
|
4623
|
+
assert(!self.nack_prepare_from_other_replicas.isSet(self.replica));
|
|
4415
4624
|
self.reset_quorum_counter(&self.nack_prepare_from_other_replicas);
|
|
4416
4625
|
self.nack_prepare_op = null;
|
|
4417
4626
|
}
|
|
4418
4627
|
|
|
4419
4628
|
fn reset_quorum_start_view_change(self: *Self) void {
|
|
4420
|
-
self.reset_quorum_counter(&self.
|
|
4421
|
-
self.start_view_change_quorum = false;
|
|
4629
|
+
self.reset_quorum_counter(&self.start_view_change_from_all_replicas);
|
|
4422
4630
|
}
|
|
4423
4631
|
|
|
4424
4632
|
fn send_prepare_ok(self: *Self, header: *const Header) void {
|
|
@@ -4506,28 +4714,28 @@ pub fn ReplicaType(
|
|
|
4506
4714
|
}
|
|
4507
4715
|
|
|
4508
4716
|
fn send_start_view_change(self: *Self) void {
|
|
4509
|
-
assert(self.status == .view_change);
|
|
4510
|
-
|
|
4511
|
-
|
|
4512
|
-
// Send only to other replicas (and not to ourself) to avoid a quorum off-by-one error:
|
|
4513
|
-
// This could happen if the replica mistakenly counts its own message in the quorum.
|
|
4514
|
-
self.send_header_to_other_replicas(.{
|
|
4717
|
+
assert(self.status == .normal or self.status == .view_change);
|
|
4718
|
+
|
|
4719
|
+
const header: Header = .{
|
|
4515
4720
|
.command = .start_view_change,
|
|
4516
4721
|
.cluster = self.cluster,
|
|
4517
4722
|
.replica = self.replica,
|
|
4518
4723
|
.view = self.view,
|
|
4519
|
-
}
|
|
4724
|
+
};
|
|
4725
|
+
|
|
4726
|
+
self.send_header_to_other_replicas(header);
|
|
4727
|
+
|
|
4728
|
+
if (!self.start_view_change_from_all_replicas.isSet(self.replica)) {
|
|
4729
|
+
self.send_header_to_replica(self.replica, header);
|
|
4730
|
+
defer self.flush_loopback_queue();
|
|
4731
|
+
}
|
|
4520
4732
|
}
|
|
4521
4733
|
|
|
4522
4734
|
fn send_do_view_change(self: *Self) void {
|
|
4523
4735
|
assert(self.status == .view_change);
|
|
4524
|
-
assert(self.
|
|
4736
|
+
assert(self.view > self.log_view);
|
|
4525
4737
|
assert(!self.do_view_change_quorum);
|
|
4526
4738
|
|
|
4527
|
-
const count_start_view_change = self.start_view_change_from_other_replicas.count();
|
|
4528
|
-
assert(count_start_view_change >= self.quorum_view_change - 1);
|
|
4529
|
-
assert(count_start_view_change <= self.replica_count - 1);
|
|
4530
|
-
|
|
4531
4739
|
const message = self.create_view_change_message(.do_view_change);
|
|
4532
4740
|
defer self.message_bus.unref(message);
|
|
4533
4741
|
|
|
@@ -4546,7 +4754,14 @@ pub fn ReplicaType(
|
|
|
4546
4754
|
assert(message.header.commit == self.commit_min);
|
|
4547
4755
|
DVCQuorum.verify_message(message);
|
|
4548
4756
|
|
|
4549
|
-
self.
|
|
4757
|
+
self.send_message_to_other_replicas(message);
|
|
4758
|
+
|
|
4759
|
+
if (self.replica == self.primary_index(self.view) and
|
|
4760
|
+
self.do_view_change_from_all_replicas[self.replica] == null)
|
|
4761
|
+
{
|
|
4762
|
+
self.send_message_to_replica(self.replica, message);
|
|
4763
|
+
defer self.flush_loopback_queue();
|
|
4764
|
+
}
|
|
4550
4765
|
}
|
|
4551
4766
|
|
|
4552
4767
|
fn send_eviction_message_to_client(self: *Self, client: u128) void {
|
|
@@ -4568,6 +4783,8 @@ pub fn ReplicaType(
|
|
|
4568
4783
|
}
|
|
4569
4784
|
|
|
4570
4785
|
fn send_header_to_client(self: *Self, client: u128, header: Header) void {
|
|
4786
|
+
assert(header.cluster == self.cluster);
|
|
4787
|
+
|
|
4571
4788
|
const message = self.create_message_from_header(header);
|
|
4572
4789
|
defer self.message_bus.unref(message);
|
|
4573
4790
|
|
|
@@ -4643,14 +4860,15 @@ pub fn ReplicaType(
|
|
|
4643
4860
|
assert(replica == self.primary_index(self.view));
|
|
4644
4861
|
assert(message.header.replica == self.replica);
|
|
4645
4862
|
},
|
|
4863
|
+
.reply => unreachable,
|
|
4646
4864
|
.start_view_change => {
|
|
4647
|
-
assert(self.status == .view_change);
|
|
4865
|
+
assert(self.status == .normal or self.status == .view_change);
|
|
4648
4866
|
assert(message.header.view == self.view);
|
|
4649
4867
|
assert(message.header.replica == self.replica);
|
|
4650
4868
|
},
|
|
4651
4869
|
.do_view_change => {
|
|
4652
4870
|
assert(self.status == .view_change);
|
|
4653
|
-
assert(self.
|
|
4871
|
+
assert(self.view > self.log_view);
|
|
4654
4872
|
assert(!self.do_view_change_quorum);
|
|
4655
4873
|
assert(message.header.view == self.view);
|
|
4656
4874
|
assert(message.header.replica == self.replica);
|
|
@@ -4658,23 +4876,12 @@ pub fn ReplicaType(
|
|
|
4658
4876
|
assert(message.header.commit == self.commit_min);
|
|
4659
4877
|
assert(message.header.timestamp == self.log_view);
|
|
4660
4878
|
},
|
|
4661
|
-
.start_view =>
|
|
4662
|
-
.
|
|
4663
|
-
|
|
4664
|
-
|
|
4665
|
-
|
|
4666
|
-
|
|
4667
|
-
assert(message.header.replica == self.replica);
|
|
4668
|
-
assert(message.header.replica != replica);
|
|
4669
|
-
},
|
|
4670
|
-
.view_change => {
|
|
4671
|
-
assert(self.start_view_change_quorum);
|
|
4672
|
-
assert(self.do_view_change_quorum);
|
|
4673
|
-
assert(message.header.view == self.view);
|
|
4674
|
-
assert(message.header.replica == self.replica);
|
|
4675
|
-
assert(message.header.replica != replica);
|
|
4676
|
-
},
|
|
4677
|
-
else => unreachable,
|
|
4879
|
+
.start_view => {
|
|
4880
|
+
assert(self.status == .normal);
|
|
4881
|
+
assert(!self.do_view_change_quorum);
|
|
4882
|
+
assert(message.header.view == self.view);
|
|
4883
|
+
assert(message.header.replica == self.replica);
|
|
4884
|
+
assert(message.header.replica != replica);
|
|
4678
4885
|
},
|
|
4679
4886
|
.headers => {
|
|
4680
4887
|
assert(self.status == .normal or self.status == .view_change);
|
|
@@ -4682,11 +4889,18 @@ pub fn ReplicaType(
|
|
|
4682
4889
|
assert(message.header.replica == self.replica);
|
|
4683
4890
|
assert(message.header.replica != replica);
|
|
4684
4891
|
},
|
|
4685
|
-
.ping
|
|
4686
|
-
assert(
|
|
4892
|
+
.ping => {
|
|
4893
|
+
assert(self.status == .normal);
|
|
4894
|
+
assert(message.header.replica == self.replica);
|
|
4895
|
+
assert(message.header.replica != replica);
|
|
4896
|
+
},
|
|
4897
|
+
.pong => {
|
|
4898
|
+
assert(self.status == .normal or self.status == .view_change);
|
|
4687
4899
|
assert(message.header.replica == self.replica);
|
|
4688
4900
|
assert(message.header.replica != replica);
|
|
4689
4901
|
},
|
|
4902
|
+
.ping_client => unreachable,
|
|
4903
|
+
.pong_client => unreachable,
|
|
4690
4904
|
.commit => {
|
|
4691
4905
|
assert(self.status == .normal);
|
|
4692
4906
|
assert(self.primary());
|
|
@@ -4716,12 +4930,14 @@ pub fn ReplicaType(
|
|
|
4716
4930
|
assert(message.header.replica != replica);
|
|
4717
4931
|
assert(self.primary_index(self.view) == replica);
|
|
4718
4932
|
},
|
|
4719
|
-
|
|
4720
|
-
|
|
4721
|
-
|
|
4722
|
-
|
|
4723
|
-
|
|
4933
|
+
.eviction => {
|
|
4934
|
+
assert(self.status == .normal);
|
|
4935
|
+
assert(self.primary());
|
|
4936
|
+
assert(message.header.view == self.view);
|
|
4937
|
+
assert(message.header.replica == self.replica);
|
|
4724
4938
|
},
|
|
4939
|
+
.request_block => unreachable,
|
|
4940
|
+
.block => unreachable,
|
|
4725
4941
|
}
|
|
4726
4942
|
|
|
4727
4943
|
if (replica != self.replica) {
|
|
@@ -4929,27 +5145,12 @@ pub fn ReplicaType(
|
|
|
4929
5145
|
}
|
|
4930
5146
|
|
|
4931
5147
|
if (self.status == .view_change and self.log_view < self.view) {
|
|
4932
|
-
if (self.
|
|
4933
|
-
self.start_view_change_quorum and !self.do_view_change_quorum)
|
|
4934
|
-
{
|
|
4935
|
-
self.send_do_view_change();
|
|
4936
|
-
}
|
|
5148
|
+
if (!self.do_view_change_quorum) self.send_do_view_change();
|
|
4937
5149
|
}
|
|
4938
5150
|
}
|
|
4939
5151
|
|
|
4940
5152
|
fn set_op_and_commit_max(self: *Self, op: u64, commit_max: u64, method: []const u8) void {
|
|
4941
|
-
assert(self.status == .view_change
|
|
4942
|
-
|
|
4943
|
-
switch (self.status) {
|
|
4944
|
-
.normal => unreachable,
|
|
4945
|
-
.view_change => {},
|
|
4946
|
-
.recovering => {
|
|
4947
|
-
// The replica's view hasn't been set yet.
|
|
4948
|
-
// It will be set shortly, when we transition to normal status.
|
|
4949
|
-
assert(self.view == 0);
|
|
4950
|
-
},
|
|
4951
|
-
.recovering_head => unreachable,
|
|
4952
|
-
}
|
|
5153
|
+
assert(self.status == .view_change);
|
|
4953
5154
|
|
|
4954
5155
|
// Uncommitted ops may not survive a view change so we must assert `op` against
|
|
4955
5156
|
// `commit_max` and not `self.op`. However, committed ops (`commit_max`) must survive:
|
|
@@ -5026,9 +5227,9 @@ pub fn ReplicaType(
|
|
|
5026
5227
|
/// replica 0 doesn't have 6/7, then replica 1/2 must share the latest log_view. ∎)
|
|
5027
5228
|
fn primary_set_log_from_do_view_change_messages(self: *Self) void {
|
|
5028
5229
|
assert(self.status == .view_change);
|
|
5230
|
+
assert(self.view > self.log_view);
|
|
5029
5231
|
assert(self.primary_index(self.view) == self.replica);
|
|
5030
5232
|
assert(self.replica_count > 1);
|
|
5031
|
-
assert(self.start_view_change_quorum);
|
|
5032
5233
|
assert(self.do_view_change_quorum);
|
|
5033
5234
|
assert(self.do_view_change_from_all_replicas[self.replica] != null);
|
|
5034
5235
|
DVCQuorum.verify(self.do_view_change_from_all_replicas);
|
|
@@ -5249,12 +5450,16 @@ pub fn ReplicaType(
|
|
|
5249
5450
|
|
|
5250
5451
|
assert(self.replica_count == 1);
|
|
5251
5452
|
assert(!self.prepare_timeout.ticking);
|
|
5252
|
-
assert(!self.
|
|
5453
|
+
assert(!self.primary_abdicate_timeout.ticking);
|
|
5454
|
+
assert(!self.normal_heartbeat_timeout.ticking);
|
|
5455
|
+
assert(!self.start_view_change_window_timeout.ticking);
|
|
5253
5456
|
assert(!self.view_change_status_timeout.ticking);
|
|
5254
|
-
assert(!self.
|
|
5457
|
+
assert(!self.do_view_change_message_timeout.ticking);
|
|
5458
|
+
assert(!self.request_start_view_message_timeout.ticking);
|
|
5255
5459
|
|
|
5256
5460
|
self.ping_timeout.start();
|
|
5257
|
-
self.
|
|
5461
|
+
self.start_view_change_message_timeout.start();
|
|
5462
|
+
self.commit_message_timeout.start();
|
|
5258
5463
|
self.repair_timeout.start();
|
|
5259
5464
|
|
|
5260
5465
|
self.pipeline.cache.deinit(self.message_bus.pool);
|
|
@@ -5269,12 +5474,17 @@ pub fn ReplicaType(
|
|
|
5269
5474
|
);
|
|
5270
5475
|
|
|
5271
5476
|
assert(!self.prepare_timeout.ticking);
|
|
5272
|
-
assert(!self.
|
|
5477
|
+
assert(!self.primary_abdicate_timeout.ticking);
|
|
5478
|
+
assert(!self.normal_heartbeat_timeout.ticking);
|
|
5479
|
+
assert(!self.start_view_change_window_timeout.ticking);
|
|
5480
|
+
assert(!self.commit_message_timeout.ticking);
|
|
5273
5481
|
assert(!self.view_change_status_timeout.ticking);
|
|
5274
|
-
assert(!self.
|
|
5482
|
+
assert(!self.do_view_change_message_timeout.ticking);
|
|
5483
|
+
assert(!self.request_start_view_message_timeout.ticking);
|
|
5275
5484
|
|
|
5276
5485
|
self.ping_timeout.start();
|
|
5277
|
-
self.
|
|
5486
|
+
self.normal_heartbeat_timeout.start();
|
|
5487
|
+
self.start_view_change_message_timeout.start();
|
|
5278
5488
|
self.repair_timeout.start();
|
|
5279
5489
|
}
|
|
5280
5490
|
}
|
|
@@ -5285,6 +5495,7 @@ pub fn ReplicaType(
|
|
|
5285
5495
|
assert(self.status == .view_change);
|
|
5286
5496
|
assert(view_new >= self.view);
|
|
5287
5497
|
assert(self.journal.header_with_op(self.op) != null);
|
|
5498
|
+
assert(!self.primary_abdicating);
|
|
5288
5499
|
|
|
5289
5500
|
self.status = .normal;
|
|
5290
5501
|
|
|
@@ -5295,6 +5506,8 @@ pub fn ReplicaType(
|
|
|
5295
5506
|
);
|
|
5296
5507
|
|
|
5297
5508
|
assert(!self.prepare_timeout.ticking);
|
|
5509
|
+
assert(!self.normal_heartbeat_timeout.ticking);
|
|
5510
|
+
assert(!self.primary_abdicate_timeout.ticking);
|
|
5298
5511
|
assert(!self.pipeline_repairing);
|
|
5299
5512
|
assert(self.pipeline == .queue);
|
|
5300
5513
|
assert(self.view == view_new);
|
|
@@ -5308,14 +5521,19 @@ pub fn ReplicaType(
|
|
|
5308
5521
|
self.view_durable_update();
|
|
5309
5522
|
|
|
5310
5523
|
self.ping_timeout.start();
|
|
5311
|
-
self.
|
|
5312
|
-
self.
|
|
5524
|
+
self.commit_message_timeout.start();
|
|
5525
|
+
self.start_view_change_window_timeout.stop();
|
|
5526
|
+
self.start_view_change_message_timeout.start();
|
|
5313
5527
|
self.view_change_status_timeout.stop();
|
|
5314
|
-
self.
|
|
5528
|
+
self.do_view_change_message_timeout.stop();
|
|
5529
|
+
self.request_start_view_message_timeout.stop();
|
|
5315
5530
|
self.repair_timeout.start();
|
|
5316
5531
|
|
|
5317
5532
|
// Do not reset the pipeline as there may be uncommitted ops to drive to completion.
|
|
5318
|
-
if (self.pipeline.queue.prepare_queue.count > 0)
|
|
5533
|
+
if (self.pipeline.queue.prepare_queue.count > 0) {
|
|
5534
|
+
self.prepare_timeout.start();
|
|
5535
|
+
self.primary_abdicate_timeout.start();
|
|
5536
|
+
}
|
|
5319
5537
|
} else {
|
|
5320
5538
|
log.debug("{}: transition_to_normal_from_view_change_status: view={}..{} backup", .{
|
|
5321
5539
|
self.replica,
|
|
@@ -5324,6 +5542,9 @@ pub fn ReplicaType(
|
|
|
5324
5542
|
});
|
|
5325
5543
|
|
|
5326
5544
|
assert(!self.prepare_timeout.ticking);
|
|
5545
|
+
assert(!self.normal_heartbeat_timeout.ticking);
|
|
5546
|
+
assert(!self.primary_abdicate_timeout.ticking);
|
|
5547
|
+
assert(self.request_start_view_message_timeout.ticking);
|
|
5327
5548
|
assert(self.pipeline == .cache);
|
|
5328
5549
|
|
|
5329
5550
|
if (self.log_view == view_new and self.view == view_new) {
|
|
@@ -5336,24 +5557,27 @@ pub fn ReplicaType(
|
|
|
5336
5557
|
}
|
|
5337
5558
|
|
|
5338
5559
|
self.ping_timeout.start();
|
|
5339
|
-
self.
|
|
5340
|
-
self.
|
|
5560
|
+
self.commit_message_timeout.stop();
|
|
5561
|
+
self.normal_heartbeat_timeout.start();
|
|
5562
|
+
self.start_view_change_window_timeout.stop();
|
|
5563
|
+
self.start_view_change_message_timeout.start();
|
|
5341
5564
|
self.view_change_status_timeout.stop();
|
|
5342
|
-
self.
|
|
5565
|
+
self.do_view_change_message_timeout.stop();
|
|
5566
|
+
self.request_start_view_message_timeout.stop();
|
|
5343
5567
|
self.repair_timeout.start();
|
|
5344
5568
|
}
|
|
5345
5569
|
|
|
5570
|
+
self.heartbeat_timestamp = 0;
|
|
5346
5571
|
self.reset_quorum_start_view_change();
|
|
5347
5572
|
self.reset_quorum_do_view_change();
|
|
5348
5573
|
self.reset_quorum_nack_prepare();
|
|
5349
5574
|
|
|
5350
|
-
assert(self.start_view_change_quorum == false);
|
|
5351
5575
|
assert(self.do_view_change_quorum == false);
|
|
5352
5576
|
assert(self.nack_prepare_op == null);
|
|
5353
5577
|
}
|
|
5354
5578
|
|
|
5355
5579
|
/// A replica i that notices the need for a view change advances its view, sets its status
|
|
5356
|
-
/// to view_change, and sends a ⟨
|
|
5580
|
+
/// to view_change, and sends a ⟨do_view_change v, i⟩ message to all the other replicas,
|
|
5357
5581
|
/// where v identifies the new view. A replica notices the need for a view change either
|
|
5358
5582
|
/// based on its own timer, or because it receives a start_view_change or do_view_change
|
|
5359
5583
|
/// message for a view with a larger number than its own view.
|
|
@@ -5390,30 +5614,40 @@ pub fn ReplicaType(
|
|
|
5390
5614
|
}
|
|
5391
5615
|
|
|
5392
5616
|
self.ping_timeout.stop();
|
|
5393
|
-
self.
|
|
5394
|
-
self.
|
|
5617
|
+
self.commit_message_timeout.stop();
|
|
5618
|
+
self.normal_heartbeat_timeout.stop();
|
|
5619
|
+
self.start_view_change_window_timeout.stop();
|
|
5620
|
+
self.start_view_change_message_timeout.start();
|
|
5395
5621
|
self.view_change_status_timeout.start();
|
|
5396
|
-
self.
|
|
5622
|
+
self.do_view_change_message_timeout.start();
|
|
5397
5623
|
self.repair_timeout.stop();
|
|
5398
5624
|
self.prepare_timeout.stop();
|
|
5625
|
+
self.primary_abdicate_timeout.stop();
|
|
5626
|
+
|
|
5627
|
+
if (self.primary_index(self.view) == self.replica) {
|
|
5628
|
+
self.request_start_view_message_timeout.stop();
|
|
5629
|
+
} else {
|
|
5630
|
+
self.request_start_view_message_timeout.start();
|
|
5631
|
+
}
|
|
5399
5632
|
|
|
5400
5633
|
// Do not reset quorum counters only on entering a view, assuming that the view will be
|
|
5401
5634
|
// followed only by a single subsequent view change to the next view, because multiple
|
|
5402
5635
|
// successive view changes can fail, e.g. after a view change timeout.
|
|
5403
5636
|
// We must therefore reset our counters here to avoid counting messages from an older
|
|
5404
5637
|
// view, which would violate the quorum intersection property essential for correctness.
|
|
5638
|
+
self.heartbeat_timestamp = 0;
|
|
5639
|
+
self.primary_abdicating = false;
|
|
5405
5640
|
self.reset_quorum_start_view_change();
|
|
5406
5641
|
self.reset_quorum_do_view_change();
|
|
5407
5642
|
self.reset_quorum_nack_prepare();
|
|
5408
5643
|
|
|
5409
|
-
assert(self.start_view_change_quorum == false);
|
|
5410
5644
|
assert(self.do_view_change_quorum == false);
|
|
5411
5645
|
assert(self.nack_prepare_op == null);
|
|
5412
5646
|
|
|
5413
5647
|
if (self.log_view == self.view) {
|
|
5414
5648
|
assert(status_before == .recovering_head);
|
|
5415
5649
|
} else {
|
|
5416
|
-
self.
|
|
5650
|
+
self.send_do_view_change();
|
|
5417
5651
|
}
|
|
5418
5652
|
}
|
|
5419
5653
|
|
|
@@ -5590,6 +5824,7 @@ pub fn ReplicaType(
|
|
|
5590
5824
|
}
|
|
5591
5825
|
|
|
5592
5826
|
// TODO Debounce and decouple this from `on_message()` by moving into `tick()`:
|
|
5827
|
+
// (Using request_start_view_message_timeout).
|
|
5593
5828
|
log.debug("{}: view_jump: requesting start_view message", .{self.replica});
|
|
5594
5829
|
self.send_header_to_replica(self.primary_index(header.view), .{
|
|
5595
5830
|
.command = .request_start_view,
|