tigerbeetle-node 0.11.5 → 0.11.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/.client.node.sha256 +1 -1
- package/dist/index.d.ts +41 -42
- package/dist/index.js +41 -42
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
- package/src/index.ts +0 -1
- package/src/tigerbeetle/scripts/benchmark.bat +6 -1
- package/src/tigerbeetle/scripts/benchmark.sh +1 -1
- package/src/tigerbeetle/src/c/tb_client.h +42 -43
- package/src/tigerbeetle/src/cli.zig +32 -8
- package/src/tigerbeetle/src/config.zig +24 -3
- package/src/tigerbeetle/src/constants.zig +8 -5
- package/src/tigerbeetle/src/lsm/compaction.zig +9 -43
- package/src/tigerbeetle/src/lsm/forest_fuzz.zig +2 -7
- package/src/tigerbeetle/src/lsm/groove.zig +3 -0
- package/src/tigerbeetle/src/lsm/manifest_level.zig +1 -0
- package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +17 -9
- package/src/tigerbeetle/src/lsm/merge_iterator.zig +106 -0
- package/src/tigerbeetle/src/lsm/posted_groove.zig +1 -0
- package/src/tigerbeetle/src/lsm/segmented_array.zig +1 -0
- package/src/tigerbeetle/src/lsm/table.zig +14 -0
- package/src/tigerbeetle/src/lsm/table_iterator.zig +2 -2
- package/src/tigerbeetle/src/lsm/table_mutable.zig +49 -15
- package/src/tigerbeetle/src/lsm/test.zig +8 -4
- package/src/tigerbeetle/src/lsm/tree_fuzz.zig +302 -263
- package/src/tigerbeetle/src/main.zig +22 -25
- package/src/tigerbeetle/src/message_pool.zig +2 -1
- package/src/tigerbeetle/src/simulator.zig +22 -79
- package/src/tigerbeetle/src/{test/accounting → state_machine}/auditor.zig +8 -8
- package/src/tigerbeetle/src/{test/accounting → state_machine}/workload.zig +108 -48
- package/src/tigerbeetle/src/state_machine.zig +20 -14
- package/src/tigerbeetle/src/test/cluster.zig +11 -11
- package/src/tigerbeetle/src/test/conductor.zig +2 -3
- package/src/tigerbeetle/src/test/id.zig +10 -0
- package/src/tigerbeetle/src/test/state_machine.zig +151 -46
- package/src/tigerbeetle/src/tigerbeetle.zig +0 -1
- package/src/tigerbeetle/src/unit_tests.zig +2 -2
- package/src/tigerbeetle/src/vsr/client.zig +5 -5
- package/src/tigerbeetle/src/vsr/clock.zig +2 -2
- package/src/tigerbeetle/src/vsr/journal.zig +537 -487
- package/src/tigerbeetle/src/vsr/replica.zig +324 -314
- package/src/tigerbeetle/src/vsr/replica_format.zig +7 -4
- package/src/tigerbeetle/src/vsr/superblock.zig +76 -31
- package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +10 -5
- package/src/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +3 -3
- package/src/tigerbeetle/src/vsr.zig +5 -5
|
@@ -26,22 +26,23 @@ pub const Status = enum {
|
|
|
26
26
|
view_change,
|
|
27
27
|
// Recovery (for replica_count > 1):
|
|
28
28
|
//
|
|
29
|
-
// 1.
|
|
30
|
-
//
|
|
31
|
-
//
|
|
32
|
-
//
|
|
29
|
+
// 1. Open the replica:
|
|
30
|
+
// a. At replica start: `status=recovering`.
|
|
31
|
+
// b. Recover the WAL. Mark questionable entries as faulty.
|
|
32
|
+
// c. If the WAL has no entries (besides the initial commit), skip to step 3 with view 0.
|
|
33
|
+
// 2. Run VSR recovery protocol:
|
|
33
34
|
// a. Send a `recovery` message to every replica (except self).
|
|
34
35
|
// b. Wait for f+1 `recovery_response` messages from replicas in `normal` status.
|
|
35
36
|
// Each `recovery_response` includes the current view number.
|
|
36
37
|
// Each `recovery_response` must include a nonce matching the `recovery` message.
|
|
37
|
-
// c. Wait for a `recovery_response` from the
|
|
38
|
-
//
|
|
39
|
-
// * Set `op` to the highest op in the
|
|
38
|
+
// c. Wait for a `recovery_response` from the primary of the highest known view.
|
|
39
|
+
// 3. Transition to `status=normal` with the discovered view number:
|
|
40
|
+
// * Set `op` to the highest op in the primary's recovery response.
|
|
40
41
|
// * Repair faulty messages.
|
|
41
42
|
// * Commit through to the discovered `commit_max`.
|
|
42
43
|
// * Set `state_machine.prepare_timeout` to the current op's timestamp.
|
|
43
44
|
//
|
|
44
|
-
// TODO
|
|
45
|
+
// TODO Document state transfer in this progression.
|
|
45
46
|
recovering,
|
|
46
47
|
};
|
|
47
48
|
|
|
@@ -66,7 +67,7 @@ const quorum_counter_null = QuorumCounter.initEmpty();
|
|
|
66
67
|
|
|
67
68
|
// CRITICAL: The number of prepare headers to include in the body:
|
|
68
69
|
// We must provide enough headers to cover all uncommitted headers so that the new
|
|
69
|
-
//
|
|
70
|
+
// primary (if we are in a view change) can decide whether to discard uncommitted headers
|
|
70
71
|
// that cannot be repaired because they are gaps, and this must be relative to the
|
|
71
72
|
// cluster as a whole (not relative to the difference between our op and commit number)
|
|
72
73
|
// as otherwise we would break correctness.
|
|
@@ -91,7 +92,7 @@ pub fn ReplicaType(
|
|
|
91
92
|
return struct {
|
|
92
93
|
const Self = @This();
|
|
93
94
|
|
|
94
|
-
const Journal = vsr.
|
|
95
|
+
const Journal = vsr.JournalType(Self, Storage);
|
|
95
96
|
const Clock = vsr.Clock(Time);
|
|
96
97
|
|
|
97
98
|
/// We use this allocator during open/init and then disable it.
|
|
@@ -115,7 +116,7 @@ pub fn ReplicaType(
|
|
|
115
116
|
|
|
116
117
|
time: Time,
|
|
117
118
|
|
|
118
|
-
/// A distributed fault-tolerant clock for lower and upper bounds on the
|
|
119
|
+
/// A distributed fault-tolerant clock for lower and upper bounds on the primary's wall clock:
|
|
119
120
|
clock: Clock,
|
|
120
121
|
|
|
121
122
|
/// The persistent log of hash-chained journal entries:
|
|
@@ -192,11 +193,11 @@ pub fn ReplicaType(
|
|
|
192
193
|
/// Whether we are reading a prepare from storage in order to push to the pipeline.
|
|
193
194
|
repairing_pipeline: bool = false,
|
|
194
195
|
|
|
195
|
-
/// The
|
|
196
|
+
/// The primary's pipeline of inflight prepares waiting to commit in FIFO order.
|
|
196
197
|
/// This allows us to pipeline without the complexity of out-of-order commits.
|
|
197
198
|
///
|
|
198
|
-
/// After a view change, the old
|
|
199
|
-
/// help the new
|
|
199
|
+
/// After a view change, the old primary's pipeline is left untouched so that it is able to
|
|
200
|
+
/// help the new primary repair, even in the face of local storage faults.
|
|
200
201
|
pipeline: RingBuffer(Prepare, constants.pipeline_max, .array) = .{},
|
|
201
202
|
|
|
202
203
|
/// In some cases, a replica may send a message to itself. We do not submit these messages
|
|
@@ -219,27 +220,27 @@ pub fn ReplicaType(
|
|
|
219
220
|
/// Whether a replica has received a quorum of start_view_change messages for the view change:
|
|
220
221
|
start_view_change_quorum: bool = false,
|
|
221
222
|
|
|
222
|
-
/// Whether the
|
|
223
|
-
/// Determines whether the
|
|
223
|
+
/// Whether the primary has received a quorum of do_view_change messages for the view change:
|
|
224
|
+
/// Determines whether the primary may effect repairs according to the CTRL protocol.
|
|
224
225
|
do_view_change_quorum: bool = false,
|
|
225
226
|
|
|
226
|
-
/// Whether the
|
|
227
|
+
/// Whether the primary is expecting to receive a nack_prepare and for which op:
|
|
227
228
|
nack_prepare_op: ?u64 = null,
|
|
228
229
|
|
|
229
|
-
/// The number of ticks before a
|
|
230
|
+
/// The number of ticks before a primary or backup broadcasts a ping to other replicas.
|
|
230
231
|
/// TODO Explain why we need this (MessageBus handshaking, leapfrogging faulty replicas,
|
|
231
232
|
/// deciding whether starting a view change would be detrimental under some network partitions).
|
|
232
233
|
ping_timeout: Timeout,
|
|
233
234
|
|
|
234
|
-
/// The number of ticks without enough prepare_ok's before the
|
|
235
|
+
/// The number of ticks without enough prepare_ok's before the primary resends a prepare.
|
|
235
236
|
prepare_timeout: Timeout,
|
|
236
237
|
|
|
237
|
-
/// The number of ticks before the
|
|
238
|
-
/// The
|
|
238
|
+
/// The number of ticks before the primary sends a commit heartbeat:
|
|
239
|
+
/// The primary always sends a commit heartbeat irrespective of when it last sent a prepare.
|
|
239
240
|
/// This improves liveness when prepare messages cannot be replicated fully due to partitions.
|
|
240
241
|
commit_timeout: Timeout,
|
|
241
242
|
|
|
242
|
-
/// The number of ticks without hearing from the
|
|
243
|
+
/// The number of ticks without hearing from the primary before starting a view change.
|
|
243
244
|
/// This transitions from .normal status to .view_change status.
|
|
244
245
|
normal_status_timeout: Timeout,
|
|
245
246
|
|
|
@@ -290,6 +291,7 @@ pub fn ReplicaType(
|
|
|
290
291
|
|
|
291
292
|
const OpenOptions = struct {
|
|
292
293
|
replica_count: u8,
|
|
294
|
+
storage_size_limit: u64,
|
|
293
295
|
storage: *Storage,
|
|
294
296
|
message_pool: *MessagePool,
|
|
295
297
|
time: Time,
|
|
@@ -299,13 +301,19 @@ pub fn ReplicaType(
|
|
|
299
301
|
|
|
300
302
|
/// Initializes and opens the provided replica using the options.
|
|
301
303
|
pub fn open(self: *Self, parent_allocator: std.mem.Allocator, options: OpenOptions) !void {
|
|
304
|
+
assert(options.storage_size_limit <= constants.storage_size_max);
|
|
305
|
+
assert(options.storage_size_limit % constants.sector_size == 0);
|
|
306
|
+
|
|
302
307
|
self.static_allocator = StaticAllocator.init(parent_allocator);
|
|
303
308
|
const allocator = self.static_allocator.allocator();
|
|
304
309
|
|
|
305
310
|
self.superblock = try SuperBlock.init(
|
|
306
311
|
allocator,
|
|
307
|
-
|
|
308
|
-
|
|
312
|
+
.{
|
|
313
|
+
.storage = options.storage,
|
|
314
|
+
.message_pool = options.message_pool,
|
|
315
|
+
.storage_size_limit = options.storage_size_limit,
|
|
316
|
+
},
|
|
309
317
|
);
|
|
310
318
|
|
|
311
319
|
// Once initialzed, the replica is in charge of calling superblock.deinit()
|
|
@@ -326,7 +334,7 @@ pub fn ReplicaType(
|
|
|
326
334
|
return error.NoAddress;
|
|
327
335
|
}
|
|
328
336
|
|
|
329
|
-
//
|
|
337
|
+
// Initialize the replica:
|
|
330
338
|
try self.init(allocator, .{
|
|
331
339
|
.cluster = self.superblock.working.cluster,
|
|
332
340
|
.replica_index = self.superblock.working.replica,
|
|
@@ -351,6 +359,30 @@ pub fn ReplicaType(
|
|
|
351
359
|
self.grid.tick();
|
|
352
360
|
self.superblock.storage.tick();
|
|
353
361
|
}
|
|
362
|
+
|
|
363
|
+
self.opened = false;
|
|
364
|
+
self.journal.recover(journal_recover_callback);
|
|
365
|
+
while (!self.opened) self.superblock.storage.tick();
|
|
366
|
+
|
|
367
|
+
if (self.journal.is_empty()) {
|
|
368
|
+
// The data file is brand new — no messages have ever been written.
|
|
369
|
+
// Transition to normal status; no need to run the VSR recovery protocol.
|
|
370
|
+
assert(self.journal.dirty.count == 0);
|
|
371
|
+
assert(self.journal.faulty.count == 0);
|
|
372
|
+
assert(self.commit_min == 0);
|
|
373
|
+
assert(self.commit_max == 0);
|
|
374
|
+
assert(self.op_checkpoint == 0);
|
|
375
|
+
assert(self.op == 0);
|
|
376
|
+
assert(self.view == 0);
|
|
377
|
+
|
|
378
|
+
log.debug("{}: open: empty data file", .{self.replica});
|
|
379
|
+
self.transition_to_normal_from_recovering_status(0);
|
|
380
|
+
assert(self.status == .normal);
|
|
381
|
+
} else if (self.replica_count == 1) {
|
|
382
|
+
if (self.journal.faulty.count != 0) @panic("journal is corrupt");
|
|
383
|
+
} else {
|
|
384
|
+
assert(self.status == .recovering);
|
|
385
|
+
}
|
|
354
386
|
}
|
|
355
387
|
|
|
356
388
|
fn superblock_open_callback(superblock_context: *SuperBlock.Context) void {
|
|
@@ -365,6 +397,12 @@ pub fn ReplicaType(
|
|
|
365
397
|
self.opened = true;
|
|
366
398
|
}
|
|
367
399
|
|
|
400
|
+
fn journal_recover_callback(journal: *Journal) void {
|
|
401
|
+
const self = @fieldParentPtr(Self, "journal", journal);
|
|
402
|
+
assert(!self.opened);
|
|
403
|
+
self.opened = true;
|
|
404
|
+
}
|
|
405
|
+
|
|
368
406
|
const Options = struct {
|
|
369
407
|
cluster: u32,
|
|
370
408
|
replica_count: u8,
|
|
@@ -592,46 +630,27 @@ pub fn ReplicaType(
|
|
|
592
630
|
/// Time is measured in logical ticks that are incremented on every call to tick().
|
|
593
631
|
/// This eliminates a dependency on the system time and enables deterministic testing.
|
|
594
632
|
pub fn tick(self: *Self) void {
|
|
633
|
+
assert(self.opened);
|
|
595
634
|
// Ensure that all asynchronous IO callbacks flushed the loopback queue as needed.
|
|
596
635
|
// If an IO callback queues a loopback message without flushing the queue then this will
|
|
597
|
-
// delay the delivery of messages (e.g. a prepare_ok from the
|
|
636
|
+
// delay the delivery of messages (e.g. a prepare_ok from the primary to itself) and
|
|
598
637
|
// decrease throughput significantly.
|
|
599
638
|
assert(self.loopback_queue == null);
|
|
600
639
|
|
|
601
640
|
// TODO Replica owns Time; should it tick() here instead of Clock?
|
|
602
641
|
self.clock.tick();
|
|
603
|
-
|
|
604
|
-
// Storage/IO is ticked by top-level in case of multiple replicas sharing the same IO.
|
|
605
|
-
// self.journal.storage.tick();
|
|
606
|
-
|
|
607
642
|
self.grid.tick();
|
|
608
643
|
self.message_bus.tick();
|
|
609
644
|
|
|
610
|
-
switch (self.journal.status) {
|
|
611
|
-
.init => return self.journal.recover(),
|
|
612
|
-
.recovering => return,
|
|
613
|
-
.recovered => {},
|
|
614
|
-
}
|
|
615
|
-
|
|
616
645
|
if (self.status == .recovering) {
|
|
617
646
|
if (self.recovery_timeout.ticking) {
|
|
618
647
|
// Continue running the VSR recovery protocol.
|
|
619
648
|
self.recovery_timeout.tick();
|
|
620
649
|
if (self.recovery_timeout.fired()) self.on_recovery_timeout();
|
|
621
|
-
} else if (self.journal.is_empty()) {
|
|
622
|
-
// The data file is brand new — no messages have ever been written.
|
|
623
|
-
// Transition to normal status; no need to run the VSR recovery protocol.
|
|
624
|
-
assert(self.journal.faulty.count == 0);
|
|
625
|
-
assert(self.commit_min == 0);
|
|
626
|
-
assert(self.commit_max == 0);
|
|
627
|
-
assert(self.op_checkpoint == 0);
|
|
628
|
-
assert(self.op == 0);
|
|
629
|
-
self.transition_to_normal_from_recovering_status(0);
|
|
630
|
-
assert(self.status == .normal);
|
|
631
650
|
} else if (self.replica_count == 1) {
|
|
632
651
|
// A cluster-of-one does not run the VSR recovery protocol.
|
|
633
|
-
if (self.journal.faulty.count != 0) @panic("journal is corrupt");
|
|
634
652
|
if (self.committing) return;
|
|
653
|
+
assert(self.journal.faulty.count == 0);
|
|
635
654
|
assert(self.op == 0);
|
|
636
655
|
// TODO Assert that this path isn't taken more than once.
|
|
637
656
|
self.op = self.journal.op_maximum();
|
|
@@ -677,6 +696,7 @@ pub fn ReplicaType(
|
|
|
677
696
|
}
|
|
678
697
|
|
|
679
698
|
pub fn on_message(self: *Self, message: *Message) void {
|
|
699
|
+
assert(self.opened);
|
|
680
700
|
assert(self.loopback_queue == null);
|
|
681
701
|
assert(message.references > 0);
|
|
682
702
|
|
|
@@ -704,11 +724,6 @@ pub fn ReplicaType(
|
|
|
704
724
|
return;
|
|
705
725
|
}
|
|
706
726
|
|
|
707
|
-
if (self.journal.status != .recovered) {
|
|
708
|
-
log.debug("{}: on_message: waiting for journal to recover", .{self.replica});
|
|
709
|
-
return;
|
|
710
|
-
}
|
|
711
|
-
|
|
712
727
|
assert(message.header.replica < self.replica_count);
|
|
713
728
|
switch (message.header.command) {
|
|
714
729
|
.ping => self.on_ping(message),
|
|
@@ -811,7 +826,7 @@ pub fn ReplicaType(
|
|
|
811
826
|
if (self.ignore_request_message(message)) return;
|
|
812
827
|
|
|
813
828
|
assert(self.status == .normal);
|
|
814
|
-
assert(self.
|
|
829
|
+
assert(self.primary());
|
|
815
830
|
assert(self.commit_min == self.commit_max);
|
|
816
831
|
assert(self.commit_max + self.pipeline.count == self.op);
|
|
817
832
|
|
|
@@ -828,7 +843,7 @@ pub fn ReplicaType(
|
|
|
828
843
|
// Guard against the wall clock going backwards by taking the max with timestamps issued:
|
|
829
844
|
self.state_machine.prepare_timestamp = std.math.max(
|
|
830
845
|
// The cluster `commit_timestamp` may be ahead of our `prepare_timestamp` because this
|
|
831
|
-
// may be our first prepare as a recently elected
|
|
846
|
+
// may be our first prepare as a recently elected primary:
|
|
832
847
|
std.math.max(
|
|
833
848
|
self.state_machine.prepare_timestamp,
|
|
834
849
|
self.state_machine.commit_timestamp,
|
|
@@ -873,27 +888,27 @@ pub fn ReplicaType(
|
|
|
873
888
|
|
|
874
889
|
self.on_prepare(message);
|
|
875
890
|
|
|
876
|
-
// We expect `on_prepare()` to increment `self.op` to match the
|
|
891
|
+
// We expect `on_prepare()` to increment `self.op` to match the primary's latest prepare:
|
|
877
892
|
// This is critical to ensure that pipelined prepares do not receive the same op number.
|
|
878
893
|
assert(self.op == message.header.op);
|
|
879
894
|
}
|
|
880
895
|
|
|
881
|
-
/// Replication is simple, with a single code path for the
|
|
896
|
+
/// Replication is simple, with a single code path for the primary and backups.
|
|
882
897
|
///
|
|
883
|
-
/// The
|
|
898
|
+
/// The primary starts by sending a prepare message to itself.
|
|
884
899
|
///
|
|
885
|
-
/// Each replica (including the
|
|
900
|
+
/// Each replica (including the primary) then forwards this prepare message to the next
|
|
886
901
|
/// replica in the configuration, in parallel to writing to its own journal, closing the
|
|
887
|
-
/// circle until the next replica is back to the
|
|
902
|
+
/// circle until the next replica is back to the primary, in which case the replica does not
|
|
888
903
|
/// forward.
|
|
889
904
|
///
|
|
890
|
-
/// This keeps the
|
|
891
|
-
/// since the
|
|
892
|
-
/// need to replicate to multiple
|
|
905
|
+
/// This keeps the primary's outgoing bandwidth limited (one-for-one) to incoming bandwidth,
|
|
906
|
+
/// since the primary need only replicate to the next replica. Otherwise, the primary would
|
|
907
|
+
/// need to replicate to multiple backups, dividing available bandwidth.
|
|
893
908
|
///
|
|
894
909
|
/// This does not impact latency, since with Flexible Paxos we need only one remote
|
|
895
910
|
/// prepare_ok. It is ideal if this synchronous replication to one remote replica is to the
|
|
896
|
-
/// next replica, since that is the replica next in line to be
|
|
911
|
+
/// next replica, since that is the replica next in line to be primary, which will need to
|
|
897
912
|
/// be up-to-date before it can start the next view.
|
|
898
913
|
///
|
|
899
914
|
/// At the same time, asynchronous replication keeps going, so that if our local disk is
|
|
@@ -901,8 +916,8 @@ pub fn ReplicaType(
|
|
|
901
916
|
/// come in. This gives automatic tail latency tolerance for storage latency spikes.
|
|
902
917
|
///
|
|
903
918
|
/// The remaining problem then is tail latency tolerance for network latency spikes.
|
|
904
|
-
/// If the next replica is down or partitioned, then the
|
|
905
|
-
/// and the
|
|
919
|
+
/// If the next replica is down or partitioned, then the primary's prepare timeout will fire,
|
|
920
|
+
/// and the primary will resend but to another replica, until it receives enough prepare_ok's.
|
|
906
921
|
fn on_prepare(self: *Self, message: *Message) void {
|
|
907
922
|
self.view_jump(message.header);
|
|
908
923
|
|
|
@@ -934,21 +949,21 @@ pub fn ReplicaType(
|
|
|
934
949
|
message.header.op,
|
|
935
950
|
self.op_checkpoint,
|
|
936
951
|
});
|
|
937
|
-
// When we are the
|
|
938
|
-
assert(self.
|
|
952
|
+
// When we are the primary, `on_request` enforces this invariant.
|
|
953
|
+
assert(self.backup());
|
|
939
954
|
return;
|
|
940
955
|
}
|
|
941
956
|
|
|
942
957
|
assert(self.status == .normal);
|
|
943
958
|
assert(message.header.view == self.view);
|
|
944
|
-
assert(self.
|
|
945
|
-
assert(message.header.replica == self.
|
|
959
|
+
assert(self.primary() or self.backup());
|
|
960
|
+
assert(message.header.replica == self.primary_index(message.header.view));
|
|
946
961
|
assert(message.header.op > self.op_checkpoint);
|
|
947
962
|
assert(message.header.op > self.op);
|
|
948
963
|
assert(message.header.op > self.commit_min);
|
|
949
964
|
assert(message.header.op <= self.op_checkpoint_trigger());
|
|
950
965
|
|
|
951
|
-
if (self.
|
|
966
|
+
if (self.backup()) self.normal_status_timeout.reset();
|
|
952
967
|
|
|
953
968
|
if (message.header.op > self.op + 1) {
|
|
954
969
|
log.debug("{}: on_prepare: newer op", .{self.replica});
|
|
@@ -964,7 +979,7 @@ pub fn ReplicaType(
|
|
|
964
979
|
}
|
|
965
980
|
|
|
966
981
|
// We must advance our op and set the header as dirty before replicating and journalling.
|
|
967
|
-
// The
|
|
982
|
+
// The primary needs this before its journal is outrun by any prepare_ok quorum:
|
|
968
983
|
log.debug("{}: on_prepare: advancing: op={}..{} checksum={}..{}", .{
|
|
969
984
|
self.replica,
|
|
970
985
|
self.op,
|
|
@@ -979,7 +994,7 @@ pub fn ReplicaType(
|
|
|
979
994
|
self.replicate(message);
|
|
980
995
|
self.append(message);
|
|
981
996
|
|
|
982
|
-
if (self.
|
|
997
|
+
if (self.backup()) {
|
|
983
998
|
// A prepare may already be committed if requested by repair() so take the max:
|
|
984
999
|
self.commit_journal(std.math.max(message.header.commit, self.commit_max));
|
|
985
1000
|
assert(self.commit_max >= message.header.commit);
|
|
@@ -991,7 +1006,7 @@ pub fn ReplicaType(
|
|
|
991
1006
|
|
|
992
1007
|
assert(self.status == .normal);
|
|
993
1008
|
assert(message.header.view == self.view);
|
|
994
|
-
assert(self.
|
|
1009
|
+
assert(self.primary());
|
|
995
1010
|
|
|
996
1011
|
const prepare = self.pipeline_prepare_for_prepare_ok(message) orelse return;
|
|
997
1012
|
|
|
@@ -1029,9 +1044,9 @@ pub fn ReplicaType(
|
|
|
1029
1044
|
}
|
|
1030
1045
|
|
|
1031
1046
|
/// Known issue:
|
|
1032
|
-
/// TODO The
|
|
1033
|
-
/// It's possible for the network to be one-way partitioned so that
|
|
1034
|
-
///
|
|
1047
|
+
/// TODO The primary should stand down if it sees too many retries in on_prepare_timeout().
|
|
1048
|
+
/// It's possible for the network to be one-way partitioned so that backups don't see the
|
|
1049
|
+
/// primary as down, but neither can the primary hear from the backups.
|
|
1035
1050
|
fn on_commit(self: *Self, message: *const Message) void {
|
|
1036
1051
|
self.view_jump(message.header);
|
|
1037
1052
|
|
|
@@ -1050,15 +1065,15 @@ pub fn ReplicaType(
|
|
|
1050
1065
|
return;
|
|
1051
1066
|
}
|
|
1052
1067
|
|
|
1053
|
-
if (self.
|
|
1054
|
-
log.warn("{}: on_commit: ignoring (
|
|
1068
|
+
if (self.primary()) {
|
|
1069
|
+
log.warn("{}: on_commit: ignoring (primary)", .{self.replica});
|
|
1055
1070
|
return;
|
|
1056
1071
|
}
|
|
1057
1072
|
|
|
1058
1073
|
assert(self.status == .normal);
|
|
1059
|
-
assert(self.
|
|
1074
|
+
assert(self.backup());
|
|
1060
1075
|
assert(message.header.view == self.view);
|
|
1061
|
-
assert(message.header.replica == self.
|
|
1076
|
+
assert(message.header.replica == self.primary_index(message.header.view));
|
|
1062
1077
|
|
|
1063
1078
|
// We may not always have the latest commit entry but if we do our checksum must match:
|
|
1064
1079
|
if (self.journal.header_with_op(message.header.commit)) |commit_entry| {
|
|
@@ -1094,8 +1109,8 @@ pub fn ReplicaType(
|
|
|
1094
1109
|
return;
|
|
1095
1110
|
}
|
|
1096
1111
|
|
|
1097
|
-
if (self.status == .view_change and self.
|
|
1098
|
-
log.debug("{}: on_repair: ignoring (view change,
|
|
1112
|
+
if (self.status == .view_change and self.primary_index(self.view) != self.replica) {
|
|
1113
|
+
log.debug("{}: on_repair: ignoring (view change, backup)", .{self.replica});
|
|
1099
1114
|
return;
|
|
1100
1115
|
}
|
|
1101
1116
|
|
|
@@ -1199,7 +1214,7 @@ pub fn ReplicaType(
|
|
|
1199
1214
|
///
|
|
1200
1215
|
/// * The headers must all belong to the same hash chain. (Gaps are allowed).
|
|
1201
1216
|
/// (Reason: the headers bundled with the DVC(s) with the highest view_normal will be
|
|
1202
|
-
/// loaded into the new
|
|
1217
|
+
/// loaded into the new primary with `replace_header()`, not `repair_header()`).
|
|
1203
1218
|
///
|
|
1204
1219
|
/// Across all DVCs in the quorum:
|
|
1205
1220
|
///
|
|
@@ -1207,7 +1222,7 @@ pub fn ReplicaType(
|
|
|
1207
1222
|
/// dvc₁.headers[i].op == dvc₂.headers[j].op implies
|
|
1208
1223
|
/// dvc₁.headers[i].checksum == dvc₂.headers[j].checksum.
|
|
1209
1224
|
/// (Reason: the headers bundled with the DVC(s) with the highest view_normal will be
|
|
1210
|
-
/// loaded into the new
|
|
1225
|
+
/// loaded into the new primary with `replace_header()`, not `repair_header()`).
|
|
1211
1226
|
///
|
|
1212
1227
|
/// Perhaps unintuitively, it is safe to advertise a header before its message is prepared
|
|
1213
1228
|
/// (e.g. the write is still queued). The header is either:
|
|
@@ -1221,7 +1236,7 @@ pub fn ReplicaType(
|
|
|
1221
1236
|
|
|
1222
1237
|
assert(self.status == .normal or self.status == .view_change);
|
|
1223
1238
|
assert(message.header.view >= self.view);
|
|
1224
|
-
assert(self.
|
|
1239
|
+
assert(self.primary_index(message.header.view) == self.replica);
|
|
1225
1240
|
|
|
1226
1241
|
self.view_jump(message.header);
|
|
1227
1242
|
|
|
@@ -1259,7 +1274,7 @@ pub fn ReplicaType(
|
|
|
1259
1274
|
assert(!self.do_view_change_quorum);
|
|
1260
1275
|
self.do_view_change_quorum = true;
|
|
1261
1276
|
|
|
1262
|
-
self.
|
|
1277
|
+
self.primary_set_log_from_do_view_change_messages();
|
|
1263
1278
|
assert(self.op >= self.commit_max);
|
|
1264
1279
|
assert(self.state_machine.prepare_timestamp >=
|
|
1265
1280
|
self.journal.header_with_op(self.op).?.timestamp);
|
|
@@ -1286,14 +1301,14 @@ pub fn ReplicaType(
|
|
|
1286
1301
|
// precluding recovery.
|
|
1287
1302
|
//
|
|
1288
1303
|
// TODO State transfer. Currently this is unreachable because the
|
|
1289
|
-
//
|
|
1304
|
+
// primary won't checkpoint until all replicas are caught up.
|
|
1290
1305
|
unreachable;
|
|
1291
1306
|
}
|
|
1292
1307
|
|
|
1293
1308
|
assert(self.status == .view_change or self.status == .normal);
|
|
1294
1309
|
assert(message.header.view >= self.view);
|
|
1295
1310
|
assert(message.header.replica != self.replica);
|
|
1296
|
-
assert(message.header.replica == self.
|
|
1311
|
+
assert(message.header.replica == self.primary_index(message.header.view));
|
|
1297
1312
|
|
|
1298
1313
|
self.view_jump(message.header);
|
|
1299
1314
|
|
|
@@ -1313,7 +1328,7 @@ pub fn ReplicaType(
|
|
|
1313
1328
|
|
|
1314
1329
|
assert(self.status == .normal);
|
|
1315
1330
|
assert(message.header.view == self.view);
|
|
1316
|
-
assert(self.
|
|
1331
|
+
assert(self.backup());
|
|
1317
1332
|
|
|
1318
1333
|
self.commit_journal(self.commit_max);
|
|
1319
1334
|
|
|
@@ -1326,7 +1341,7 @@ pub fn ReplicaType(
|
|
|
1326
1341
|
assert(self.status == .normal);
|
|
1327
1342
|
assert(message.header.view == self.view);
|
|
1328
1343
|
assert(message.header.replica != self.replica);
|
|
1329
|
-
assert(self.
|
|
1344
|
+
assert(self.primary());
|
|
1330
1345
|
|
|
1331
1346
|
const start_view = self.create_view_change_message(.start_view);
|
|
1332
1347
|
defer self.message_bus.unref(start_view);
|
|
@@ -1380,13 +1395,13 @@ pub fn ReplicaType(
|
|
|
1380
1395
|
// replica_count 3
|
|
1381
1396
|
// do_view_change.headers.len 3 (= pipeline_max)
|
|
1382
1397
|
// recovery_response.headers.len 2 (!)
|
|
1383
|
-
// replica 0 log 3, 4a, 5a, 6a, 7a, 8a (status=normal,
|
|
1384
|
-
// replica 1 log 3, 4a, 5a, --, --, -- (status=normal,
|
|
1398
|
+
// replica 0 log 3, 4a, 5a, 6a, 7a, 8a (status=normal, primary)
|
|
1399
|
+
// replica 1 log 3, 4a, 5a, --, --, -- (status=normal, backup)
|
|
1385
1400
|
// replica 2 log 3, 4b, 5b, --, --, -- (status=recovering)
|
|
1386
1401
|
//
|
|
1387
1402
|
// 1. Replica 2 receives a recovery_response quorum.
|
|
1388
1403
|
// 2. Replica 2 sets `replica.op` to 8a.
|
|
1389
|
-
// 3. Replica 2 sets its headers from the
|
|
1404
|
+
// 3. Replica 2 sets its headers from the primary's recovery_response (8a, 7a)
|
|
1390
1405
|
// (via `replace_header()`).
|
|
1391
1406
|
// 4. Replica 2 transitions to status=normal.
|
|
1392
1407
|
// 5. Replica 0 fails (before replica 2 has a chance to repair its hash chain.)
|
|
@@ -1447,9 +1462,6 @@ pub fn ReplicaType(
|
|
|
1447
1462
|
return;
|
|
1448
1463
|
}
|
|
1449
1464
|
|
|
1450
|
-
// Recovery messages with our nonce are not sent until after the journal is recovered.
|
|
1451
|
-
assert(self.journal.status == .recovered);
|
|
1452
|
-
|
|
1453
1465
|
var responses: *QuorumMessages = &self.recovery_response_from_other_replicas;
|
|
1454
1466
|
if (responses[message.header.replica]) |existing| {
|
|
1455
1467
|
assert(message.header.replica == existing.header.replica);
|
|
@@ -1520,7 +1532,7 @@ pub fn ReplicaType(
|
|
|
1520
1532
|
|
|
1521
1533
|
// Wait until we have:
|
|
1522
1534
|
// * at least `f + 1` messages for quorum (not including ourself), and
|
|
1523
|
-
// * a response from the
|
|
1535
|
+
// * a response from the primary of the highest discovered view.
|
|
1524
1536
|
const count = self.count_quorum(responses, .recovery_response, self.recovery_nonce);
|
|
1525
1537
|
assert(count <= self.replica_count - 1);
|
|
1526
1538
|
|
|
@@ -1548,10 +1560,10 @@ pub fn ReplicaType(
|
|
|
1548
1560
|
break :blk view;
|
|
1549
1561
|
};
|
|
1550
1562
|
|
|
1551
|
-
const
|
|
1552
|
-
if (
|
|
1563
|
+
const primary_response = responses[self.primary_index(view)];
|
|
1564
|
+
if (primary_response == null) {
|
|
1553
1565
|
log.debug(
|
|
1554
|
-
"{}: on_recovery_response: ignoring (awaiting response from
|
|
1566
|
+
"{}: on_recovery_response: ignoring (awaiting response from primary of view={})",
|
|
1555
1567
|
.{
|
|
1556
1568
|
self.replica,
|
|
1557
1569
|
view,
|
|
@@ -1560,14 +1572,14 @@ pub fn ReplicaType(
|
|
|
1560
1572
|
return;
|
|
1561
1573
|
}
|
|
1562
1574
|
|
|
1563
|
-
if (
|
|
1564
|
-
// The
|
|
1575
|
+
if (primary_response.?.header.view != view) {
|
|
1576
|
+
// The primary (according to the view quorum) isn't the primary (according to itself).
|
|
1565
1577
|
// The `recovery_timeout` will retry shortly with another round.
|
|
1566
1578
|
log.debug(
|
|
1567
|
-
"{}: on_recovery_response: ignoring (
|
|
1579
|
+
"{}: on_recovery_response: ignoring (primary view={} != quorum view={})",
|
|
1568
1580
|
.{
|
|
1569
1581
|
self.replica,
|
|
1570
|
-
|
|
1582
|
+
primary_response.?.header.view,
|
|
1571
1583
|
view,
|
|
1572
1584
|
},
|
|
1573
1585
|
);
|
|
@@ -1578,16 +1590,16 @@ pub fn ReplicaType(
|
|
|
1578
1590
|
// All further `recovery_response` messages are ignored.
|
|
1579
1591
|
|
|
1580
1592
|
// TODO When the view is recovered from the superblock (instead of via the VSR recovery
|
|
1581
|
-
// protocol), if the view number indicates that this replica is a
|
|
1593
|
+
// protocol), if the view number indicates that this replica is a primary, it must
|
|
1582
1594
|
// transition to status=view_change instead of status=normal.
|
|
1583
1595
|
|
|
1584
|
-
const
|
|
1585
|
-
assert(
|
|
1596
|
+
const primary_headers = message_body_as_headers(primary_response.?);
|
|
1597
|
+
assert(primary_headers.len > 0);
|
|
1586
1598
|
|
|
1587
|
-
const commit =
|
|
1599
|
+
const commit = primary_response.?.header.commit;
|
|
1588
1600
|
{
|
|
1589
|
-
const op = op_highest(
|
|
1590
|
-
assert(op ==
|
|
1601
|
+
const op = op_highest(primary_headers);
|
|
1602
|
+
assert(op == primary_response.?.header.op);
|
|
1591
1603
|
|
|
1592
1604
|
self.set_op_and_commit_max(op, commit, "on_recovery_response");
|
|
1593
1605
|
|
|
@@ -1596,7 +1608,7 @@ pub fn ReplicaType(
|
|
|
1596
1608
|
// use the hash chain to figure out which headers to request. Maybe include our
|
|
1597
1609
|
// `op_checkpoint` in the recovery (request) message so that the response can give
|
|
1598
1610
|
// more useful (i.e. older) headers.
|
|
1599
|
-
self.replace_headers(
|
|
1611
|
+
self.replace_headers(primary_headers);
|
|
1600
1612
|
|
|
1601
1613
|
if (self.op < constants.journal_slot_count) {
|
|
1602
1614
|
if (self.journal.header_with_op(0)) |header| {
|
|
@@ -1622,15 +1634,15 @@ pub fn ReplicaType(
|
|
|
1622
1634
|
assert(self.status == .recovering);
|
|
1623
1635
|
self.transition_to_normal_from_recovering_status(view);
|
|
1624
1636
|
assert(self.status == .normal);
|
|
1625
|
-
assert(self.
|
|
1637
|
+
assert(self.backup());
|
|
1626
1638
|
|
|
1627
1639
|
log.info("{}: on_recovery_response: recovery done responses={} view={} headers={}..{}" ++
|
|
1628
1640
|
" commit={} dirty={} faulty={}", .{
|
|
1629
1641
|
self.replica,
|
|
1630
1642
|
count,
|
|
1631
1643
|
view,
|
|
1632
|
-
|
|
1633
|
-
|
|
1644
|
+
primary_headers[primary_headers.len - 1].op,
|
|
1645
|
+
primary_headers[0].op,
|
|
1634
1646
|
commit,
|
|
1635
1647
|
self.journal.dirty.count,
|
|
1636
1648
|
self.journal.faulty.count,
|
|
@@ -1669,8 +1681,8 @@ pub fn ReplicaType(
|
|
|
1669
1681
|
else => unreachable,
|
|
1670
1682
|
};
|
|
1671
1683
|
|
|
1672
|
-
// Only the
|
|
1673
|
-
assert(checksum != null or self.
|
|
1684
|
+
// Only the primary may respond to `request_prepare` messages without a checksum.
|
|
1685
|
+
assert(checksum != null or self.primary_index(self.view) == self.replica);
|
|
1674
1686
|
|
|
1675
1687
|
// Try to serve the message directly from the pipeline.
|
|
1676
1688
|
// This saves us from going to disk. And we don't need to worry that the WAL's copy
|
|
@@ -1741,7 +1753,7 @@ pub fn ReplicaType(
|
|
|
1741
1753
|
// they should also be in view change status, waiting for the new primary to start
|
|
1742
1754
|
// the view.
|
|
1743
1755
|
if (self.status == .view_change) {
|
|
1744
|
-
assert(message.header.replica == self.
|
|
1756
|
+
assert(message.header.replica == self.primary_index(self.view));
|
|
1745
1757
|
assert(checksum != null);
|
|
1746
1758
|
|
|
1747
1759
|
if (self.journal.header_with_op_and_checksum(op, checksum)) |_| {
|
|
@@ -1834,7 +1846,7 @@ pub fn ReplicaType(
|
|
|
1834
1846
|
assert(self.status == .view_change);
|
|
1835
1847
|
assert(message.header.view == self.view);
|
|
1836
1848
|
assert(message.header.replica != self.replica);
|
|
1837
|
-
assert(self.
|
|
1849
|
+
assert(self.primary_index(self.view) == self.replica);
|
|
1838
1850
|
assert(self.do_view_change_quorum);
|
|
1839
1851
|
assert(self.repairs_allowed());
|
|
1840
1852
|
|
|
@@ -1859,7 +1871,7 @@ pub fn ReplicaType(
|
|
|
1859
1871
|
return;
|
|
1860
1872
|
}
|
|
1861
1873
|
|
|
1862
|
-
//
|
|
1874
|
+
// backups may not send a `nack_prepare` for a different checksum:
|
|
1863
1875
|
// However our op may change in between sending the request and getting the nack.
|
|
1864
1876
|
assert(message.header.op == op);
|
|
1865
1877
|
assert(message.header.context == checksum);
|
|
@@ -1918,7 +1930,7 @@ pub fn ReplicaType(
|
|
|
1918
1930
|
assert(!self.nack_prepare_from_other_replicas.isSet(self.replica));
|
|
1919
1931
|
log.debug("{}: on_nack_prepare: quorum received op={}", .{ self.replica, op });
|
|
1920
1932
|
|
|
1921
|
-
self.
|
|
1933
|
+
self.primary_discard_uncommitted_ops_from(op, checksum);
|
|
1922
1934
|
self.reset_quorum_nack_prepare();
|
|
1923
1935
|
self.repair();
|
|
1924
1936
|
}
|
|
@@ -1950,7 +1962,7 @@ pub fn ReplicaType(
|
|
|
1950
1962
|
|
|
1951
1963
|
// TODO We may want to ping for connectivity during a view change.
|
|
1952
1964
|
assert(self.status == .normal);
|
|
1953
|
-
assert(self.
|
|
1965
|
+
assert(self.primary() or self.backup());
|
|
1954
1966
|
|
|
1955
1967
|
var ping = Header{
|
|
1956
1968
|
.command = .ping,
|
|
@@ -1966,7 +1978,7 @@ pub fn ReplicaType(
|
|
|
1966
1978
|
fn on_prepare_timeout(self: *Self) void {
|
|
1967
1979
|
// We will decide below whether to reset or backoff the timeout.
|
|
1968
1980
|
assert(self.status == .normal);
|
|
1969
|
-
assert(self.
|
|
1981
|
+
assert(self.primary());
|
|
1970
1982
|
|
|
1971
1983
|
const prepare = self.pipeline.head_ptr().?;
|
|
1972
1984
|
assert(prepare.message.header.command == .prepare);
|
|
@@ -2017,7 +2029,7 @@ pub fn ReplicaType(
|
|
|
2017
2029
|
// We may even have maxed out our IO depth and been unable to initiate the write,
|
|
2018
2030
|
// which can happen if `constants.pipeline_max` exceeds `constants.journal_iops_write_max`.
|
|
2019
2031
|
// This can lead to deadlock for a cluster of one or two (if we do not retry here),
|
|
2020
|
-
// since there is no other way for the
|
|
2032
|
+
// since there is no other way for the primary to repair the dirty op because no
|
|
2021
2033
|
// other replica has it.
|
|
2022
2034
|
//
|
|
2023
2035
|
// Retry the write through `on_repair()` which will work out which is which.
|
|
@@ -2056,7 +2068,7 @@ pub fn ReplicaType(
|
|
|
2056
2068
|
self.commit_timeout.reset();
|
|
2057
2069
|
|
|
2058
2070
|
assert(self.status == .normal);
|
|
2059
|
-
assert(self.
|
|
2071
|
+
assert(self.primary());
|
|
2060
2072
|
assert(self.commit_min == self.commit_max);
|
|
2061
2073
|
|
|
2062
2074
|
// TODO Snapshots: Use snapshot checksum if commit is no longer in journal.
|
|
@@ -2074,7 +2086,7 @@ pub fn ReplicaType(
|
|
|
2074
2086
|
|
|
2075
2087
|
fn on_normal_status_timeout(self: *Self) void {
|
|
2076
2088
|
assert(self.status == .normal);
|
|
2077
|
-
assert(self.
|
|
2089
|
+
assert(self.backup());
|
|
2078
2090
|
self.transition_to_view_change_status(self.view + 1);
|
|
2079
2091
|
}
|
|
2080
2092
|
|
|
@@ -2089,14 +2101,14 @@ pub fn ReplicaType(
|
|
|
2089
2101
|
|
|
2090
2102
|
// Keep sending `start_view_change` messages:
|
|
2091
2103
|
// We may have a `start_view_change_quorum` but other replicas may not.
|
|
2092
|
-
// However, the
|
|
2104
|
+
// However, the primary may stop sending once it has a `do_view_change_quorum`.
|
|
2093
2105
|
if (!self.do_view_change_quorum) self.send_start_view_change();
|
|
2094
2106
|
|
|
2095
2107
|
// It is critical that a `do_view_change` message implies a `start_view_change_quorum`:
|
|
2096
2108
|
if (self.start_view_change_quorum) {
|
|
2097
|
-
// The
|
|
2109
|
+
// The primary need not retry to send a `do_view_change` message to itself:
|
|
2098
2110
|
// We assume the MessageBus will not drop messages sent by a replica to itself.
|
|
2099
|
-
if (self.
|
|
2111
|
+
if (self.primary_index(self.view) != self.replica) self.send_do_view_change();
|
|
2100
2112
|
}
|
|
2101
2113
|
}
|
|
2102
2114
|
|
|
@@ -2131,7 +2143,7 @@ pub fn ReplicaType(
|
|
|
2131
2143
|
if (self.replica_count == 2) assert(threshold == 2);
|
|
2132
2144
|
|
|
2133
2145
|
assert(self.status == .view_change);
|
|
2134
|
-
assert(self.
|
|
2146
|
+
assert(self.primary_index(self.view) == self.replica);
|
|
2135
2147
|
},
|
|
2136
2148
|
else => unreachable,
|
|
2137
2149
|
}
|
|
@@ -2232,7 +2244,7 @@ pub fn ReplicaType(
|
|
|
2232
2244
|
if (self.replica_count <= 2) assert(threshold == self.replica_count);
|
|
2233
2245
|
|
|
2234
2246
|
assert(self.status == .normal);
|
|
2235
|
-
assert(self.
|
|
2247
|
+
assert(self.primary());
|
|
2236
2248
|
},
|
|
2237
2249
|
.start_view_change => {
|
|
2238
2250
|
assert(self.replica_count > 1);
|
|
@@ -2246,7 +2258,7 @@ pub fn ReplicaType(
|
|
|
2246
2258
|
if (self.replica_count == 2) assert(threshold >= 1);
|
|
2247
2259
|
|
|
2248
2260
|
assert(self.status == .view_change);
|
|
2249
|
-
assert(self.
|
|
2261
|
+
assert(self.primary_index(self.view) == self.replica);
|
|
2250
2262
|
assert(message.header.replica != self.replica);
|
|
2251
2263
|
assert(message.header.op == self.nack_prepare_op.?);
|
|
2252
2264
|
},
|
|
@@ -2362,7 +2374,7 @@ pub fn ReplicaType(
|
|
|
2362
2374
|
/// A function which calls `commit_journal()` to set `commit_max` must first call
|
|
2363
2375
|
/// `view_jump()`. Otherwise, we may fork the log.
|
|
2364
2376
|
fn commit_journal(self: *Self, commit: u64) void {
|
|
2365
|
-
// TODO Restrict `view_change` status only to the
|
|
2377
|
+
// TODO Restrict `view_change` status only to the primary purely as defense-in-depth.
|
|
2366
2378
|
// Be careful of concurrency when doing this, as successive view changes can happen quickly.
|
|
2367
2379
|
assert(self.status == .normal or self.status == .view_change or
|
|
2368
2380
|
(self.status == .recovering and self.replica_count == 1));
|
|
@@ -2466,21 +2478,21 @@ pub fn ReplicaType(
|
|
|
2466
2478
|
switch (self.status) {
|
|
2467
2479
|
.normal => {},
|
|
2468
2480
|
.view_change => {
|
|
2469
|
-
if (self.
|
|
2481
|
+
if (self.primary_index(self.view) != self.replica) {
|
|
2470
2482
|
self.commit_ops_done();
|
|
2471
|
-
log.debug("{}: commit_journal_next_callback: no longer
|
|
2483
|
+
log.debug("{}: commit_journal_next_callback: no longer primary view={}", .{
|
|
2472
2484
|
self.replica,
|
|
2473
2485
|
self.view,
|
|
2474
2486
|
});
|
|
2475
2487
|
assert(self.replica_count > 1);
|
|
2476
2488
|
return;
|
|
2477
2489
|
}
|
|
2478
|
-
// Only the
|
|
2490
|
+
// Only the primary may commit during a view change before starting the new view.
|
|
2479
2491
|
// Fall through if this is indeed the case.
|
|
2480
2492
|
},
|
|
2481
2493
|
.recovering => {
|
|
2482
2494
|
assert(self.replica_count == 1);
|
|
2483
|
-
assert(self.
|
|
2495
|
+
assert(self.primary_index(self.view) == self.replica);
|
|
2484
2496
|
},
|
|
2485
2497
|
}
|
|
2486
2498
|
|
|
@@ -2548,7 +2560,7 @@ pub fn ReplicaType(
|
|
|
2548
2560
|
assert(self.commit_min == self.commit_prepare.?.header.op);
|
|
2549
2561
|
assert(self.commit_min <= self.commit_max);
|
|
2550
2562
|
|
|
2551
|
-
if (self.status == .normal and self.
|
|
2563
|
+
if (self.status == .normal and self.primary()) {
|
|
2552
2564
|
const prepare = self.pipeline.pop().?;
|
|
2553
2565
|
assert(self.commit_min == self.commit_max);
|
|
2554
2566
|
assert(prepare.message.header.checksum == self.commit_prepare.?.header.checksum);
|
|
@@ -2712,7 +2724,7 @@ pub fn ReplicaType(
|
|
|
2712
2724
|
assert(prepare.header.op == self.commit_min + 1);
|
|
2713
2725
|
assert(prepare.header.op <= self.op);
|
|
2714
2726
|
|
|
2715
|
-
// If we are a
|
|
2727
|
+
// If we are a backup committing through `commit_journal()` then a view change may
|
|
2716
2728
|
// have happened since we last checked in `commit_journal_next()`. However, this would
|
|
2717
2729
|
// relate to subsequent ops, since by now we have already verified the hash chain for
|
|
2718
2730
|
// this commit.
|
|
@@ -2728,10 +2740,10 @@ pub fn ReplicaType(
|
|
|
2728
2740
|
self.journal.header_with_op(self.commit_min).?.checksum);
|
|
2729
2741
|
}
|
|
2730
2742
|
|
|
2731
|
-
log.debug("{}: commit_op: executing view={}
|
|
2743
|
+
log.debug("{}: commit_op: executing view={} primary={} op={} checksum={} ({s})", .{
|
|
2732
2744
|
self.replica,
|
|
2733
2745
|
self.view,
|
|
2734
|
-
self.
|
|
2746
|
+
self.primary_index(self.view) == self.replica,
|
|
2735
2747
|
prepare.header.op,
|
|
2736
2748
|
prepare.header.checksum,
|
|
2737
2749
|
@tagName(prepare.header.operation.cast(StateMachine)),
|
|
@@ -2807,18 +2819,18 @@ pub fn ReplicaType(
|
|
|
2807
2819
|
}
|
|
2808
2820
|
}
|
|
2809
2821
|
|
|
2810
|
-
if (self.
|
|
2822
|
+
if (self.primary_index(self.view) == self.replica) {
|
|
2811
2823
|
log.debug("{}: commit_op: replying to client: {}", .{ self.replica, reply.header });
|
|
2812
2824
|
self.message_bus.send_message_to_client(reply.header.client, reply);
|
|
2813
2825
|
}
|
|
2814
2826
|
}
|
|
2815
2827
|
|
|
2816
2828
|
/// Commits, frees and pops as many prepares at the head of the pipeline as have quorum.
|
|
2817
|
-
/// Can be called only when the replica is the
|
|
2829
|
+
/// Can be called only when the replica is the primary.
|
|
2818
2830
|
/// Can be called only when the pipeline has at least one prepare.
|
|
2819
2831
|
fn commit_pipeline(self: *Self) void {
|
|
2820
2832
|
assert(self.status == .normal);
|
|
2821
|
-
assert(self.
|
|
2833
|
+
assert(self.primary());
|
|
2822
2834
|
assert(self.pipeline.count > 0);
|
|
2823
2835
|
|
|
2824
2836
|
// Guard against multiple concurrent invocations of commit_journal()/commit_pipeline():
|
|
@@ -2834,7 +2846,7 @@ pub fn ReplicaType(
|
|
|
2834
2846
|
fn commit_pipeline_next(self: *Self) void {
|
|
2835
2847
|
assert(self.committing);
|
|
2836
2848
|
assert(self.status == .normal);
|
|
2837
|
-
assert(self.
|
|
2849
|
+
assert(self.primary());
|
|
2838
2850
|
|
|
2839
2851
|
if (self.pipeline.head_ptr()) |prepare| {
|
|
2840
2852
|
assert(self.commit_min == self.commit_max);
|
|
@@ -2864,7 +2876,7 @@ pub fn ReplicaType(
|
|
|
2864
2876
|
assert(self.commit_min <= self.commit_max);
|
|
2865
2877
|
assert(self.commit_min <= self.op);
|
|
2866
2878
|
|
|
2867
|
-
if (self.status == .normal and self.
|
|
2879
|
+
if (self.status == .normal and self.primary()) {
|
|
2868
2880
|
if (self.pipeline.head_ptr()) |pipeline_head| {
|
|
2869
2881
|
assert(pipeline_head.message.header.op == self.commit_min + 1);
|
|
2870
2882
|
}
|
|
@@ -3023,7 +3035,7 @@ pub fn ReplicaType(
|
|
|
3023
3035
|
fn create_view_change_message(self: *Self, command: Command) *Message {
|
|
3024
3036
|
assert(command == .do_view_change or command == .start_view);
|
|
3025
3037
|
|
|
3026
|
-
// We may send a start_view message in normal status to resolve a
|
|
3038
|
+
// We may send a start_view message in normal status to resolve a backup's view jump:
|
|
3027
3039
|
assert(self.status == .normal or self.status == .view_change);
|
|
3028
3040
|
|
|
3029
3041
|
const message = self.message_bus.get_message();
|
|
@@ -3078,7 +3090,7 @@ pub fn ReplicaType(
|
|
|
3078
3090
|
}
|
|
3079
3091
|
|
|
3080
3092
|
/// Returns the op of the highest canonical message, according to this replica (the new
|
|
3081
|
-
///
|
|
3093
|
+
/// primary) prior to loading the current view change's DVC quorum headers.
|
|
3082
3094
|
/// When this replica participated in the last `view_normal`, this is just `replica.op`.
|
|
3083
3095
|
///
|
|
3084
3096
|
/// - A *canonical* message was part of the last view_normal.
|
|
@@ -3089,19 +3101,19 @@ pub fn ReplicaType(
|
|
|
3089
3101
|
///
|
|
3090
3102
|
/// Consider these logs:
|
|
3091
3103
|
///
|
|
3092
|
-
/// replica 0: 4, 5, 6b, 7b, 8b (commit_min=6b,
|
|
3093
|
-
/// replica 1: 4, 5, 6b, --, -- (commit_min=5,
|
|
3104
|
+
/// replica 0: 4, 5, 6b, 7b, 8b (commit_min=6b, primary, status=normal, view=X)
|
|
3105
|
+
/// replica 1: 4, 5, 6b, --, -- (commit_min=5, backup, status=normal, view=X)
|
|
3094
3106
|
/// replica 2: 4, 5, 6a, --, 8b (view<X)
|
|
3095
3107
|
///
|
|
3096
3108
|
/// 1. Replica 0 crashes immediately after committing 6b.
|
|
3097
3109
|
/// 2. Replicas 1 and 2 must determine the new chain HEAD.
|
|
3098
3110
|
/// 3. 8b is discarded due to the gap in 7.
|
|
3099
|
-
/// 4. To distinguish between 6a and 6b (and safely discard 6a), the new
|
|
3111
|
+
/// 4. To distinguish between 6a and 6b (and safely discard 6a), the new primary trusts ops
|
|
3100
3112
|
/// from the DVC(s) with the greatest `view_normal`.
|
|
3101
|
-
fn
|
|
3113
|
+
fn primary_op_canonical_max(self: *const Self, view_normal_canonical: u64) usize {
|
|
3102
3114
|
assert(self.replica_count > 1);
|
|
3103
3115
|
assert(self.status == .view_change);
|
|
3104
|
-
assert(self.
|
|
3116
|
+
assert(self.primary_index(self.view) == self.replica);
|
|
3105
3117
|
assert(self.do_view_change_quorum);
|
|
3106
3118
|
assert(!self.repair_timeout.ticking);
|
|
3107
3119
|
assert(self.journal.header_with_op(self.op) != null);
|
|
@@ -3141,12 +3153,12 @@ pub fn ReplicaType(
|
|
|
3141
3153
|
/// Discards uncommitted ops during a view change from after and including `op`.
|
|
3142
3154
|
/// This is required to maximize availability in the presence of storage faults.
|
|
3143
3155
|
/// Refer to the CTRL protocol from Protocol-Aware Recovery for Consensus-Based Storage.
|
|
3144
|
-
fn
|
|
3156
|
+
fn primary_discard_uncommitted_ops_from(self: *Self, op: u64, checksum: u128) void {
|
|
3145
3157
|
assert(self.status == .view_change);
|
|
3146
|
-
assert(self.
|
|
3158
|
+
assert(self.primary_index(self.view) == self.replica);
|
|
3147
3159
|
assert(self.repairs_allowed());
|
|
3148
3160
|
|
|
3149
|
-
assert(self.valid_hash_chain("
|
|
3161
|
+
assert(self.valid_hash_chain("primary_discard_uncommitted_ops_from"));
|
|
3150
3162
|
|
|
3151
3163
|
const slot = self.journal.slot_with_op(op).?;
|
|
3152
3164
|
assert(op > self.commit_max);
|
|
@@ -3154,7 +3166,7 @@ pub fn ReplicaType(
|
|
|
3154
3166
|
assert(self.journal.header_with_op_and_checksum(op, checksum) != null);
|
|
3155
3167
|
assert(self.journal.dirty.bit(slot));
|
|
3156
3168
|
|
|
3157
|
-
log.debug("{}:
|
|
3169
|
+
log.debug("{}: primary_discard_uncommitted_ops_from: ops={}..{} view={}", .{
|
|
3158
3170
|
self.replica,
|
|
3159
3171
|
op,
|
|
3160
3172
|
self.op,
|
|
@@ -3169,28 +3181,28 @@ pub fn ReplicaType(
|
|
|
3169
3181
|
assert(!self.journal.faulty.bit(slot));
|
|
3170
3182
|
|
|
3171
3183
|
// We require that `self.op` always exists. Rewinding `self.op` could change that.
|
|
3172
|
-
// However, we do this only as the
|
|
3184
|
+
// However, we do this only as the primary within a view change, with all headers intact.
|
|
3173
3185
|
assert(self.journal.header_with_op(self.op) != null);
|
|
3174
3186
|
}
|
|
3175
3187
|
|
|
3176
|
-
/// Returns whether the replica is a
|
|
3188
|
+
/// Returns whether the replica is a backup for the current view.
|
|
3177
3189
|
/// This may be used only when the replica status is normal.
|
|
3178
|
-
fn
|
|
3179
|
-
return !self.
|
|
3190
|
+
fn backup(self: *Self) bool {
|
|
3191
|
+
return !self.primary();
|
|
3180
3192
|
}
|
|
3181
3193
|
|
|
3182
3194
|
fn flush_loopback_queue(self: *Self) void {
|
|
3183
3195
|
// There are four cases where a replica will send a message to itself:
|
|
3184
3196
|
// However, of these four cases, all but one call send_message_to_replica().
|
|
3185
3197
|
//
|
|
3186
|
-
// 1. In on_request(), the
|
|
3198
|
+
// 1. In on_request(), the primary sends a synchronous prepare to itself, but this is
|
|
3187
3199
|
// done by calling on_prepare() directly, and subsequent prepare timeout retries will
|
|
3188
3200
|
// never resend to self.
|
|
3189
|
-
// 2. In on_prepare(), after writing to storage, the
|
|
3201
|
+
// 2. In on_prepare(), after writing to storage, the primary sends a (typically)
|
|
3190
3202
|
// asynchronous prepare_ok to itself.
|
|
3191
3203
|
// 3. In on_start_view_change(), after receiving a quorum of start_view_change
|
|
3192
|
-
// messages, the new
|
|
3193
|
-
// 4. In
|
|
3204
|
+
// messages, the new primary sends a synchronous do_view_change to itself.
|
|
3205
|
+
// 4. In start_view_as_the_new_primary(), the new primary sends itself a prepare_ok
|
|
3194
3206
|
// message for each uncommitted message.
|
|
3195
3207
|
if (self.loopback_queue) |message| {
|
|
3196
3208
|
defer self.message_bus.unref(message);
|
|
@@ -3218,15 +3230,15 @@ pub fn ReplicaType(
|
|
|
3218
3230
|
}
|
|
3219
3231
|
|
|
3220
3232
|
if (message.header.view > self.view) {
|
|
3221
|
-
// Another replica is treating us as the
|
|
3233
|
+
// Another replica is treating us as the primary for a view we do not know about.
|
|
3222
3234
|
// This may be caused by a fault in the network topology.
|
|
3223
3235
|
log.warn("{}: on_prepare_ok: ignoring (newer view)", .{self.replica});
|
|
3224
3236
|
return true;
|
|
3225
3237
|
}
|
|
3226
3238
|
|
|
3227
|
-
if (self.
|
|
3239
|
+
if (self.backup()) {
|
|
3228
3240
|
// This may be caused by a fault in the network topology.
|
|
3229
|
-
log.warn("{}: on_prepare_ok: ignoring (
|
|
3241
|
+
log.warn("{}: on_prepare_ok: ignoring (backup)", .{self.replica});
|
|
3230
3242
|
return true;
|
|
3231
3243
|
}
|
|
3232
3244
|
|
|
@@ -3264,14 +3276,14 @@ pub fn ReplicaType(
|
|
|
3264
3276
|
return true;
|
|
3265
3277
|
}
|
|
3266
3278
|
|
|
3267
|
-
if (self.
|
|
3279
|
+
if (self.primary_index(self.view) != self.replica) {
|
|
3268
3280
|
switch (message.header.command) {
|
|
3269
|
-
// Only the
|
|
3281
|
+
// Only the primary may receive these messages:
|
|
3270
3282
|
.request_start_view, .nack_prepare => {
|
|
3271
|
-
log.warn("{}: on_{s}: ignoring (
|
|
3283
|
+
log.warn("{}: on_{s}: ignoring (backup)", .{ self.replica, command });
|
|
3272
3284
|
return true;
|
|
3273
3285
|
},
|
|
3274
|
-
// Only the
|
|
3286
|
+
// Only the primary may answer a request for a prepare without a context:
|
|
3275
3287
|
.request_prepare => if (message.header.timestamp == 0) {
|
|
3276
3288
|
log.warn("{}: on_{s}: ignoring (no context)", .{ self.replica, command });
|
|
3277
3289
|
return true;
|
|
@@ -3301,8 +3313,8 @@ pub fn ReplicaType(
|
|
|
3301
3313
|
return true;
|
|
3302
3314
|
},
|
|
3303
3315
|
.request_headers, .request_prepare => {
|
|
3304
|
-
if (self.
|
|
3305
|
-
log.debug("{}: on_{s}: ignoring (view change, requested by
|
|
3316
|
+
if (self.primary_index(self.view) != message.header.replica) {
|
|
3317
|
+
log.debug("{}: on_{s}: ignoring (view change, requested by backup)", .{
|
|
3306
3318
|
self.replica,
|
|
3307
3319
|
command,
|
|
3308
3320
|
});
|
|
@@ -3310,8 +3322,8 @@ pub fn ReplicaType(
|
|
|
3310
3322
|
}
|
|
3311
3323
|
},
|
|
3312
3324
|
.headers, .nack_prepare => {
|
|
3313
|
-
if (self.
|
|
3314
|
-
log.debug("{}: on_{s}: ignoring (view change, received by
|
|
3325
|
+
if (self.primary_index(self.view) != self.replica) {
|
|
3326
|
+
log.debug("{}: on_{s}: ignoring (view change, received by backup)", .{
|
|
3315
3327
|
self.replica,
|
|
3316
3328
|
command,
|
|
3317
3329
|
});
|
|
@@ -3338,7 +3350,7 @@ pub fn ReplicaType(
|
|
|
3338
3350
|
return true;
|
|
3339
3351
|
}
|
|
3340
3352
|
|
|
3341
|
-
if (self.
|
|
3353
|
+
if (self.ignore_request_message_backup(message)) return true;
|
|
3342
3354
|
if (self.ignore_request_message_duplicate(message)) return true;
|
|
3343
3355
|
if (self.ignore_request_message_preparing(message)) return true;
|
|
3344
3356
|
|
|
@@ -3360,11 +3372,11 @@ pub fn ReplicaType(
|
|
|
3360
3372
|
/// Resends the reply to the latest request if the request has been committed.
|
|
3361
3373
|
fn ignore_request_message_duplicate(self: *Self, message: *const Message) bool {
|
|
3362
3374
|
assert(self.status == .normal);
|
|
3363
|
-
assert(self.
|
|
3375
|
+
assert(self.primary());
|
|
3364
3376
|
|
|
3365
3377
|
assert(message.header.command == .request);
|
|
3366
3378
|
assert(message.header.client > 0);
|
|
3367
|
-
assert(message.header.view <= self.view); // See
|
|
3379
|
+
assert(message.header.view <= self.view); // See ignore_request_message_backup().
|
|
3368
3380
|
assert(message.header.context == 0 or message.header.operation != .register);
|
|
3369
3381
|
assert(message.header.request == 0 or message.header.operation != .register);
|
|
3370
3382
|
|
|
@@ -3418,16 +3430,16 @@ pub fn ReplicaType(
|
|
|
3418
3430
|
log.debug("{}: on_request: new session", .{self.replica});
|
|
3419
3431
|
return false;
|
|
3420
3432
|
} else if (self.pipeline_prepare_for_client(message.header.client)) |_| {
|
|
3421
|
-
// The client registered with the previous
|
|
3433
|
+
// The client registered with the previous primary, which committed and replied back
|
|
3422
3434
|
// to the client before the view change, after which the register operation was
|
|
3423
|
-
// reloaded into the pipeline to be driven to completion by the new
|
|
3435
|
+
// reloaded into the pipeline to be driven to completion by the new primary, which
|
|
3424
3436
|
// now receives a request from the client that appears to have no session.
|
|
3425
3437
|
// However, the session is about to be registered, so we must wait for it to commit.
|
|
3426
3438
|
log.debug("{}: on_request: waiting for session to commit", .{self.replica});
|
|
3427
3439
|
return true;
|
|
3428
3440
|
} else {
|
|
3429
3441
|
// We must have all commits to know whether a session has been evicted. For example,
|
|
3430
|
-
// there is the risk of sending an eviction message (even as the
|
|
3442
|
+
// there is the risk of sending an eviction message (even as the primary) if we are
|
|
3431
3443
|
// partitioned and don't yet know about a session. We solve this by having clients
|
|
3432
3444
|
// include the view number and rejecting messages from clients with newer views.
|
|
3433
3445
|
log.err("{}: on_request: no session", .{self.replica});
|
|
@@ -3436,58 +3448,58 @@ pub fn ReplicaType(
|
|
|
3436
3448
|
}
|
|
3437
3449
|
}
|
|
3438
3450
|
|
|
3439
|
-
/// Returns whether the replica is eligible to process this request as the
|
|
3451
|
+
/// Returns whether the replica is eligible to process this request as the primary.
|
|
3440
3452
|
/// Takes the client's perspective into account if the client is aware of a newer view.
|
|
3441
|
-
/// Forwards requests to the
|
|
3442
|
-
fn
|
|
3453
|
+
/// Forwards requests to the primary if the client has an older view.
|
|
3454
|
+
fn ignore_request_message_backup(self: *Self, message: *Message) bool {
|
|
3443
3455
|
assert(self.status == .normal);
|
|
3444
3456
|
assert(message.header.command == .request);
|
|
3445
3457
|
|
|
3446
3458
|
// The client is aware of a newer view:
|
|
3447
|
-
// Even if we think we are the
|
|
3459
|
+
// Even if we think we are the primary, we may be partitioned from the rest of the cluster.
|
|
3448
3460
|
// We therefore drop the message rather than flood our partition with traffic.
|
|
3449
3461
|
if (message.header.view > self.view) {
|
|
3450
3462
|
log.debug("{}: on_request: ignoring (newer view)", .{self.replica});
|
|
3451
3463
|
return true;
|
|
3452
|
-
} else if (self.
|
|
3464
|
+
} else if (self.primary()) {
|
|
3453
3465
|
return false;
|
|
3454
3466
|
}
|
|
3455
3467
|
|
|
3456
3468
|
if (message.header.operation == .register) {
|
|
3457
3469
|
// We do not forward `.register` requests for the sake of `Header.peer_type()`.
|
|
3458
3470
|
// This enables the MessageBus to identify client connections on the first message.
|
|
3459
|
-
log.debug("{}: on_request: ignoring (
|
|
3471
|
+
log.debug("{}: on_request: ignoring (backup, register)", .{self.replica});
|
|
3460
3472
|
} else if (message.header.view < self.view) {
|
|
3461
|
-
// The client may not know who the
|
|
3462
|
-
// We forward to the new
|
|
3473
|
+
// The client may not know who the primary is, or may be retrying after a primary failure.
|
|
3474
|
+
// We forward to the new primary ahead of any client retry timeout to reduce latency.
|
|
3463
3475
|
// Since the client is already connected to all replicas, the client may yet receive the
|
|
3464
|
-
// reply from the new
|
|
3465
|
-
log.debug("{}: on_request: forwarding (
|
|
3466
|
-
self.send_message_to_replica(self.
|
|
3476
|
+
// reply from the new primary directly.
|
|
3477
|
+
log.debug("{}: on_request: forwarding (backup)", .{self.replica});
|
|
3478
|
+
self.send_message_to_replica(self.primary_index(self.view), message);
|
|
3467
3479
|
} else {
|
|
3468
3480
|
assert(message.header.view == self.view);
|
|
3469
|
-
// The client has the correct view, but has retried against a
|
|
3470
|
-
// This may mean that the
|
|
3471
|
-
// There is also not much we can do as the client already knows who the
|
|
3481
|
+
// The client has the correct view, but has retried against a backup.
|
|
3482
|
+
// This may mean that the primary is down and that we are about to do a view change.
|
|
3483
|
+
// There is also not much we can do as the client already knows who the primary is.
|
|
3472
3484
|
// We do not forward as this would amplify traffic on the network.
|
|
3473
3485
|
|
|
3474
|
-
// TODO This may also indicate a client-
|
|
3475
|
-
// should we trigger a view change to select a
|
|
3486
|
+
// TODO This may also indicate a client-primary partition. If we see enough of these,
|
|
3487
|
+
// should we trigger a view change to select a primary that clients can reach?
|
|
3476
3488
|
// This is a question of weighing the probability of a partition vs routing error.
|
|
3477
|
-
log.debug("{}: on_request: ignoring (
|
|
3489
|
+
log.debug("{}: on_request: ignoring (backup, same view)", .{self.replica});
|
|
3478
3490
|
}
|
|
3479
3491
|
|
|
3480
|
-
assert(self.
|
|
3492
|
+
assert(self.backup());
|
|
3481
3493
|
return true;
|
|
3482
3494
|
}
|
|
3483
3495
|
|
|
3484
3496
|
fn ignore_request_message_preparing(self: *Self, message: *const Message) bool {
|
|
3485
3497
|
assert(self.status == .normal);
|
|
3486
|
-
assert(self.
|
|
3498
|
+
assert(self.primary());
|
|
3487
3499
|
|
|
3488
3500
|
assert(message.header.command == .request);
|
|
3489
3501
|
assert(message.header.client > 0);
|
|
3490
|
-
assert(message.header.view <= self.view); // See
|
|
3502
|
+
assert(message.header.view <= self.view); // See ignore_request_message_backup().
|
|
3491
3503
|
|
|
3492
3504
|
if (self.pipeline_prepare_for_client(message.header.client)) |prepare| {
|
|
3493
3505
|
assert(prepare.message.header.command == .prepare);
|
|
@@ -3547,8 +3559,8 @@ pub fn ReplicaType(
|
|
|
3547
3559
|
}
|
|
3548
3560
|
},
|
|
3549
3561
|
.do_view_change => {
|
|
3550
|
-
if (self.
|
|
3551
|
-
log.warn("{}: on_{s}: ignoring (
|
|
3562
|
+
if (self.primary_index(message.header.view) != self.replica) {
|
|
3563
|
+
log.warn("{}: on_{s}: ignoring (backup)", .{ self.replica, command });
|
|
3552
3564
|
return true;
|
|
3553
3565
|
}
|
|
3554
3566
|
},
|
|
@@ -3572,15 +3584,15 @@ pub fn ReplicaType(
|
|
|
3572
3584
|
return false;
|
|
3573
3585
|
}
|
|
3574
3586
|
|
|
3575
|
-
/// Returns whether the replica is the
|
|
3587
|
+
/// Returns whether the replica is the primary for the current view.
|
|
3576
3588
|
/// This may be used only when the replica status is normal.
|
|
3577
|
-
fn
|
|
3589
|
+
fn primary(self: *const Self) bool {
|
|
3578
3590
|
assert(self.status == .normal);
|
|
3579
|
-
return self.
|
|
3591
|
+
return self.primary_index(self.view) == self.replica;
|
|
3580
3592
|
}
|
|
3581
3593
|
|
|
3582
|
-
/// Returns the index into the configuration of the
|
|
3583
|
-
fn
|
|
3594
|
+
/// Returns the index into the configuration of the primary for a given view.
|
|
3595
|
+
fn primary_index(self: *const Self, view: u32) u8 {
|
|
3584
3596
|
return @intCast(u8, @mod(view, self.replica_count));
|
|
3585
3597
|
}
|
|
3586
3598
|
|
|
@@ -3589,7 +3601,7 @@ pub fn ReplicaType(
|
|
|
3589
3601
|
/// This function temporarily violates the "replica.op must exist in WAL" invariant.
|
|
3590
3602
|
fn jump_to_newer_op_in_normal_status(self: *Self, header: *const Header) void {
|
|
3591
3603
|
assert(self.status == .normal);
|
|
3592
|
-
assert(self.
|
|
3604
|
+
assert(self.backup());
|
|
3593
3605
|
assert(header.view == self.view);
|
|
3594
3606
|
assert(header.op > self.op + 1);
|
|
3595
3607
|
// We may have learned of a higher `commit_max` through a commit message before jumping
|
|
@@ -3662,11 +3674,10 @@ pub fn ReplicaType(
|
|
|
3662
3674
|
/// * ` ✓ = o `: View change is unsafe if any slots are faulty.
|
|
3663
3675
|
/// (`replica.op_checkpoint` == `replica.op`).
|
|
3664
3676
|
// TODO Use this function once we switch from recovery protocol to the superblock.
|
|
3665
|
-
// If there is an "unsafe" fault, we will need to request a start_view from the
|
|
3677
|
+
// If there is an "unsafe" fault, we will need to request a start_view from the primary to
|
|
3666
3678
|
// learn the op.
|
|
3667
3679
|
fn op_certain(self: *const Self) bool {
|
|
3668
3680
|
assert(self.status == .recovering);
|
|
3669
|
-
assert(self.journal.status == .recovered);
|
|
3670
3681
|
assert(self.op_checkpoint <= self.op);
|
|
3671
3682
|
|
|
3672
3683
|
const slot_op_checkpoint = self.journal.slot_for_op(self.op_checkpoint).index;
|
|
@@ -3815,7 +3826,7 @@ pub fn ReplicaType(
|
|
|
3815
3826
|
/// Searches the pipeline for a prepare for a given client.
|
|
3816
3827
|
fn pipeline_prepare_for_client(self: *Self, client: u128) ?*Prepare {
|
|
3817
3828
|
assert(self.status == .normal);
|
|
3818
|
-
assert(self.
|
|
3829
|
+
assert(self.primary());
|
|
3819
3830
|
assert(self.commit_min == self.commit_max);
|
|
3820
3831
|
|
|
3821
3832
|
var op = self.commit_max + 1;
|
|
@@ -3827,7 +3838,7 @@ pub fn ReplicaType(
|
|
|
3827
3838
|
assert(prepare.message.header.parent == parent);
|
|
3828
3839
|
|
|
3829
3840
|
// A client may have multiple requests in the pipeline if these were committed by
|
|
3830
|
-
// the previous
|
|
3841
|
+
// the previous primary and were reloaded into the pipeline after a view change.
|
|
3831
3842
|
if (prepare.message.header.client == client) return prepare;
|
|
3832
3843
|
|
|
3833
3844
|
parent = prepare.message.header.checksum;
|
|
@@ -3848,7 +3859,7 @@ pub fn ReplicaType(
|
|
|
3848
3859
|
assert(ok.header.command == .prepare_ok);
|
|
3849
3860
|
|
|
3850
3861
|
assert(self.status == .normal);
|
|
3851
|
-
assert(self.
|
|
3862
|
+
assert(self.primary());
|
|
3852
3863
|
|
|
3853
3864
|
const prepare = self.pipeline_prepare_for_client(ok.header.client) orelse {
|
|
3854
3865
|
log.debug("{}: pipeline_prepare_for_prepare_ok: not preparing", .{self.replica});
|
|
@@ -3881,7 +3892,6 @@ pub fn ReplicaType(
|
|
|
3881
3892
|
fn recover(self: *Self) void {
|
|
3882
3893
|
assert(self.status == .recovering);
|
|
3883
3894
|
assert(self.replica_count > 1);
|
|
3884
|
-
assert(self.journal.status == .recovered);
|
|
3885
3895
|
|
|
3886
3896
|
log.debug("{}: recover: sending recovery messages nonce={}", .{
|
|
3887
3897
|
self.replica,
|
|
@@ -3922,14 +3932,14 @@ pub fn ReplicaType(
|
|
|
3922
3932
|
const commit_max_limit = std.math.min(self.commit_max, self.op_checkpoint_trigger());
|
|
3923
3933
|
|
|
3924
3934
|
// Request outstanding committed prepares to advance our op number:
|
|
3925
|
-
// This handles the case of an idle cluster, where a
|
|
3935
|
+
// This handles the case of an idle cluster, where a backup will not otherwise advance.
|
|
3926
3936
|
// This is not required for correctness, but for durability.
|
|
3927
3937
|
if (self.op < commit_max_limit) {
|
|
3928
|
-
// If the
|
|
3938
|
+
// If the primary repairs during a view change, it will have already advanced
|
|
3929
3939
|
// `self.op` to the latest op according to the quorum of `do_view_change` messages
|
|
3930
|
-
// received, so we must therefore be a
|
|
3940
|
+
// received, so we must therefore be a backup in normal status:
|
|
3931
3941
|
assert(self.status == .normal);
|
|
3932
|
-
assert(self.
|
|
3942
|
+
assert(self.backup());
|
|
3933
3943
|
log.debug("{}: repair: op={} < commit_max_limit={}, commit_max={}", .{
|
|
3934
3944
|
self.replica,
|
|
3935
3945
|
self.op,
|
|
@@ -3938,10 +3948,10 @@ pub fn ReplicaType(
|
|
|
3938
3948
|
});
|
|
3939
3949
|
// We need to advance our op number and therefore have to `request_prepare`,
|
|
3940
3950
|
// since only `on_prepare()` can do this, not `repair_header()` in `on_headers()`.
|
|
3941
|
-
self.send_header_to_replica(self.
|
|
3951
|
+
self.send_header_to_replica(self.primary_index(self.view), .{
|
|
3942
3952
|
.command = .request_prepare,
|
|
3943
3953
|
// We cannot yet know the checksum of the prepare so we set the context and
|
|
3944
|
-
// timestamp to 0: Context is optional when requesting from the
|
|
3954
|
+
// timestamp to 0: Context is optional when requesting from the primary but
|
|
3945
3955
|
// required otherwise.
|
|
3946
3956
|
.context = 0,
|
|
3947
3957
|
.timestamp = 0,
|
|
@@ -4002,10 +4012,10 @@ pub fn ReplicaType(
|
|
|
4002
4012
|
return;
|
|
4003
4013
|
}
|
|
4004
4014
|
|
|
4005
|
-
if (self.status == .view_change and self.
|
|
4006
|
-
if (self.
|
|
4007
|
-
// Start the view as the new
|
|
4008
|
-
self.
|
|
4015
|
+
if (self.status == .view_change and self.primary_index(self.view) == self.replica) {
|
|
4016
|
+
if (self.primary_repair_pipeline_op() != null) return self.primary_repair_pipeline();
|
|
4017
|
+
// Start the view as the new primary:
|
|
4018
|
+
self.start_view_as_the_new_primary();
|
|
4009
4019
|
}
|
|
4010
4020
|
}
|
|
4011
4021
|
|
|
@@ -4014,7 +4024,7 @@ pub fn ReplicaType(
|
|
|
4014
4024
|
/// A repair may never advance or replace `self.op` (critical for correctness):
|
|
4015
4025
|
///
|
|
4016
4026
|
/// Repairs must always backfill in behind `self.op` but may never advance `self.op`.
|
|
4017
|
-
/// Otherwise, a split-brain
|
|
4027
|
+
/// Otherwise, a split-brain primary may reapply an op that was removed through a view
|
|
4018
4028
|
/// change, which could be committed by a higher `commit_max` number in a commit message.
|
|
4019
4029
|
///
|
|
4020
4030
|
/// See this commit message for an example:
|
|
@@ -4023,7 +4033,7 @@ pub fn ReplicaType(
|
|
|
4023
4033
|
/// Our guiding principles around repairs in general:
|
|
4024
4034
|
///
|
|
4025
4035
|
/// * The latest op makes sense of everything else and must not be replaced with a different
|
|
4026
|
-
/// op or advanced except by the
|
|
4036
|
+
/// op or advanced except by the primary in the current view.
|
|
4027
4037
|
///
|
|
4028
4038
|
/// * Do not jump to a view in normal status without receiving a start_view message.
|
|
4029
4039
|
///
|
|
@@ -4174,7 +4184,7 @@ pub fn ReplicaType(
|
|
|
4174
4184
|
/// 2. We do a stale prepare to the right, ignoring the hash chain break to the left.
|
|
4175
4185
|
/// 3. We do another stale prepare that replaces the first since it connects to the second.
|
|
4176
4186
|
///
|
|
4177
|
-
/// This would violate our quorum replication commitment to the
|
|
4187
|
+
/// This would violate our quorum replication commitment to the primary.
|
|
4178
4188
|
/// The mistake in this example was not that we ignored the break to the left, which we must
|
|
4179
4189
|
/// do to repair reordered ops, but that we did not check for connection to the right.
|
|
4180
4190
|
fn repair_header_would_connect_hash_chain(self: *Self, header: *const Header) bool {
|
|
@@ -4199,33 +4209,33 @@ pub fn ReplicaType(
|
|
|
4199
4209
|
return true;
|
|
4200
4210
|
}
|
|
4201
4211
|
|
|
4202
|
-
/// Reads prepares into the pipeline (before we start the view as the new
|
|
4203
|
-
fn
|
|
4212
|
+
/// Reads prepares into the pipeline (before we start the view as the new primary).
|
|
4213
|
+
fn primary_repair_pipeline(self: *Self) void {
|
|
4204
4214
|
assert(self.status == .view_change);
|
|
4205
|
-
assert(self.
|
|
4215
|
+
assert(self.primary_index(self.view) == self.replica);
|
|
4206
4216
|
assert(self.commit_max < self.op);
|
|
4207
4217
|
assert(self.journal.dirty.count == 0);
|
|
4208
4218
|
|
|
4209
4219
|
if (self.repairing_pipeline) {
|
|
4210
|
-
log.debug("{}:
|
|
4220
|
+
log.debug("{}: primary_repair_pipeline: already repairing...", .{self.replica});
|
|
4211
4221
|
return;
|
|
4212
4222
|
}
|
|
4213
4223
|
|
|
4214
|
-
log.debug("{}:
|
|
4224
|
+
log.debug("{}: primary_repair_pipeline: repairing", .{self.replica});
|
|
4215
4225
|
|
|
4216
4226
|
assert(!self.repairing_pipeline);
|
|
4217
4227
|
self.repairing_pipeline = true;
|
|
4218
4228
|
|
|
4219
|
-
self.
|
|
4229
|
+
self.primary_repair_pipeline_read();
|
|
4220
4230
|
}
|
|
4221
4231
|
|
|
4222
4232
|
/// Discard messages from the prepare pipeline.
|
|
4223
4233
|
/// Retain uncommitted messages that belong in the current view to maximize durability.
|
|
4224
|
-
fn
|
|
4234
|
+
fn primary_repair_pipeline_diff(self: *Self) void {
|
|
4225
4235
|
assert(self.status == .view_change);
|
|
4226
|
-
assert(self.
|
|
4236
|
+
assert(self.primary_index(self.view) == self.replica);
|
|
4227
4237
|
|
|
4228
|
-
// Discard messages from the front of the pipeline that committed since we were
|
|
4238
|
+
// Discard messages from the front of the pipeline that committed since we were primary.
|
|
4229
4239
|
while (self.pipeline.head_ptr()) |prepare| {
|
|
4230
4240
|
if (prepare.message.header.op > self.commit_max) break;
|
|
4231
4241
|
|
|
@@ -4251,7 +4261,7 @@ pub fn ReplicaType(
|
|
|
4251
4261
|
self.message_bus.unref(self.pipeline.pop_tail().?.message);
|
|
4252
4262
|
}
|
|
4253
4263
|
|
|
4254
|
-
log.debug("{}:
|
|
4264
|
+
log.debug("{}: primary_repair_pipeline_diff: {} prepare(s)", .{
|
|
4255
4265
|
self.replica,
|
|
4256
4266
|
self.pipeline.count,
|
|
4257
4267
|
});
|
|
@@ -4259,16 +4269,16 @@ pub fn ReplicaType(
|
|
|
4259
4269
|
self.verify_pipeline();
|
|
4260
4270
|
|
|
4261
4271
|
// Do not reset `repairing_pipeline` here as this must be reset by the read callback.
|
|
4262
|
-
// Otherwise, we would be making `
|
|
4272
|
+
// Otherwise, we would be making `primary_repair_pipeline()` reentrant.
|
|
4263
4273
|
}
|
|
4264
4274
|
|
|
4265
4275
|
/// Returns the next `op` number that needs to be read into the pipeline.
|
|
4266
|
-
fn
|
|
4276
|
+
fn primary_repair_pipeline_op(self: *Self) ?u64 {
|
|
4267
4277
|
assert(self.status == .view_change);
|
|
4268
|
-
assert(self.
|
|
4278
|
+
assert(self.primary_index(self.view) == self.replica);
|
|
4269
4279
|
|
|
4270
4280
|
// We cannot rely on `pipeline.count` below unless the pipeline has first been diffed.
|
|
4271
|
-
self.
|
|
4281
|
+
self.primary_repair_pipeline_diff();
|
|
4272
4282
|
|
|
4273
4283
|
const op = self.commit_max + self.pipeline.count + 1;
|
|
4274
4284
|
if (op <= self.op) return op;
|
|
@@ -4277,19 +4287,19 @@ pub fn ReplicaType(
|
|
|
4277
4287
|
return null;
|
|
4278
4288
|
}
|
|
4279
4289
|
|
|
4280
|
-
fn
|
|
4290
|
+
fn primary_repair_pipeline_read(self: *Self) void {
|
|
4281
4291
|
assert(self.repairing_pipeline);
|
|
4282
4292
|
assert(self.status == .view_change);
|
|
4283
|
-
assert(self.
|
|
4293
|
+
assert(self.primary_index(self.view) == self.replica);
|
|
4284
4294
|
|
|
4285
|
-
if (self.
|
|
4295
|
+
if (self.primary_repair_pipeline_op()) |op| {
|
|
4286
4296
|
assert(op > self.commit_max);
|
|
4287
4297
|
assert(op <= self.op);
|
|
4288
4298
|
assert(self.commit_max + self.pipeline.count + 1 == op);
|
|
4289
4299
|
|
|
4290
4300
|
const checksum = self.journal.header_with_op(op).?.checksum;
|
|
4291
4301
|
|
|
4292
|
-
log.debug("{}:
|
|
4302
|
+
log.debug("{}: primary_repair_pipeline_read: op={} checksum={}", .{
|
|
4293
4303
|
self.replica,
|
|
4294
4304
|
op,
|
|
4295
4305
|
checksum,
|
|
@@ -4297,7 +4307,7 @@ pub fn ReplicaType(
|
|
|
4297
4307
|
|
|
4298
4308
|
self.journal.read_prepare(repair_pipeline_push, op, checksum, null);
|
|
4299
4309
|
} else {
|
|
4300
|
-
log.debug("{}:
|
|
4310
|
+
log.debug("{}: primary_repair_pipeline_read: repaired", .{self.replica});
|
|
4301
4311
|
self.repairing_pipeline = false;
|
|
4302
4312
|
self.repair();
|
|
4303
4313
|
}
|
|
@@ -4326,13 +4336,13 @@ pub fn ReplicaType(
|
|
|
4326
4336
|
return;
|
|
4327
4337
|
}
|
|
4328
4338
|
|
|
4329
|
-
if (self.
|
|
4330
|
-
log.debug("{}: repair_pipeline_push: no longer
|
|
4339
|
+
if (self.primary_index(self.view) != self.replica) {
|
|
4340
|
+
log.debug("{}: repair_pipeline_push: no longer primary", .{self.replica});
|
|
4331
4341
|
return;
|
|
4332
4342
|
}
|
|
4333
4343
|
|
|
4334
4344
|
// We may even be several views ahead and may now have a completely different pipeline.
|
|
4335
|
-
const op = self.
|
|
4345
|
+
const op = self.primary_repair_pipeline_op() orelse {
|
|
4336
4346
|
log.debug("{}: repair_pipeline_push: pipeline changed", .{self.replica});
|
|
4337
4347
|
return;
|
|
4338
4348
|
};
|
|
@@ -4352,7 +4362,7 @@ pub fn ReplicaType(
|
|
|
4352
4362
|
}
|
|
4353
4363
|
|
|
4354
4364
|
assert(self.status == .view_change);
|
|
4355
|
-
assert(self.
|
|
4365
|
+
assert(self.primary_index(self.view) == self.replica);
|
|
4356
4366
|
|
|
4357
4367
|
log.debug("{}: repair_pipeline_push: op={} checksum={}", .{
|
|
4358
4368
|
self.replica,
|
|
@@ -4368,7 +4378,7 @@ pub fn ReplicaType(
|
|
|
4368
4378
|
assert(self.pipeline.count >= 1);
|
|
4369
4379
|
|
|
4370
4380
|
self.repairing_pipeline = true;
|
|
4371
|
-
self.
|
|
4381
|
+
self.primary_repair_pipeline_read();
|
|
4372
4382
|
}
|
|
4373
4383
|
|
|
4374
4384
|
fn repair_prepares(self: *Self) void {
|
|
@@ -4446,7 +4456,7 @@ pub fn ReplicaType(
|
|
|
4446
4456
|
continue;
|
|
4447
4457
|
}
|
|
4448
4458
|
|
|
4449
|
-
// If this is an uncommitted op, and we are the
|
|
4459
|
+
// If this is an uncommitted op, and we are the primary in `view_change` status,
|
|
4450
4460
|
// then we will `request_prepare` from the cluster, set `nack_prepare_op`,
|
|
4451
4461
|
// and stop repairing any further prepares:
|
|
4452
4462
|
// This will also rebroadcast any `request_prepare` every `repair_timeout` tick.
|
|
@@ -4454,7 +4464,7 @@ pub fn ReplicaType(
|
|
|
4454
4464
|
if (self.nack_prepare_op) |nack_prepare_op| {
|
|
4455
4465
|
assert(nack_prepare_op == op);
|
|
4456
4466
|
assert(self.status == .view_change);
|
|
4457
|
-
assert(self.
|
|
4467
|
+
assert(self.primary_index(self.view) == self.replica);
|
|
4458
4468
|
assert(op > self.commit_max);
|
|
4459
4469
|
return;
|
|
4460
4470
|
}
|
|
@@ -4474,7 +4484,7 @@ pub fn ReplicaType(
|
|
|
4474
4484
|
|
|
4475
4485
|
/// During a view change, for uncommitted ops, which are few, we optimize for latency:
|
|
4476
4486
|
///
|
|
4477
|
-
/// * request a `prepare` or `nack_prepare` from all
|
|
4487
|
+
/// * request a `prepare` or `nack_prepare` from all backups in parallel,
|
|
4478
4488
|
/// * repair as soon as we get a `prepare`, or
|
|
4479
4489
|
/// * discard as soon as we get a majority of `nack_prepare` messages for the same checksum.
|
|
4480
4490
|
///
|
|
@@ -4509,9 +4519,9 @@ pub fn ReplicaType(
|
|
|
4509
4519
|
|
|
4510
4520
|
// The message may be available in the local pipeline.
|
|
4511
4521
|
// For example (replica_count=3):
|
|
4512
|
-
// 1. View=1: Replica 1 is
|
|
4522
|
+
// 1. View=1: Replica 1 is primary, and prepares op 5. The local write fails.
|
|
4513
4523
|
// 2. Time passes. The view changes (e.g. due to a timeout)…
|
|
4514
|
-
// 3. View=4: Replica 1 is
|
|
4524
|
+
// 3. View=4: Replica 1 is primary again, and is repairing op 5
|
|
4515
4525
|
// (which is still in the pipeline).
|
|
4516
4526
|
//
|
|
4517
4527
|
// Using the pipeline to repair is faster than a `request_prepare`.
|
|
@@ -4543,7 +4553,7 @@ pub fn ReplicaType(
|
|
|
4543
4553
|
|
|
4544
4554
|
const request_prepare = Header{
|
|
4545
4555
|
.command = .request_prepare,
|
|
4546
|
-
// If we request a prepare from a
|
|
4556
|
+
// If we request a prepare from a backup, as below, it is critical to pass a
|
|
4547
4557
|
// checksum: Otherwise we could receive different prepares for the same op number.
|
|
4548
4558
|
.context = checksum,
|
|
4549
4559
|
.timestamp = 1, // The checksum is included in context.
|
|
@@ -4554,8 +4564,8 @@ pub fn ReplicaType(
|
|
|
4554
4564
|
};
|
|
4555
4565
|
|
|
4556
4566
|
if (self.status == .view_change and op > self.commit_max) {
|
|
4557
|
-
// Only the
|
|
4558
|
-
assert(self.
|
|
4567
|
+
// Only the primary is allowed to do repairs in a view change:
|
|
4568
|
+
assert(self.primary_index(self.view) == self.replica);
|
|
4559
4569
|
|
|
4560
4570
|
const reason = if (self.journal.faulty.bit(slot)) "faulty" else "dirty";
|
|
4561
4571
|
log.debug(
|
|
@@ -4570,13 +4580,13 @@ pub fn ReplicaType(
|
|
|
4570
4580
|
|
|
4571
4581
|
if (self.replica_count == 2 and !self.journal.faulty.bit(slot)) {
|
|
4572
4582
|
// This is required to avoid a liveness issue for a cluster-of-two where a new
|
|
4573
|
-
//
|
|
4574
|
-
// the old
|
|
4575
|
-
// been committed by the old
|
|
4576
|
-
// the old
|
|
4583
|
+
// primary learns of an op during a view change but where the op is faulty on
|
|
4584
|
+
// the old primary. We must immediately roll back the op since it could not have
|
|
4585
|
+
// been committed by the old primary if we know we do not have it, and because
|
|
4586
|
+
// the old primary cannot send a nack_prepare for its faulty copy.
|
|
4577
4587
|
// For this to be correct, the recovery protocol must set all headers as faulty,
|
|
4578
4588
|
// not only as dirty.
|
|
4579
|
-
self.
|
|
4589
|
+
self.primary_discard_uncommitted_ops_from(op, checksum);
|
|
4580
4590
|
return false;
|
|
4581
4591
|
}
|
|
4582
4592
|
|
|
@@ -4612,7 +4622,7 @@ pub fn ReplicaType(
|
|
|
4612
4622
|
// We expect that `repair_prepare()` is called in reverse chronological order:
|
|
4613
4623
|
// Any uncommitted ops should have already been dealt with.
|
|
4614
4624
|
// We never roll back committed ops, and thus never regard `nack_prepare` responses.
|
|
4615
|
-
// Alternatively, we may not be the
|
|
4625
|
+
// Alternatively, we may not be the primary, in which case we do distinguish anyway.
|
|
4616
4626
|
assert(self.nack_prepare_op == null);
|
|
4617
4627
|
assert(request_prepare.context == checksum);
|
|
4618
4628
|
if (self.choose_any_other_replica()) |replica| {
|
|
@@ -4627,7 +4637,7 @@ pub fn ReplicaType(
|
|
|
4627
4637
|
switch (self.status) {
|
|
4628
4638
|
.view_change => {
|
|
4629
4639
|
if (self.do_view_change_quorum) {
|
|
4630
|
-
assert(self.
|
|
4640
|
+
assert(self.primary_index(self.view) == self.replica);
|
|
4631
4641
|
return true;
|
|
4632
4642
|
} else {
|
|
4633
4643
|
return false;
|
|
@@ -4687,8 +4697,8 @@ pub fn ReplicaType(
|
|
|
4687
4697
|
if (!self.journal.has(header)) self.journal.set_header_as_dirty(header);
|
|
4688
4698
|
}
|
|
4689
4699
|
|
|
4690
|
-
/// Replicates to the next replica in the configuration (until we get back to the
|
|
4691
|
-
/// Replication starts and ends with the
|
|
4700
|
+
/// Replicates to the next replica in the configuration (until we get back to the primary):
|
|
4701
|
+
/// Replication starts and ends with the primary, we never forward back to the primary.
|
|
4692
4702
|
/// Does not flood the network with prepares that have already committed.
|
|
4693
4703
|
/// TODO Use recent heartbeat data for next replica to leapfrog if faulty (optimization).
|
|
4694
4704
|
fn replicate(self: *Self, message: *Message) void {
|
|
@@ -4703,7 +4713,7 @@ pub fn ReplicaType(
|
|
|
4703
4713
|
}
|
|
4704
4714
|
|
|
4705
4715
|
const next = @mod(self.replica + 1, @intCast(u8, self.replica_count));
|
|
4706
|
-
if (next == self.
|
|
4716
|
+
if (next == self.primary_index(message.header.view)) {
|
|
4707
4717
|
log.debug("{}: replicate: not replicating (completed)", .{self.replica});
|
|
4708
4718
|
return;
|
|
4709
4719
|
}
|
|
@@ -4801,7 +4811,7 @@ pub fn ReplicaType(
|
|
|
4801
4811
|
fn send_prepare_ok(self: *Self, header: *const Header) void {
|
|
4802
4812
|
assert(header.command == .prepare);
|
|
4803
4813
|
assert(header.cluster == self.cluster);
|
|
4804
|
-
assert(header.replica == self.
|
|
4814
|
+
assert(header.replica == self.primary_index(header.view));
|
|
4805
4815
|
assert(header.view <= self.view);
|
|
4806
4816
|
assert(header.op <= self.op or header.view < self.view);
|
|
4807
4817
|
|
|
@@ -4819,7 +4829,7 @@ pub fn ReplicaType(
|
|
|
4819
4829
|
|
|
4820
4830
|
assert(self.status == .normal);
|
|
4821
4831
|
// After a view change, replicas send prepare_oks for uncommitted ops with older views:
|
|
4822
|
-
// However, we only send to the
|
|
4832
|
+
// However, we only send to the primary of the current view (see below where we send).
|
|
4823
4833
|
assert(header.view <= self.view);
|
|
4824
4834
|
assert(header.op <= self.op);
|
|
4825
4835
|
|
|
@@ -4843,9 +4853,9 @@ pub fn ReplicaType(
|
|
|
4843
4853
|
// sending its log to the new one, the old primary might commit an operation that
|
|
4844
4854
|
// the new primary doesn't learn about in the do_view_change messages.
|
|
4845
4855
|
|
|
4846
|
-
// We therefore only ever send to the
|
|
4847
|
-
//
|
|
4848
|
-
self.send_header_to_replica(self.
|
|
4856
|
+
// We therefore only ever send to the primary of the current view, never to the
|
|
4857
|
+
// primary of the prepare header's view:
|
|
4858
|
+
self.send_header_to_replica(self.primary_index(self.view), .{
|
|
4849
4859
|
.command = .prepare_ok,
|
|
4850
4860
|
.parent = header.parent,
|
|
4851
4861
|
.client = header.client,
|
|
@@ -4873,8 +4883,8 @@ pub fn ReplicaType(
|
|
|
4873
4883
|
while (op <= self.op) : (op += 1) {
|
|
4874
4884
|
// We may have breaks or stale headers in our uncommitted chain here. However:
|
|
4875
4885
|
// * being able to send what we have will allow the pipeline to commit earlier, and
|
|
4876
|
-
// * the
|
|
4877
|
-
// This is safe only because the
|
|
4886
|
+
// * the primary will drop any prepare_ok for a prepare not in the pipeline.
|
|
4887
|
+
// This is safe only because the primary can verify against the prepare checksum.
|
|
4878
4888
|
if (self.journal.header_with_op(op)) |header| {
|
|
4879
4889
|
self.send_prepare_ok(header);
|
|
4880
4890
|
defer self.flush_loopback_queue();
|
|
@@ -4921,12 +4931,12 @@ pub fn ReplicaType(
|
|
|
4921
4931
|
// primary crashed. The new primary will use the NACK protocol to be sure of a discard.
|
|
4922
4932
|
assert(message.header.commit == self.commit_min);
|
|
4923
4933
|
|
|
4924
|
-
self.send_message_to_replica(self.
|
|
4934
|
+
self.send_message_to_replica(self.primary_index(self.view), message);
|
|
4925
4935
|
}
|
|
4926
4936
|
|
|
4927
4937
|
fn send_eviction_message_to_client(self: *Self, client: u128) void {
|
|
4928
4938
|
assert(self.status == .normal);
|
|
4929
|
-
assert(self.
|
|
4939
|
+
assert(self.primary());
|
|
4930
4940
|
|
|
4931
4941
|
log.err("{}: too many sessions, sending eviction message to client={}", .{
|
|
4932
4942
|
self.replica,
|
|
@@ -5012,10 +5022,10 @@ pub fn ReplicaType(
|
|
|
5012
5022
|
assert(self.status == .normal);
|
|
5013
5023
|
assert(message.header.view == self.view);
|
|
5014
5024
|
assert(message.header.op <= self.op_checkpoint_trigger());
|
|
5015
|
-
// We must only ever send a prepare_ok to the latest
|
|
5016
|
-
// We must never straddle views by sending to a
|
|
5017
|
-
// Otherwise, we would be enabling a partitioned
|
|
5018
|
-
assert(replica == self.
|
|
5025
|
+
// We must only ever send a prepare_ok to the latest primary of the active view:
|
|
5026
|
+
// We must never straddle views by sending to a primary in an older view.
|
|
5027
|
+
// Otherwise, we would be enabling a partitioned primary to commit.
|
|
5028
|
+
assert(replica == self.primary_index(self.view));
|
|
5019
5029
|
assert(message.header.replica == self.replica);
|
|
5020
5030
|
},
|
|
5021
5031
|
.start_view_change => {
|
|
@@ -5031,11 +5041,11 @@ pub fn ReplicaType(
|
|
|
5031
5041
|
assert(message.header.replica == self.replica);
|
|
5032
5042
|
assert(message.header.op == self.op);
|
|
5033
5043
|
assert(message.header.commit == self.commit_min);
|
|
5034
|
-
assert(replica == self.
|
|
5044
|
+
assert(replica == self.primary_index(self.view));
|
|
5035
5045
|
},
|
|
5036
5046
|
.start_view => switch (self.status) {
|
|
5037
5047
|
.normal => {
|
|
5038
|
-
// A
|
|
5048
|
+
// A backup may ask the primary to resend the start_view message.
|
|
5039
5049
|
assert(!self.start_view_change_quorum);
|
|
5040
5050
|
assert(!self.do_view_change_quorum);
|
|
5041
5051
|
assert(message.header.view == self.view);
|
|
@@ -5076,7 +5086,7 @@ pub fn ReplicaType(
|
|
|
5076
5086
|
},
|
|
5077
5087
|
.commit => {
|
|
5078
5088
|
assert(self.status == .normal);
|
|
5079
|
-
assert(self.
|
|
5089
|
+
assert(self.primary());
|
|
5080
5090
|
assert(message.header.view == self.view);
|
|
5081
5091
|
assert(message.header.replica == self.replica);
|
|
5082
5092
|
assert(message.header.replica != replica);
|
|
@@ -5085,7 +5095,7 @@ pub fn ReplicaType(
|
|
|
5085
5095
|
assert(message.header.view >= self.view);
|
|
5086
5096
|
assert(message.header.replica == self.replica);
|
|
5087
5097
|
assert(message.header.replica != replica);
|
|
5088
|
-
assert(self.
|
|
5098
|
+
assert(self.primary_index(message.header.view) == replica);
|
|
5089
5099
|
},
|
|
5090
5100
|
.request_headers => {
|
|
5091
5101
|
assert(message.header.view == self.view);
|
|
@@ -5101,7 +5111,7 @@ pub fn ReplicaType(
|
|
|
5101
5111
|
assert(message.header.view == self.view);
|
|
5102
5112
|
assert(message.header.replica == self.replica);
|
|
5103
5113
|
assert(message.header.replica != replica);
|
|
5104
|
-
assert(self.
|
|
5114
|
+
assert(self.primary_index(self.view) == replica);
|
|
5105
5115
|
},
|
|
5106
5116
|
else => {
|
|
5107
5117
|
log.info("{}: send_message_to_replica: TODO {s}", .{
|
|
@@ -5121,7 +5131,6 @@ pub fn ReplicaType(
|
|
|
5121
5131
|
|
|
5122
5132
|
fn set_op_and_commit_max(self: *Self, op: u64, commit_max: u64, method: []const u8) void {
|
|
5123
5133
|
assert(self.status == .view_change or self.status == .recovering);
|
|
5124
|
-
assert(self.journal.status == .recovered);
|
|
5125
5134
|
|
|
5126
5135
|
switch (self.status) {
|
|
5127
5136
|
.normal => unreachable,
|
|
@@ -5141,13 +5150,13 @@ pub fn ReplicaType(
|
|
|
5141
5150
|
assert(op <= self.op_checkpoint_trigger());
|
|
5142
5151
|
|
|
5143
5152
|
// We expect that our commit numbers may also be greater even than `commit_max` because
|
|
5144
|
-
// we may be the old
|
|
5153
|
+
// we may be the old primary joining towards the end of the view change and we may have
|
|
5145
5154
|
// committed `op` already.
|
|
5146
5155
|
// However, this is bounded by pipelining.
|
|
5147
5156
|
// The intersection property only requires that all possibly committed operations must
|
|
5148
|
-
// survive into the new view so that they can then be committed by the new
|
|
5149
|
-
// This guarantees that if the old
|
|
5150
|
-
// new
|
|
5157
|
+
// survive into the new view so that they can then be committed by the new primary.
|
|
5158
|
+
// This guarantees that if the old primary possibly committed the operation, then the
|
|
5159
|
+
// new primary will also commit the operation.
|
|
5151
5160
|
if (commit_max < self.commit_max and self.commit_min == self.commit_max) {
|
|
5152
5161
|
log.debug("{}: {s}: k={} < commit_max={} and commit_min == commit_max", .{
|
|
5153
5162
|
self.replica,
|
|
@@ -5193,13 +5202,13 @@ pub fn ReplicaType(
|
|
|
5193
5202
|
/// In other words, you can't end up in a situation with a DVC quorum like:
|
|
5194
5203
|
///
|
|
5195
5204
|
/// replica headers commit_min
|
|
5196
|
-
/// 0 4 5 _ _ 8 4 (new
|
|
5205
|
+
/// 0 4 5 _ _ 8 4 (new primary; handling DVC quorum)
|
|
5197
5206
|
/// 1 4 _ 6 _ 8 4
|
|
5198
5207
|
/// 2 4 _ _ 7 8 4
|
|
5199
5208
|
/// 3 (4 5 6 7 8) 8 (didn't participate in view change)
|
|
5200
5209
|
/// 4 (4 5 6 7 8) 8 (didn't participate in view change)
|
|
5201
5210
|
///
|
|
5202
|
-
/// where the new
|
|
5211
|
+
/// where the new primary's headers depends on which of replica 1 and 2's DVC is used
|
|
5203
5212
|
/// for repair before the other (i.e. whether they repair op 6 or 7 first).
|
|
5204
5213
|
///
|
|
5205
5214
|
/// For the above case to occur, replicas 0, 1, and 2 must all share the highest `view_normal`.
|
|
@@ -5209,9 +5218,9 @@ pub fn ReplicaType(
|
|
|
5209
5218
|
/// (If replica 0's view_normal was greater than 1/2's, then replica 0 must have all
|
|
5210
5219
|
/// headers from previous views. Which means 6,7 are from the current view. But since
|
|
5211
5220
|
/// replica 0 doesn't have 6/7, then replica 1/2 must share the latest view_normal. ∎)
|
|
5212
|
-
fn
|
|
5221
|
+
fn primary_set_log_from_do_view_change_messages(self: *Self) void {
|
|
5213
5222
|
assert(self.status == .view_change);
|
|
5214
|
-
assert(self.
|
|
5223
|
+
assert(self.primary_index(self.view) == self.replica);
|
|
5215
5224
|
assert(self.replica_count > 1);
|
|
5216
5225
|
assert(self.start_view_change_quorum);
|
|
5217
5226
|
assert(self.do_view_change_quorum);
|
|
@@ -5238,7 +5247,7 @@ pub fn ReplicaType(
|
|
|
5238
5247
|
// Don't remove the uncanonical headers yet — even though the removed headers are
|
|
5239
5248
|
// a subset of the DVC headers, removing and then adding them back would cause clean
|
|
5240
5249
|
// headers to become dirty.
|
|
5241
|
-
const op_canonical = self.
|
|
5250
|
+
const op_canonical = self.primary_op_canonical_max(view_normal_canonical);
|
|
5242
5251
|
assert(op_canonical <= self.op);
|
|
5243
5252
|
assert(op_canonical >= self.op -| constants.pipeline_max);
|
|
5244
5253
|
assert(op_canonical >= self.commit_min);
|
|
@@ -5249,7 +5258,7 @@ pub fn ReplicaType(
|
|
|
5249
5258
|
// precluding recovery.
|
|
5250
5259
|
//
|
|
5251
5260
|
// TODO State transfer. Currently this is unreachable because the
|
|
5252
|
-
//
|
|
5261
|
+
// primary won't checkpoint until all replicas are caught up.
|
|
5253
5262
|
unreachable;
|
|
5254
5263
|
}
|
|
5255
5264
|
|
|
@@ -5330,7 +5339,7 @@ pub fn ReplicaType(
|
|
|
5330
5339
|
assert(op_max <= self.op);
|
|
5331
5340
|
assert(op_max >= self.commit_min);
|
|
5332
5341
|
if (op_max != self.op) {
|
|
5333
|
-
log.debug("{}:
|
|
5342
|
+
log.debug("{}: primary_set_log_from_do_view_change_messages: discard op={}..{}", .{
|
|
5334
5343
|
self.replica,
|
|
5335
5344
|
op_max + 1,
|
|
5336
5345
|
self.op,
|
|
@@ -5355,7 +5364,7 @@ pub fn ReplicaType(
|
|
|
5355
5364
|
timestamp: u64,
|
|
5356
5365
|
} {
|
|
5357
5366
|
assert(self.status == .view_change);
|
|
5358
|
-
assert(self.
|
|
5367
|
+
assert(self.primary_index(self.view) == self.replica);
|
|
5359
5368
|
assert(self.replica_count > 1);
|
|
5360
5369
|
assert(self.start_view_change_quorum);
|
|
5361
5370
|
assert(self.do_view_change_quorum);
|
|
@@ -5478,7 +5487,7 @@ pub fn ReplicaType(
|
|
|
5478
5487
|
fn do_view_change_op_max(self: *const Self, op_canonical: u64) u64 {
|
|
5479
5488
|
assert(self.replica_count > 1);
|
|
5480
5489
|
assert(self.status == .view_change);
|
|
5481
|
-
assert(self.
|
|
5490
|
+
assert(self.primary_index(self.view) == self.replica);
|
|
5482
5491
|
assert(self.do_view_change_quorum);
|
|
5483
5492
|
assert(!self.repair_timeout.ticking);
|
|
5484
5493
|
assert(self.op >= self.commit_max);
|
|
@@ -5530,14 +5539,14 @@ pub fn ReplicaType(
|
|
|
5530
5539
|
return std.math.min(op_before_break, op_before_gap);
|
|
5531
5540
|
}
|
|
5532
5541
|
|
|
5533
|
-
fn
|
|
5542
|
+
fn start_view_as_the_new_primary(self: *Self) void {
|
|
5534
5543
|
assert(self.status == .view_change);
|
|
5535
|
-
assert(self.
|
|
5544
|
+
assert(self.primary_index(self.view) == self.replica);
|
|
5536
5545
|
assert(self.do_view_change_quorum);
|
|
5537
5546
|
assert(!self.repairing_pipeline);
|
|
5538
5547
|
|
|
5539
5548
|
assert(self.commit_min == self.commit_max);
|
|
5540
|
-
assert(self.
|
|
5549
|
+
assert(self.primary_repair_pipeline_op() == null);
|
|
5541
5550
|
self.verify_pipeline();
|
|
5542
5551
|
assert(self.commit_max + self.pipeline.count == self.op);
|
|
5543
5552
|
assert(self.valid_hash_chain_between(self.commit_min, self.op));
|
|
@@ -5554,7 +5563,7 @@ pub fn ReplicaType(
|
|
|
5554
5563
|
assert(self.commit_max + self.pipeline.count == self.op);
|
|
5555
5564
|
|
|
5556
5565
|
assert(self.status == .normal);
|
|
5557
|
-
assert(self.
|
|
5566
|
+
assert(self.primary());
|
|
5558
5567
|
|
|
5559
5568
|
assert(start_view.references == 1);
|
|
5560
5569
|
assert(start_view.header.command == .start_view);
|
|
@@ -5578,9 +5587,9 @@ pub fn ReplicaType(
|
|
|
5578
5587
|
self.view_normal = new_view;
|
|
5579
5588
|
self.status = .normal;
|
|
5580
5589
|
|
|
5581
|
-
if (self.
|
|
5590
|
+
if (self.primary()) {
|
|
5582
5591
|
log.debug(
|
|
5583
|
-
"{}: transition_to_normal_from_recovering_status: view={}
|
|
5592
|
+
"{}: transition_to_normal_from_recovering_status: view={} primary",
|
|
5584
5593
|
.{
|
|
5585
5594
|
self.replica,
|
|
5586
5595
|
self.view,
|
|
@@ -5599,7 +5608,7 @@ pub fn ReplicaType(
|
|
|
5599
5608
|
self.recovery_timeout.stop();
|
|
5600
5609
|
} else {
|
|
5601
5610
|
log.debug(
|
|
5602
|
-
"{}: transition_to_normal_from_recovering_status: view={}
|
|
5611
|
+
"{}: transition_to_normal_from_recovering_status: view={} backup",
|
|
5603
5612
|
.{
|
|
5604
5613
|
self.replica,
|
|
5605
5614
|
self.view,
|
|
@@ -5628,9 +5637,9 @@ pub fn ReplicaType(
|
|
|
5628
5637
|
self.view_normal = new_view;
|
|
5629
5638
|
self.status = .normal;
|
|
5630
5639
|
|
|
5631
|
-
if (self.
|
|
5640
|
+
if (self.primary()) {
|
|
5632
5641
|
log.debug(
|
|
5633
|
-
"{}: transition_to_normal_from_view_change_status: view={}
|
|
5642
|
+
"{}: transition_to_normal_from_view_change_status: view={} primary",
|
|
5634
5643
|
.{
|
|
5635
5644
|
self.replica,
|
|
5636
5645
|
self.view,
|
|
@@ -5650,7 +5659,7 @@ pub fn ReplicaType(
|
|
|
5650
5659
|
// Do not reset the pipeline as there may be uncommitted ops to drive to completion.
|
|
5651
5660
|
if (self.pipeline.count > 0) self.prepare_timeout.start();
|
|
5652
5661
|
} else {
|
|
5653
|
-
log.debug("{}: transition_to_normal_from_view_change_status: view={}
|
|
5662
|
+
log.debug("{}: transition_to_normal_from_view_change_status: view={} backup", .{
|
|
5654
5663
|
self.replica,
|
|
5655
5664
|
self.view,
|
|
5656
5665
|
});
|
|
@@ -5789,7 +5798,7 @@ pub fn ReplicaType(
|
|
|
5789
5798
|
assert(op_min >= self.op_checkpoint);
|
|
5790
5799
|
|
|
5791
5800
|
// If we use anything less than self.op then we may commit ops for a forked hash chain
|
|
5792
|
-
// that have since been reordered by a new
|
|
5801
|
+
// that have since been reordered by a new primary.
|
|
5793
5802
|
assert(op_max == self.op);
|
|
5794
5803
|
var b = self.journal.header_with_op(op_max).?;
|
|
5795
5804
|
|
|
@@ -5917,7 +5926,7 @@ pub fn ReplicaType(
|
|
|
5917
5926
|
|
|
5918
5927
|
// TODO Debounce and decouple this from `on_message()` by moving into `tick()`:
|
|
5919
5928
|
log.debug("{}: view_jump: requesting start_view message", .{self.replica});
|
|
5920
|
-
self.send_header_to_replica(self.
|
|
5929
|
+
self.send_header_to_replica(self.primary_index(header.view), .{
|
|
5921
5930
|
.command = .request_start_view,
|
|
5922
5931
|
.cluster = self.cluster,
|
|
5923
5932
|
.replica = self.replica,
|
|
@@ -5989,6 +5998,7 @@ pub fn ReplicaType(
|
|
|
5989
5998
|
// This is an optimization to eliminate waiting until the next repair timeout.
|
|
5990
5999
|
.repair => self.repair(),
|
|
5991
6000
|
.pipeline => self.repair(),
|
|
6001
|
+
.fix => unreachable,
|
|
5992
6002
|
}
|
|
5993
6003
|
}
|
|
5994
6004
|
};
|