tigerbeetle-node 0.8.0 → 0.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +38 -38
- package/package.json +1 -1
- package/scripts/download_node_headers.sh +3 -1
- package/src/node.zig +1 -1
- package/src/tigerbeetle/scripts/benchmark.bat +46 -46
- package/src/tigerbeetle/scripts/install_zig.bat +109 -109
- package/src/tigerbeetle/scripts/install_zig.sh +6 -2
- package/src/tigerbeetle/scripts/vopr.bat +47 -47
- package/src/tigerbeetle/src/config.zig +17 -13
- package/src/tigerbeetle/src/demo.zig +2 -2
- package/src/tigerbeetle/src/demo_03_create_transfers.zig +3 -3
- package/src/tigerbeetle/src/demo_04_create_pending_transfers.zig +10 -10
- package/src/tigerbeetle/src/demo_05_post_pending_transfers.zig +7 -7
- package/src/tigerbeetle/src/demo_06_void_pending_transfers.zig +3 -3
- package/src/tigerbeetle/src/demo_07_lookup_transfers.zig +1 -1
- package/src/tigerbeetle/src/main.zig +18 -2
- package/src/tigerbeetle/src/message_pool.zig +4 -1
- package/src/tigerbeetle/src/ring_buffer.zig +48 -3
- package/src/tigerbeetle/src/simulator.zig +104 -8
- package/src/tigerbeetle/src/state_machine.zig +13 -23
- package/src/tigerbeetle/src/test/cluster.zig +165 -32
- package/src/tigerbeetle/src/test/packet_simulator.zig +14 -1
- package/src/tigerbeetle/src/test/state_checker.zig +3 -1
- package/src/tigerbeetle/src/test/state_machine.zig +8 -7
- package/src/tigerbeetle/src/test/storage.zig +99 -40
- package/src/tigerbeetle/src/vsr/journal.zig +1387 -459
- package/src/tigerbeetle/src/vsr/replica.zig +1199 -412
- package/src/tigerbeetle/src/vsr.zig +203 -49
|
@@ -18,6 +18,24 @@ const log = std.log.scoped(.replica);
|
|
|
18
18
|
pub const Status = enum {
|
|
19
19
|
normal,
|
|
20
20
|
view_change,
|
|
21
|
+
// Recovery (for replica_count > 1):
|
|
22
|
+
//
|
|
23
|
+
// 1. At replica start: `status=recovering` and `journal.recovered=false`
|
|
24
|
+
// 2. Load the WAL. Mark questionable entries as faulty.
|
|
25
|
+
// 3. If the WAL has no entries (besides the initial commit), skip to step 5 with view 0.
|
|
26
|
+
// 4. Run VSR recovery protocol:
|
|
27
|
+
// a. Send a `recovery` message to every replica (except self).
|
|
28
|
+
// b. Wait for f+1 `recovery_response` messages from replicas in `normal` status.
|
|
29
|
+
// Each `recovery_response` includes the current view number.
|
|
30
|
+
// Each `recovery_response` must include a nonce matching the `recovery` message.
|
|
31
|
+
// c. Wait for a `recovery_response` from the leader of the highest known view.
|
|
32
|
+
// 5. Transition to `status=normal` with the discovered view number:
|
|
33
|
+
// * Set `op` to the highest op in the leader's recovery response.
|
|
34
|
+
// * Repair faulty messages.
|
|
35
|
+
// * Commit through to the discovered `commit_max`.
|
|
36
|
+
// * Set `state_machine.prepare_timeout` to the current op's timestamp.
|
|
37
|
+
//
|
|
38
|
+
// TODO document snapshot recovery in this progression
|
|
21
39
|
recovering,
|
|
22
40
|
};
|
|
23
41
|
|
|
@@ -47,22 +65,24 @@ const ClientTableEntry = struct {
|
|
|
47
65
|
reply: *Message,
|
|
48
66
|
};
|
|
49
67
|
|
|
68
|
+
const Nonce = u128;
|
|
69
|
+
|
|
50
70
|
const Prepare = struct {
|
|
51
71
|
/// The current prepare message (used to cross-check prepare_ok messages, and for resending).
|
|
52
72
|
message: *Message,
|
|
53
73
|
|
|
54
74
|
/// Unique prepare_ok messages for the same view, op number and checksum from ALL replicas.
|
|
55
|
-
ok_from_all_replicas: QuorumCounter =
|
|
75
|
+
ok_from_all_replicas: QuorumCounter = quorum_counter_null,
|
|
56
76
|
|
|
57
77
|
/// Whether a quorum of prepare_ok messages has been received for this prepare.
|
|
58
78
|
ok_quorum_received: bool = false,
|
|
59
79
|
};
|
|
60
80
|
|
|
61
81
|
const QuorumMessages = [config.replicas_max]?*Message;
|
|
62
|
-
const
|
|
82
|
+
const quorum_messages_null = [_]?*Message{null} ** config.replicas_max;
|
|
63
83
|
|
|
64
84
|
const QuorumCounter = std.StaticBitSet(config.replicas_max);
|
|
65
|
-
const
|
|
85
|
+
const quorum_counter_null = QuorumCounter.initEmpty();
|
|
66
86
|
|
|
67
87
|
pub fn Replica(
|
|
68
88
|
comptime StateMachine: type,
|
|
@@ -114,12 +134,17 @@ pub fn Replica(
|
|
|
114
134
|
view_normal: u32,
|
|
115
135
|
|
|
116
136
|
/// The current status, either normal, view_change, or recovering:
|
|
117
|
-
|
|
118
|
-
status: Status = .normal,
|
|
137
|
+
status: Status = .recovering,
|
|
119
138
|
|
|
120
139
|
/// The op number assigned to the most recently prepared operation:
|
|
121
140
|
op: u64,
|
|
122
141
|
|
|
142
|
+
/// The op of the highest checkpointed message.
|
|
143
|
+
// TODO Update this to use LSM storage.
|
|
144
|
+
// TODO Refuse to store/ack any op>op_checkpoint+journal_slot_count.
|
|
145
|
+
// TODO Enforce invariant op≥op_checkpoint.
|
|
146
|
+
op_checkpoint: u64 = 0,
|
|
147
|
+
|
|
123
148
|
/// The op number of the latest committed and executed operation (according to the replica):
|
|
124
149
|
/// The replica may have to wait for repairs to complete before commit_min reaches commit_max.
|
|
125
150
|
commit_min: u64,
|
|
@@ -136,6 +161,9 @@ pub fn Replica(
|
|
|
136
161
|
|
|
137
162
|
/// The leader's pipeline of inflight prepares waiting to commit in FIFO order.
|
|
138
163
|
/// This allows us to pipeline without the complexity of out-of-order commits.
|
|
164
|
+
///
|
|
165
|
+
/// After a view change, the old leader's pipeline is left untouched so that it is able to
|
|
166
|
+
/// help the new leader repair, even in the face of local storage faults.
|
|
139
167
|
pipeline: RingBuffer(Prepare, config.pipeline_max) = .{},
|
|
140
168
|
|
|
141
169
|
/// In some cases, a replica may send a message to itself. We do not submit these messages
|
|
@@ -144,13 +172,16 @@ pub fn Replica(
|
|
|
144
172
|
loopback_queue: ?*Message = null,
|
|
145
173
|
|
|
146
174
|
/// Unique start_view_change messages for the same view from OTHER replicas (excluding ourself).
|
|
147
|
-
start_view_change_from_other_replicas: QuorumCounter =
|
|
175
|
+
start_view_change_from_other_replicas: QuorumCounter = quorum_counter_null,
|
|
148
176
|
|
|
149
177
|
/// Unique do_view_change messages for the same view from ALL replicas (including ourself).
|
|
150
|
-
do_view_change_from_all_replicas: QuorumMessages =
|
|
178
|
+
do_view_change_from_all_replicas: QuorumMessages = quorum_messages_null,
|
|
151
179
|
|
|
152
180
|
/// Unique nack_prepare messages for the same view from OTHER replicas (excluding ourself).
|
|
153
|
-
nack_prepare_from_other_replicas: QuorumCounter =
|
|
181
|
+
nack_prepare_from_other_replicas: QuorumCounter = quorum_counter_null,
|
|
182
|
+
|
|
183
|
+
/// Unique recovery_response messages from OTHER replicas (excluding ourself).
|
|
184
|
+
recovery_response_from_other_replicas: QuorumMessages = quorum_messages_null,
|
|
154
185
|
|
|
155
186
|
/// Whether a replica has received a quorum of start_view_change messages for the view change:
|
|
156
187
|
start_view_change_quorum: bool = false,
|
|
@@ -189,6 +220,12 @@ pub fn Replica(
|
|
|
189
220
|
/// The number of ticks before repairing missing/disconnected headers and/or dirty entries:
|
|
190
221
|
repair_timeout: Timeout,
|
|
191
222
|
|
|
223
|
+
/// The number of ticks before attempting to send another set of `recovery` messages.
|
|
224
|
+
recovery_timeout: Timeout,
|
|
225
|
+
|
|
226
|
+
/// The nonce of the `recovery` messages.
|
|
227
|
+
recovery_nonce: Nonce,
|
|
228
|
+
|
|
192
229
|
/// Used to provide deterministic entropy to `choose_any_other_replica()`.
|
|
193
230
|
/// Incremented whenever `choose_any_other_replica()` is called.
|
|
194
231
|
choose_any_other_replica_ticks: u64 = 0,
|
|
@@ -245,25 +282,27 @@ pub fn Replica(
|
|
|
245
282
|
try client_table.ensureTotalCapacity(allocator, @intCast(u32, config.clients_max));
|
|
246
283
|
assert(client_table.capacity() >= config.clients_max);
|
|
247
284
|
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
285
|
+
const root_prepare = Header.root_prepare(cluster);
|
|
286
|
+
|
|
287
|
+
var clock = try Clock.init(
|
|
288
|
+
allocator,
|
|
289
|
+
replica_count,
|
|
290
|
+
replica,
|
|
291
|
+
time,
|
|
292
|
+
);
|
|
293
|
+
errdefer clock.deinit(allocator);
|
|
294
|
+
|
|
295
|
+
const journal = try Journal.init(allocator, storage, replica);
|
|
296
|
+
errdefer journal.deinit(allocator);
|
|
297
|
+
|
|
298
|
+
const recovery_nonce = blk: {
|
|
299
|
+
var nonce: [@sizeOf(Nonce)]u8 = undefined;
|
|
300
|
+
var hash = std.crypto.hash.Blake3.init(.{});
|
|
301
|
+
hash.update(std.mem.asBytes(&clock.monotonic()));
|
|
302
|
+
hash.update(&[_]u8{replica});
|
|
303
|
+
hash.final(&nonce);
|
|
304
|
+
break :blk @bitCast(Nonce, nonce);
|
|
264
305
|
};
|
|
265
|
-
init_prepare.set_checksum_body(&[0]u8{});
|
|
266
|
-
init_prepare.set_checksum();
|
|
267
306
|
|
|
268
307
|
var self = Self{
|
|
269
308
|
.cluster = cluster,
|
|
@@ -271,28 +310,16 @@ pub fn Replica(
|
|
|
271
310
|
.replica = replica,
|
|
272
311
|
.quorum_replication = quorum_replication,
|
|
273
312
|
.quorum_view_change = quorum_view_change,
|
|
274
|
-
.clock =
|
|
275
|
-
|
|
276
|
-
replica_count,
|
|
277
|
-
replica,
|
|
278
|
-
time,
|
|
279
|
-
),
|
|
280
|
-
.journal = try Journal.init(
|
|
281
|
-
allocator,
|
|
282
|
-
storage,
|
|
283
|
-
replica,
|
|
284
|
-
config.journal_size_max,
|
|
285
|
-
config.journal_headers_max,
|
|
286
|
-
&init_prepare,
|
|
287
|
-
),
|
|
313
|
+
.clock = clock,
|
|
314
|
+
.journal = journal,
|
|
288
315
|
.message_bus = message_bus,
|
|
289
316
|
.state_machine = state_machine,
|
|
290
317
|
.client_table = client_table,
|
|
291
|
-
.view =
|
|
292
|
-
.view_normal =
|
|
293
|
-
.op =
|
|
294
|
-
.commit_min =
|
|
295
|
-
.commit_max =
|
|
318
|
+
.view = root_prepare.view,
|
|
319
|
+
.view_normal = root_prepare.view,
|
|
320
|
+
.op = root_prepare.op,
|
|
321
|
+
.commit_min = root_prepare.commit,
|
|
322
|
+
.commit_max = root_prepare.commit,
|
|
296
323
|
.ping_timeout = Timeout{
|
|
297
324
|
.name = "ping_timeout",
|
|
298
325
|
.id = replica,
|
|
@@ -328,6 +355,12 @@ pub fn Replica(
|
|
|
328
355
|
.id = replica,
|
|
329
356
|
.after = 50,
|
|
330
357
|
},
|
|
358
|
+
.recovery_timeout = Timeout{
|
|
359
|
+
.name = "recovery_timeout",
|
|
360
|
+
.id = replica,
|
|
361
|
+
.after = 200,
|
|
362
|
+
},
|
|
363
|
+
.recovery_nonce = recovery_nonce,
|
|
331
364
|
.prng = std.rand.DefaultPrng.init(replica),
|
|
332
365
|
};
|
|
333
366
|
|
|
@@ -346,20 +379,7 @@ pub fn Replica(
|
|
|
346
379
|
config.clients_max,
|
|
347
380
|
});
|
|
348
381
|
|
|
349
|
-
|
|
350
|
-
// can race with tick()... before timeouts have been initialized:
|
|
351
|
-
assert(self.status == .normal);
|
|
352
|
-
if (self.leader()) {
|
|
353
|
-
log.debug("{}: init: leader", .{self.replica});
|
|
354
|
-
self.ping_timeout.start();
|
|
355
|
-
self.commit_timeout.start();
|
|
356
|
-
self.repair_timeout.start();
|
|
357
|
-
} else {
|
|
358
|
-
log.debug("{}: init: follower", .{self.replica});
|
|
359
|
-
self.ping_timeout.start();
|
|
360
|
-
self.normal_status_timeout.start();
|
|
361
|
-
self.repair_timeout.start();
|
|
362
|
-
}
|
|
382
|
+
assert(self.status == .recovering);
|
|
363
383
|
|
|
364
384
|
return self;
|
|
365
385
|
}
|
|
@@ -378,12 +398,7 @@ pub fn Replica(
|
|
|
378
398
|
self.client_table.deinit(allocator);
|
|
379
399
|
}
|
|
380
400
|
|
|
381
|
-
|
|
382
|
-
var it = self.pipeline.iterator();
|
|
383
|
-
while (it.next()) |prepare| {
|
|
384
|
-
self.message_bus.unref(prepare.message);
|
|
385
|
-
}
|
|
386
|
-
}
|
|
401
|
+
while (self.pipeline.pop()) |prepare| self.message_bus.unref(prepare.message);
|
|
387
402
|
|
|
388
403
|
if (self.loopback_queue) |loopback_message| {
|
|
389
404
|
assert(loopback_message.next == null);
|
|
@@ -394,6 +409,10 @@ pub fn Replica(
|
|
|
394
409
|
for (self.do_view_change_from_all_replicas) |message| {
|
|
395
410
|
if (message) |m| self.message_bus.unref(m);
|
|
396
411
|
}
|
|
412
|
+
|
|
413
|
+
for (self.recovery_response_from_other_replicas) |message| {
|
|
414
|
+
if (message) |m| self.message_bus.unref(m);
|
|
415
|
+
}
|
|
397
416
|
}
|
|
398
417
|
|
|
399
418
|
/// Time is measured in logical ticks that are incremented on every call to tick().
|
|
@@ -408,12 +427,40 @@ pub fn Replica(
|
|
|
408
427
|
self.clock.tick();
|
|
409
428
|
|
|
410
429
|
if (!self.journal.recovered) {
|
|
411
|
-
self.journal.recover();
|
|
430
|
+
if (!self.journal.recovering) self.journal.recover();
|
|
412
431
|
return;
|
|
413
432
|
} else {
|
|
414
433
|
assert(!self.journal.recovering);
|
|
415
434
|
}
|
|
416
435
|
|
|
436
|
+
if (self.status == .recovering) {
|
|
437
|
+
if (self.recovery_timeout.ticking) {
|
|
438
|
+
// Continue running the VSR recovery protocol.
|
|
439
|
+
self.recovery_timeout.tick();
|
|
440
|
+
if (self.recovery_timeout.fired()) self.on_recovery_timeout();
|
|
441
|
+
} else if (self.journal.is_empty()) {
|
|
442
|
+
// The data file is brand new — no messages have ever been written.
|
|
443
|
+
// Transition to normal status; no need to run the VSR recovery protocol.
|
|
444
|
+
assert(self.journal.faulty.count == 0);
|
|
445
|
+
self.transition_to_normal_from_recovering_status(0);
|
|
446
|
+
assert(self.status == .normal);
|
|
447
|
+
} else if (self.replica_count == 1) {
|
|
448
|
+
// A cluster-of-one does not run the VSR recovery protocol.
|
|
449
|
+
if (self.journal.faulty.count != 0) @panic("journal is corrupt");
|
|
450
|
+
if (self.committing) return;
|
|
451
|
+
assert(self.op == 0);
|
|
452
|
+
self.op = self.journal.op_maximum();
|
|
453
|
+
self.commit_ops(self.op);
|
|
454
|
+
// The recovering→normal transition is deferred until all ops are committed.
|
|
455
|
+
} else {
|
|
456
|
+
// The journal just finished recovery.
|
|
457
|
+
// Now try to learn the current view via the VSR recovery protocol.
|
|
458
|
+
self.recovery_timeout.start();
|
|
459
|
+
self.recover();
|
|
460
|
+
}
|
|
461
|
+
return;
|
|
462
|
+
}
|
|
463
|
+
|
|
417
464
|
self.ping_timeout.tick();
|
|
418
465
|
self.prepare_timeout.tick();
|
|
419
466
|
self.commit_timeout.tick();
|
|
@@ -437,11 +484,12 @@ pub fn Replica(
|
|
|
437
484
|
/// Called by the MessageBus to deliver a message to the replica.
|
|
438
485
|
pub fn on_message(self: *Self, message: *Message) void {
|
|
439
486
|
assert(self.loopback_queue == null);
|
|
487
|
+
assert(message.references > 0);
|
|
440
488
|
|
|
441
|
-
log.debug("{}: on_message: view={} status={
|
|
489
|
+
log.debug("{}: on_message: view={} status={} {}", .{
|
|
442
490
|
self.replica,
|
|
443
491
|
self.view,
|
|
444
|
-
|
|
492
|
+
self.status,
|
|
445
493
|
message.header,
|
|
446
494
|
});
|
|
447
495
|
|
|
@@ -463,7 +511,6 @@ pub fn Replica(
|
|
|
463
511
|
}
|
|
464
512
|
|
|
465
513
|
if (!self.journal.recovered) {
|
|
466
|
-
self.journal.recover();
|
|
467
514
|
log.debug("{}: on_message: waiting for journal to recover", .{self.replica});
|
|
468
515
|
return;
|
|
469
516
|
} else {
|
|
@@ -482,7 +529,7 @@ pub fn Replica(
|
|
|
482
529
|
.do_view_change => self.on_do_view_change(message),
|
|
483
530
|
.start_view => self.on_start_view(message),
|
|
484
531
|
.recovery => self.on_recovery(message),
|
|
485
|
-
.recovery_response =>
|
|
532
|
+
.recovery_response => self.on_recovery_response(message),
|
|
486
533
|
.request_start_view => self.on_request_start_view(message),
|
|
487
534
|
.request_prepare => self.on_request_prepare(message),
|
|
488
535
|
.request_headers => self.on_request_headers(message),
|
|
@@ -542,7 +589,7 @@ pub fn Replica(
|
|
|
542
589
|
} else {
|
|
543
590
|
// Copy the ping's monotonic timestamp to our pong and add our wall clock sample:
|
|
544
591
|
pong.op = message.header.op;
|
|
545
|
-
pong.
|
|
592
|
+
pong.timestamp = @bitCast(u64, self.clock.realtime());
|
|
546
593
|
self.send_header_to_replica(message.header.replica, pong);
|
|
547
594
|
}
|
|
548
595
|
}
|
|
@@ -552,7 +599,7 @@ pub fn Replica(
|
|
|
552
599
|
if (message.header.replica == self.replica) return;
|
|
553
600
|
|
|
554
601
|
const m0 = message.header.op;
|
|
555
|
-
const t1 = @bitCast(i64, message.header.
|
|
602
|
+
const t1 = @bitCast(i64, message.header.timestamp);
|
|
556
603
|
const m2 = self.clock.monotonic();
|
|
557
604
|
|
|
558
605
|
self.clock.learn(message.header.replica, m0, t1, m2);
|
|
@@ -560,9 +607,9 @@ pub fn Replica(
|
|
|
560
607
|
|
|
561
608
|
/// The primary advances op-number, adds the request to the end of the log, and updates the
|
|
562
609
|
/// information for this client in the client-table to contain the new request number, s.
|
|
563
|
-
/// Then it sends a ⟨PREPARE v, m, n, k⟩ message to the other replicas, where v is the
|
|
564
|
-
/// view-number, m is the message it received from the client, n is the op-number
|
|
565
|
-
/// the request, and k is the commit-number.
|
|
610
|
+
/// Then it sends a ⟨PREPARE v, m, n, k⟩ message to the other replicas, where v is the
|
|
611
|
+
/// current view-number, m is the message it received from the client, n is the op-number
|
|
612
|
+
/// it assigned to the request, and k is the commit-number.
|
|
566
613
|
fn on_request(self: *Self, message: *Message) void {
|
|
567
614
|
if (self.ignore_request_message(message)) return;
|
|
568
615
|
|
|
@@ -581,19 +628,30 @@ pub fn Replica(
|
|
|
581
628
|
|
|
582
629
|
log.debug("{}: on_request: request {}", .{ self.replica, message.header.checksum });
|
|
583
630
|
|
|
584
|
-
|
|
585
|
-
|
|
631
|
+
// Guard against the wall clock going backwards by taking the max with timestamps issued:
|
|
632
|
+
self.state_machine.prepare_timestamp = std.math.max(
|
|
633
|
+
// The cluster `commit_timestamp` may be ahead of our `prepare_timestamp` because this
|
|
634
|
+
// may be our first prepare as a recently elected leader:
|
|
635
|
+
std.math.max(
|
|
636
|
+
self.state_machine.prepare_timestamp,
|
|
637
|
+
self.state_machine.commit_timestamp,
|
|
638
|
+
) + 1,
|
|
639
|
+
@intCast(u64, realtime),
|
|
640
|
+
);
|
|
641
|
+
assert(self.state_machine.prepare_timestamp > self.state_machine.commit_timestamp);
|
|
642
|
+
|
|
643
|
+
const prepare_timestamp = self.state_machine.prepare(
|
|
586
644
|
message.header.operation.cast(StateMachine),
|
|
587
645
|
message.body(),
|
|
588
646
|
);
|
|
589
647
|
|
|
590
|
-
|
|
648
|
+
const latest_entry = self.journal.header_with_op(self.op).?;
|
|
591
649
|
message.header.parent = latest_entry.checksum;
|
|
592
650
|
message.header.context = message.header.checksum;
|
|
593
651
|
message.header.view = self.view;
|
|
594
652
|
message.header.op = self.op + 1;
|
|
595
653
|
message.header.commit = self.commit_max;
|
|
596
|
-
message.header.
|
|
654
|
+
message.header.timestamp = prepare_timestamp;
|
|
597
655
|
message.header.replica = self.replica;
|
|
598
656
|
message.header.command = .prepare;
|
|
599
657
|
|
|
@@ -612,6 +670,8 @@ pub fn Replica(
|
|
|
612
670
|
} else {
|
|
613
671
|
// Do not restart the prepare timeout as it is already ticking for another prepare.
|
|
614
672
|
assert(self.prepare_timeout.ticking);
|
|
673
|
+
const previous = self.pipeline.get_ptr(self.pipeline.count - 2).?;
|
|
674
|
+
assert(previous.message.header.checksum == message.header.parent);
|
|
615
675
|
}
|
|
616
676
|
|
|
617
677
|
self.on_prepare(message);
|
|
@@ -625,22 +685,23 @@ pub fn Replica(
|
|
|
625
685
|
///
|
|
626
686
|
/// The leader starts by sending a prepare message to itself.
|
|
627
687
|
///
|
|
628
|
-
/// Each replica (including the leader) then forwards this prepare message to the next
|
|
629
|
-
/// in the configuration, in parallel to writing to its own journal, closing the
|
|
630
|
-
/// the next replica is back to the leader, in which case the replica does not
|
|
688
|
+
/// Each replica (including the leader) then forwards this prepare message to the next
|
|
689
|
+
/// replica in the configuration, in parallel to writing to its own journal, closing the
|
|
690
|
+
/// circle until the next replica is back to the leader, in which case the replica does not
|
|
691
|
+
/// forward.
|
|
631
692
|
///
|
|
632
693
|
/// This keeps the leader's outgoing bandwidth limited (one-for-one) to incoming bandwidth,
|
|
633
|
-
/// since the leader need only replicate to the next replica. Otherwise, the leader would
|
|
634
|
-
/// to replicate to multiple followers, dividing available bandwidth.
|
|
694
|
+
/// since the leader need only replicate to the next replica. Otherwise, the leader would
|
|
695
|
+
/// need to replicate to multiple followers, dividing available bandwidth.
|
|
635
696
|
///
|
|
636
|
-
/// This does not impact latency, since with Flexible Paxos we need only one remote
|
|
637
|
-
/// It is ideal if this synchronous replication to one remote replica is to the
|
|
638
|
-
/// since that is the replica next in line to be leader, which will need to
|
|
639
|
-
/// it can start the next view.
|
|
697
|
+
/// This does not impact latency, since with Flexible Paxos we need only one remote
|
|
698
|
+
/// prepare_ok. It is ideal if this synchronous replication to one remote replica is to the
|
|
699
|
+
/// next replica, since that is the replica next in line to be leader, which will need to
|
|
700
|
+
/// be up-to-date before it can start the next view.
|
|
640
701
|
///
|
|
641
|
-
/// At the same time, asynchronous replication keeps going, so that if our local disk is
|
|
642
|
-
/// then any latency spike will be masked by more remote prepare_ok messages as they
|
|
643
|
-
/// This gives automatic tail latency tolerance for storage latency spikes.
|
|
702
|
+
/// At the same time, asynchronous replication keeps going, so that if our local disk is
|
|
703
|
+
/// slow, then any latency spike will be masked by more remote prepare_ok messages as they
|
|
704
|
+
/// come in. This gives automatic tail latency tolerance for storage latency spikes.
|
|
644
705
|
///
|
|
645
706
|
/// The remaining problem then is tail latency tolerance for network latency spikes.
|
|
646
707
|
/// If the next replica is down or partitioned, then the leader's prepare timeout will fire,
|
|
@@ -669,12 +730,26 @@ pub fn Replica(
|
|
|
669
730
|
return;
|
|
670
731
|
}
|
|
671
732
|
|
|
733
|
+
// Verify that the new request will fit in the WAL.
|
|
734
|
+
if (message.header.op >= self.op_checkpoint + config.journal_slot_count) {
|
|
735
|
+
log.debug("{}: on_prepare: ignoring op={} (too far ahead, checkpoint={})", .{
|
|
736
|
+
self.replica,
|
|
737
|
+
message.header.op,
|
|
738
|
+
self.op_checkpoint,
|
|
739
|
+
});
|
|
740
|
+
// When we are the leader, `on_request` enforces this invariant.
|
|
741
|
+
assert(self.follower());
|
|
742
|
+
return;
|
|
743
|
+
}
|
|
744
|
+
|
|
672
745
|
assert(self.status == .normal);
|
|
673
746
|
assert(message.header.view == self.view);
|
|
674
747
|
assert(self.leader() or self.follower());
|
|
675
748
|
assert(message.header.replica == self.leader_index(message.header.view));
|
|
749
|
+
assert(message.header.op > self.op_checkpoint);
|
|
676
750
|
assert(message.header.op > self.op);
|
|
677
751
|
assert(message.header.op > self.commit_min);
|
|
752
|
+
assert(message.header.op < self.op_checkpoint + config.journal_slot_count);
|
|
678
753
|
|
|
679
754
|
if (self.follower()) self.normal_status_timeout.reset();
|
|
680
755
|
|
|
@@ -685,7 +760,7 @@ pub fn Replica(
|
|
|
685
760
|
|
|
686
761
|
if (self.journal.previous_entry(message.header)) |previous| {
|
|
687
762
|
// Any previous entry may be a whole journal's worth of ops behind due to wrapping.
|
|
688
|
-
// We therefore do not do any further op
|
|
763
|
+
// We therefore do not do any further op or checksum assertions beyond this:
|
|
689
764
|
self.panic_if_hash_chain_would_break_in_the_same_view(previous, message.header);
|
|
690
765
|
}
|
|
691
766
|
|
|
@@ -700,7 +775,7 @@ pub fn Replica(
|
|
|
700
775
|
});
|
|
701
776
|
assert(message.header.op == self.op + 1);
|
|
702
777
|
self.op = message.header.op;
|
|
703
|
-
self.journal.
|
|
778
|
+
self.journal.set_header_as_dirty(message.header);
|
|
704
779
|
|
|
705
780
|
self.replicate(message);
|
|
706
781
|
self.append(message);
|
|
@@ -780,7 +855,7 @@ pub fn Replica(
|
|
|
780
855
|
assert(message.header.replica == self.leader_index(message.header.view));
|
|
781
856
|
|
|
782
857
|
// We may not always have the latest commit entry but if we do our checksum must match:
|
|
783
|
-
if (self.journal.
|
|
858
|
+
if (self.journal.header_with_op(message.header.commit)) |commit_entry| {
|
|
784
859
|
if (commit_entry.checksum == message.header.context) {
|
|
785
860
|
log.debug("{}: on_commit: checksum verified", .{self.replica});
|
|
786
861
|
} else if (self.valid_hash_chain("on_commit")) {
|
|
@@ -792,7 +867,6 @@ pub fn Replica(
|
|
|
792
867
|
}
|
|
793
868
|
|
|
794
869
|
self.normal_status_timeout.reset();
|
|
795
|
-
|
|
796
870
|
self.commit_ops(message.header.commit);
|
|
797
871
|
}
|
|
798
872
|
|
|
@@ -951,7 +1025,7 @@ pub fn Replica(
|
|
|
951
1025
|
|
|
952
1026
|
var v: ?u32 = null;
|
|
953
1027
|
var k: ?u64 = null;
|
|
954
|
-
var latest = Header.reserved();
|
|
1028
|
+
var latest = Header.reserved(self.cluster, 0);
|
|
955
1029
|
|
|
956
1030
|
for (self.do_view_change_from_all_replicas) |received, replica| {
|
|
957
1031
|
if (received) |m| {
|
|
@@ -962,10 +1036,10 @@ pub fn Replica(
|
|
|
962
1036
|
|
|
963
1037
|
// The latest normal view experienced by this replica:
|
|
964
1038
|
// This may be higher than the view in any of the prepare headers.
|
|
965
|
-
var replica_view_normal = @intCast(u32, m.header.
|
|
1039
|
+
var replica_view_normal = @intCast(u32, m.header.timestamp);
|
|
966
1040
|
assert(replica_view_normal < m.header.view);
|
|
967
1041
|
|
|
968
|
-
var replica_latest = Header.reserved();
|
|
1042
|
+
var replica_latest = Header.reserved(self.cluster, 0);
|
|
969
1043
|
set_latest_op(self.message_body_as_headers(m), &replica_latest);
|
|
970
1044
|
assert(replica_latest.op == m.header.op);
|
|
971
1045
|
|
|
@@ -1005,7 +1079,7 @@ pub fn Replica(
|
|
|
1005
1079
|
}
|
|
1006
1080
|
|
|
1007
1081
|
// Verify that the repairs above have not replaced or advanced the latest op:
|
|
1008
|
-
assert(self.journal.
|
|
1082
|
+
assert(self.journal.header_with_op(self.op).?.checksum == latest.checksum);
|
|
1009
1083
|
|
|
1010
1084
|
assert(self.start_view_change_quorum);
|
|
1011
1085
|
assert(!self.do_view_change_quorum);
|
|
@@ -1013,7 +1087,11 @@ pub fn Replica(
|
|
|
1013
1087
|
|
|
1014
1088
|
self.discard_uncommitted_headers();
|
|
1015
1089
|
assert(self.op >= self.commit_max);
|
|
1016
|
-
|
|
1090
|
+
|
|
1091
|
+
const prepare_timestamp = self.journal.header_with_op(self.op).?.timestamp;
|
|
1092
|
+
if (self.state_machine.prepare_timestamp < prepare_timestamp) {
|
|
1093
|
+
self.state_machine.prepare_timestamp = prepare_timestamp;
|
|
1094
|
+
}
|
|
1017
1095
|
|
|
1018
1096
|
// Start repairs according to the CTRL protocol:
|
|
1019
1097
|
assert(!self.repair_timeout.ticking);
|
|
@@ -1041,7 +1119,7 @@ pub fn Replica(
|
|
|
1041
1119
|
assert(self.status == .view_change);
|
|
1042
1120
|
assert(message.header.view == self.view);
|
|
1043
1121
|
|
|
1044
|
-
var latest = Header.reserved();
|
|
1122
|
+
var latest = Header.reserved(self.cluster, 0);
|
|
1045
1123
|
set_latest_op(self.message_body_as_headers(message), &latest);
|
|
1046
1124
|
assert(latest.op == message.header.op);
|
|
1047
1125
|
|
|
@@ -1053,10 +1131,10 @@ pub fn Replica(
|
|
|
1053
1131
|
}
|
|
1054
1132
|
|
|
1055
1133
|
// Verify that the repairs above have not replaced or advanced the latest op:
|
|
1056
|
-
assert(self.journal.
|
|
1134
|
+
assert(self.journal.header_with_op(self.op).?.checksum == latest.checksum);
|
|
1057
1135
|
|
|
1058
1136
|
if (self.status == .view_change) {
|
|
1059
|
-
self.
|
|
1137
|
+
self.transition_to_normal_from_view_change_status(message.header.view);
|
|
1060
1138
|
self.send_prepare_oks_after_view_change();
|
|
1061
1139
|
}
|
|
1062
1140
|
|
|
@@ -1089,8 +1167,9 @@ pub fn Replica(
|
|
|
1089
1167
|
self.send_message_to_replica(message.header.replica, start_view);
|
|
1090
1168
|
}
|
|
1091
1169
|
|
|
1092
|
-
/// TODO This is a work in progress (out of scope for the bounty)
|
|
1093
1170
|
fn on_recovery(self: *Self, message: *const Message) void {
|
|
1171
|
+
assert(self.replica_count > 1);
|
|
1172
|
+
|
|
1094
1173
|
if (self.status != .normal) {
|
|
1095
1174
|
log.debug("{}: on_recovery: ignoring ({})", .{ self.replica, self.status });
|
|
1096
1175
|
return;
|
|
@@ -1104,34 +1183,28 @@ pub fn Replica(
|
|
|
1104
1183
|
const response = self.message_bus.get_message();
|
|
1105
1184
|
defer self.message_bus.unref(response);
|
|
1106
1185
|
|
|
1186
|
+
log.debug("{}: on_recovery: view={} op={} commit={} nonce={}", .{
|
|
1187
|
+
self.replica,
|
|
1188
|
+
self.view,
|
|
1189
|
+
self.op,
|
|
1190
|
+
self.commit_max,
|
|
1191
|
+
message.header.context,
|
|
1192
|
+
});
|
|
1193
|
+
|
|
1107
1194
|
response.header.* = .{
|
|
1108
1195
|
.command = .recovery_response,
|
|
1109
1196
|
.cluster = self.cluster,
|
|
1110
|
-
.context = message.header.context,
|
|
1197
|
+
.context = message.header.context, // Echo the request's nonce.
|
|
1111
1198
|
.replica = self.replica,
|
|
1112
1199
|
.view = self.view,
|
|
1113
1200
|
.op = self.op,
|
|
1114
1201
|
.commit = self.commit_max,
|
|
1115
1202
|
};
|
|
1116
1203
|
|
|
1117
|
-
const count_max = 8; // The number of prepare headers to include in the body.
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
1 + count_max,
|
|
1122
|
-
);
|
|
1123
|
-
assert(size_max > @sizeOf(Header));
|
|
1124
|
-
|
|
1125
|
-
const count = self.journal.copy_latest_headers_between(
|
|
1126
|
-
0,
|
|
1127
|
-
self.op,
|
|
1128
|
-
std.mem.bytesAsSlice(Header, response.buffer[@sizeOf(Header)..size_max]),
|
|
1129
|
-
);
|
|
1130
|
-
|
|
1131
|
-
// We expect that self.op always exists.
|
|
1132
|
-
assert(count > 0);
|
|
1133
|
-
|
|
1134
|
-
response.header.size = @intCast(u32, @sizeOf(Header) + @sizeOf(Header) * count);
|
|
1204
|
+
const count_max = 8; // The maximum number of prepare headers to include in the body.
|
|
1205
|
+
const count = self.copy_latest_headers_and_set_size(0, self.op, count_max, response);
|
|
1206
|
+
assert(count > 0); // We expect that self.op always exists.
|
|
1207
|
+
assert(@divExact(response.header.size, @sizeOf(Header)) == 1 + count);
|
|
1135
1208
|
|
|
1136
1209
|
response.header.set_checksum_body(response.body());
|
|
1137
1210
|
response.header.set_checksum();
|
|
@@ -1143,68 +1216,339 @@ pub fn Replica(
|
|
|
1143
1216
|
self.send_message_to_replica(message.header.replica, response);
|
|
1144
1217
|
}
|
|
1145
1218
|
|
|
1146
|
-
/// TODO This is a work in progress (out of scope for the bounty)
|
|
1147
1219
|
fn on_recovery_response(self: *Self, message: *Message) void {
|
|
1148
|
-
|
|
1149
|
-
|
|
1220
|
+
assert(self.replica_count > 1);
|
|
1221
|
+
|
|
1222
|
+
if (self.status != .recovering) {
|
|
1223
|
+
log.debug("{}: on_recovery_response: ignoring ({})", .{
|
|
1224
|
+
self.replica,
|
|
1225
|
+
self.status,
|
|
1226
|
+
});
|
|
1227
|
+
return;
|
|
1228
|
+
}
|
|
1229
|
+
|
|
1230
|
+
if (message.header.replica == self.replica) {
|
|
1231
|
+
log.warn("{}: on_recovery_response: ignoring (self)", .{self.replica});
|
|
1232
|
+
return;
|
|
1233
|
+
}
|
|
1234
|
+
|
|
1235
|
+
if (message.header.context != self.recovery_nonce) {
|
|
1236
|
+
log.warn("{}: on_recovery_response: ignoring (different nonce)", .{self.replica});
|
|
1237
|
+
return;
|
|
1238
|
+
}
|
|
1239
|
+
|
|
1240
|
+
// Recovery messages with our nonce are not sent until after the journal is recovered.
|
|
1241
|
+
assert(self.journal.recovered);
|
|
1242
|
+
|
|
1243
|
+
var responses: *QuorumMessages = &self.recovery_response_from_other_replicas;
|
|
1244
|
+
if (responses[message.header.replica]) |existing| {
|
|
1245
|
+
assert(message.header.replica == existing.header.replica);
|
|
1246
|
+
|
|
1247
|
+
if (message.header.checksum == existing.header.checksum) {
|
|
1248
|
+
// The response was replayed by the network; ignore it.
|
|
1249
|
+
log.debug("{}: on_recovery_response: ignoring (duplicate message)", .{
|
|
1250
|
+
self.replica,
|
|
1251
|
+
});
|
|
1252
|
+
return;
|
|
1253
|
+
}
|
|
1254
|
+
|
|
1255
|
+
// We received a second (distinct) response from a replica. Possible causes:
|
|
1256
|
+
// * We retried the `recovery` message, because we had not yet received a quorum.
|
|
1257
|
+
// * The `recovery` message was duplicated/misdirected by the network, and the
|
|
1258
|
+
// receiver's state changed in the mean time.
|
|
1259
|
+
|
|
1260
|
+
log.debug(
|
|
1261
|
+
"{}: on_recovery_response: replica={} view={}..{} op={}..{} commit={}..{}",
|
|
1262
|
+
.{
|
|
1263
|
+
self.replica,
|
|
1264
|
+
existing.header.replica,
|
|
1265
|
+
existing.header.view,
|
|
1266
|
+
message.header.view,
|
|
1267
|
+
existing.header.op,
|
|
1268
|
+
message.header.op,
|
|
1269
|
+
existing.header.commit,
|
|
1270
|
+
message.header.commit,
|
|
1271
|
+
},
|
|
1272
|
+
);
|
|
1273
|
+
|
|
1274
|
+
if (message.header.view < existing.header.view or
|
|
1275
|
+
(message.header.view == existing.header.view and
|
|
1276
|
+
message.header.op < existing.header.op) or
|
|
1277
|
+
(message.header.view == existing.header.view and
|
|
1278
|
+
message.header.op == existing.header.op and
|
|
1279
|
+
message.header.commit < existing.header.commit))
|
|
1280
|
+
{
|
|
1281
|
+
// The second message is older than the first one (reordered packets).
|
|
1282
|
+
log.debug("{}: on_recovery_response: ignoring (older)", .{self.replica});
|
|
1283
|
+
return;
|
|
1284
|
+
}
|
|
1285
|
+
|
|
1286
|
+
// The second message is newer than the first one.
|
|
1287
|
+
assert(message.header.view >= existing.header.view);
|
|
1288
|
+
// The op number may regress if an uncommitted op was discarded in a higher view.
|
|
1289
|
+
assert(message.header.op >= existing.header.op or
|
|
1290
|
+
message.header.view > existing.header.view);
|
|
1291
|
+
assert(message.header.commit >= existing.header.commit);
|
|
1292
|
+
|
|
1293
|
+
self.message_bus.unref(existing);
|
|
1294
|
+
responses[message.header.replica] = null;
|
|
1295
|
+
} else {
|
|
1296
|
+
log.debug(
|
|
1297
|
+
"{}: on_recovery_response: replica={} view={} op={} commit={}",
|
|
1298
|
+
.{
|
|
1299
|
+
self.replica,
|
|
1300
|
+
message.header.replica,
|
|
1301
|
+
message.header.view,
|
|
1302
|
+
message.header.op,
|
|
1303
|
+
message.header.commit,
|
|
1304
|
+
},
|
|
1305
|
+
);
|
|
1306
|
+
}
|
|
1307
|
+
|
|
1308
|
+
assert(responses[message.header.replica] == null);
|
|
1309
|
+
responses[message.header.replica] = message.ref();
|
|
1310
|
+
|
|
1311
|
+
// Wait until we have:
|
|
1312
|
+
// * at least `f + 1` messages for quorum (not including ourself), and
|
|
1313
|
+
// * a response from the leader of the highest discovered view.
|
|
1314
|
+
const count = self.count_quorum(responses, .recovery_response, self.recovery_nonce);
|
|
1315
|
+
assert(count <= self.replica_count - 1);
|
|
1316
|
+
|
|
1317
|
+
const threshold = self.quorum_view_change;
|
|
1318
|
+
if (count < threshold) {
|
|
1319
|
+
log.debug("{}: on_recovery_response: waiting for quorum ({}/{})", .{
|
|
1320
|
+
self.replica,
|
|
1321
|
+
count,
|
|
1322
|
+
threshold,
|
|
1323
|
+
});
|
|
1324
|
+
return;
|
|
1325
|
+
}
|
|
1326
|
+
|
|
1327
|
+
const view = blk: { // The latest known view.
|
|
1328
|
+
var view: u32 = 0;
|
|
1329
|
+
for (self.recovery_response_from_other_replicas) |received, replica| {
|
|
1330
|
+
if (received) |response| {
|
|
1331
|
+
assert(replica != self.replica);
|
|
1332
|
+
assert(response.header.replica == replica);
|
|
1333
|
+
assert(response.header.context == self.recovery_nonce);
|
|
1334
|
+
|
|
1335
|
+
view = std.math.max(view, response.header.view);
|
|
1336
|
+
}
|
|
1337
|
+
}
|
|
1338
|
+
break :blk view;
|
|
1339
|
+
};
|
|
1340
|
+
|
|
1341
|
+
const leader_response = responses[self.leader_index(view)];
|
|
1342
|
+
if (leader_response == null) {
|
|
1343
|
+
log.debug(
|
|
1344
|
+
"{}: on_recovery_response: ignoring (awaiting response from leader of view={})",
|
|
1345
|
+
.{
|
|
1346
|
+
self.replica,
|
|
1347
|
+
view,
|
|
1348
|
+
},
|
|
1349
|
+
);
|
|
1350
|
+
return;
|
|
1351
|
+
}
|
|
1352
|
+
|
|
1353
|
+
if (leader_response.?.header.view != view) {
|
|
1354
|
+
// The leader (according to the view quorum) isn't the leader (according to itself).
|
|
1355
|
+
// The `recovery_timeout` will retry shortly with another round.
|
|
1356
|
+
log.debug(
|
|
1357
|
+
"{}: on_recovery_response: ignoring (leader view={} != quorum view={})",
|
|
1358
|
+
.{
|
|
1359
|
+
self.replica,
|
|
1360
|
+
leader_response.?.header.view,
|
|
1361
|
+
view,
|
|
1362
|
+
},
|
|
1363
|
+
);
|
|
1364
|
+
return;
|
|
1365
|
+
}
|
|
1366
|
+
|
|
1367
|
+
// This recovering→normal status transition occurs exactly once.
|
|
1368
|
+
// All further `recovery_response` messages are ignored.
|
|
1369
|
+
|
|
1370
|
+
// TODO When the view is recovered from the superblock (instead of via the VSR recovery
|
|
1371
|
+
// protocol), if the view number indicates that this replica is a leader, it must
|
|
1372
|
+
// transition to status=view_change instead of status=normal.
|
|
1373
|
+
|
|
1374
|
+
const leader_headers = self.message_body_as_headers(leader_response.?);
|
|
1375
|
+
assert(leader_headers.len > 0);
|
|
1376
|
+
|
|
1377
|
+
const commit = leader_response.?.header.commit;
|
|
1378
|
+
{
|
|
1379
|
+
var latest = Header.reserved(self.cluster, 0);
|
|
1380
|
+
set_latest_op(leader_headers, &latest);
|
|
1381
|
+
assert(latest.op == leader_response.?.header.op);
|
|
1382
|
+
|
|
1383
|
+
self.set_latest_op_and_k(&latest, commit, "on_recovery_response");
|
|
1384
|
+
assert(self.op == latest.op);
|
|
1385
|
+
assert(self.journal.header_with_op(self.op) != null);
|
|
1386
|
+
}
|
|
1387
|
+
|
|
1388
|
+
assert(self.status == .recovering);
|
|
1389
|
+
self.transition_to_normal_from_recovering_status(view);
|
|
1390
|
+
assert(self.status == .normal);
|
|
1391
|
+
assert(self.follower());
|
|
1392
|
+
|
|
1393
|
+
// TODO If the view's primary is >1 WAL ahead of us, these headers could cause
|
|
1394
|
+
// problems. We don't want to jump this far ahead to repair, but we still need to use
|
|
1395
|
+
// the hash chain to figure out which headers to request. Maybe include our
|
|
1396
|
+
// `op_checkpoint` in the recovery (request) message so that the response can give more
|
|
1397
|
+
// useful (i.e. older) headers.
|
|
1398
|
+
for (leader_headers) |*header| {
|
|
1399
|
+
_ = self.repair_header(header);
|
|
1400
|
+
}
|
|
1401
|
+
|
|
1402
|
+
if (self.op < config.journal_slot_count) {
|
|
1403
|
+
if (self.journal.header_with_op(0)) |header| {
|
|
1404
|
+
assert(header.command == .prepare);
|
|
1405
|
+
assert(header.operation == .root);
|
|
1406
|
+
} else {
|
|
1407
|
+
// This is the first wrap of the log, and the root prepare is corrupt.
|
|
1408
|
+
// Repair the root repair. This is necessary to maintain the invariant that the
|
|
1409
|
+
// op=commit_min exists in-memory.
|
|
1410
|
+
const header = Header.root_prepare(self.cluster);
|
|
1411
|
+
self.journal.set_header_as_dirty(&header);
|
|
1412
|
+
log.debug("{}: on_recovery_response: repair root op", .{self.replica});
|
|
1413
|
+
}
|
|
1414
|
+
}
|
|
1415
|
+
|
|
1416
|
+
log.debug("{}: on_recovery_response: responses={} view={} headers={}..{}" ++
|
|
1417
|
+
" commit={} dirty={} faulty={}", .{
|
|
1418
|
+
self.replica,
|
|
1419
|
+
count,
|
|
1420
|
+
view,
|
|
1421
|
+
leader_headers[leader_headers.len - 1].op,
|
|
1422
|
+
leader_headers[0].op,
|
|
1423
|
+
commit,
|
|
1424
|
+
self.journal.dirty.count,
|
|
1425
|
+
self.journal.faulty.count,
|
|
1426
|
+
});
|
|
1427
|
+
|
|
1428
|
+
self.state_machine.prepare_timestamp = self.journal.header_with_op(self.op).?.timestamp;
|
|
1429
|
+
// `state_machine.commit_timestamp` is updated as messages are committed.
|
|
1430
|
+
|
|
1431
|
+
self.reset_quorum_recovery_response();
|
|
1432
|
+
self.commit_ops(commit);
|
|
1433
|
+
self.repair();
|
|
1150
1434
|
}
|
|
1151
1435
|
|
|
1436
|
+
/// If the requested prepare has been guaranteed by this replica:
|
|
1437
|
+
/// * Read the prepare from storage, and forward it to the replica that requested it.
|
|
1438
|
+
/// * Otherwise send no reply — it isn't safe to nack.
|
|
1439
|
+
/// If the requested prepare has *not* been guaranteed by this replica, then send a nack.
|
|
1440
|
+
///
|
|
1441
|
+
/// A prepare is considered "guaranteed" by a replica if that replica has acknowledged it
|
|
1442
|
+
/// to the cluster. The cluster sees the replica as an underwriter of a guaranteed
|
|
1443
|
+
/// prepare. If a guaranteed prepare is found to by faulty, the replica must repair it
|
|
1444
|
+
/// to restore durability.
|
|
1152
1445
|
fn on_request_prepare(self: *Self, message: *const Message) void {
|
|
1153
1446
|
if (self.ignore_repair_message(message)) return;
|
|
1154
1447
|
|
|
1448
|
+
assert(self.replica_count > 1);
|
|
1155
1449
|
assert(self.status == .normal or self.status == .view_change);
|
|
1156
1450
|
assert(message.header.view == self.view);
|
|
1157
1451
|
assert(message.header.replica != self.replica);
|
|
1158
1452
|
|
|
1159
1453
|
const op = message.header.op;
|
|
1160
|
-
|
|
1161
|
-
|
|
1454
|
+
const slot = self.journal.slot_for_op(op);
|
|
1455
|
+
const checksum: ?u128 = switch (message.header.timestamp) {
|
|
1456
|
+
0 => null,
|
|
1457
|
+
1 => message.header.context,
|
|
1458
|
+
else => unreachable,
|
|
1459
|
+
};
|
|
1162
1460
|
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
assert(checksum == null or entry.checksum == checksum.?);
|
|
1461
|
+
// Only the leader may respond to `request_prepare` messages without a checksum.
|
|
1462
|
+
assert(checksum != null or self.leader_index(self.view) == self.replica);
|
|
1166
1463
|
|
|
1167
|
-
|
|
1168
|
-
|
|
1464
|
+
// Try to serve the message directly from the pipeline.
|
|
1465
|
+
// This saves us from going to disk. And we don't need to worry that the WAL's copy
|
|
1466
|
+
// of an uncommitted prepare is lost/corrupted.
|
|
1467
|
+
if (self.pipeline_prepare_for_op_and_checksum(op, checksum)) |prepare| {
|
|
1468
|
+
log.debug("{}: on_request_prepare: op={} checksum={} reply from pipeline", .{
|
|
1469
|
+
self.replica,
|
|
1470
|
+
op,
|
|
1471
|
+
checksum,
|
|
1472
|
+
});
|
|
1473
|
+
self.send_message_to_replica(message.header.replica, prepare.message);
|
|
1474
|
+
return;
|
|
1475
|
+
}
|
|
1169
1476
|
|
|
1477
|
+
if (self.journal.prepare_inhabited[slot.index]) {
|
|
1478
|
+
const prepare_checksum = self.journal.prepare_checksums[slot.index];
|
|
1479
|
+
// Consult `journal.prepare_checksums` (rather than `journal.headers`):
|
|
1480
|
+
// the former may have the prepare we want — even if journal recovery marked the
|
|
1481
|
+
// slot as faulty and left the in-memory header as reserved.
|
|
1482
|
+
if (checksum == null or checksum.? == prepare_checksum) {
|
|
1170
1483
|
log.debug("{}: on_request_prepare: op={} checksum={} reading", .{
|
|
1171
1484
|
self.replica,
|
|
1172
1485
|
op,
|
|
1173
1486
|
checksum,
|
|
1174
1487
|
});
|
|
1175
1488
|
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1489
|
+
if (self.journal.header_with_op_and_checksum(op, checksum)) |_| {
|
|
1490
|
+
// The header for the target prepare is already in-memory.
|
|
1491
|
+
// This is preferable to the `else` case since we have the prepare's
|
|
1492
|
+
// `header.size` in-memory, so the read can be (potentially) shorter.
|
|
1493
|
+
// TODO Do not reissue the read if we are already reading in order to send
|
|
1494
|
+
// to this particular destination replica.
|
|
1495
|
+
self.journal.read_prepare(
|
|
1496
|
+
on_request_prepare_read,
|
|
1497
|
+
op,
|
|
1498
|
+
prepare_checksum,
|
|
1499
|
+
message.header.replica,
|
|
1500
|
+
);
|
|
1501
|
+
} else {
|
|
1502
|
+
// TODO Do not reissue the read if we are already reading in order to send to
|
|
1503
|
+
// this particular destination replica.
|
|
1504
|
+
self.journal.read_prepare_with_op_and_checksum(
|
|
1505
|
+
on_request_prepare_read,
|
|
1506
|
+
op,
|
|
1507
|
+
prepare_checksum,
|
|
1508
|
+
message.header.replica,
|
|
1509
|
+
);
|
|
1510
|
+
}
|
|
1194
1511
|
|
|
1195
|
-
// We have
|
|
1512
|
+
// We have guaranteed the prepare (not safe to nack).
|
|
1513
|
+
// Our copy may or may not be valid, but we will try to read & forward it.
|
|
1196
1514
|
return;
|
|
1197
1515
|
}
|
|
1516
|
+
}
|
|
1198
1517
|
|
|
1199
|
-
|
|
1200
|
-
//
|
|
1518
|
+
{
|
|
1519
|
+
// We may have guaranteed the prepare but our copy is faulty (not safe to nack).
|
|
1520
|
+
if (self.journal.faulty.bit(slot)) return;
|
|
1521
|
+
if (self.journal.header_with_op_and_checksum(message.header.op, checksum)) |_| {
|
|
1522
|
+
if (self.journal.dirty.bit(slot)) {
|
|
1523
|
+
// We know of the prepare but have yet to write it (safe to nack).
|
|
1524
|
+
// Continue through below...
|
|
1525
|
+
} else {
|
|
1526
|
+
// We have guaranteed the prepare and our copy is clean (not safe to nack).
|
|
1527
|
+
return;
|
|
1528
|
+
}
|
|
1529
|
+
}
|
|
1201
1530
|
}
|
|
1202
1531
|
|
|
1532
|
+
// Protocol-Aware Recovery's CTRL protocol only runs during the view change, when the
|
|
1533
|
+
// new primary needs to repair its own WAL before starting the new view.
|
|
1534
|
+
//
|
|
1535
|
+
// This branch is only where the backup doesn't have the prepare and could possibly
|
|
1536
|
+
// send a nack as part of the CTRL protocol. Nacks only get sent during a view change
|
|
1537
|
+
// to help the new primary trim uncommitted ops that couldn't otherwise be repaired.
|
|
1538
|
+
// Without doing this, the cluster would become permanently unavailable. So backups
|
|
1539
|
+
// shouldn't respond to the `request_prepare` if the new view has already started,
|
|
1540
|
+
// they should also be in view change status, waiting for the new primary to start
|
|
1541
|
+
// the view.
|
|
1203
1542
|
if (self.status == .view_change) {
|
|
1204
1543
|
assert(message.header.replica == self.leader_index(self.view));
|
|
1205
1544
|
assert(checksum != null);
|
|
1206
|
-
|
|
1207
|
-
|
|
1545
|
+
|
|
1546
|
+
if (self.journal.header_with_op_and_checksum(op, checksum)) |_| {
|
|
1547
|
+
assert(self.journal.dirty.bit(slot) and !self.journal.faulty.bit(slot));
|
|
1548
|
+
}
|
|
1549
|
+
|
|
1550
|
+
if (self.journal.prepare_inhabited[slot.index]) {
|
|
1551
|
+
assert(self.journal.prepare_checksums[slot.index] != checksum.?);
|
|
1208
1552
|
}
|
|
1209
1553
|
|
|
1210
1554
|
log.debug("{}: on_request_prepare: op={} checksum={} nacking", .{
|
|
@@ -1264,21 +1608,9 @@ pub fn Replica(
|
|
|
1264
1608
|
const op_max = message.header.op;
|
|
1265
1609
|
assert(op_max >= op_min);
|
|
1266
1610
|
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
assert(
|
|
1270
|
-
|
|
1271
|
-
const size_max = @sizeOf(Header) * std.math.min(
|
|
1272
|
-
std.math.max(@divFloor(message.buffer.len, @sizeOf(Header)), 2),
|
|
1273
|
-
1 + count_max,
|
|
1274
|
-
);
|
|
1275
|
-
assert(size_max > @sizeOf(Header));
|
|
1276
|
-
|
|
1277
|
-
const count = self.journal.copy_latest_headers_between(
|
|
1278
|
-
op_min,
|
|
1279
|
-
op_max,
|
|
1280
|
-
std.mem.bytesAsSlice(Header, response.buffer[@sizeOf(Header)..size_max]),
|
|
1281
|
-
);
|
|
1611
|
+
const count = self.copy_latest_headers_and_set_size(op_min, op_max, null, response);
|
|
1612
|
+
assert(count >= 0);
|
|
1613
|
+
assert(@divExact(response.header.size, @sizeOf(Header)) == 1 + count);
|
|
1282
1614
|
|
|
1283
1615
|
if (count == 0) {
|
|
1284
1616
|
log.debug("{}: on_request_headers: ignoring (op={}..{}, no headers)", .{
|
|
@@ -1289,8 +1621,6 @@ pub fn Replica(
|
|
|
1289
1621
|
return;
|
|
1290
1622
|
}
|
|
1291
1623
|
|
|
1292
|
-
response.header.size = @intCast(u32, @sizeOf(Header) + @sizeOf(Header) * count);
|
|
1293
|
-
|
|
1294
1624
|
response.header.set_checksum_body(response.body());
|
|
1295
1625
|
response.header.set_checksum();
|
|
1296
1626
|
|
|
@@ -1313,7 +1643,8 @@ pub fn Replica(
|
|
|
1313
1643
|
}
|
|
1314
1644
|
|
|
1315
1645
|
const op = self.nack_prepare_op.?;
|
|
1316
|
-
const checksum = self.journal.
|
|
1646
|
+
const checksum = self.journal.header_with_op(op).?.checksum;
|
|
1647
|
+
const slot = self.journal.slot_with_op(op).?;
|
|
1317
1648
|
|
|
1318
1649
|
if (message.header.op != op) {
|
|
1319
1650
|
log.debug("{}: on_nack_prepare: ignoring (repairing another op)", .{self.replica});
|
|
@@ -1348,14 +1679,14 @@ pub fn Replica(
|
|
|
1348
1679
|
// Otherwise, if we know we do not have the op, then we can exclude ourselves.
|
|
1349
1680
|
assert(self.replica_count > 1);
|
|
1350
1681
|
|
|
1351
|
-
const threshold = if (self.journal.faulty.bit(
|
|
1682
|
+
const threshold = if (self.journal.faulty.bit(slot))
|
|
1352
1683
|
self.replica_count - self.quorum_replication + 1
|
|
1353
1684
|
else
|
|
1354
1685
|
self.replica_count - self.quorum_replication;
|
|
1355
1686
|
|
|
1356
1687
|
if (threshold == 0) {
|
|
1357
1688
|
assert(self.replica_count == 2);
|
|
1358
|
-
assert(!self.journal.faulty.bit(
|
|
1689
|
+
assert(!self.journal.faulty.bit(slot));
|
|
1359
1690
|
|
|
1360
1691
|
// This is a special case for a cluster-of-two, handled in `repair_prepare()`.
|
|
1361
1692
|
log.debug("{}: on_nack_prepare: ignoring (cluster-of-two, not faulty)", .{
|
|
@@ -1364,10 +1695,11 @@ pub fn Replica(
|
|
|
1364
1695
|
return;
|
|
1365
1696
|
}
|
|
1366
1697
|
|
|
1367
|
-
log.debug("{}: on_nack_prepare: quorum_replication={} threshold={}", .{
|
|
1698
|
+
log.debug("{}: on_nack_prepare: quorum_replication={} threshold={} op={}", .{
|
|
1368
1699
|
self.replica,
|
|
1369
1700
|
self.quorum_replication,
|
|
1370
1701
|
threshold,
|
|
1702
|
+
op,
|
|
1371
1703
|
});
|
|
1372
1704
|
|
|
1373
1705
|
// We should never expect to receive a nack from ourselves:
|
|
@@ -1383,7 +1715,7 @@ pub fn Replica(
|
|
|
1383
1715
|
|
|
1384
1716
|
assert(count == threshold);
|
|
1385
1717
|
assert(!self.nack_prepare_from_other_replicas.isSet(self.replica));
|
|
1386
|
-
log.debug("{}: on_nack_prepare: quorum received", .{self.replica});
|
|
1718
|
+
log.debug("{}: on_nack_prepare: quorum received op={}", .{ self.replica, op });
|
|
1387
1719
|
|
|
1388
1720
|
self.discard_uncommitted_ops_from(op, checksum);
|
|
1389
1721
|
self.reset_quorum_nack_prepare();
|
|
@@ -1512,7 +1844,10 @@ pub fn Replica(
|
|
|
1512
1844
|
const replica = waiting[self.prepare_timeout.attempts % waiting_len];
|
|
1513
1845
|
assert(replica != self.replica);
|
|
1514
1846
|
|
|
1515
|
-
log.debug("{}: on_prepare_timeout: replicating to replica {}", .{
|
|
1847
|
+
log.debug("{}: on_prepare_timeout: replicating to replica {}", .{
|
|
1848
|
+
self.replica,
|
|
1849
|
+
replica,
|
|
1850
|
+
});
|
|
1516
1851
|
self.send_message_to_replica(replica, prepare.message);
|
|
1517
1852
|
}
|
|
1518
1853
|
|
|
@@ -1524,7 +1859,7 @@ pub fn Replica(
|
|
|
1524
1859
|
assert(self.commit_min == self.commit_max);
|
|
1525
1860
|
|
|
1526
1861
|
// TODO Snapshots: Use snapshot checksum if commit is no longer in journal.
|
|
1527
|
-
const latest_committed_entry = self.journal.
|
|
1862
|
+
const latest_committed_entry = self.journal.header_with_op(self.commit_max).?;
|
|
1528
1863
|
|
|
1529
1864
|
self.send_header_to_other_replicas(.{
|
|
1530
1865
|
.command = .commit,
|
|
@@ -1569,6 +1904,13 @@ pub fn Replica(
|
|
|
1569
1904
|
self.repair();
|
|
1570
1905
|
}
|
|
1571
1906
|
|
|
1907
|
+
fn on_recovery_timeout(self: *Self) void {
|
|
1908
|
+
assert(self.status == .recovering);
|
|
1909
|
+
assert(self.replica_count > 1);
|
|
1910
|
+
self.recovery_timeout.reset();
|
|
1911
|
+
self.recover();
|
|
1912
|
+
}
|
|
1913
|
+
|
|
1572
1914
|
fn reference_message_and_receive_quorum_exactly_once(
|
|
1573
1915
|
self: *Self,
|
|
1574
1916
|
messages: *QuorumMessages,
|
|
@@ -1625,7 +1967,10 @@ pub fn Replica(
|
|
|
1625
1967
|
|
|
1626
1968
|
// This is not the first time we have had quorum, the state transition has already happened:
|
|
1627
1969
|
if (count > threshold) {
|
|
1628
|
-
log.debug("{}: on_{s}: ignoring (quorum received already)", .{
|
|
1970
|
+
log.debug("{}: on_{s}: ignoring (quorum received already)", .{
|
|
1971
|
+
self.replica,
|
|
1972
|
+
command,
|
|
1973
|
+
});
|
|
1629
1974
|
return null;
|
|
1630
1975
|
}
|
|
1631
1976
|
|
|
@@ -1674,7 +2019,11 @@ pub fn Replica(
|
|
|
1674
2019
|
|
|
1675
2020
|
// Do not allow duplicate messages to trigger multiple passes through a state transition:
|
|
1676
2021
|
if (counter.isSet(message.header.replica)) {
|
|
1677
|
-
log.debug("{}: on_{s}: ignoring (duplicate message)", .{
|
|
2022
|
+
log.debug("{}: on_{s}: ignoring (duplicate message replica={})", .{
|
|
2023
|
+
self.replica,
|
|
2024
|
+
command,
|
|
2025
|
+
message.header.replica,
|
|
2026
|
+
});
|
|
1678
2027
|
return null;
|
|
1679
2028
|
}
|
|
1680
2029
|
|
|
@@ -1695,7 +2044,10 @@ pub fn Replica(
|
|
|
1695
2044
|
|
|
1696
2045
|
// This is not the first time we have had quorum, the state transition has already happened:
|
|
1697
2046
|
if (count > threshold) {
|
|
1698
|
-
log.debug("{}: on_{s}: ignoring (quorum received already)", .{
|
|
2047
|
+
log.debug("{}: on_{s}: ignoring (quorum received already)", .{
|
|
2048
|
+
self.replica,
|
|
2049
|
+
command,
|
|
2050
|
+
});
|
|
1699
2051
|
return null;
|
|
1700
2052
|
}
|
|
1701
2053
|
|
|
@@ -1709,8 +2061,15 @@ pub fn Replica(
|
|
|
1709
2061
|
assert(message.header.view == self.view);
|
|
1710
2062
|
assert(message.header.op == self.op);
|
|
1711
2063
|
|
|
1712
|
-
|
|
1713
|
-
|
|
2064
|
+
if (self.replica_count == 1 and self.pipeline.count > 1) {
|
|
2065
|
+
// In a cluster-of-one, the prepares must always be written to the WAL sequentially
|
|
2066
|
+
// (never concurrently). This ensures that there will be no gaps in the WAL during
|
|
2067
|
+
// crash recovery.
|
|
2068
|
+
log.debug("{}: append: serializing append op={}", .{ self.replica, message.header.op });
|
|
2069
|
+
} else {
|
|
2070
|
+
log.debug("{}: append: appending to journal", .{self.replica});
|
|
2071
|
+
self.write_prepare(message, .append);
|
|
2072
|
+
}
|
|
1714
2073
|
}
|
|
1715
2074
|
|
|
1716
2075
|
/// Returns whether `b` succeeds `a` by having a newer view or same view and newer op.
|
|
@@ -1761,7 +2120,8 @@ pub fn Replica(
|
|
|
1761
2120
|
fn commit_ops(self: *Self, commit: u64) void {
|
|
1762
2121
|
// TODO Restrict `view_change` status only to the leader purely as defense-in-depth.
|
|
1763
2122
|
// Be careful of concurrency when doing this, as successive view changes can happen quickly.
|
|
1764
|
-
assert(self.status == .normal or self.status == .view_change
|
|
2123
|
+
assert(self.status == .normal or self.status == .view_change or
|
|
2124
|
+
(self.status == .recovering and self.replica_count == 1));
|
|
1765
2125
|
assert(self.commit_min <= self.commit_max);
|
|
1766
2126
|
assert(self.commit_min <= self.op);
|
|
1767
2127
|
assert(self.commit_max <= self.op or self.commit_max > self.op);
|
|
@@ -1805,12 +2165,14 @@ pub fn Replica(
|
|
|
1805
2165
|
|
|
1806
2166
|
fn commit_ops_read(self: *Self) void {
|
|
1807
2167
|
assert(self.committing);
|
|
1808
|
-
assert(self.status == .normal or self.status == .view_change
|
|
2168
|
+
assert(self.status == .normal or self.status == .view_change or
|
|
2169
|
+
(self.status == .recovering and self.replica_count == 1));
|
|
1809
2170
|
assert(self.commit_min <= self.commit_max);
|
|
1810
2171
|
assert(self.commit_min <= self.op);
|
|
1811
2172
|
|
|
1812
2173
|
if (!self.valid_hash_chain("commit_ops_read")) {
|
|
1813
2174
|
self.committing = false;
|
|
2175
|
+
assert(self.replica_count > 1);
|
|
1814
2176
|
return;
|
|
1815
2177
|
}
|
|
1816
2178
|
assert(self.op >= self.commit_max);
|
|
@@ -1819,12 +2181,22 @@ pub fn Replica(
|
|
|
1819
2181
|
// Even a naive state transfer may fail to correct for this.
|
|
1820
2182
|
if (self.commit_min < self.commit_max and self.commit_min < self.op) {
|
|
1821
2183
|
const op = self.commit_min + 1;
|
|
1822
|
-
const checksum = self.journal.
|
|
2184
|
+
const checksum = self.journal.header_with_op(op).?.checksum;
|
|
1823
2185
|
self.journal.read_prepare(commit_ops_commit, op, checksum, null);
|
|
1824
2186
|
} else {
|
|
1825
2187
|
self.committing = false;
|
|
1826
2188
|
// This is an optimization to expedite the view change before the `repair_timeout`:
|
|
1827
2189
|
if (self.status == .view_change and self.repairs_allowed()) self.repair();
|
|
2190
|
+
|
|
2191
|
+
if (self.status == .recovering) {
|
|
2192
|
+
assert(self.replica_count == 1);
|
|
2193
|
+
assert(self.commit_min == self.commit_max);
|
|
2194
|
+
assert(self.commit_min == self.op);
|
|
2195
|
+
self.transition_to_normal_from_recovering_status(0);
|
|
2196
|
+
} else {
|
|
2197
|
+
// We expect that a cluster-of-one only calls commit_ops() in recovering status.
|
|
2198
|
+
assert(self.replica_count > 1);
|
|
2199
|
+
}
|
|
1828
2200
|
}
|
|
1829
2201
|
}
|
|
1830
2202
|
|
|
@@ -1836,31 +2208,39 @@ pub fn Replica(
|
|
|
1836
2208
|
|
|
1837
2209
|
if (prepare == null) {
|
|
1838
2210
|
log.debug("{}: commit_ops_commit: prepare == null", .{self.replica});
|
|
2211
|
+
if (self.replica_count == 1) @panic("cannot recover corrupt prepare");
|
|
1839
2212
|
return;
|
|
1840
2213
|
}
|
|
1841
2214
|
|
|
1842
|
-
|
|
1843
|
-
|
|
1844
|
-
|
|
1845
|
-
|
|
1846
|
-
|
|
2215
|
+
switch (self.status) {
|
|
2216
|
+
.normal => {},
|
|
2217
|
+
.view_change => {
|
|
2218
|
+
if (self.leader_index(self.view) != self.replica) {
|
|
2219
|
+
log.debug("{}: commit_ops_commit: no longer leader", .{self.replica});
|
|
2220
|
+
assert(self.replica_count > 1);
|
|
2221
|
+
return;
|
|
2222
|
+
}
|
|
1847
2223
|
|
|
1848
|
-
|
|
1849
|
-
|
|
1850
|
-
|
|
1851
|
-
|
|
1852
|
-
|
|
2224
|
+
// Only the leader may commit during a view change before starting the new view.
|
|
2225
|
+
// Fall through if this is indeed the case.
|
|
2226
|
+
},
|
|
2227
|
+
.recovering => {
|
|
2228
|
+
assert(self.replica_count == 1);
|
|
2229
|
+
assert(self.leader_index(self.view) == self.replica);
|
|
2230
|
+
},
|
|
1853
2231
|
}
|
|
1854
2232
|
|
|
1855
2233
|
const op = self.commit_min + 1;
|
|
1856
2234
|
|
|
1857
2235
|
if (prepare.?.header.op != op) {
|
|
1858
2236
|
log.debug("{}: commit_ops_commit: op changed", .{self.replica});
|
|
2237
|
+
assert(self.replica_count > 1);
|
|
1859
2238
|
return;
|
|
1860
2239
|
}
|
|
1861
2240
|
|
|
1862
|
-
if (prepare.?.header.checksum != self.journal.
|
|
2241
|
+
if (prepare.?.header.checksum != self.journal.header_with_op(op).?.checksum) {
|
|
1863
2242
|
log.debug("{}: commit_ops_commit: checksum changed", .{self.replica});
|
|
2243
|
+
assert(self.replica_count > 1);
|
|
1864
2244
|
return;
|
|
1865
2245
|
}
|
|
1866
2246
|
|
|
@@ -1876,9 +2256,10 @@ pub fn Replica(
|
|
|
1876
2256
|
|
|
1877
2257
|
fn commit_op(self: *Self, prepare: *const Message) void {
|
|
1878
2258
|
// TODO Can we add more checks around allowing commit_op() during a view change?
|
|
1879
|
-
assert(self.status == .normal or self.status == .view_change
|
|
2259
|
+
assert(self.status == .normal or self.status == .view_change or
|
|
2260
|
+
(self.status == .recovering and self.replica_count == 1));
|
|
1880
2261
|
assert(prepare.header.command == .prepare);
|
|
1881
|
-
assert(prepare.header.operation != .
|
|
2262
|
+
assert(prepare.header.operation != .root);
|
|
1882
2263
|
assert(prepare.header.op == self.commit_min + 1);
|
|
1883
2264
|
assert(prepare.header.op <= self.op);
|
|
1884
2265
|
|
|
@@ -1886,7 +2267,7 @@ pub fn Replica(
|
|
|
1886
2267
|
// happened since we last checked in `commit_ops_read()`. However, this would relate to
|
|
1887
2268
|
// subsequent ops, since by now we have already verified the hash chain for this commit.
|
|
1888
2269
|
|
|
1889
|
-
assert(self.journal.
|
|
2270
|
+
assert(self.journal.header_with_op(self.commit_min).?.checksum ==
|
|
1890
2271
|
prepare.header.parent);
|
|
1891
2272
|
|
|
1892
2273
|
log.debug("{}: commit_op: executing view={} {} op={} checksum={} ({s})", .{
|
|
@@ -1901,6 +2282,8 @@ pub fn Replica(
|
|
|
1901
2282
|
const reply = self.message_bus.get_message();
|
|
1902
2283
|
defer self.message_bus.unref(reply);
|
|
1903
2284
|
|
|
2285
|
+
assert(self.state_machine.commit_timestamp < prepare.header.timestamp);
|
|
2286
|
+
|
|
1904
2287
|
const reply_body_size = @intCast(u32, self.state_machine.commit(
|
|
1905
2288
|
prepare.header.client,
|
|
1906
2289
|
prepare.header.operation.cast(StateMachine),
|
|
@@ -1908,6 +2291,9 @@ pub fn Replica(
|
|
|
1908
2291
|
reply.buffer[@sizeOf(Header)..],
|
|
1909
2292
|
));
|
|
1910
2293
|
|
|
2294
|
+
assert(self.state_machine.commit_timestamp <= prepare.header.timestamp);
|
|
2295
|
+
self.state_machine.commit_timestamp = prepare.header.timestamp;
|
|
2296
|
+
|
|
1911
2297
|
self.commit_min += 1;
|
|
1912
2298
|
assert(self.commit_min == prepare.header.op);
|
|
1913
2299
|
if (self.commit_min > self.commit_max) self.commit_max = self.commit_min;
|
|
@@ -1927,10 +2313,10 @@ pub fn Replica(
|
|
|
1927
2313
|
.commit = prepare.header.op,
|
|
1928
2314
|
.size = @sizeOf(Header) + reply_body_size,
|
|
1929
2315
|
};
|
|
1930
|
-
assert(reply.header.
|
|
2316
|
+
assert(reply.header.timestamp == 0);
|
|
1931
2317
|
assert(reply.header.epoch == 0);
|
|
1932
2318
|
|
|
1933
|
-
reply.header.set_checksum_body(reply.
|
|
2319
|
+
reply.header.set_checksum_body(reply.body());
|
|
1934
2320
|
reply.header.set_checksum();
|
|
1935
2321
|
|
|
1936
2322
|
if (reply.header.operation == .register) {
|
|
@@ -1974,8 +2360,16 @@ pub fn Replica(
|
|
|
1974
2360
|
assert(self.commit_min == self.commit_max);
|
|
1975
2361
|
assert(self.commit_max == prepare.message.header.op);
|
|
1976
2362
|
|
|
1977
|
-
self.message_bus.unref(
|
|
1978
|
-
|
|
2363
|
+
self.message_bus.unref(self.pipeline.pop().?.message);
|
|
2364
|
+
|
|
2365
|
+
if (self.replica_count == 1) {
|
|
2366
|
+
if (self.pipeline.head_ptr()) |head| {
|
|
2367
|
+
// Write the next message in the queue.
|
|
2368
|
+
// A cluster-of-one writes prepares sequentially to avoid gaps in the WAL.
|
|
2369
|
+
self.write_prepare(head.message, .append);
|
|
2370
|
+
// The loop will wrap around and exit when `!ok_quorum_received`.
|
|
2371
|
+
}
|
|
2372
|
+
}
|
|
1979
2373
|
}
|
|
1980
2374
|
|
|
1981
2375
|
assert(self.prepare_timeout.ticking);
|
|
@@ -1983,6 +2377,39 @@ pub fn Replica(
|
|
|
1983
2377
|
if (self.pipeline.count == 0) self.prepare_timeout.stop();
|
|
1984
2378
|
}
|
|
1985
2379
|
|
|
2380
|
+
fn copy_latest_headers_and_set_size(
|
|
2381
|
+
self: *const Self,
|
|
2382
|
+
op_min: u64,
|
|
2383
|
+
op_max: u64,
|
|
2384
|
+
count_max: ?usize,
|
|
2385
|
+
message: *Message,
|
|
2386
|
+
) usize {
|
|
2387
|
+
assert(op_max >= op_min);
|
|
2388
|
+
assert(count_max == null or count_max.? > 0);
|
|
2389
|
+
assert(message.header.command == .do_view_change or
|
|
2390
|
+
message.header.command == .start_view or
|
|
2391
|
+
message.header.command == .headers or
|
|
2392
|
+
message.header.command == .recovery_response);
|
|
2393
|
+
|
|
2394
|
+
const body_size_max = @sizeOf(Header) * std.math.min(
|
|
2395
|
+
@divExact(message.buffer.len - @sizeOf(Header), @sizeOf(Header)),
|
|
2396
|
+
// We must add 1 because op_max and op_min are both inclusive:
|
|
2397
|
+
count_max orelse std.math.min(64, op_max - op_min + 1),
|
|
2398
|
+
);
|
|
2399
|
+
assert(body_size_max >= @sizeOf(Header));
|
|
2400
|
+
assert(count_max == null or body_size_max == count_max.? * @sizeOf(Header));
|
|
2401
|
+
|
|
2402
|
+
const count = self.journal.copy_latest_headers_between(
|
|
2403
|
+
op_min,
|
|
2404
|
+
op_max,
|
|
2405
|
+
std.mem.bytesAsSlice(Header, message.buffer[@sizeOf(Header)..][0..body_size_max]),
|
|
2406
|
+
);
|
|
2407
|
+
|
|
2408
|
+
message.header.size = @intCast(u32, @sizeOf(Header) * (1 + count));
|
|
2409
|
+
|
|
2410
|
+
return count;
|
|
2411
|
+
}
|
|
2412
|
+
|
|
1986
2413
|
fn count_quorum(
|
|
1987
2414
|
self: *Self,
|
|
1988
2415
|
messages: *QuorumMessages,
|
|
@@ -2004,6 +2431,7 @@ pub fn Replica(
|
|
|
2004
2431
|
assert(m.header.view == self.view);
|
|
2005
2432
|
},
|
|
2006
2433
|
.do_view_change => assert(m.header.view == self.view),
|
|
2434
|
+
.recovery_response => assert(m.header.replica != self.replica),
|
|
2007
2435
|
.nack_prepare => {
|
|
2008
2436
|
// TODO See if we can restrict this branch further.
|
|
2009
2437
|
assert(m.header.replica != self.replica);
|
|
@@ -2032,7 +2460,8 @@ pub fn Replica(
|
|
|
2032
2460
|
const session = reply.header.commit; // The commit number becomes the session number.
|
|
2033
2461
|
const request = reply.header.request;
|
|
2034
2462
|
|
|
2035
|
-
|
|
2463
|
+
// We reserved the `0` commit number for the cluster `.root` operation.
|
|
2464
|
+
assert(session > 0);
|
|
2036
2465
|
assert(request == 0);
|
|
2037
2466
|
|
|
2038
2467
|
// For correctness, it's critical that all replicas evict deterministically:
|
|
@@ -2113,8 +2542,8 @@ pub fn Replica(
|
|
|
2113
2542
|
// The latest normal view (as specified in the 2012 paper) is different to the view
|
|
2114
2543
|
// number contained in the prepare headers we include in the body. The former shows
|
|
2115
2544
|
// how recent a view change the replica participated in, which may be much higher.
|
|
2116
|
-
// We use the `
|
|
2117
|
-
.
|
|
2545
|
+
// We use the `timestamp` field to send this in addition to the current view number:
|
|
2546
|
+
.timestamp = if (command == .do_view_change) self.view_normal else 0,
|
|
2118
2547
|
.op = self.op,
|
|
2119
2548
|
.commit = self.commit_max,
|
|
2120
2549
|
};
|
|
@@ -2128,22 +2557,9 @@ pub fn Replica(
|
|
|
2128
2557
|
const count_max = config.pipeline_max;
|
|
2129
2558
|
assert(count_max > 0);
|
|
2130
2559
|
|
|
2131
|
-
const
|
|
2132
|
-
|
|
2133
|
-
|
|
2134
|
-
);
|
|
2135
|
-
assert(size_max > @sizeOf(Header));
|
|
2136
|
-
|
|
2137
|
-
const count = self.journal.copy_latest_headers_between(
|
|
2138
|
-
0,
|
|
2139
|
-
self.op,
|
|
2140
|
-
std.mem.bytesAsSlice(Header, message.buffer[@sizeOf(Header)..size_max]),
|
|
2141
|
-
);
|
|
2142
|
-
|
|
2143
|
-
// We expect that self.op always exists.
|
|
2144
|
-
assert(count > 0);
|
|
2145
|
-
|
|
2146
|
-
message.header.size = @intCast(u32, @sizeOf(Header) + @sizeOf(Header) * count);
|
|
2560
|
+
const count = self.copy_latest_headers_and_set_size(0, self.op, count_max, message);
|
|
2561
|
+
assert(count > 0); // We expect that self.op always exists.
|
|
2562
|
+
assert(@divExact(message.header.size, @sizeOf(Header)) == 1 + count);
|
|
2147
2563
|
|
|
2148
2564
|
message.header.set_checksum_body(message.body());
|
|
2149
2565
|
message.header.set_checksum();
|
|
@@ -2154,7 +2570,9 @@ pub fn Replica(
|
|
|
2154
2570
|
/// The caller owns the returned message, if any, which has exactly 1 reference.
|
|
2155
2571
|
fn create_message_from_header(self: *Self, header: Header) *Message {
|
|
2156
2572
|
assert(header.replica == self.replica);
|
|
2157
|
-
assert(header.view == self.view or
|
|
2573
|
+
assert(header.view == self.view or
|
|
2574
|
+
header.command == .request_start_view or
|
|
2575
|
+
header.command == .recovery);
|
|
2158
2576
|
assert(header.size == @sizeOf(Header));
|
|
2159
2577
|
|
|
2160
2578
|
const message = self.message_bus.pool.get_message();
|
|
@@ -2181,6 +2599,12 @@ pub fn Replica(
|
|
|
2181
2599
|
/// uncommitted header gaps and compare them with the quorum of do_view_change messages
|
|
2182
2600
|
/// received from other replicas, before starting the new view, to discard any that may be
|
|
2183
2601
|
/// impossible to repair.
|
|
2602
|
+
///
|
|
2603
|
+
/// For example, if the old primary replicates ops=7,8,9 (all uncommitted) but only op=9 is
|
|
2604
|
+
/// prepared on another replica before the old primary crashes, then this function finds a
|
|
2605
|
+
/// gap for ops=7,8 and will attempt to discard ops 7,8,9.
|
|
2606
|
+
// TODO To improve availability, potentially call this before the local headers are
|
|
2607
|
+
// repaired during the view change, so that we can participate in nacking headers.
|
|
2184
2608
|
fn discard_uncommitted_headers(self: *Self) void {
|
|
2185
2609
|
assert(self.status == .view_change);
|
|
2186
2610
|
assert(self.leader_index(self.view) == self.replica);
|
|
@@ -2188,6 +2612,7 @@ pub fn Replica(
|
|
|
2188
2612
|
assert(!self.repair_timeout.ticking);
|
|
2189
2613
|
assert(self.op >= self.commit_max);
|
|
2190
2614
|
assert(self.replica_count > 1);
|
|
2615
|
+
assert(self.op - self.commit_max <= config.journal_slot_count);
|
|
2191
2616
|
|
|
2192
2617
|
const threshold = self.replica_count - self.quorum_replication;
|
|
2193
2618
|
if (threshold == 0) {
|
|
@@ -2195,9 +2620,13 @@ pub fn Replica(
|
|
|
2195
2620
|
return;
|
|
2196
2621
|
}
|
|
2197
2622
|
|
|
2623
|
+
// Iterating > commit_max does not in itself guarantee that the header is uncommitted.
|
|
2624
|
+
// We must also count nacks from the quorum, since the old primary may have committed
|
|
2625
|
+
// another op just before crashing, if there was sufficient quorum. Counting nacks
|
|
2626
|
+
// ensures that the old primary could not possibly have committed the header.
|
|
2198
2627
|
var op = self.op;
|
|
2199
2628
|
while (op > self.commit_max) : (op -= 1) {
|
|
2200
|
-
if (self.journal.
|
|
2629
|
+
if (self.journal.header_with_op(op) != null) continue;
|
|
2201
2630
|
|
|
2202
2631
|
log.debug("{}: discard_uncommitted_headers: op={} gap", .{ self.replica, op });
|
|
2203
2632
|
|
|
@@ -2208,14 +2637,30 @@ pub fn Replica(
|
|
|
2208
2637
|
assert(m.header.cluster == self.cluster);
|
|
2209
2638
|
assert(m.header.replica == replica);
|
|
2210
2639
|
assert(m.header.view == self.view);
|
|
2640
|
+
assert(m.header.commit <= self.commit_max);
|
|
2211
2641
|
|
|
2212
2642
|
if (replica != self.replica) {
|
|
2213
|
-
|
|
2214
|
-
|
|
2215
|
-
|
|
2643
|
+
// Check for a gap in the uncommitted headers from this replica.
|
|
2644
|
+
const received_headers = self.message_body_as_headers(m);
|
|
2645
|
+
assert(received_headers.len >= 1);
|
|
2646
|
+
|
|
2647
|
+
const received_op_min = received_headers[received_headers.len - 1].op;
|
|
2648
|
+
const received_op_max = received_headers[0].op;
|
|
2649
|
+
assert(received_op_max >= received_op_min);
|
|
2650
|
+
|
|
2651
|
+
const nack = for (received_headers) |*h| {
|
|
2652
|
+
if (h.op == op) break false;
|
|
2653
|
+
} else nack: {
|
|
2654
|
+
// Don't nack ops that didn't fit in the message's attached headers.
|
|
2655
|
+
break :nack op >= received_op_min;
|
|
2656
|
+
};
|
|
2657
|
+
|
|
2658
|
+
if (nack) nacks += 1;
|
|
2659
|
+
log.debug("{}: discard_uncommitted_headers: replica={} op={} nack={}", .{
|
|
2216
2660
|
self.replica,
|
|
2217
2661
|
m.header.replica,
|
|
2218
|
-
|
|
2662
|
+
op,
|
|
2663
|
+
nack,
|
|
2219
2664
|
});
|
|
2220
2665
|
}
|
|
2221
2666
|
}
|
|
@@ -2229,12 +2674,15 @@ pub fn Replica(
|
|
|
2229
2674
|
});
|
|
2230
2675
|
|
|
2231
2676
|
if (nacks >= threshold) {
|
|
2677
|
+
assert(op > self.commit_max);
|
|
2678
|
+
|
|
2232
2679
|
self.journal.remove_entries_from(op);
|
|
2233
2680
|
self.op = op - 1;
|
|
2234
2681
|
|
|
2235
|
-
|
|
2236
|
-
assert(
|
|
2237
|
-
assert(!self.journal.
|
|
2682
|
+
const slot = self.journal.slot_for_op(op);
|
|
2683
|
+
assert(self.journal.header_for_op(op) == null);
|
|
2684
|
+
assert(!self.journal.dirty.bit(slot));
|
|
2685
|
+
assert(!self.journal.faulty.bit(slot));
|
|
2238
2686
|
}
|
|
2239
2687
|
}
|
|
2240
2688
|
}
|
|
@@ -2249,10 +2697,11 @@ pub fn Replica(
|
|
|
2249
2697
|
|
|
2250
2698
|
assert(self.valid_hash_chain("discard_uncommitted_ops_from"));
|
|
2251
2699
|
|
|
2700
|
+
const slot = self.journal.slot_with_op(op).?;
|
|
2252
2701
|
assert(op > self.commit_max);
|
|
2253
2702
|
assert(op <= self.op);
|
|
2254
|
-
assert(self.journal.
|
|
2255
|
-
assert(self.journal.dirty.bit(
|
|
2703
|
+
assert(self.journal.header_with_op_and_checksum(op, checksum) != null);
|
|
2704
|
+
assert(self.journal.dirty.bit(slot));
|
|
2256
2705
|
|
|
2257
2706
|
log.debug("{}: discard_uncommitted_ops_from: ops={}..{} view={}", .{
|
|
2258
2707
|
self.replica,
|
|
@@ -2264,13 +2713,13 @@ pub fn Replica(
|
|
|
2264
2713
|
self.journal.remove_entries_from(op);
|
|
2265
2714
|
self.op = op - 1;
|
|
2266
2715
|
|
|
2267
|
-
assert(self.journal.
|
|
2268
|
-
assert(!self.journal.dirty.bit(
|
|
2269
|
-
assert(!self.journal.faulty.bit(
|
|
2716
|
+
assert(self.journal.header_for_op(op) == null);
|
|
2717
|
+
assert(!self.journal.dirty.bit(slot));
|
|
2718
|
+
assert(!self.journal.faulty.bit(slot));
|
|
2270
2719
|
|
|
2271
2720
|
// We require that `self.op` always exists. Rewinding `self.op` could change that.
|
|
2272
2721
|
// However, we do this only as the leader within a view change, with all headers intact.
|
|
2273
|
-
assert(self.journal.
|
|
2722
|
+
assert(self.journal.header_with_op(self.op) != null);
|
|
2274
2723
|
}
|
|
2275
2724
|
|
|
2276
2725
|
/// Returns whether the replica is a follower for the current view.
|
|
@@ -2370,7 +2819,7 @@ pub fn Replica(
|
|
|
2370
2819
|
return true;
|
|
2371
2820
|
},
|
|
2372
2821
|
// Only the leader may answer a request for a prepare without a context:
|
|
2373
|
-
.request_prepare => if (message.header.
|
|
2822
|
+
.request_prepare => if (message.header.timestamp == 0) {
|
|
2374
2823
|
log.warn("{}: on_{s}: ignoring (no context)", .{ self.replica, command });
|
|
2375
2824
|
return true;
|
|
2376
2825
|
},
|
|
@@ -2439,6 +2888,18 @@ pub fn Replica(
|
|
|
2439
2888
|
if (self.ignore_request_message_follower(message)) return true;
|
|
2440
2889
|
if (self.ignore_request_message_duplicate(message)) return true;
|
|
2441
2890
|
if (self.ignore_request_message_preparing(message)) return true;
|
|
2891
|
+
|
|
2892
|
+
// Verify that the new request will fit in the WAL.
|
|
2893
|
+
// The message's op hasn't been assigned yet, but it will be `self.op + 1`.
|
|
2894
|
+
if (self.op + 1 >= self.op_checkpoint + config.journal_slot_count) {
|
|
2895
|
+
log.debug("{}: on_request: ignoring op={} (too far ahead, checkpoint={})", .{
|
|
2896
|
+
self.replica,
|
|
2897
|
+
message.header.op,
|
|
2898
|
+
self.op_checkpoint,
|
|
2899
|
+
});
|
|
2900
|
+
return true;
|
|
2901
|
+
}
|
|
2902
|
+
|
|
2442
2903
|
return false;
|
|
2443
2904
|
}
|
|
2444
2905
|
|
|
@@ -2491,7 +2952,9 @@ pub fn Replica(
|
|
|
2491
2952
|
return false;
|
|
2492
2953
|
} else {
|
|
2493
2954
|
// The client may have only one request inflight at a time.
|
|
2494
|
-
log.err("{}: on_request: ignoring new request (client bug)", .{
|
|
2955
|
+
log.err("{}: on_request: ignoring new request (client bug)", .{
|
|
2956
|
+
self.replica,
|
|
2957
|
+
});
|
|
2495
2958
|
return true;
|
|
2496
2959
|
}
|
|
2497
2960
|
} else {
|
|
@@ -2642,7 +3105,71 @@ pub fn Replica(
|
|
|
2642
3105
|
return false;
|
|
2643
3106
|
}
|
|
2644
3107
|
|
|
2645
|
-
|
|
3108
|
+
/// Returns whether the highest known op is certain.
|
|
3109
|
+
///
|
|
3110
|
+
/// After recovering the WAL, there are 2 possible outcomes:
|
|
3111
|
+
/// * All entries valid. The highest op is certain, and safe to set as `replica.op`.
|
|
3112
|
+
/// * One or more entries are faulty. The highest op isn't certain — it may be one of the
|
|
3113
|
+
/// broken entries.
|
|
3114
|
+
///
|
|
3115
|
+
/// The replica must refrain from repairing any faulty slots until the highest op is known.
|
|
3116
|
+
/// Otherwise, if we were to repair a slot while uncertain of `replica.op`:
|
|
3117
|
+
///
|
|
3118
|
+
/// * we may nack an op that we shouldn't, or
|
|
3119
|
+
/// * we may replace a prepared op that we were guaranteeing for the primary, potentially
|
|
3120
|
+
/// forking the log.
|
|
3121
|
+
///
|
|
3122
|
+
///
|
|
3123
|
+
/// Test for a fault the right of the current op. The fault might be our true op, and
|
|
3124
|
+
/// sharing our current `replica.op` might cause the cluster's op to likewise regress.
|
|
3125
|
+
///
|
|
3126
|
+
/// Note that for our purposes here, we only care about entries that were faulty during
|
|
3127
|
+
/// WAL recovery, not ones that were found to be faulty after the fact (e.g. due to
|
|
3128
|
+
/// `request_prepare`).
|
|
3129
|
+
///
|
|
3130
|
+
/// Cases (`✓`: `replica.op_checkpoint`, `✗`: faulty, `o`: `replica.op`):
|
|
3131
|
+
/// * ` ✓ o ✗ `: View change is unsafe.
|
|
3132
|
+
/// * ` ✗ ✓ o `: View change is unsafe.
|
|
3133
|
+
/// * ` ✓ ✗ o `: View change is safe.
|
|
3134
|
+
/// * ` ✓ = o `: View change is unsafe if any slots are faulty.
|
|
3135
|
+
/// (`replica.op_checkpoint` == `replica.op`).
|
|
3136
|
+
// TODO Use this function once we switch from recovery protocol to the superblock.
|
|
3137
|
+
// If there is an "unsafe" fault, we will need to request a start_view from the leader to
|
|
3138
|
+
// learn the op.
|
|
3139
|
+
fn op_certain(self: *const Self) bool {
|
|
3140
|
+
assert(self.status == .recovering);
|
|
3141
|
+
assert(self.journal.recovered);
|
|
3142
|
+
assert(self.op_checkpoint <= self.op);
|
|
3143
|
+
|
|
3144
|
+
const slot_op_checkpoint = self.journal.slot_for_op(self.op_checkpoint).index;
|
|
3145
|
+
const slot_op = self.journal.slot_with_op(self.op).?.index;
|
|
3146
|
+
const slot_known_range = vsr.SlotRange{
|
|
3147
|
+
.head = slot_op_checkpoint,
|
|
3148
|
+
.tail = slot_op,
|
|
3149
|
+
};
|
|
3150
|
+
|
|
3151
|
+
var iterator = self.journal.faulty.bits.iterator(.{ .kind = .set });
|
|
3152
|
+
while (iterator.next()) |slot| {
|
|
3153
|
+
// The command is `reserved` when the entry was found faulty during WAL recovery.
|
|
3154
|
+
// Faults found after WAL recovery are not relevant, because we know their op.
|
|
3155
|
+
if (self.journal.headers[slot.index].command == .reserved) {
|
|
3156
|
+
if (slot_op_checkpoint == slot_op or
|
|
3157
|
+
!slot_known_range.contains(slot))
|
|
3158
|
+
{
|
|
3159
|
+
log.warn("{}: op_certain: op not known (faulty_slot={}, op={}, op_checkpoint={})", .{
|
|
3160
|
+
self.replica,
|
|
3161
|
+
slot.index,
|
|
3162
|
+
self.op,
|
|
3163
|
+
self.op_checkpoint,
|
|
3164
|
+
});
|
|
3165
|
+
return false;
|
|
3166
|
+
}
|
|
3167
|
+
}
|
|
3168
|
+
}
|
|
3169
|
+
return true;
|
|
3170
|
+
}
|
|
3171
|
+
|
|
3172
|
+
fn is_repair(self: *const Self, message: *const Message) bool {
|
|
2646
3173
|
assert(message.header.command == .prepare);
|
|
2647
3174
|
|
|
2648
3175
|
if (self.status == .normal) {
|
|
@@ -2674,15 +3201,17 @@ pub fn Replica(
|
|
|
2674
3201
|
assert(self.follower());
|
|
2675
3202
|
assert(header.view == self.view);
|
|
2676
3203
|
assert(header.op > self.op + 1);
|
|
2677
|
-
// We may have learned of a higher `commit_max` through a commit message before jumping
|
|
2678
|
-
// newer op that is less than `commit_max` but greater than `commit_min`:
|
|
3204
|
+
// We may have learned of a higher `commit_max` through a commit message before jumping
|
|
3205
|
+
// to a newer op that is less than `commit_max` but greater than `commit_min`:
|
|
2679
3206
|
assert(header.op > self.commit_min);
|
|
3207
|
+
// Never overwrite an op that still needs to be checkpointed.
|
|
3208
|
+
assert(header.op - self.op_checkpoint < config.journal_slot_count);
|
|
2680
3209
|
|
|
2681
3210
|
log.debug("{}: jump_to_newer_op: advancing: op={}..{} checksum={}..{}", .{
|
|
2682
3211
|
self.replica,
|
|
2683
3212
|
self.op,
|
|
2684
3213
|
header.op - 1,
|
|
2685
|
-
self.journal.
|
|
3214
|
+
self.journal.header_with_op(self.op).?.checksum,
|
|
2686
3215
|
header.parent,
|
|
2687
3216
|
});
|
|
2688
3217
|
|
|
@@ -2694,7 +3223,10 @@ pub fn Replica(
|
|
|
2694
3223
|
fn message_body_as_headers(_: *Self, message: *const Message) []Header {
|
|
2695
3224
|
// TODO Assert message commands that we expect this to be called for.
|
|
2696
3225
|
assert(message.header.size > @sizeOf(Header)); // Body must contain at least one header.
|
|
2697
|
-
return std.mem.bytesAsSlice(
|
|
3226
|
+
return std.mem.bytesAsSlice(
|
|
3227
|
+
Header,
|
|
3228
|
+
message.buffer[@sizeOf(Header)..message.header.size],
|
|
3229
|
+
);
|
|
2698
3230
|
}
|
|
2699
3231
|
|
|
2700
3232
|
/// Panics if immediate neighbors in the same view would have a broken hash chain.
|
|
@@ -2716,6 +3248,29 @@ pub fn Replica(
|
|
|
2716
3248
|
}
|
|
2717
3249
|
}
|
|
2718
3250
|
|
|
3251
|
+
/// Searches the pipeline for a prepare for a given op and checksum.
|
|
3252
|
+
/// When `checksum` is `null`, match any checksum.
|
|
3253
|
+
fn pipeline_prepare_for_op_and_checksum(self: *Self, op: u64, checksum: ?u128) ?*Prepare {
|
|
3254
|
+
assert(self.status == .normal or self.status == .view_change);
|
|
3255
|
+
|
|
3256
|
+
// To optimize the search, we can leverage the fact that the pipeline is ordered and
|
|
3257
|
+
// continuous.
|
|
3258
|
+
if (self.pipeline.count == 0) return null;
|
|
3259
|
+
const head_op = self.pipeline.head_ptr().?.message.header.op;
|
|
3260
|
+
const tail_op = self.pipeline.tail_ptr().?.message.header.op;
|
|
3261
|
+
if (op < head_op) return null;
|
|
3262
|
+
if (op > tail_op) return null;
|
|
3263
|
+
|
|
3264
|
+
const pipeline_prepare = self.pipeline.get_ptr(op - head_op).?;
|
|
3265
|
+
assert(pipeline_prepare.message.header.op == op);
|
|
3266
|
+
|
|
3267
|
+
if (checksum == null or pipeline_prepare.message.header.checksum == checksum.?) {
|
|
3268
|
+
return pipeline_prepare;
|
|
3269
|
+
} else {
|
|
3270
|
+
return null;
|
|
3271
|
+
}
|
|
3272
|
+
}
|
|
3273
|
+
|
|
2719
3274
|
/// Searches the pipeline for a prepare for a given client.
|
|
2720
3275
|
fn pipeline_prepare_for_client(self: *Self, client: u128) ?*Prepare {
|
|
2721
3276
|
assert(self.status == .normal);
|
|
@@ -2723,7 +3278,7 @@ pub fn Replica(
|
|
|
2723
3278
|
assert(self.commit_min == self.commit_max);
|
|
2724
3279
|
|
|
2725
3280
|
var op = self.commit_max + 1;
|
|
2726
|
-
var parent = self.journal.
|
|
3281
|
+
var parent = self.journal.header_with_op(self.commit_max).?.checksum;
|
|
2727
3282
|
var iterator = self.pipeline.iterator();
|
|
2728
3283
|
while (iterator.next_ptr()) |prepare| {
|
|
2729
3284
|
assert(prepare.message.header.command == .prepare);
|
|
@@ -2776,15 +3331,33 @@ pub fn Replica(
|
|
|
2776
3331
|
assert(prepare.message.header.view <= ok.header.view);
|
|
2777
3332
|
assert(prepare.message.header.op == ok.header.op);
|
|
2778
3333
|
assert(prepare.message.header.commit == ok.header.commit);
|
|
2779
|
-
assert(prepare.message.header.
|
|
3334
|
+
assert(prepare.message.header.timestamp == ok.header.timestamp);
|
|
2780
3335
|
assert(prepare.message.header.operation == ok.header.operation);
|
|
2781
3336
|
|
|
2782
3337
|
return prepare;
|
|
2783
3338
|
}
|
|
2784
3339
|
|
|
3340
|
+
fn recover(self: *Self) void {
|
|
3341
|
+
assert(self.status == .recovering);
|
|
3342
|
+
assert(self.replica_count > 1);
|
|
3343
|
+
assert(self.journal.recovered);
|
|
3344
|
+
|
|
3345
|
+
log.debug("{}: recover: sending recovery messages nonce={}", .{
|
|
3346
|
+
self.replica,
|
|
3347
|
+
self.recovery_nonce,
|
|
3348
|
+
});
|
|
3349
|
+
|
|
3350
|
+
self.send_header_to_other_replicas(.{
|
|
3351
|
+
.command = .recovery,
|
|
3352
|
+
.cluster = self.cluster,
|
|
3353
|
+
.context = self.recovery_nonce,
|
|
3354
|
+
.replica = self.replica,
|
|
3355
|
+
});
|
|
3356
|
+
}
|
|
3357
|
+
|
|
2785
3358
|
/// Starting from the latest journal entry, backfill any missing or disconnected headers.
|
|
2786
|
-
/// A header is disconnected if it breaks the
|
|
2787
|
-
/// Since we work
|
|
3359
|
+
/// A header is disconnected if it breaks the chain with its newer neighbor to the right.
|
|
3360
|
+
/// Since we work back from the latest entry, we should always be able to fix the chain.
|
|
2788
3361
|
/// Once headers are connected, backfill any dirty or faulty prepares.
|
|
2789
3362
|
fn repair(self: *Self) void {
|
|
2790
3363
|
if (!self.repair_timeout.ticking) {
|
|
@@ -2796,38 +3369,50 @@ pub fn Replica(
|
|
|
2796
3369
|
|
|
2797
3370
|
assert(self.status == .normal or self.status == .view_change);
|
|
2798
3371
|
assert(self.repairs_allowed());
|
|
3372
|
+
|
|
3373
|
+
assert(self.op_checkpoint <= self.op);
|
|
3374
|
+
assert(self.op_checkpoint <= self.commit_min);
|
|
2799
3375
|
assert(self.commit_min <= self.op);
|
|
2800
3376
|
assert(self.commit_min <= self.commit_max);
|
|
2801
3377
|
|
|
2802
|
-
|
|
2803
|
-
assert(self.journal.
|
|
2804
|
-
|
|
3378
|
+
assert(self.journal.header_with_op(self.commit_min) != null);
|
|
3379
|
+
assert(self.journal.header_with_op(self.op) != null);
|
|
3380
|
+
|
|
3381
|
+
// The replica repairs backwards from `commit_max`. But if `commit_max` is too high
|
|
3382
|
+
// (>1 WAL ahead), then bound it such that uncommitted WAL entries are not overwritten.
|
|
3383
|
+
const commit_max_limit = std.math.min(
|
|
3384
|
+
self.commit_max,
|
|
3385
|
+
self.op_checkpoint + config.journal_slot_count,
|
|
3386
|
+
);
|
|
2805
3387
|
|
|
2806
3388
|
// Request outstanding committed prepares to advance our op number:
|
|
2807
3389
|
// This handles the case of an idle cluster, where a follower will not otherwise advance.
|
|
2808
3390
|
// This is not required for correctness, but for durability.
|
|
2809
|
-
if (self.op <
|
|
3391
|
+
if (self.op < commit_max_limit) {
|
|
2810
3392
|
// If the leader repairs during a view change, it will have already advanced
|
|
2811
3393
|
// `self.op` to the latest op according to the quorum of `do_view_change` messages
|
|
2812
3394
|
// received, so we must therefore be a follower in normal status:
|
|
2813
3395
|
assert(self.status == .normal);
|
|
2814
3396
|
assert(self.follower());
|
|
2815
|
-
log.debug("{}: repair: op={} < commit_max={}", .{
|
|
3397
|
+
log.debug("{}: repair: op={} < commit_max_limit={}, commit_max={}", .{
|
|
2816
3398
|
self.replica,
|
|
2817
3399
|
self.op,
|
|
3400
|
+
commit_max_limit,
|
|
2818
3401
|
self.commit_max,
|
|
2819
3402
|
});
|
|
2820
3403
|
// We need to advance our op number and therefore have to `request_prepare`,
|
|
2821
3404
|
// since only `on_prepare()` can do this, not `repair_header()` in `on_headers()`.
|
|
2822
3405
|
self.send_header_to_replica(self.leader_index(self.view), .{
|
|
2823
3406
|
.command = .request_prepare,
|
|
2824
|
-
// We cannot yet know the checksum of the prepare so we set the context
|
|
2825
|
-
// Context is optional when requesting from the leader but
|
|
3407
|
+
// We cannot yet know the checksum of the prepare so we set the context and
|
|
3408
|
+
// timestamp to 0: Context is optional when requesting from the leader but
|
|
3409
|
+
// required otherwise.
|
|
2826
3410
|
.context = 0,
|
|
3411
|
+
.timestamp = 0,
|
|
2827
3412
|
.cluster = self.cluster,
|
|
2828
3413
|
.replica = self.replica,
|
|
2829
3414
|
.view = self.view,
|
|
2830
|
-
.op =
|
|
3415
|
+
.op = commit_max_limit,
|
|
2831
3416
|
});
|
|
2832
3417
|
return;
|
|
2833
3418
|
}
|
|
@@ -2848,9 +3433,10 @@ pub fn Replica(
|
|
|
2848
3433
|
assert(range.op_min > self.commit_min);
|
|
2849
3434
|
assert(range.op_max < self.op);
|
|
2850
3435
|
// A range of `op_min=0` or `op_max=0` should be impossible as a header break:
|
|
2851
|
-
// This is the
|
|
3436
|
+
// This is the root op that is prepared when the cluster is initialized.
|
|
2852
3437
|
assert(range.op_min > 0);
|
|
2853
3438
|
assert(range.op_max > 0);
|
|
3439
|
+
|
|
2854
3440
|
if (self.choose_any_other_replica()) |replica| {
|
|
2855
3441
|
self.send_header_to_replica(replica, .{
|
|
2856
3442
|
.command = .request_headers,
|
|
@@ -2869,10 +3455,14 @@ pub fn Replica(
|
|
|
2869
3455
|
assert(self.valid_hash_chain_between(self.commit_min, self.op));
|
|
2870
3456
|
|
|
2871
3457
|
// Request and repair any dirty or faulty prepares:
|
|
2872
|
-
if (self.journal.dirty.
|
|
3458
|
+
if (self.journal.dirty.count > 0) return self.repair_prepares();
|
|
2873
3459
|
|
|
2874
3460
|
// Commit ops, which may in turn discover faulty prepares and drive more repairs:
|
|
2875
|
-
if (self.commit_min < self.commit_max)
|
|
3461
|
+
if (self.commit_min < self.commit_max) {
|
|
3462
|
+
assert(self.replica_count > 1);
|
|
3463
|
+
self.commit_ops(self.commit_max);
|
|
3464
|
+
return;
|
|
3465
|
+
}
|
|
2876
3466
|
|
|
2877
3467
|
if (self.status == .view_change and self.leader_index(self.view) == self.replica) {
|
|
2878
3468
|
if (self.repair_pipeline_op() != null) return self.repair_pipeline();
|
|
@@ -2927,10 +3517,13 @@ pub fn Replica(
|
|
|
2927
3517
|
}
|
|
2928
3518
|
|
|
2929
3519
|
if (header.op > self.op) {
|
|
2930
|
-
log.debug("{}: repair_header: false (advances self.op)", .{
|
|
3520
|
+
log.debug("{}: repair_header: false (advances self.op={})", .{
|
|
3521
|
+
self.replica,
|
|
3522
|
+
self.op,
|
|
3523
|
+
});
|
|
2931
3524
|
return false;
|
|
2932
3525
|
} else if (header.op == self.op) {
|
|
2933
|
-
if (self.journal.
|
|
3526
|
+
if (self.journal.header_with_op_and_checksum(self.op, header.checksum)) |_| {
|
|
2934
3527
|
// Fall through below to check if self.op is uncommitted AND reordered,
|
|
2935
3528
|
// which we would see by the presence of an earlier op with higher view number,
|
|
2936
3529
|
// that breaks the chain with self.op. In this case, we must skip the repair to
|
|
@@ -2944,27 +3537,42 @@ pub fn Replica(
|
|
|
2944
3537
|
}
|
|
2945
3538
|
}
|
|
2946
3539
|
|
|
2947
|
-
if (self.journal.
|
|
3540
|
+
if (self.journal.header_for_entry(header)) |existing| {
|
|
3541
|
+
assert(existing.op == header.op);
|
|
3542
|
+
|
|
2948
3543
|
// Do not replace any existing op lightly as doing so may impair durability and even
|
|
2949
3544
|
// violate correctness by undoing a prepare already acknowledged to the leader:
|
|
2950
3545
|
if (existing.checksum == header.checksum) {
|
|
2951
|
-
|
|
2952
|
-
|
|
3546
|
+
const slot = self.journal.slot_with_header(header).?;
|
|
3547
|
+
if (!self.journal.dirty.bit(slot)) {
|
|
3548
|
+
log.debug("{}: repair_header: op={} false (checksum clean)", .{
|
|
3549
|
+
self.replica,
|
|
3550
|
+
header.op,
|
|
3551
|
+
});
|
|
2953
3552
|
return false;
|
|
2954
3553
|
}
|
|
2955
3554
|
|
|
2956
|
-
log.debug("{}: repair_header: exists, checksum dirty", .{
|
|
3555
|
+
log.debug("{}: repair_header: op={} exists, checksum dirty", .{
|
|
3556
|
+
self.replica,
|
|
3557
|
+
header.op,
|
|
3558
|
+
});
|
|
2957
3559
|
} else if (existing.view == header.view) {
|
|
2958
3560
|
// The journal must have wrapped:
|
|
2959
3561
|
// We expect that the same view and op will have the same checksum.
|
|
2960
3562
|
assert(existing.op != header.op);
|
|
2961
3563
|
|
|
2962
3564
|
if (existing.op > header.op) {
|
|
2963
|
-
log.debug("{}: repair_header: false (view has newer op)", .{
|
|
3565
|
+
log.debug("{}: repair_header: op={} false (view has newer op)", .{
|
|
3566
|
+
self.replica,
|
|
3567
|
+
header.op,
|
|
3568
|
+
});
|
|
2964
3569
|
return false;
|
|
2965
3570
|
}
|
|
2966
3571
|
|
|
2967
|
-
log.debug("{}: repair_header: exists, view has older op", .{
|
|
3572
|
+
log.debug("{}: repair_header: op={} exists, view has older op", .{
|
|
3573
|
+
self.replica,
|
|
3574
|
+
header.op,
|
|
3575
|
+
});
|
|
2968
3576
|
} else {
|
|
2969
3577
|
assert(existing.view != header.view);
|
|
2970
3578
|
assert(existing.op == header.op or existing.op != header.op);
|
|
@@ -2972,38 +3580,37 @@ pub fn Replica(
|
|
|
2972
3580
|
if (!self.repair_header_would_connect_hash_chain(header)) {
|
|
2973
3581
|
// We cannot replace this op until we are sure that doing so would not
|
|
2974
3582
|
// violate any prior commitments made to the leader.
|
|
2975
|
-
log.debug("{}: repair_header: false (exists)", .{
|
|
3583
|
+
log.debug("{}: repair_header: op={} false (exists)", .{
|
|
3584
|
+
self.replica,
|
|
3585
|
+
header.op,
|
|
3586
|
+
});
|
|
2976
3587
|
return false;
|
|
2977
3588
|
}
|
|
2978
3589
|
|
|
2979
|
-
log.debug("{}: repair_header: exists, connects hash chain", .{
|
|
3590
|
+
log.debug("{}: repair_header: op={} exists, connects hash chain", .{
|
|
3591
|
+
self.replica,
|
|
3592
|
+
header.op,
|
|
3593
|
+
});
|
|
2980
3594
|
}
|
|
2981
3595
|
} else {
|
|
2982
|
-
log.debug("{}: repair_header: gap", .{self.replica});
|
|
3596
|
+
log.debug("{}: repair_header: op={} gap", .{ self.replica, header.op });
|
|
2983
3597
|
}
|
|
2984
3598
|
|
|
2985
3599
|
// Caveat: Do not repair an existing op or gap if doing so would break the hash chain:
|
|
2986
3600
|
if (self.repair_header_would_break_hash_chain_with_next_entry(header)) {
|
|
2987
|
-
log.debug("{}: repair_header: false (breaks hash chain)", .{
|
|
3601
|
+
log.debug("{}: repair_header: op={} false (breaks hash chain)", .{
|
|
3602
|
+
self.replica,
|
|
3603
|
+
header.op,
|
|
3604
|
+
});
|
|
2988
3605
|
return false;
|
|
2989
3606
|
}
|
|
2990
3607
|
|
|
2991
|
-
// Caveat: Do not repair an existing op or gap if doing so would overlap another:
|
|
2992
|
-
if (self.repair_header_would_overlap_another(header)) {
|
|
2993
|
-
if (!self.repair_header_would_connect_hash_chain(header)) {
|
|
2994
|
-
log.debug("{}: repair_header: false (overlap)", .{self.replica});
|
|
2995
|
-
return false;
|
|
2996
|
-
}
|
|
2997
|
-
// We may have to overlap previous entries in order to connect the hash chain:
|
|
2998
|
-
log.debug("{}: repair_header: overlap, connects hash chain", .{self.replica});
|
|
2999
|
-
}
|
|
3000
|
-
|
|
3001
3608
|
// TODO Snapshots: Skip if this header is already snapshotted.
|
|
3002
3609
|
|
|
3003
3610
|
assert(header.op < self.op or
|
|
3004
|
-
self.journal.
|
|
3611
|
+
self.journal.header_with_op(self.op).?.checksum == header.checksum);
|
|
3005
3612
|
|
|
3006
|
-
self.journal.
|
|
3613
|
+
self.journal.set_header_as_dirty(header);
|
|
3007
3614
|
return true;
|
|
3008
3615
|
}
|
|
3009
3616
|
|
|
@@ -3024,10 +3631,12 @@ pub fn Replica(
|
|
|
3024
3631
|
if (header.checksum == next.parent) {
|
|
3025
3632
|
assert(header.view <= next.view);
|
|
3026
3633
|
assert(header.op + 1 == next.op);
|
|
3027
|
-
// We don't break with `next` but this is no guarantee that `next` does not
|
|
3634
|
+
// We don't break with `next` but this is no guarantee that `next` does not
|
|
3635
|
+
// break.
|
|
3028
3636
|
return false;
|
|
3029
3637
|
} else {
|
|
3030
|
-
// If the journal has wrapped, then err in favor of a break regardless of op
|
|
3638
|
+
// If the journal has wrapped, then err in favor of a break regardless of op
|
|
3639
|
+
// order:
|
|
3031
3640
|
return true;
|
|
3032
3641
|
}
|
|
3033
3642
|
}
|
|
@@ -3036,14 +3645,17 @@ pub fn Replica(
|
|
|
3036
3645
|
return false;
|
|
3037
3646
|
}
|
|
3038
3647
|
|
|
3039
|
-
/// If we repair this header, then would this connect the hash chain through to the latest
|
|
3040
|
-
/// This offers a strong guarantee that may be used to replace or overlap an existing
|
|
3648
|
+
/// If we repair this header, then would this connect the hash chain through to the latest
|
|
3649
|
+
/// op? This offers a strong guarantee that may be used to replace or overlap an existing
|
|
3650
|
+
/// op.
|
|
3041
3651
|
///
|
|
3042
3652
|
/// Here is an example of what could go wrong if we did not check for complete connection:
|
|
3043
3653
|
///
|
|
3044
3654
|
/// 1. We do a prepare that's going to be committed.
|
|
3045
|
-
/// 2. We do a stale prepare to the right of this, ignoring the hash chain break to the
|
|
3046
|
-
///
|
|
3655
|
+
/// 2. We do a stale prepare to the right of this, ignoring the hash chain break to the
|
|
3656
|
+
/// left.
|
|
3657
|
+
/// 3. We do another stale prepare that replaces the first op because it connects to the
|
|
3658
|
+
/// second.
|
|
3047
3659
|
///
|
|
3048
3660
|
/// This would violate our quorum replication commitment to the leader.
|
|
3049
3661
|
/// The mistake in this example was not that we ignored the break to the left, which we must
|
|
@@ -3066,43 +3678,16 @@ pub fn Replica(
|
|
|
3066
3678
|
}
|
|
3067
3679
|
|
|
3068
3680
|
assert(entry.op == self.op);
|
|
3069
|
-
assert(entry.checksum == self.journal.
|
|
3681
|
+
assert(entry.checksum == self.journal.header_with_op(self.op).?.checksum);
|
|
3070
3682
|
return true;
|
|
3071
3683
|
}
|
|
3072
3684
|
|
|
3073
|
-
/// If we repair this header, then would this overlap and overwrite part of another batch?
|
|
3074
|
-
/// Journal entries have variable-sized batches that may overlap if entries are disconnected.
|
|
3075
|
-
fn repair_header_would_overlap_another(self: *Self, header: *const Header) bool {
|
|
3076
|
-
// TODO Snapshots: Handle journal wrap around.
|
|
3077
|
-
{
|
|
3078
|
-
// Look behind this entry for any preceeding entry that this would overlap:
|
|
3079
|
-
var op: u64 = header.op;
|
|
3080
|
-
while (op > 0) {
|
|
3081
|
-
op -= 1;
|
|
3082
|
-
if (self.journal.entry_for_op(op)) |neighbor| {
|
|
3083
|
-
if (Journal.next_offset(neighbor) > header.offset) return true;
|
|
3084
|
-
break;
|
|
3085
|
-
}
|
|
3086
|
-
}
|
|
3087
|
-
}
|
|
3088
|
-
{
|
|
3089
|
-
// Look beyond this entry for any succeeding entry that this would overlap:
|
|
3090
|
-
var op: u64 = header.op + 1;
|
|
3091
|
-
while (op <= self.op) : (op += 1) {
|
|
3092
|
-
if (self.journal.entry_for_op(op)) |neighbor| {
|
|
3093
|
-
if (Journal.next_offset(header) > neighbor.offset) return true;
|
|
3094
|
-
break;
|
|
3095
|
-
}
|
|
3096
|
-
}
|
|
3097
|
-
}
|
|
3098
|
-
return false;
|
|
3099
|
-
}
|
|
3100
|
-
|
|
3101
3685
|
/// Reads prepares into the pipeline (before we start the view as the new leader).
|
|
3102
3686
|
fn repair_pipeline(self: *Self) void {
|
|
3103
3687
|
assert(self.status == .view_change);
|
|
3104
3688
|
assert(self.leader_index(self.view) == self.replica);
|
|
3105
3689
|
assert(self.commit_max < self.op);
|
|
3690
|
+
assert(self.journal.dirty.count == 0);
|
|
3106
3691
|
|
|
3107
3692
|
if (self.repairing_pipeline) {
|
|
3108
3693
|
log.debug("{}: repair_pipeline: already repairing...", .{self.replica});
|
|
@@ -3117,11 +3702,57 @@ pub fn Replica(
|
|
|
3117
3702
|
self.repair_pipeline_read();
|
|
3118
3703
|
}
|
|
3119
3704
|
|
|
3705
|
+
/// Discard messages from the prepare pipeline.
|
|
3706
|
+
/// Retain uncommitted messages that belong in the current view to maximize durability.
|
|
3707
|
+
fn repair_pipeline_diff(self: *Self) void {
|
|
3708
|
+
assert(self.status == .view_change);
|
|
3709
|
+
assert(self.leader_index(self.view) == self.replica);
|
|
3710
|
+
|
|
3711
|
+
// Discard messages from the front of the pipeline that committed since we were leader.
|
|
3712
|
+
while (self.pipeline.head_ptr()) |prepare| {
|
|
3713
|
+
if (prepare.message.header.op > self.commit_max) break;
|
|
3714
|
+
|
|
3715
|
+
self.message_bus.unref(self.pipeline.pop().?.message);
|
|
3716
|
+
}
|
|
3717
|
+
|
|
3718
|
+
// Discard the whole pipeline if it is now disconnected from the WAL's hash chain.
|
|
3719
|
+
if (self.pipeline.head_ptr()) |pipeline_head| {
|
|
3720
|
+
const parent = self.journal.header_with_op_and_checksum(
|
|
3721
|
+
pipeline_head.message.header.op - 1,
|
|
3722
|
+
pipeline_head.message.header.parent,
|
|
3723
|
+
);
|
|
3724
|
+
if (parent == null) {
|
|
3725
|
+
while (self.pipeline.pop()) |prepare| self.message_bus.unref(prepare.message);
|
|
3726
|
+
assert(self.pipeline.count == 0);
|
|
3727
|
+
}
|
|
3728
|
+
}
|
|
3729
|
+
|
|
3730
|
+
// Discard messages from the back of the pipeline that are not part of this view.
|
|
3731
|
+
while (self.pipeline.tail_ptr()) |prepare| {
|
|
3732
|
+
if (self.journal.has(prepare.message.header)) break;
|
|
3733
|
+
|
|
3734
|
+
self.message_bus.unref(self.pipeline.pop_tail().?.message);
|
|
3735
|
+
}
|
|
3736
|
+
|
|
3737
|
+
log.debug("{}: repair_pipeline_diff: {} prepare(s)", .{
|
|
3738
|
+
self.replica,
|
|
3739
|
+
self.pipeline.count,
|
|
3740
|
+
});
|
|
3741
|
+
|
|
3742
|
+
self.verify_pipeline();
|
|
3743
|
+
|
|
3744
|
+
// Do not reset `repairing_pipeline` here as this must be reset by the read callback.
|
|
3745
|
+
// Otherwise, we would be making `repair_pipeline()` reentrant.
|
|
3746
|
+
}
|
|
3747
|
+
|
|
3120
3748
|
/// Returns the next `op` number that needs to be read into the pipeline.
|
|
3121
3749
|
fn repair_pipeline_op(self: *Self) ?u64 {
|
|
3122
3750
|
assert(self.status == .view_change);
|
|
3123
3751
|
assert(self.leader_index(self.view) == self.replica);
|
|
3124
3752
|
|
|
3753
|
+
// We cannot rely on `pipeline.count` below unless the pipeline has first been diffed.
|
|
3754
|
+
self.repair_pipeline_diff();
|
|
3755
|
+
|
|
3125
3756
|
const op = self.commit_max + self.pipeline.count + 1;
|
|
3126
3757
|
if (op <= self.op) return op;
|
|
3127
3758
|
|
|
@@ -3139,7 +3770,7 @@ pub fn Replica(
|
|
|
3139
3770
|
assert(op <= self.op);
|
|
3140
3771
|
assert(self.commit_max + self.pipeline.count + 1 == op);
|
|
3141
3772
|
|
|
3142
|
-
const checksum = self.journal.
|
|
3773
|
+
const checksum = self.journal.header_with_op(op).?.checksum;
|
|
3143
3774
|
|
|
3144
3775
|
log.debug("{}: repair_pipeline_read: op={} checksum={}", .{
|
|
3145
3776
|
self.replica,
|
|
@@ -3198,7 +3829,7 @@ pub fn Replica(
|
|
|
3198
3829
|
return;
|
|
3199
3830
|
}
|
|
3200
3831
|
|
|
3201
|
-
if (prepare.?.header.checksum != self.journal.
|
|
3832
|
+
if (prepare.?.header.checksum != self.journal.header_with_op(op).?.checksum) {
|
|
3202
3833
|
log.debug("{}: repair_pipeline_push: checksum changed", .{self.replica});
|
|
3203
3834
|
return;
|
|
3204
3835
|
}
|
|
@@ -3212,6 +3843,10 @@ pub fn Replica(
|
|
|
3212
3843
|
prepare.?.header.checksum,
|
|
3213
3844
|
});
|
|
3214
3845
|
|
|
3846
|
+
if (self.pipeline.tail_ptr()) |parent| {
|
|
3847
|
+
assert(prepare.?.header.parent == parent.message.header.checksum);
|
|
3848
|
+
}
|
|
3849
|
+
|
|
3215
3850
|
self.pipeline.push_assume_capacity(.{ .message = prepare.?.ref() });
|
|
3216
3851
|
assert(self.pipeline.count >= 1);
|
|
3217
3852
|
|
|
@@ -3222,7 +3857,7 @@ pub fn Replica(
|
|
|
3222
3857
|
fn repair_prepares(self: *Self) void {
|
|
3223
3858
|
assert(self.status == .normal or self.status == .view_change);
|
|
3224
3859
|
assert(self.repairs_allowed());
|
|
3225
|
-
assert(self.journal.dirty.
|
|
3860
|
+
assert(self.journal.dirty.count > 0);
|
|
3226
3861
|
|
|
3227
3862
|
// Request enough prepares to utilize our max IO depth:
|
|
3228
3863
|
var budget = self.journal.writes.available();
|
|
@@ -3231,11 +3866,34 @@ pub fn Replica(
|
|
|
3231
3866
|
return;
|
|
3232
3867
|
}
|
|
3233
3868
|
|
|
3869
|
+
if (self.op < config.journal_slot_count) {
|
|
3870
|
+
// The op is known, and this is the first WAL cycle.
|
|
3871
|
+
// Therefore, any faulty ops to the right of `replica.op` are corrupt reserved
|
|
3872
|
+
// entries from the initial format.
|
|
3873
|
+
var op: usize = self.op + 1;
|
|
3874
|
+
while (op < config.journal_slot_count) : (op += 1) {
|
|
3875
|
+
const slot = self.journal.slot_for_op(op);
|
|
3876
|
+
assert(slot.index == op);
|
|
3877
|
+
|
|
3878
|
+
if (self.journal.faulty.bit(slot)) {
|
|
3879
|
+
assert(self.journal.headers[op].command == .reserved);
|
|
3880
|
+
self.journal.dirty.clear(slot);
|
|
3881
|
+
self.journal.faulty.clear(slot);
|
|
3882
|
+
log.debug("{}: repair_prepares: op={} (op known, first cycle)", .{
|
|
3883
|
+
self.replica,
|
|
3884
|
+
op,
|
|
3885
|
+
});
|
|
3886
|
+
}
|
|
3887
|
+
}
|
|
3888
|
+
}
|
|
3889
|
+
|
|
3234
3890
|
var op = self.op + 1;
|
|
3235
|
-
|
|
3891
|
+
const op_min = op -| config.journal_slot_count;
|
|
3892
|
+
while (op > op_min) {
|
|
3236
3893
|
op -= 1;
|
|
3237
3894
|
|
|
3238
|
-
|
|
3895
|
+
const slot = self.journal.slot_for_op(op);
|
|
3896
|
+
if (self.journal.dirty.bit(slot)) {
|
|
3239
3897
|
// If this is an uncommitted op, and we are the leader in `view_change` status,
|
|
3240
3898
|
// then we will `request_prepare` from the cluster, set `nack_prepare_op`,
|
|
3241
3899
|
// and stop repairing any further prepares:
|
|
@@ -3257,7 +3915,7 @@ pub fn Replica(
|
|
|
3257
3915
|
}
|
|
3258
3916
|
}
|
|
3259
3917
|
} else {
|
|
3260
|
-
assert(!self.journal.faulty.bit(
|
|
3918
|
+
assert(!self.journal.faulty.bit(slot));
|
|
3261
3919
|
}
|
|
3262
3920
|
}
|
|
3263
3921
|
}
|
|
@@ -3279,16 +3937,17 @@ pub fn Replica(
|
|
|
3279
3937
|
/// This is effectively "many-to-one" repair, where a single replica recovers using the
|
|
3280
3938
|
/// resources of many replicas, for faster recovery.
|
|
3281
3939
|
fn repair_prepare(self: *Self, op: u64) bool {
|
|
3940
|
+
const slot = self.journal.slot_with_op(op).?;
|
|
3941
|
+
const checksum = self.journal.header_with_op(op).?.checksum;
|
|
3942
|
+
|
|
3282
3943
|
assert(self.status == .normal or self.status == .view_change);
|
|
3283
3944
|
assert(self.repairs_allowed());
|
|
3284
|
-
assert(self.journal.dirty.bit(
|
|
3285
|
-
|
|
3286
|
-
const checksum = self.journal.entry_for_op_exact(op).?.checksum;
|
|
3945
|
+
assert(self.journal.dirty.bit(slot));
|
|
3287
3946
|
|
|
3288
3947
|
// We may be appending to or repairing the journal concurrently.
|
|
3289
3948
|
// We do not want to re-request any of these prepares unnecessarily.
|
|
3290
3949
|
if (self.journal.writing(op, checksum)) {
|
|
3291
|
-
log.debug("{}: repair_prepare:
|
|
3950
|
+
log.debug("{}: repair_prepare: op={} checksum={} (already writing)", .{
|
|
3292
3951
|
self.replica,
|
|
3293
3952
|
op,
|
|
3294
3953
|
checksum,
|
|
@@ -3296,11 +3955,46 @@ pub fn Replica(
|
|
|
3296
3955
|
return false;
|
|
3297
3956
|
}
|
|
3298
3957
|
|
|
3958
|
+
// The message may be available in the local pipeline.
|
|
3959
|
+
// For example (replica_count=3):
|
|
3960
|
+
// 1. View=1: Replica 1 is leader, and prepares op 5. The local write fails.
|
|
3961
|
+
// 2. Time passes. The view changes (e.g. due to a timeout)…
|
|
3962
|
+
// 3. View=4: Replica 1 is leader again, and is repairing op 5
|
|
3963
|
+
// (which is still in the pipeline).
|
|
3964
|
+
//
|
|
3965
|
+
// Using the pipeline to repair is faster than a `request_prepare`.
|
|
3966
|
+
// Also, messages in the pipeline are never corrupt.
|
|
3967
|
+
if (self.pipeline_prepare_for_op_and_checksum(op, checksum)) |prepare| {
|
|
3968
|
+
assert(prepare.message.header.op == op);
|
|
3969
|
+
assert(prepare.message.header.checksum == checksum);
|
|
3970
|
+
|
|
3971
|
+
if (self.replica_count == 1) {
|
|
3972
|
+
// This op won't start writing until all ops in the pipeline preceding it have
|
|
3973
|
+
// been written.
|
|
3974
|
+
log.debug("{}: repair_prepare: op={} checksum={} (serializing append)", .{
|
|
3975
|
+
self.replica,
|
|
3976
|
+
op,
|
|
3977
|
+
checksum,
|
|
3978
|
+
});
|
|
3979
|
+
assert(op > self.pipeline.head_ptr().?.message.header.op);
|
|
3980
|
+
return false;
|
|
3981
|
+
}
|
|
3982
|
+
|
|
3983
|
+
log.debug("{}: repair_prepare: op={} checksum={} (from pipeline)", .{
|
|
3984
|
+
self.replica,
|
|
3985
|
+
op,
|
|
3986
|
+
checksum,
|
|
3987
|
+
});
|
|
3988
|
+
self.write_prepare(prepare.message, .pipeline);
|
|
3989
|
+
return true;
|
|
3990
|
+
}
|
|
3991
|
+
|
|
3299
3992
|
const request_prepare = Header{
|
|
3300
3993
|
.command = .request_prepare,
|
|
3301
|
-
// If we request a prepare from a follower, as below, it is critical to pass a
|
|
3302
|
-
// Otherwise we could receive different prepares for the same op number.
|
|
3994
|
+
// If we request a prepare from a follower, as below, it is critical to pass a
|
|
3995
|
+
// checksum: Otherwise we could receive different prepares for the same op number.
|
|
3303
3996
|
.context = checksum,
|
|
3997
|
+
.timestamp = 1, // The checksum is included in context.
|
|
3304
3998
|
.cluster = self.cluster,
|
|
3305
3999
|
.replica = self.replica,
|
|
3306
4000
|
.view = self.view,
|
|
@@ -3311,7 +4005,7 @@ pub fn Replica(
|
|
|
3311
4005
|
// Only the leader is allowed to do repairs in a view change:
|
|
3312
4006
|
assert(self.leader_index(self.view) == self.replica);
|
|
3313
4007
|
|
|
3314
|
-
const reason = if (self.journal.faulty.bit(
|
|
4008
|
+
const reason = if (self.journal.faulty.bit(slot)) "faulty" else "dirty";
|
|
3315
4009
|
log.debug(
|
|
3316
4010
|
"{}: repair_prepare: op={} checksum={} (uncommitted, {s}, view_change)",
|
|
3317
4011
|
.{
|
|
@@ -3322,7 +4016,7 @@ pub fn Replica(
|
|
|
3322
4016
|
},
|
|
3323
4017
|
);
|
|
3324
4018
|
|
|
3325
|
-
if (self.replica_count == 2 and !self.journal.faulty.bit(
|
|
4019
|
+
if (self.replica_count == 2 and !self.journal.faulty.bit(slot)) {
|
|
3326
4020
|
// This is required to avoid a liveness issue for a cluster-of-two where a new
|
|
3327
4021
|
// leader learns of an op during a view change but where the op is faulty on
|
|
3328
4022
|
// the old leader. We must immediately roll back the op since it could not have
|
|
@@ -3354,7 +4048,7 @@ pub fn Replica(
|
|
|
3354
4048
|
self.send_header_to_other_replicas(request_prepare);
|
|
3355
4049
|
} else {
|
|
3356
4050
|
const nature = if (op > self.commit_max) "uncommitted" else "committed";
|
|
3357
|
-
const reason = if (self.journal.faulty.bit(
|
|
4051
|
+
const reason = if (self.journal.faulty.bit(slot)) "faulty" else "dirty";
|
|
3358
4052
|
log.debug("{}: repair_prepare: op={} checksum={} ({s}, {s})", .{
|
|
3359
4053
|
self.replica,
|
|
3360
4054
|
op,
|
|
@@ -3417,22 +4111,6 @@ pub fn Replica(
|
|
|
3417
4111
|
self.send_message_to_replica(next, message);
|
|
3418
4112
|
}
|
|
3419
4113
|
|
|
3420
|
-
/// Empties the prepare pipeline, unreffing all prepare and prepare_ok messages.
|
|
3421
|
-
/// Stops the prepare timeout and resets the timeouts counter.
|
|
3422
|
-
fn reset_pipeline(self: *Self) void {
|
|
3423
|
-
while (self.pipeline.pop()) |prepare| {
|
|
3424
|
-
self.message_bus.unref(prepare.message);
|
|
3425
|
-
}
|
|
3426
|
-
|
|
3427
|
-
self.prepare_timeout.stop();
|
|
3428
|
-
|
|
3429
|
-
assert(self.pipeline.count == 0);
|
|
3430
|
-
assert(self.prepare_timeout.ticking == false);
|
|
3431
|
-
|
|
3432
|
-
// Do not reset `repairing_pipeline` here as this must be reset by the read callback.
|
|
3433
|
-
// Otherwise, we would be making `repair_pipeline()` reentrant.
|
|
3434
|
-
}
|
|
3435
|
-
|
|
3436
4114
|
fn reset_quorum_messages(self: *Self, messages: *QuorumMessages, command: Command) void {
|
|
3437
4115
|
assert(messages.len == config.replicas_max);
|
|
3438
4116
|
var view: ?u32 = null;
|
|
@@ -3457,7 +4135,12 @@ pub fn Replica(
|
|
|
3457
4135
|
received.* = null;
|
|
3458
4136
|
}
|
|
3459
4137
|
assert(count <= self.replica_count);
|
|
3460
|
-
log.debug("{}: reset {} {s} message(s)", .{
|
|
4138
|
+
log.debug("{}: reset {} {s} message(s) from view={}", .{
|
|
4139
|
+
self.replica,
|
|
4140
|
+
count,
|
|
4141
|
+
@tagName(command),
|
|
4142
|
+
view,
|
|
4143
|
+
});
|
|
3461
4144
|
}
|
|
3462
4145
|
|
|
3463
4146
|
fn reset_quorum_counter(self: *Self, counter: *QuorumCounter) void {
|
|
@@ -3466,7 +4149,7 @@ pub fn Replica(
|
|
|
3466
4149
|
assert(replica < self.replica_count);
|
|
3467
4150
|
}
|
|
3468
4151
|
|
|
3469
|
-
counter.setIntersection(
|
|
4152
|
+
counter.setIntersection(quorum_counter_null);
|
|
3470
4153
|
assert(counter.count() == 0);
|
|
3471
4154
|
|
|
3472
4155
|
var replica: usize = 0;
|
|
@@ -3490,6 +4173,16 @@ pub fn Replica(
|
|
|
3490
4173
|
self.start_view_change_quorum = false;
|
|
3491
4174
|
}
|
|
3492
4175
|
|
|
4176
|
+
fn reset_quorum_recovery_response(self: *Self) void {
|
|
4177
|
+
for (self.recovery_response_from_other_replicas) |*received, replica| {
|
|
4178
|
+
if (received.*) |message| {
|
|
4179
|
+
assert(replica != self.replica);
|
|
4180
|
+
self.message_bus.unref(message);
|
|
4181
|
+
received.* = null;
|
|
4182
|
+
}
|
|
4183
|
+
}
|
|
4184
|
+
}
|
|
4185
|
+
|
|
3493
4186
|
fn send_prepare_ok(self: *Self, header: *const Header) void {
|
|
3494
4187
|
assert(header.command == .prepare);
|
|
3495
4188
|
assert(header.cluster == self.cluster);
|
|
@@ -3549,7 +4242,7 @@ pub fn Replica(
|
|
|
3549
4242
|
.view = self.view,
|
|
3550
4243
|
.op = header.op,
|
|
3551
4244
|
.commit = header.commit,
|
|
3552
|
-
.
|
|
4245
|
+
.timestamp = header.timestamp,
|
|
3553
4246
|
.operation = header.operation,
|
|
3554
4247
|
});
|
|
3555
4248
|
} else {
|
|
@@ -3567,7 +4260,7 @@ pub fn Replica(
|
|
|
3567
4260
|
// * being able to send what we have will allow the pipeline to commit earlier, and
|
|
3568
4261
|
// * the leader will drop any prepare_ok for a prepare not in the pipeline.
|
|
3569
4262
|
// This is safe only because the leader can verify against the prepare checksum.
|
|
3570
|
-
if (self.journal.
|
|
4263
|
+
if (self.journal.header_with_op(op)) |header| {
|
|
3571
4264
|
self.send_prepare_ok(header);
|
|
3572
4265
|
defer self.flush_loopback_queue();
|
|
3573
4266
|
}
|
|
@@ -3603,8 +4296,8 @@ pub fn Replica(
|
|
|
3603
4296
|
assert(message.header.command == .do_view_change);
|
|
3604
4297
|
assert(message.header.view == self.view);
|
|
3605
4298
|
assert(message.header.op == self.op);
|
|
4299
|
+
assert(message.header.op == self.message_body_as_headers(message)[0].op);
|
|
3606
4300
|
assert(message.header.commit == self.commit_max);
|
|
3607
|
-
// TODO Assert that latest header in message body matches self.op.
|
|
3608
4301
|
|
|
3609
4302
|
self.send_message_to_replica(self.leader_index(self.view), message);
|
|
3610
4303
|
}
|
|
@@ -3679,6 +4372,7 @@ pub fn Replica(
|
|
|
3679
4372
|
|
|
3680
4373
|
// TODO According to message.header.command, assert on the destination replica.
|
|
3681
4374
|
switch (message.header.command) {
|
|
4375
|
+
.reserved => unreachable,
|
|
3682
4376
|
.request => {
|
|
3683
4377
|
// Do not assert message.header.replica because we forward .request messages.
|
|
3684
4378
|
assert(self.status == .normal);
|
|
@@ -3731,6 +4425,16 @@ pub fn Replica(
|
|
|
3731
4425
|
},
|
|
3732
4426
|
else => unreachable,
|
|
3733
4427
|
},
|
|
4428
|
+
.recovery => {
|
|
4429
|
+
assert(self.status == .recovering);
|
|
4430
|
+
assert(message.header.replica == self.replica);
|
|
4431
|
+
assert(message.header.context == self.recovery_nonce);
|
|
4432
|
+
},
|
|
4433
|
+
.recovery_response => {
|
|
4434
|
+
assert(self.status == .normal);
|
|
4435
|
+
assert(message.header.view == self.view);
|
|
4436
|
+
assert(message.header.replica == self.replica);
|
|
4437
|
+
},
|
|
3734
4438
|
.headers => {
|
|
3735
4439
|
assert(self.status == .normal or self.status == .view_change);
|
|
3736
4440
|
assert(message.header.view == self.view);
|
|
@@ -3757,7 +4461,7 @@ pub fn Replica(
|
|
|
3757
4461
|
.nack_prepare => {
|
|
3758
4462
|
assert(message.header.view == self.view);
|
|
3759
4463
|
assert(message.header.replica == self.replica);
|
|
3760
|
-
assert(
|
|
4464
|
+
assert(self.leader_index(self.view) == replica);
|
|
3761
4465
|
},
|
|
3762
4466
|
else => {
|
|
3763
4467
|
log.info("{}: send_message_to_replica: TODO {s}", .{
|
|
@@ -3776,8 +4480,8 @@ pub fn Replica(
|
|
|
3776
4480
|
}
|
|
3777
4481
|
|
|
3778
4482
|
/// Finds the header with the highest op number in a slice of headers from a replica.
|
|
3779
|
-
/// Searches only by op number to find the highest `self.op for the replica.
|
|
3780
|
-
fn set_latest_op(headers: []Header, latest: *Header) void {
|
|
4483
|
+
/// Searches only by op number to find the highest `self.op` for the replica.
|
|
4484
|
+
fn set_latest_op(headers: []const Header, latest: *Header) void {
|
|
3781
4485
|
switch (latest.command) {
|
|
3782
4486
|
.reserved, .prepare => assert(latest.valid_checksum()),
|
|
3783
4487
|
else => unreachable,
|
|
@@ -3802,17 +4506,27 @@ pub fn Replica(
|
|
|
3802
4506
|
k: u64,
|
|
3803
4507
|
method: []const u8,
|
|
3804
4508
|
) void {
|
|
3805
|
-
assert(self.status == .view_change);
|
|
3806
|
-
|
|
4509
|
+
assert(self.status == .view_change or self.status == .recovering);
|
|
4510
|
+
assert(self.journal.recovered);
|
|
3807
4511
|
assert(latest.valid_checksum());
|
|
3808
4512
|
assert(latest.invalid() == null);
|
|
3809
4513
|
assert(latest.command == .prepare);
|
|
3810
4514
|
assert(latest.cluster == self.cluster);
|
|
3811
4515
|
|
|
3812
|
-
|
|
3813
|
-
|
|
4516
|
+
switch (self.status) {
|
|
4517
|
+
.normal => unreachable,
|
|
4518
|
+
.view_change => {
|
|
4519
|
+
// The view may have started already, so we can have a prepare in the same view:
|
|
4520
|
+
assert(latest.view <= self.view);
|
|
4521
|
+
},
|
|
4522
|
+
.recovering => {
|
|
4523
|
+
// The replica's view hasn't been set yet.
|
|
4524
|
+
// It will be set shortly, when we transition to normal status.
|
|
4525
|
+
assert(self.view == 0);
|
|
4526
|
+
},
|
|
4527
|
+
}
|
|
3814
4528
|
|
|
3815
|
-
log.debug("{}: {s}: view={} op={}..{} commit={}..{} checksum={}
|
|
4529
|
+
log.debug("{}: {s}: view={} op={}..{} commit={}..{} checksum={}", .{
|
|
3816
4530
|
self.replica,
|
|
3817
4531
|
method,
|
|
3818
4532
|
self.view,
|
|
@@ -3821,7 +4535,6 @@ pub fn Replica(
|
|
|
3821
4535
|
self.commit_max,
|
|
3822
4536
|
k,
|
|
3823
4537
|
latest.checksum,
|
|
3824
|
-
latest.offset,
|
|
3825
4538
|
});
|
|
3826
4539
|
|
|
3827
4540
|
// Uncommitted ops may not survive a view change so we must assert `latest.op` against
|
|
@@ -3863,15 +4576,15 @@ pub fn Replica(
|
|
|
3863
4576
|
// Do not set the latest op as dirty if we already have it exactly:
|
|
3864
4577
|
// Otherwise, this would trigger a repair and delay the view change, or worse, it would
|
|
3865
4578
|
// prevent us from assisting another replica to recover when we do in fact have the op.
|
|
3866
|
-
if (self.journal.
|
|
4579
|
+
if (self.journal.has(latest)) {
|
|
3867
4580
|
log.debug("{}: {s}: latest op exists exactly", .{ self.replica, method });
|
|
3868
4581
|
} else {
|
|
3869
|
-
self.journal.
|
|
4582
|
+
self.journal.set_header_as_dirty(latest);
|
|
3870
4583
|
}
|
|
3871
4584
|
|
|
3872
4585
|
assert(self.op == latest.op);
|
|
3873
4586
|
self.journal.remove_entries_from(self.op + 1);
|
|
3874
|
-
assert(self.journal.
|
|
4587
|
+
assert(self.journal.header_with_op(self.op).?.checksum == latest.checksum);
|
|
3875
4588
|
}
|
|
3876
4589
|
|
|
3877
4590
|
fn start_view_as_the_new_leader(self: *Self) void {
|
|
@@ -3884,31 +4597,18 @@ pub fn Replica(
|
|
|
3884
4597
|
|
|
3885
4598
|
assert(self.commit_min == self.commit_max);
|
|
3886
4599
|
assert(self.repair_pipeline_op() == null);
|
|
4600
|
+
self.verify_pipeline();
|
|
3887
4601
|
assert(self.commit_max + self.pipeline.count == self.op);
|
|
3888
4602
|
assert(self.valid_hash_chain_between(self.commit_min, self.op));
|
|
3889
4603
|
|
|
3890
|
-
|
|
3891
|
-
|
|
3892
|
-
var iterator = self.pipeline.iterator();
|
|
3893
|
-
while (iterator.next_ptr()) |prepare| {
|
|
3894
|
-
assert(prepare.message.header.command == .prepare);
|
|
3895
|
-
assert(prepare.message.header.op == pipeline_op);
|
|
3896
|
-
assert(prepare.message.header.parent == pipeline_parent);
|
|
3897
|
-
|
|
3898
|
-
pipeline_parent = prepare.message.header.checksum;
|
|
3899
|
-
pipeline_op += 1;
|
|
3900
|
-
}
|
|
3901
|
-
assert(self.pipeline.count <= config.pipeline_max);
|
|
3902
|
-
assert(self.commit_max + self.pipeline.count == pipeline_op - 1);
|
|
3903
|
-
|
|
3904
|
-
assert(self.journal.dirty.len == 0);
|
|
3905
|
-
assert(self.journal.faulty.len == 0);
|
|
4604
|
+
assert(self.journal.dirty.count == 0);
|
|
4605
|
+
assert(self.journal.faulty.count == 0);
|
|
3906
4606
|
assert(self.nack_prepare_op == null);
|
|
3907
4607
|
|
|
3908
4608
|
const start_view = self.create_view_change_message(.start_view);
|
|
3909
4609
|
defer self.message_bus.unref(start_view);
|
|
3910
4610
|
|
|
3911
|
-
self.
|
|
4611
|
+
self.transition_to_normal_from_view_change_status(self.view);
|
|
3912
4612
|
// Detect if the transition to normal status above accidentally resets the pipeline:
|
|
3913
4613
|
assert(self.commit_max + self.pipeline.count == self.op);
|
|
3914
4614
|
|
|
@@ -3927,17 +4627,73 @@ pub fn Replica(
|
|
|
3927
4627
|
self.send_message_to_other_replicas(start_view);
|
|
3928
4628
|
}
|
|
3929
4629
|
|
|
3930
|
-
fn
|
|
3931
|
-
|
|
4630
|
+
fn transition_to_normal_from_recovering_status(self: *Self, new_view: u32) void {
|
|
4631
|
+
assert(self.status == .recovering);
|
|
4632
|
+
assert(self.view == 0);
|
|
4633
|
+
self.view = new_view;
|
|
4634
|
+
self.view_normal = new_view;
|
|
4635
|
+
self.status = .normal;
|
|
4636
|
+
|
|
4637
|
+
if (self.leader()) {
|
|
4638
|
+
log.debug(
|
|
4639
|
+
"{}: transition_to_normal_from_recovering_status: view={} leader",
|
|
4640
|
+
.{
|
|
4641
|
+
self.replica,
|
|
4642
|
+
self.view,
|
|
4643
|
+
},
|
|
4644
|
+
);
|
|
4645
|
+
|
|
4646
|
+
assert(self.journal.is_empty() or self.replica_count == 1);
|
|
4647
|
+
assert(!self.prepare_timeout.ticking);
|
|
4648
|
+
assert(!self.normal_status_timeout.ticking);
|
|
4649
|
+
assert(!self.view_change_status_timeout.ticking);
|
|
4650
|
+
assert(!self.view_change_message_timeout.ticking);
|
|
4651
|
+
|
|
4652
|
+
self.ping_timeout.start();
|
|
4653
|
+
self.commit_timeout.start();
|
|
4654
|
+
self.repair_timeout.start();
|
|
4655
|
+
self.recovery_timeout.stop();
|
|
4656
|
+
} else {
|
|
4657
|
+
log.debug(
|
|
4658
|
+
"{}: transition_to_normal_from_recovering_status: view={} follower",
|
|
4659
|
+
.{
|
|
4660
|
+
self.replica,
|
|
4661
|
+
self.view,
|
|
4662
|
+
},
|
|
4663
|
+
);
|
|
4664
|
+
|
|
4665
|
+
assert(!self.prepare_timeout.ticking);
|
|
4666
|
+
assert(!self.commit_timeout.ticking);
|
|
4667
|
+
assert(!self.view_change_status_timeout.ticking);
|
|
4668
|
+
assert(!self.view_change_message_timeout.ticking);
|
|
4669
|
+
|
|
4670
|
+
self.ping_timeout.start();
|
|
4671
|
+
self.normal_status_timeout.start();
|
|
4672
|
+
self.repair_timeout.start();
|
|
4673
|
+
self.recovery_timeout.stop();
|
|
4674
|
+
}
|
|
4675
|
+
}
|
|
4676
|
+
|
|
4677
|
+
fn transition_to_normal_from_view_change_status(self: *Self, new_view: u32) void {
|
|
3932
4678
|
// In the VRR paper it's possible to transition from normal to normal for the same view.
|
|
3933
4679
|
// For example, this could happen after a state transfer triggered by an op jump.
|
|
4680
|
+
assert(self.status == .view_change);
|
|
3934
4681
|
assert(new_view >= self.view);
|
|
3935
4682
|
self.view = new_view;
|
|
3936
4683
|
self.view_normal = new_view;
|
|
3937
4684
|
self.status = .normal;
|
|
3938
4685
|
|
|
3939
4686
|
if (self.leader()) {
|
|
3940
|
-
log.debug(
|
|
4687
|
+
log.debug(
|
|
4688
|
+
"{}: transition_to_normal_from_view_change_status: view={} leader",
|
|
4689
|
+
.{
|
|
4690
|
+
self.replica,
|
|
4691
|
+
self.view,
|
|
4692
|
+
},
|
|
4693
|
+
);
|
|
4694
|
+
|
|
4695
|
+
assert(!self.prepare_timeout.ticking);
|
|
4696
|
+
assert(!self.recovery_timeout.ticking);
|
|
3941
4697
|
|
|
3942
4698
|
self.ping_timeout.start();
|
|
3943
4699
|
self.commit_timeout.start();
|
|
@@ -3947,12 +4703,15 @@ pub fn Replica(
|
|
|
3947
4703
|
self.repair_timeout.start();
|
|
3948
4704
|
|
|
3949
4705
|
// Do not reset the pipeline as there may be uncommitted ops to drive to completion.
|
|
3950
|
-
if (self.pipeline.count > 0)
|
|
3951
|
-
assert(!self.prepare_timeout.ticking);
|
|
3952
|
-
self.prepare_timeout.start();
|
|
3953
|
-
}
|
|
4706
|
+
if (self.pipeline.count > 0) self.prepare_timeout.start();
|
|
3954
4707
|
} else {
|
|
3955
|
-
log.debug("{}:
|
|
4708
|
+
log.debug("{}: transition_to_normal_from_view_change_status: view={} follower", .{
|
|
4709
|
+
self.replica,
|
|
4710
|
+
self.view,
|
|
4711
|
+
});
|
|
4712
|
+
|
|
4713
|
+
assert(!self.prepare_timeout.ticking);
|
|
4714
|
+
assert(!self.recovery_timeout.ticking);
|
|
3956
4715
|
|
|
3957
4716
|
self.ping_timeout.start();
|
|
3958
4717
|
self.commit_timeout.stop();
|
|
@@ -3960,8 +4719,6 @@ pub fn Replica(
|
|
|
3960
4719
|
self.view_change_status_timeout.stop();
|
|
3961
4720
|
self.view_change_message_timeout.stop();
|
|
3962
4721
|
self.repair_timeout.start();
|
|
3963
|
-
|
|
3964
|
-
self.reset_pipeline();
|
|
3965
4722
|
}
|
|
3966
4723
|
|
|
3967
4724
|
self.reset_quorum_start_view_change();
|
|
@@ -3973,17 +4730,18 @@ pub fn Replica(
|
|
|
3973
4730
|
assert(self.nack_prepare_op == null);
|
|
3974
4731
|
}
|
|
3975
4732
|
|
|
3976
|
-
/// A replica i that notices the need for a view change advances its view, sets its status
|
|
3977
|
-
/// view_change, and sends a ⟨start_view_change v, i⟩ message to all the other replicas,
|
|
3978
|
-
/// where v identifies the new view. A replica notices the need for a view change either
|
|
3979
|
-
/// on its own timer, or because it receives a start_view_change or do_view_change
|
|
3980
|
-
/// a view with a larger number than its own view.
|
|
4733
|
+
/// A replica i that notices the need for a view change advances its view, sets its status
|
|
4734
|
+
/// to view_change, and sends a ⟨start_view_change v, i⟩ message to all the other replicas,
|
|
4735
|
+
/// where v identifies the new view. A replica notices the need for a view change either
|
|
4736
|
+
/// based on its own timer, or because it receives a start_view_change or do_view_change
|
|
4737
|
+
/// message for a view with a larger number than its own view.
|
|
3981
4738
|
fn transition_to_view_change_status(self: *Self, new_view: u32) void {
|
|
3982
4739
|
log.debug("{}: transition_to_view_change_status: view={}..{}", .{
|
|
3983
4740
|
self.replica,
|
|
3984
4741
|
self.view,
|
|
3985
4742
|
new_view,
|
|
3986
4743
|
});
|
|
4744
|
+
assert(self.status == .normal or self.status == .view_change);
|
|
3987
4745
|
assert(new_view > self.view);
|
|
3988
4746
|
self.view = new_view;
|
|
3989
4747
|
self.status = .view_change;
|
|
@@ -3994,13 +4752,14 @@ pub fn Replica(
|
|
|
3994
4752
|
self.view_change_status_timeout.start();
|
|
3995
4753
|
self.view_change_message_timeout.start();
|
|
3996
4754
|
self.repair_timeout.stop();
|
|
4755
|
+
self.prepare_timeout.stop();
|
|
4756
|
+
assert(!self.recovery_timeout.ticking);
|
|
3997
4757
|
|
|
3998
4758
|
// Do not reset quorum counters only on entering a view, assuming that the view will be
|
|
3999
4759
|
// followed only by a single subsequent view change to the next view, because multiple
|
|
4000
4760
|
// successive view changes can fail, e.g. after a view change timeout.
|
|
4001
|
-
// We must therefore reset our counters here to avoid counting messages from an older
|
|
4002
|
-
// which would violate the quorum intersection property essential for correctness.
|
|
4003
|
-
self.reset_pipeline();
|
|
4761
|
+
// We must therefore reset our counters here to avoid counting messages from an older
|
|
4762
|
+
// view, which would violate the quorum intersection property essential for correctness.
|
|
4004
4763
|
self.reset_quorum_start_view_change();
|
|
4005
4764
|
self.reset_quorum_do_view_change();
|
|
4006
4765
|
self.reset_quorum_nack_prepare();
|
|
@@ -4075,21 +4834,21 @@ pub fn Replica(
|
|
|
4075
4834
|
return true;
|
|
4076
4835
|
}
|
|
4077
4836
|
|
|
4078
|
-
/// Returns true if all operations are present, correctly ordered and connected by hash
|
|
4079
|
-
/// between `op_min` and `op_max` (both inclusive).
|
|
4837
|
+
/// Returns true if all operations are present, correctly ordered and connected by hash
|
|
4838
|
+
/// chain, between `op_min` and `op_max` (both inclusive).
|
|
4080
4839
|
fn valid_hash_chain_between(self: *Self, op_min: u64, op_max: u64) bool {
|
|
4081
4840
|
assert(op_min <= op_max);
|
|
4082
4841
|
|
|
4083
|
-
// If we use anything less than self.op then we may commit ops for a forked hash chain
|
|
4084
|
-
// have since been reordered by a new leader.
|
|
4842
|
+
// If we use anything less than self.op then we may commit ops for a forked hash chain
|
|
4843
|
+
// that have since been reordered by a new leader.
|
|
4085
4844
|
assert(op_max == self.op);
|
|
4086
|
-
var b = self.journal.
|
|
4845
|
+
var b = self.journal.header_with_op(op_max).?;
|
|
4087
4846
|
|
|
4088
4847
|
var op = op_max;
|
|
4089
4848
|
while (op > op_min) {
|
|
4090
4849
|
op -= 1;
|
|
4091
4850
|
|
|
4092
|
-
if (self.journal.
|
|
4851
|
+
if (self.journal.header_with_op(op)) |a| {
|
|
4093
4852
|
assert(a.op + 1 == b.op);
|
|
4094
4853
|
if (a.checksum == b.parent) {
|
|
4095
4854
|
assert(ascending_viewstamps(a, b));
|
|
@@ -4108,6 +4867,33 @@ pub fn Replica(
|
|
|
4108
4867
|
return true;
|
|
4109
4868
|
}
|
|
4110
4869
|
|
|
4870
|
+
fn verify_pipeline(self: *Self) void {
|
|
4871
|
+
var op = self.commit_max + 1;
|
|
4872
|
+
var parent = self.journal.header_with_op(self.commit_max).?.checksum;
|
|
4873
|
+
|
|
4874
|
+
var iterator = self.pipeline.iterator();
|
|
4875
|
+
while (iterator.next_ptr()) |prepare| {
|
|
4876
|
+
assert(prepare.message.header.command == .prepare);
|
|
4877
|
+
|
|
4878
|
+
log.debug("{}: verify_pipeline: op={} checksum={x} parent={x}", .{
|
|
4879
|
+
self.replica,
|
|
4880
|
+
prepare.message.header.op,
|
|
4881
|
+
prepare.message.header.checksum,
|
|
4882
|
+
prepare.message.header.parent,
|
|
4883
|
+
});
|
|
4884
|
+
|
|
4885
|
+
assert(self.journal.has(prepare.message.header));
|
|
4886
|
+
assert(prepare.message.header.op == op);
|
|
4887
|
+
assert(prepare.message.header.op <= self.op);
|
|
4888
|
+
assert(prepare.message.header.parent == parent);
|
|
4889
|
+
|
|
4890
|
+
parent = prepare.message.header.checksum;
|
|
4891
|
+
op += 1;
|
|
4892
|
+
}
|
|
4893
|
+
assert(self.pipeline.count <= config.pipeline_max);
|
|
4894
|
+
assert(self.commit_max + self.pipeline.count == op - 1);
|
|
4895
|
+
}
|
|
4896
|
+
|
|
4111
4897
|
fn view_jump(self: *Self, header: *const Header) void {
|
|
4112
4898
|
const to: Status = switch (header.command) {
|
|
4113
4899
|
.prepare, .commit => .normal,
|
|
@@ -4203,10 +4989,10 @@ pub fn Replica(
|
|
|
4203
4989
|
return;
|
|
4204
4990
|
}
|
|
4205
4991
|
|
|
4206
|
-
self.journal.write_prepare(
|
|
4992
|
+
self.journal.write_prepare(write_prepare_callback, message, trigger);
|
|
4207
4993
|
}
|
|
4208
4994
|
|
|
4209
|
-
fn
|
|
4995
|
+
fn write_prepare_callback(
|
|
4210
4996
|
self: *Self,
|
|
4211
4997
|
wrote: ?*Message,
|
|
4212
4998
|
trigger: Journal.Write.Trigger,
|
|
@@ -4222,6 +5008,7 @@ pub fn Replica(
|
|
|
4222
5008
|
// If this was a repair, continue immediately to repair the next prepare:
|
|
4223
5009
|
// This is an optimization to eliminate waiting until the next repair timeout.
|
|
4224
5010
|
.repair => self.repair(),
|
|
5011
|
+
.pipeline => self.repair(),
|
|
4225
5012
|
}
|
|
4226
5013
|
}
|
|
4227
5014
|
};
|